summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.locks2
-rw-r--r--kernel/Makefile8
-rw-r--r--kernel/audit.c8
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/audit_watch.c85
-rw-r--r--kernel/auditfilter.c10
-rw-r--r--kernel/auditsc.c29
-rw-r--r--kernel/bounds.c2
-rw-r--r--kernel/capability.c112
-rw-r--r--kernel/cgroup.c678
-rw-r--r--kernel/cgroup_freezer.c26
-rw-r--r--kernel/compat.c191
-rw-r--r--kernel/cpu.c13
-rw-r--r--kernel/cpuset.c175
-rw-r--r--kernel/crash_dump.c34
-rw-r--r--kernel/cred.c16
-rw-r--r--kernel/debug/debug_core.c2
-rw-r--r--kernel/debug/gdbstub.c30
-rw-r--r--kernel/debug/kdb/kdb_main.c10
-rw-r--r--kernel/debug/kdb/kdb_support.c2
-rw-r--r--kernel/events/Makefile6
-rw-r--r--kernel/events/core.c (renamed from kernel/perf_event.c)1085
-rw-r--r--kernel/events/hw_breakpoint.c (renamed from kernel/hw_breakpoint.c)0
-rw-r--r--kernel/exit.c115
-rw-r--r--kernel/extable.c18
-rw-r--r--kernel/fork.c270
-rw-r--r--kernel/freezer.c4
-rw-r--r--kernel/futex.c160
-rw-r--r--kernel/futex_compat.c11
-rw-r--r--kernel/gcov/Kconfig2
-rw-r--r--kernel/gcov/Makefile2
-rw-r--r--kernel/groups.c2
-rw-r--r--kernel/hrtimer.c228
-rw-r--r--kernel/hung_task.c2
-rw-r--r--kernel/irq/Kconfig50
-rw-r--r--kernel/irq/Makefile1
-rw-r--r--kernel/irq/autoprobe.c52
-rw-r--r--kernel/irq/chip.c699
-rw-r--r--kernel/irq/debug.h45
-rw-r--r--kernel/irq/dummychip.c9
-rw-r--r--kernel/irq/generic-chip.c354
-rw-r--r--kernel/irq/handle.c129
-rw-r--r--kernel/irq/internals.h155
-rw-r--r--kernel/irq/irqdesc.c97
-rw-r--r--kernel/irq/manage.c632
-rw-r--r--kernel/irq/migration.c29
-rw-r--r--kernel/irq/pm.c30
-rw-r--r--kernel/irq/proc.c143
-rw-r--r--kernel/irq/resend.c16
-rw-r--r--kernel/irq/settings.h142
-rw-r--r--kernel/irq/spurious.c162
-rw-r--r--kernel/jump_label.c547
-rw-r--r--kernel/kallsyms.c58
-rw-r--r--kernel/kexec.c17
-rw-r--r--kernel/kmod.c116
-rw-r--r--kernel/ksysfs.c10
-rw-r--r--kernel/kthread.c37
-rw-r--r--kernel/latencytop.c2
-rw-r--r--kernel/lockdep.c210
-rw-r--r--kernel/lockdep_proc.c9
-rw-r--r--kernel/module.c119
-rw-r--r--kernel/mutex-debug.c2
-rw-r--r--kernel/mutex-debug.h2
-rw-r--r--kernel/mutex.c36
-rw-r--r--kernel/mutex.h2
-rw-r--r--kernel/ns_cgroup.c118
-rw-r--r--kernel/nsproxy.c50
-rw-r--r--kernel/padata.c8
-rw-r--r--kernel/panic.c10
-rw-r--r--kernel/params.c25
-rw-r--r--kernel/pid.c7
-rw-r--r--kernel/pid_namespace.c11
-rw-r--r--kernel/pm_qos_params.c57
-rw-r--r--kernel/posix-cpu-timers.c112
-rw-r--r--kernel/posix-timers.c369
-rw-r--r--kernel/power/Kconfig239
-rw-r--r--kernel/power/Makefile3
-rw-r--r--kernel/power/block_io.c2
-rw-r--r--kernel/power/hibernate.c273
-rw-r--r--kernel/power/main.c6
-rw-r--r--kernel/power/power.h4
-rw-r--r--kernel/power/snapshot.c25
-rw-r--r--kernel/power/suspend.c11
-rw-r--r--kernel/power/user.c5
-rw-r--r--kernel/printk.c261
-rw-r--r--kernel/profile.c22
-rw-r--r--kernel/ptrace.c164
-rw-r--r--kernel/rcupdate.c28
-rw-r--r--kernel/rcutiny.c46
-rw-r--r--kernel/rcutiny_plugin.h205
-rw-r--r--kernel/rcutorture.c27
-rw-r--r--kernel/rcutree.c647
-rw-r--r--kernel/rcutree.h118
-rw-r--r--kernel/rcutree_plugin.h562
-rw-r--r--kernel/rcutree_trace.c192
-rw-r--r--kernel/res_counter.c14
-rw-r--r--kernel/rtmutex-debug.c1
-rw-r--r--kernel/rtmutex-tester.c40
-rw-r--r--kernel/rtmutex.c318
-rw-r--r--kernel/rtmutex_common.h16
-rw-r--r--kernel/sched.c2159
-rw-r--r--kernel/sched_autogroup.c17
-rw-r--r--kernel/sched_autogroup.h5
-rw-r--r--kernel/sched_debug.c8
-rw-r--r--kernel/sched_fair.c601
-rw-r--r--kernel/sched_features.h6
-rw-r--r--kernel/sched_idletask.c30
-rw-r--r--kernel/sched_rt.c110
-rw-r--r--kernel/sched_stats.h4
-rw-r--r--kernel/sched_stoptask.c14
-rw-r--r--kernel/signal.c869
-rw-r--r--kernel/smp.c152
-rw-r--r--kernel/softirq.c33
-rw-r--r--kernel/stop_machine.c6
-rw-r--r--kernel/sys.c86
-rw-r--r--kernel/sys_ni.c14
-rw-r--r--kernel/sysctl.c71
-rw-r--r--kernel/sysctl_binary.c19
-rw-r--r--kernel/sysctl_check.c10
-rw-r--r--kernel/taskstats.c2
-rw-r--r--kernel/time.c35
-rw-r--r--kernel/time/Makefile3
-rw-r--r--kernel/time/alarmtimer.c702
-rw-r--r--kernel/time/clockevents.c65
-rw-r--r--kernel/time/clocksource.c42
-rw-r--r--kernel/time/jiffies.c22
-rw-r--r--kernel/time/ntp.c15
-rw-r--r--kernel/time/posix-clock.c445
-rw-r--r--kernel/time/tick-broadcast.c29
-rw-r--r--kernel/time/tick-common.c1
-rw-r--r--kernel/time/tick-internal.h9
-rw-r--r--kernel/time/tick-oneshot.c1
-rw-r--r--kernel/time/tick-sched.c1
-rw-r--r--kernel/time/timekeeping.c241
-rw-r--r--kernel/time/timer_stats.c2
-rw-r--r--kernel/timer.c42
-rw-r--r--kernel/trace/Kconfig6
-rw-r--r--kernel/trace/blktrace.c48
-rw-r--r--kernel/trace/ftrace.c1337
-rw-r--r--kernel/trace/ring_buffer.c40
-rw-r--r--kernel/trace/trace.c56
-rw-r--r--kernel/trace/trace.h58
-rw-r--r--kernel/trace/trace_clock.c2
-rw-r--r--kernel/trace/trace_entries.h8
-rw-r--r--kernel/trace/trace_events.c10
-rw-r--r--kernel/trace/trace_events_filter.c885
-rw-r--r--kernel/trace/trace_functions.c2
-rw-r--r--kernel/trace/trace_functions_graph.c2
-rw-r--r--kernel/trace/trace_irqsoff.c3
-rw-r--r--kernel/trace/trace_kprobe.c114
-rw-r--r--kernel/trace/trace_output.c66
-rw-r--r--kernel/trace/trace_printk.c120
-rw-r--r--kernel/trace/trace_sched_switch.c48
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
-rw-r--r--kernel/trace/trace_selftest.c214
-rw-r--r--kernel/trace/trace_selftest_dynamic.c6
-rw-r--r--kernel/trace/trace_stack.c1
-rw-r--r--kernel/trace/trace_syscalls.c42
-rw-r--r--kernel/tracepoint.c23
-rw-r--r--kernel/uid16.c2
-rw-r--r--kernel/user-return-notifier.c2
-rw-r--r--kernel/user.c8
-rw-r--r--kernel/utsname.c51
-rw-r--r--kernel/wait.c2
-rw-r--r--kernel/watchdog.c91
-rw-r--r--kernel/workqueue.c30
166 files changed, 13905 insertions, 6492 deletions
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
index 88c92fb44618..5068e2a4e75f 100644
--- a/kernel/Kconfig.locks
+++ b/kernel/Kconfig.locks
@@ -199,4 +199,4 @@ config INLINE_WRITE_UNLOCK_IRQRESTORE
def_bool !DEBUG_SPINLOCK && ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE
config MUTEX_SPIN_ON_OWNER
- def_bool SMP && !DEBUG_MUTEXES && !HAVE_DEFAULT_NO_SPIN_MUTEXES
+ def_bool SMP && !DEBUG_MUTEXES
diff --git a/kernel/Makefile b/kernel/Makefile
index 353d3fe8ba33..2d64cfcc8b42 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -21,7 +21,6 @@ CFLAGS_REMOVE_mutex-debug.o = -pg
CFLAGS_REMOVE_rtmutex-debug.o = -pg
CFLAGS_REMOVE_cgroup-debug.o = -pg
CFLAGS_REMOVE_sched_clock.o = -pg
-CFLAGS_REMOVE_perf_event.o = -pg
CFLAGS_REMOVE_irq_work.o = -pg
endif
@@ -62,7 +61,6 @@ obj-$(CONFIG_COMPAT) += compat.o
obj-$(CONFIG_CGROUPS) += cgroup.o
obj-$(CONFIG_CGROUP_FREEZER) += cgroup_freezer.o
obj-$(CONFIG_CPUSETS) += cpuset.o
-obj-$(CONFIG_CGROUP_NS) += ns_cgroup.o
obj-$(CONFIG_UTS_NS) += utsname.o
obj-$(CONFIG_USER_NS) += user_namespace.o
obj-$(CONFIG_PID_NS) += pid_namespace.o
@@ -103,10 +101,12 @@ obj-$(CONFIG_RING_BUFFER) += trace/
obj-$(CONFIG_TRACEPOINTS) += trace/
obj-$(CONFIG_SMP) += sched_cpupri.o
obj-$(CONFIG_IRQ_WORK) += irq_work.o
-obj-$(CONFIG_PERF_EVENTS) += perf_event.o
-obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
+
+obj-$(CONFIG_PERF_EVENTS) += events/
+
obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o
obj-$(CONFIG_PADATA) += padata.o
+obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/audit.c b/kernel/audit.c
index e4956244ae50..939500317066 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -74,6 +74,8 @@ static int audit_initialized;
int audit_enabled;
int audit_ever_enabled;
+EXPORT_SYMBOL_GPL(audit_enabled);
+
/* Default state when kernel boots without any parameters. */
static int audit_default;
@@ -671,9 +673,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
pid = NETLINK_CREDS(skb)->pid;
uid = NETLINK_CREDS(skb)->uid;
- loginuid = NETLINK_CB(skb).loginuid;
- sessionid = NETLINK_CB(skb).sessionid;
- sid = NETLINK_CB(skb).sid;
+ loginuid = audit_get_loginuid(current);
+ sessionid = audit_get_sessionid(current);
+ security_task_getsecid(current, &sid);
seq = nlh->nlmsg_seq;
data = NLMSG_DATA(nlh);
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 37b2bea170c8..e99dda04b126 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -607,7 +607,7 @@ void audit_trim_trees(void)
spin_lock(&hash_lock);
list_for_each_entry(node, &tree->chunks, list) {
struct audit_chunk *chunk = find_chunk(node);
- /* this could be NULL if the watch is dieing else where... */
+ /* this could be NULL if the watch is dying else where... */
struct inode *inode = chunk->mark.i.inode;
node->index |= 1U<<31;
if (iterate_mounts(compare_root, inode, root_mnt))
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
index d2e3c7866460..e683869365d9 100644
--- a/kernel/audit_watch.c
+++ b/kernel/audit_watch.c
@@ -144,9 +144,9 @@ int audit_watch_compare(struct audit_watch *watch, unsigned long ino, dev_t dev)
}
/* Initialize a parent watch entry. */
-static struct audit_parent *audit_init_parent(struct nameidata *ndp)
+static struct audit_parent *audit_init_parent(struct path *path)
{
- struct inode *inode = ndp->path.dentry->d_inode;
+ struct inode *inode = path->dentry->d_inode;
struct audit_parent *parent;
int ret;
@@ -353,53 +353,40 @@ static void audit_remove_parent_watches(struct audit_parent *parent)
}
/* Get path information necessary for adding watches. */
-static int audit_get_nd(char *path, struct nameidata **ndp, struct nameidata **ndw)
+static int audit_get_nd(struct audit_watch *watch, struct path *parent)
{
- struct nameidata *ndparent, *ndwatch;
+ struct nameidata nd;
+ struct dentry *d;
int err;
- ndparent = kmalloc(sizeof(*ndparent), GFP_KERNEL);
- if (unlikely(!ndparent))
- return -ENOMEM;
+ err = kern_path_parent(watch->path, &nd);
+ if (err)
+ return err;
- ndwatch = kmalloc(sizeof(*ndwatch), GFP_KERNEL);
- if (unlikely(!ndwatch)) {
- kfree(ndparent);
- return -ENOMEM;
+ if (nd.last_type != LAST_NORM) {
+ path_put(&nd.path);
+ return -EINVAL;
}
- err = path_lookup(path, LOOKUP_PARENT, ndparent);
- if (err) {
- kfree(ndparent);
- kfree(ndwatch);
- return err;
+ mutex_lock_nested(&nd.path.dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+ d = lookup_one_len(nd.last.name, nd.path.dentry, nd.last.len);
+ if (IS_ERR(d)) {
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
+ path_put(&nd.path);
+ return PTR_ERR(d);
}
-
- err = path_lookup(path, 0, ndwatch);
- if (err) {
- kfree(ndwatch);
- ndwatch = NULL;
+ if (d->d_inode) {
+ /* update watch filter fields */
+ watch->dev = d->d_inode->i_sb->s_dev;
+ watch->ino = d->d_inode->i_ino;
}
+ mutex_unlock(&nd.path.dentry->d_inode->i_mutex);
- *ndp = ndparent;
- *ndw = ndwatch;
-
+ *parent = nd.path;
+ dput(d);
return 0;
}
-/* Release resources used for watch path information. */
-static void audit_put_nd(struct nameidata *ndp, struct nameidata *ndw)
-{
- if (ndp) {
- path_put(&ndp->path);
- kfree(ndp);
- }
- if (ndw) {
- path_put(&ndw->path);
- kfree(ndw);
- }
-}
-
/* Associate the given rule with an existing parent.
* Caller must hold audit_filter_mutex. */
static void audit_add_to_parent(struct audit_krule *krule,
@@ -440,31 +427,24 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
{
struct audit_watch *watch = krule->watch;
struct audit_parent *parent;
- struct nameidata *ndp = NULL, *ndw = NULL;
+ struct path parent_path;
int h, ret = 0;
mutex_unlock(&audit_filter_mutex);
/* Avoid calling path_lookup under audit_filter_mutex. */
- ret = audit_get_nd(watch->path, &ndp, &ndw);
- if (ret) {
- /* caller expects mutex locked */
- mutex_lock(&audit_filter_mutex);
- goto error;
- }
+ ret = audit_get_nd(watch, &parent_path);
+ /* caller expects mutex locked */
mutex_lock(&audit_filter_mutex);
- /* update watch filter fields */
- if (ndw) {
- watch->dev = ndw->path.dentry->d_inode->i_sb->s_dev;
- watch->ino = ndw->path.dentry->d_inode->i_ino;
- }
+ if (ret)
+ return ret;
/* either find an old parent or attach a new one */
- parent = audit_find_parent(ndp->path.dentry->d_inode);
+ parent = audit_find_parent(parent_path.dentry->d_inode);
if (!parent) {
- parent = audit_init_parent(ndp);
+ parent = audit_init_parent(&parent_path);
if (IS_ERR(parent)) {
ret = PTR_ERR(parent);
goto error;
@@ -479,9 +459,8 @@ int audit_add_watch(struct audit_krule *krule, struct list_head **list)
h = audit_hash_ino((u32)watch->ino);
*list = &audit_inode_hash[h];
error:
- audit_put_nd(ndp, ndw); /* NULL args OK */
+ path_put(&parent_path);
return ret;
-
}
void audit_remove_watch_rule(struct audit_krule *krule)
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index add2819af71b..f8277c80d678 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -1238,6 +1238,7 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
int result = 0;
+ u32 sid;
switch (f->type) {
case AUDIT_PID:
@@ -1250,19 +1251,22 @@ static int audit_filter_user_rules(struct netlink_skb_parms *cb,
result = audit_comparator(cb->creds.gid, f->op, f->val);
break;
case AUDIT_LOGINUID:
- result = audit_comparator(cb->loginuid, f->op, f->val);
+ result = audit_comparator(audit_get_loginuid(current),
+ f->op, f->val);
break;
case AUDIT_SUBJ_USER:
case AUDIT_SUBJ_ROLE:
case AUDIT_SUBJ_TYPE:
case AUDIT_SUBJ_SEN:
case AUDIT_SUBJ_CLR:
- if (f->lsm_rule)
- result = security_audit_rule_match(cb->sid,
+ if (f->lsm_rule) {
+ security_task_getsecid(current, &sid);
+ result = security_audit_rule_match(sid,
f->type,
f->op,
f->lsm_rule,
NULL);
+ }
break;
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index f49a0318c2ed..00d79df03e76 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -443,17 +443,25 @@ static int match_tree_refs(struct audit_context *ctx, struct audit_tree *tree)
/* Determine if any context name data matches a rule's watch data */
/* Compare a task_struct with an audit_rule. Return 1 on match, 0
- * otherwise. */
+ * otherwise.
+ *
+ * If task_creation is true, this is an explicit indication that we are
+ * filtering a task rule at task creation time. This and tsk == current are
+ * the only situations where tsk->cred may be accessed without an rcu read lock.
+ */
static int audit_filter_rules(struct task_struct *tsk,
struct audit_krule *rule,
struct audit_context *ctx,
struct audit_names *name,
- enum audit_state *state)
+ enum audit_state *state,
+ bool task_creation)
{
- const struct cred *cred = get_task_cred(tsk);
+ const struct cred *cred;
int i, j, need_sid = 1;
u32 sid;
+ cred = rcu_dereference_check(tsk->cred, tsk == current || task_creation);
+
for (i = 0; i < rule->field_count; i++) {
struct audit_field *f = &rule->fields[i];
int result = 0;
@@ -637,10 +645,8 @@ static int audit_filter_rules(struct task_struct *tsk,
break;
}
- if (!result) {
- put_cred(cred);
+ if (!result)
return 0;
- }
}
if (ctx) {
@@ -656,7 +662,6 @@ static int audit_filter_rules(struct task_struct *tsk,
case AUDIT_NEVER: *state = AUDIT_DISABLED; break;
case AUDIT_ALWAYS: *state = AUDIT_RECORD_CONTEXT; break;
}
- put_cred(cred);
return 1;
}
@@ -671,7 +676,8 @@ static enum audit_state audit_filter_task(struct task_struct *tsk, char **key)
rcu_read_lock();
list_for_each_entry_rcu(e, &audit_filter_list[AUDIT_FILTER_TASK], list) {
- if (audit_filter_rules(tsk, &e->rule, NULL, NULL, &state)) {
+ if (audit_filter_rules(tsk, &e->rule, NULL, NULL,
+ &state, true)) {
if (state == AUDIT_RECORD_CONTEXT)
*key = kstrdup(e->rule.filterkey, GFP_ATOMIC);
rcu_read_unlock();
@@ -705,7 +711,7 @@ static enum audit_state audit_filter_syscall(struct task_struct *tsk,
list_for_each_entry_rcu(e, list, list) {
if ((e->rule.mask[word] & bit) == bit &&
audit_filter_rules(tsk, &e->rule, ctx, NULL,
- &state)) {
+ &state, false)) {
rcu_read_unlock();
ctx->current_state = state;
return state;
@@ -743,7 +749,8 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
list_for_each_entry_rcu(e, list, list) {
if ((e->rule.mask[word] & bit) == bit &&
- audit_filter_rules(tsk, &e->rule, ctx, n, &state)) {
+ audit_filter_rules(tsk, &e->rule, ctx, n,
+ &state, false)) {
rcu_read_unlock();
ctx->current_state = state;
return;
@@ -1011,7 +1018,7 @@ static int audit_log_pid_context(struct audit_context *context, pid_t pid,
/*
* to_send and len_sent accounting are very loose estimates. We aren't
* really worried about a hard cap to MAX_EXECVE_AUDIT_LEN so much as being
- * within about 500 bytes (next page boundry)
+ * within about 500 bytes (next page boundary)
*
* why snprintf? an int is up to 12 digits long. if we just assumed when
* logging that a[%d]= was going to be 16 characters long we would be wasting
diff --git a/kernel/bounds.c b/kernel/bounds.c
index 98a51f26c136..0c9b862292b2 100644
--- a/kernel/bounds.c
+++ b/kernel/bounds.c
@@ -9,11 +9,13 @@
#include <linux/page-flags.h>
#include <linux/mmzone.h>
#include <linux/kbuild.h>
+#include <linux/page_cgroup.h>
void foo(void)
{
/* The enum constants to put into include/generated/bounds.h */
DEFINE(NR_PAGEFLAGS, __NR_PAGEFLAGS);
DEFINE(MAX_NR_ZONES, __MAX_NR_ZONES);
+ DEFINE(NR_PCG_FLAGS, __NR_PCG_FLAGS);
/* End of constants */
}
diff --git a/kernel/capability.c b/kernel/capability.c
index 9e9385f132c8..283c529f8b1c 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -14,6 +14,7 @@
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/pid_namespace.h>
+#include <linux/user_namespace.h>
#include <asm/uaccess.h>
/*
@@ -21,12 +22,8 @@
*/
const kernel_cap_t __cap_empty_set = CAP_EMPTY_SET;
-const kernel_cap_t __cap_full_set = CAP_FULL_SET;
-const kernel_cap_t __cap_init_eff_set = CAP_INIT_EFF_SET;
EXPORT_SYMBOL(__cap_empty_set);
-EXPORT_SYMBOL(__cap_full_set);
-EXPORT_SYMBOL(__cap_init_eff_set);
int file_caps_enabled = 1;
@@ -290,6 +287,60 @@ error:
}
/**
+ * has_capability - Does a task have a capability in init_user_ns
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the initial user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability(struct task_struct *t, int cap)
+{
+ int ret = security_real_capable(t, &init_user_ns, cap);
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability - Does a task have a capability in a specific user ns
+ * @t: The task in question
+ * @ns: target user namespace
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to the specified user namespace, false if not.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_ns_capability(struct task_struct *t,
+ struct user_namespace *ns, int cap)
+{
+ int ret = security_real_capable(t, ns, cap);
+
+ return (ret == 0);
+}
+
+/**
+ * has_capability_noaudit - Does a task have a capability (unaudited)
+ * @t: The task in question
+ * @cap: The capability to be tested for
+ *
+ * Return true if the specified task has the given superior capability
+ * currently in effect to init_user_ns, false if not. Don't write an
+ * audit message for the check.
+ *
+ * Note that this does not set PF_SUPERPRIV on the task.
+ */
+bool has_capability_noaudit(struct task_struct *t, int cap)
+{
+ int ret = security_real_capable_noaudit(t, &init_user_ns, cap);
+
+ return (ret == 0);
+}
+
+/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
@@ -299,17 +350,60 @@ error:
* This sets PF_SUPERPRIV on the task if the capability is available on the
* assumption that it's about to be used.
*/
-int capable(int cap)
+bool capable(int cap)
+{
+ return ns_capable(&init_user_ns, cap);
+}
+EXPORT_SYMBOL(capable);
+
+/**
+ * ns_capable - Determine if the current task has a superior capability in effect
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if the current task has the given superior capability currently
+ * available for use, false if not.
+ *
+ * This sets PF_SUPERPRIV on the task if the capability is available on the
+ * assumption that it's about to be used.
+ */
+bool ns_capable(struct user_namespace *ns, int cap)
{
if (unlikely(!cap_valid(cap))) {
printk(KERN_CRIT "capable() called with invalid cap=%u\n", cap);
BUG();
}
- if (security_capable(current_cred(), cap) == 0) {
+ if (security_capable(ns, current_cred(), cap) == 0) {
current->flags |= PF_SUPERPRIV;
- return 1;
+ return true;
}
- return 0;
+ return false;
+}
+EXPORT_SYMBOL(ns_capable);
+
+/**
+ * task_ns_capable - Determine whether current task has a superior
+ * capability targeted at a specific task's user namespace.
+ * @t: The task whose user namespace is targeted.
+ * @cap: The capability in question.
+ *
+ * Return true if it does, false otherwise.
+ */
+bool task_ns_capable(struct task_struct *t, int cap)
+{
+ return ns_capable(task_cred_xxx(t, user)->user_ns, cap);
+}
+EXPORT_SYMBOL(task_ns_capable);
+
+/**
+ * nsown_capable - Check superior capability to one's own user_ns
+ * @cap: The capability in question
+ *
+ * Return true if the current task has the given superior capability
+ * targeted at its own user namespace.
+ */
+bool nsown_capable(int cap)
+{
+ return ns_capable(current_user_ns(), cap);
}
-EXPORT_SYMBOL(capable);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b24d7027b83c..2731d115d725 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,6 +57,7 @@
#include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */
#include <linux/eventfd.h>
#include <linux/poll.h>
+#include <linux/flex_array.h> /* used in cgroup_attach_proc */
#include <asm/atomic.h>
@@ -157,7 +158,7 @@ struct css_id {
};
/*
- * cgroup_event represents events which userspace want to recieve.
+ * cgroup_event represents events which userspace want to receive.
*/
struct cgroup_event {
/*
@@ -326,12 +327,6 @@ static struct hlist_head *css_set_hash(struct cgroup_subsys_state *css[])
return &css_set_table[index];
}
-static void free_css_set_rcu(struct rcu_head *obj)
-{
- struct css_set *cg = container_of(obj, struct css_set, rcu_head);
- kfree(cg);
-}
-
/* We don't maintain the lists running through each css_set to its
* task until after the first call to cgroup_iter_start(). This
* reduces the fork()/exit() overhead for people who have cgroups
@@ -375,7 +370,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
}
write_unlock(&css_set_lock);
- call_rcu(&cg->rcu_head, free_css_set_rcu);
+ kfree_rcu(cg, rcu_head);
}
/*
@@ -812,13 +807,6 @@ static int cgroup_call_pre_destroy(struct cgroup *cgrp)
return ret;
}
-static void free_cgroup_rcu(struct rcu_head *obj)
-{
- struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
-
- kfree(cgrp);
-}
-
static void cgroup_diput(struct dentry *dentry, struct inode *inode)
{
/* is dentry a directory ? if so, kfree() associated cgroup */
@@ -856,7 +844,7 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
*/
BUG_ON(!list_empty(&cgrp->pidlists));
- call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
+ kfree_rcu(cgrp, rcu_head);
}
iput(inode);
}
@@ -1748,6 +1736,76 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
}
EXPORT_SYMBOL_GPL(cgroup_path);
+/*
+ * cgroup_task_migrate - move a task from one cgroup to another.
+ *
+ * 'guarantee' is set if the caller promises that a new css_set for the task
+ * will already exist. If not set, this function might sleep, and can fail with
+ * -ENOMEM. Otherwise, it can only fail with -ESRCH.
+ */
+static int cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp,
+ struct task_struct *tsk, bool guarantee)
+{
+ struct css_set *oldcg;
+ struct css_set *newcg;
+
+ /*
+ * get old css_set. we need to take task_lock and refcount it, because
+ * an exiting task can change its css_set to init_css_set and drop its
+ * old one without taking cgroup_mutex.
+ */
+ task_lock(tsk);
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+
+ /* locate or allocate a new css_set for this task. */
+ if (guarantee) {
+ /* we know the css_set we want already exists. */
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(oldcg, cgrp, template);
+ BUG_ON(!newcg);
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+ } else {
+ might_sleep();
+ /* find_css_set will give us newcg already referenced. */
+ newcg = find_css_set(oldcg, cgrp);
+ if (!newcg) {
+ put_css_set(oldcg);
+ return -ENOMEM;
+ }
+ }
+ put_css_set(oldcg);
+
+ /* if PF_EXITING is set, the tsk->cgroups pointer is no longer safe. */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ task_unlock(tsk);
+ put_css_set(newcg);
+ return -ESRCH;
+ }
+ rcu_assign_pointer(tsk->cgroups, newcg);
+ task_unlock(tsk);
+
+ /* Update the css_set linked lists if we're using them */
+ write_lock(&css_set_lock);
+ if (!list_empty(&tsk->cg_list))
+ list_move(&tsk->cg_list, &newcg->tasks);
+ write_unlock(&css_set_lock);
+
+ /*
+ * We just gained a reference on oldcg by taking it from the task. As
+ * trading it for newcg is protected by cgroup_mutex, we're safe to drop
+ * it here; it will be freed under RCU.
+ */
+ put_css_set(oldcg);
+
+ set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+ return 0;
+}
+
/**
* cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp'
* @cgrp: the cgroup the task is attaching to
@@ -1758,11 +1816,9 @@ EXPORT_SYMBOL_GPL(cgroup_path);
*/
int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
- int retval = 0;
+ int retval;
struct cgroup_subsys *ss, *failed_ss = NULL;
struct cgroup *oldcgrp;
- struct css_set *cg;
- struct css_set *newcg;
struct cgroupfs_root *root = cgrp->root;
/* Nothing to do if the task is already in that cgroup */
@@ -1772,7 +1828,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
for_each_subsys(root, ss) {
if (ss->can_attach) {
- retval = ss->can_attach(ss, cgrp, tsk, false);
+ retval = ss->can_attach(ss, cgrp, tsk);
if (retval) {
/*
* Remember on which subsystem the can_attach()
@@ -1784,48 +1840,29 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
goto out;
}
}
+ if (ss->can_attach_task) {
+ retval = ss->can_attach_task(cgrp, tsk);
+ if (retval) {
+ failed_ss = ss;
+ goto out;
+ }
+ }
}
- task_lock(tsk);
- cg = tsk->cgroups;
- get_css_set(cg);
- task_unlock(tsk);
- /*
- * Locate or allocate a new css_set for this task,
- * based on its final set of cgroups
- */
- newcg = find_css_set(cg, cgrp);
- put_css_set(cg);
- if (!newcg) {
- retval = -ENOMEM;
- goto out;
- }
-
- task_lock(tsk);
- if (tsk->flags & PF_EXITING) {
- task_unlock(tsk);
- put_css_set(newcg);
- retval = -ESRCH;
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, false);
+ if (retval)
goto out;
- }
- rcu_assign_pointer(tsk->cgroups, newcg);
- task_unlock(tsk);
-
- /* Update the css_set linked lists if we're using them */
- write_lock(&css_set_lock);
- if (!list_empty(&tsk->cg_list)) {
- list_del(&tsk->cg_list);
- list_add(&tsk->cg_list, &newcg->tasks);
- }
- write_unlock(&css_set_lock);
for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
if (ss->attach)
- ss->attach(ss, cgrp, oldcgrp, tsk, false);
+ ss->attach(ss, cgrp, oldcgrp, tsk);
}
- set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
+
synchronize_rcu();
- put_css_set(cg);
/*
* wake up rmdir() waiter. the rmdir should fail since the cgroup
@@ -1844,7 +1881,7 @@ out:
*/
break;
if (ss->cancel_attach)
- ss->cancel_attach(ss, cgrp, tsk, false);
+ ss->cancel_attach(ss, cgrp, tsk);
}
}
return retval;
@@ -1875,49 +1912,370 @@ int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk)
EXPORT_SYMBOL_GPL(cgroup_attach_task_all);
/*
- * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex
- * held. May take task_lock of task
+ * cgroup_attach_proc works in two stages, the first of which prefetches all
+ * new css_sets needed (to make sure we have enough memory before committing
+ * to the move) and stores them in a list of entries of the following type.
+ * TODO: possible optimization: use css_set->rcu_head for chaining instead
+ */
+struct cg_list_entry {
+ struct css_set *cg;
+ struct list_head links;
+};
+
+static bool css_set_check_fetched(struct cgroup *cgrp,
+ struct task_struct *tsk, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+ struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT];
+
+ read_lock(&css_set_lock);
+ newcg = find_existing_css_set(cg, cgrp, template);
+ if (newcg)
+ get_css_set(newcg);
+ read_unlock(&css_set_lock);
+
+ /* doesn't exist at all? */
+ if (!newcg)
+ return false;
+ /* see if it's already in the list */
+ list_for_each_entry(cg_entry, newcg_list, links) {
+ if (cg_entry->cg == newcg) {
+ put_css_set(newcg);
+ return true;
+ }
+ }
+
+ /* not found */
+ put_css_set(newcg);
+ return false;
+}
+
+/*
+ * Find the new css_set and store it in the list in preparation for moving the
+ * given task to the given cgroup. Returns 0 or -ENOMEM.
+ */
+static int css_set_prefetch(struct cgroup *cgrp, struct css_set *cg,
+ struct list_head *newcg_list)
+{
+ struct css_set *newcg;
+ struct cg_list_entry *cg_entry;
+
+ /* ensure a new css_set will exist for this thread */
+ newcg = find_css_set(cg, cgrp);
+ if (!newcg)
+ return -ENOMEM;
+ /* add it to the list */
+ cg_entry = kmalloc(sizeof(struct cg_list_entry), GFP_KERNEL);
+ if (!cg_entry) {
+ put_css_set(newcg);
+ return -ENOMEM;
+ }
+ cg_entry->cg = newcg;
+ list_add(&cg_entry->links, newcg_list);
+ return 0;
+}
+
+/**
+ * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup
+ * @cgrp: the cgroup to attach to
+ * @leader: the threadgroup leader task_struct of the group to be attached
+ *
+ * Call holding cgroup_mutex and the threadgroup_fork_lock of the leader. Will
+ * take task_lock of each thread in leader's threadgroup individually in turn.
+ */
+int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader)
+{
+ int retval, i, group_size;
+ struct cgroup_subsys *ss, *failed_ss = NULL;
+ bool cancel_failed_ss = false;
+ /* guaranteed to be initialized later, but the compiler needs this */
+ struct cgroup *oldcgrp = NULL;
+ struct css_set *oldcg;
+ struct cgroupfs_root *root = cgrp->root;
+ /* threadgroup list cursor and array */
+ struct task_struct *tsk;
+ struct flex_array *group;
+ /*
+ * we need to make sure we have css_sets for all the tasks we're
+ * going to move -before- we actually start moving them, so that in
+ * case we get an ENOMEM we can bail out before making any changes.
+ */
+ struct list_head newcg_list;
+ struct cg_list_entry *cg_entry, *temp_nobe;
+
+ /*
+ * step 0: in order to do expensive, possibly blocking operations for
+ * every thread, we cannot iterate the thread group list, since it needs
+ * rcu or tasklist locked. instead, build an array of all threads in the
+ * group - threadgroup_fork_lock prevents new threads from appearing,
+ * and if threads exit, this will just be an over-estimate.
+ */
+ group_size = get_nr_threads(leader);
+ /* flex_array supports very large thread-groups better than kmalloc. */
+ group = flex_array_alloc(sizeof(struct task_struct *), group_size,
+ GFP_KERNEL);
+ if (!group)
+ return -ENOMEM;
+ /* pre-allocate to guarantee space while iterating in rcu read-side. */
+ retval = flex_array_prealloc(group, 0, group_size - 1, GFP_KERNEL);
+ if (retval)
+ goto out_free_group_list;
+
+ /* prevent changes to the threadgroup list while we take a snapshot. */
+ rcu_read_lock();
+ if (!thread_group_leader(leader)) {
+ /*
+ * a race with de_thread from another thread's exec() may strip
+ * us of our leadership, making while_each_thread unsafe to use
+ * on this task. if this happens, there is no choice but to
+ * throw this task away and try again (from cgroup_procs_write);
+ * this is "double-double-toil-and-trouble-check locking".
+ */
+ rcu_read_unlock();
+ retval = -EAGAIN;
+ goto out_free_group_list;
+ }
+ /* take a reference on each task in the group to go in the array. */
+ tsk = leader;
+ i = 0;
+ do {
+ /* as per above, nr_threads may decrease, but not increase. */
+ BUG_ON(i >= group_size);
+ get_task_struct(tsk);
+ /*
+ * saying GFP_ATOMIC has no effect here because we did prealloc
+ * earlier, but it's good form to communicate our expectations.
+ */
+ retval = flex_array_put_ptr(group, i, tsk, GFP_ATOMIC);
+ BUG_ON(retval != 0);
+ i++;
+ } while_each_thread(leader, tsk);
+ /* remember the number of threads in the array for later. */
+ group_size = i;
+ rcu_read_unlock();
+
+ /*
+ * step 1: check that we can legitimately attach to the cgroup.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->can_attach) {
+ retval = ss->can_attach(ss, cgrp, leader);
+ if (retval) {
+ failed_ss = ss;
+ goto out_cancel_attach;
+ }
+ }
+ /* a callback to be run on every thread in the threadgroup. */
+ if (ss->can_attach_task) {
+ /* run on each task in the threadgroup. */
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ retval = ss->can_attach_task(cgrp, tsk);
+ if (retval) {
+ failed_ss = ss;
+ cancel_failed_ss = true;
+ goto out_cancel_attach;
+ }
+ }
+ }
+ }
+
+ /*
+ * step 2: make sure css_sets exist for all threads to be migrated.
+ * we use find_css_set, which allocates a new one if necessary.
+ */
+ INIT_LIST_HEAD(&newcg_list);
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ /* nothing to do if this task is already in the cgroup */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* get old css_set pointer */
+ task_lock(tsk);
+ if (tsk->flags & PF_EXITING) {
+ /* ignore this task if it's going away */
+ task_unlock(tsk);
+ continue;
+ }
+ oldcg = tsk->cgroups;
+ get_css_set(oldcg);
+ task_unlock(tsk);
+ /* see if the new one for us is already in the list? */
+ if (css_set_check_fetched(cgrp, tsk, oldcg, &newcg_list)) {
+ /* was already there, nothing to do. */
+ put_css_set(oldcg);
+ } else {
+ /* we don't already have it. get new one. */
+ retval = css_set_prefetch(cgrp, oldcg, &newcg_list);
+ put_css_set(oldcg);
+ if (retval)
+ goto out_list_teardown;
+ }
+ }
+
+ /*
+ * step 3: now that we're guaranteed success wrt the css_sets, proceed
+ * to move all tasks to the new cgroup, calling ss->attach_task for each
+ * one along the way. there are no failure cases after here, so this is
+ * the commit point.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->pre_attach)
+ ss->pre_attach(cgrp);
+ }
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ /* leave current thread as it is if it's already there */
+ oldcgrp = task_cgroup_from_root(tsk, root);
+ if (cgrp == oldcgrp)
+ continue;
+ /* attach each task to each subsystem */
+ for_each_subsys(root, ss) {
+ if (ss->attach_task)
+ ss->attach_task(cgrp, tsk);
+ }
+ /* if the thread is PF_EXITING, it can just get skipped. */
+ retval = cgroup_task_migrate(cgrp, oldcgrp, tsk, true);
+ BUG_ON(retval != 0 && retval != -ESRCH);
+ }
+ /* nothing is sensitive to fork() after this point. */
+
+ /*
+ * step 4: do expensive, non-thread-specific subsystem callbacks.
+ * TODO: if ever a subsystem needs to know the oldcgrp for each task
+ * being moved, this call will need to be reworked to communicate that.
+ */
+ for_each_subsys(root, ss) {
+ if (ss->attach)
+ ss->attach(ss, cgrp, oldcgrp, leader);
+ }
+
+ /*
+ * step 5: success! and cleanup
+ */
+ synchronize_rcu();
+ cgroup_wakeup_rmdir_waiter(cgrp);
+ retval = 0;
+out_list_teardown:
+ /* clean up the list of prefetched css_sets. */
+ list_for_each_entry_safe(cg_entry, temp_nobe, &newcg_list, links) {
+ list_del(&cg_entry->links);
+ put_css_set(cg_entry->cg);
+ kfree(cg_entry);
+ }
+out_cancel_attach:
+ /* same deal as in cgroup_attach_task */
+ if (retval) {
+ for_each_subsys(root, ss) {
+ if (ss == failed_ss) {
+ if (cancel_failed_ss && ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ break;
+ }
+ if (ss->cancel_attach)
+ ss->cancel_attach(ss, cgrp, leader);
+ }
+ }
+ /* clean up the array of referenced threads in the group. */
+ for (i = 0; i < group_size; i++) {
+ tsk = flex_array_get_ptr(group, i);
+ put_task_struct(tsk);
+ }
+out_free_group_list:
+ flex_array_free(group);
+ return retval;
+}
+
+/*
+ * Find the task_struct of the task to attach by vpid and pass it along to the
+ * function to attach either it or all tasks in its threadgroup. Will take
+ * cgroup_mutex; may take task_lock of task.
*/
-static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
+static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup)
{
struct task_struct *tsk;
const struct cred *cred = current_cred(), *tcred;
int ret;
+ if (!cgroup_lock_live_group(cgrp))
+ return -ENODEV;
+
if (pid) {
rcu_read_lock();
tsk = find_task_by_vpid(pid);
- if (!tsk || tsk->flags & PF_EXITING) {
+ if (!tsk) {
+ rcu_read_unlock();
+ cgroup_unlock();
+ return -ESRCH;
+ }
+ if (threadgroup) {
+ /*
+ * RCU protects this access, since tsk was found in the
+ * tid map. a race with de_thread may cause group_leader
+ * to stop being the leader, but cgroup_attach_proc will
+ * detect it later.
+ */
+ tsk = tsk->group_leader;
+ } else if (tsk->flags & PF_EXITING) {
+ /* optimization for the single-task-only case */
rcu_read_unlock();
+ cgroup_unlock();
return -ESRCH;
}
+ /*
+ * even if we're attaching all tasks in the thread group, we
+ * only need to check permissions on one of them.
+ */
tcred = __task_cred(tsk);
if (cred->euid &&
cred->euid != tcred->uid &&
cred->euid != tcred->suid) {
rcu_read_unlock();
+ cgroup_unlock();
return -EACCES;
}
get_task_struct(tsk);
rcu_read_unlock();
} else {
- tsk = current;
+ if (threadgroup)
+ tsk = current->group_leader;
+ else
+ tsk = current;
get_task_struct(tsk);
}
- ret = cgroup_attach_task(cgrp, tsk);
+ if (threadgroup) {
+ threadgroup_fork_write_lock(tsk);
+ ret = cgroup_attach_proc(cgrp, tsk);
+ threadgroup_fork_write_unlock(tsk);
+ } else {
+ ret = cgroup_attach_task(cgrp, tsk);
+ }
put_task_struct(tsk);
+ cgroup_unlock();
return ret;
}
static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid)
{
+ return attach_task_by_pid(cgrp, pid, false);
+}
+
+static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid)
+{
int ret;
- if (!cgroup_lock_live_group(cgrp))
- return -ENODEV;
- ret = attach_task_by_pid(cgrp, pid);
- cgroup_unlock();
+ do {
+ /*
+ * attach_proc fails with -EAGAIN if threadgroup leadership
+ * changes in the middle of the operation, in which case we need
+ * to find the task_struct for the new leader and start over.
+ */
+ ret = attach_task_by_pid(cgrp, tgid, true);
+ } while (ret == -EAGAIN);
return ret;
}
@@ -3274,9 +3632,9 @@ static struct cftype files[] = {
{
.name = CGROUP_FILE_GENERIC_PREFIX "procs",
.open = cgroup_procs_open,
- /* .write_u64 = cgroup_procs_write, TODO */
+ .write_u64 = cgroup_procs_write,
.release = cgroup_pidlist_release,
- .mode = S_IRUGO,
+ .mode = S_IRUGO | S_IWUSR,
},
{
.name = "notify_on_release",
@@ -3655,12 +4013,12 @@ again:
spin_lock(&release_list_lock);
set_bit(CGRP_REMOVED, &cgrp->flags);
if (!list_empty(&cgrp->release_list))
- list_del(&cgrp->release_list);
+ list_del_init(&cgrp->release_list);
spin_unlock(&release_list_lock);
cgroup_lock_hierarchy(cgrp->root);
/* delete this cgroup from parent->children */
- list_del(&cgrp->sibling);
+ list_del_init(&cgrp->sibling);
cgroup_unlock_hierarchy(cgrp->root);
d = dget(cgrp->dentry);
@@ -3879,7 +4237,7 @@ void cgroup_unload_subsys(struct cgroup_subsys *ss)
subsys[ss->subsys_id] = NULL;
/* remove subsystem from rootnode's list of subsystems */
- list_del(&ss->sibling);
+ list_del_init(&ss->sibling);
/*
* disentangle the css from all css_sets attached to the dummytop. as
@@ -4230,20 +4588,8 @@ void cgroup_post_fork(struct task_struct *child)
*/
void cgroup_exit(struct task_struct *tsk, int run_callbacks)
{
- int i;
struct css_set *cg;
-
- if (run_callbacks && need_forkexit_callback) {
- /*
- * modular subsystems can't use callbacks, so no need to lock
- * the subsys array
- */
- for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
- struct cgroup_subsys *ss = subsys[i];
- if (ss->exit)
- ss->exit(ss, tsk);
- }
- }
+ int i;
/*
* Unlink from the css_set task list if necessary.
@@ -4253,7 +4599,7 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
if (!list_empty(&tsk->cg_list)) {
write_lock(&css_set_lock);
if (!list_empty(&tsk->cg_list))
- list_del(&tsk->cg_list);
+ list_del_init(&tsk->cg_list);
write_unlock(&css_set_lock);
}
@@ -4261,125 +4607,26 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
task_lock(tsk);
cg = tsk->cgroups;
tsk->cgroups = &init_css_set;
- task_unlock(tsk);
- if (cg)
- put_css_set_taskexit(cg);
-}
-
-/**
- * cgroup_clone - clone the cgroup the given subsystem is attached to
- * @tsk: the task to be moved
- * @subsys: the given subsystem
- * @nodename: the name for the new cgroup
- *
- * Duplicate the current cgroup in the hierarchy that the given
- * subsystem is attached to, and move this task into the new
- * child.
- */
-int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
- char *nodename)
-{
- struct dentry *dentry;
- int ret = 0;
- struct cgroup *parent, *child;
- struct inode *inode;
- struct css_set *cg;
- struct cgroupfs_root *root;
- struct cgroup_subsys *ss;
-
- /* We shouldn't be called by an unregistered subsystem */
- BUG_ON(!subsys->active);
-
- /* First figure out what hierarchy and cgroup we're dealing
- * with, and pin them so we can drop cgroup_mutex */
- mutex_lock(&cgroup_mutex);
- again:
- root = subsys->root;
- if (root == &rootnode) {
- mutex_unlock(&cgroup_mutex);
- return 0;
- }
- /* Pin the hierarchy */
- if (!atomic_inc_not_zero(&root->sb->s_active)) {
- /* We race with the final deactivate_super() */
- mutex_unlock(&cgroup_mutex);
- return 0;
+ if (run_callbacks && need_forkexit_callback) {
+ /*
+ * modular subsystems can't use callbacks, so no need to lock
+ * the subsys array
+ */
+ for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
+ struct cgroup_subsys *ss = subsys[i];
+ if (ss->exit) {
+ struct cgroup *old_cgrp =
+ rcu_dereference_raw(cg->subsys[i])->cgroup;
+ struct cgroup *cgrp = task_cgroup(tsk, i);
+ ss->exit(ss, cgrp, old_cgrp, tsk);
+ }
+ }
}
-
- /* Keep the cgroup alive */
- task_lock(tsk);
- parent = task_cgroup(tsk, subsys->subsys_id);
- cg = tsk->cgroups;
- get_css_set(cg);
task_unlock(tsk);
- mutex_unlock(&cgroup_mutex);
-
- /* Now do the VFS work to create a cgroup */
- inode = parent->dentry->d_inode;
-
- /* Hold the parent directory mutex across this operation to
- * stop anyone else deleting the new cgroup */
- mutex_lock(&inode->i_mutex);
- dentry = lookup_one_len(nodename, parent->dentry, strlen(nodename));
- if (IS_ERR(dentry)) {
- printk(KERN_INFO
- "cgroup: Couldn't allocate dentry for %s: %ld\n", nodename,
- PTR_ERR(dentry));
- ret = PTR_ERR(dentry);
- goto out_release;
- }
-
- /* Create the cgroup directory, which also creates the cgroup */
- ret = vfs_mkdir(inode, dentry, 0755);
- child = __d_cgrp(dentry);
- dput(dentry);
- if (ret) {
- printk(KERN_INFO
- "Failed to create cgroup %s: %d\n", nodename,
- ret);
- goto out_release;
- }
-
- /* The cgroup now exists. Retake cgroup_mutex and check
- * that we're still in the same state that we thought we
- * were. */
- mutex_lock(&cgroup_mutex);
- if ((root != subsys->root) ||
- (parent != task_cgroup(tsk, subsys->subsys_id))) {
- /* Aargh, we raced ... */
- mutex_unlock(&inode->i_mutex);
- put_css_set(cg);
-
- deactivate_super(root->sb);
- /* The cgroup is still accessible in the VFS, but
- * we're not going to try to rmdir() it at this
- * point. */
- printk(KERN_INFO
- "Race in cgroup_clone() - leaking cgroup %s\n",
- nodename);
- goto again;
- }
-
- /* do any required auto-setup */
- for_each_subsys(root, ss) {
- if (ss->post_clone)
- ss->post_clone(ss, child);
- }
-
- /* All seems fine. Finish by moving the task into the new cgroup */
- ret = cgroup_attach_task(child, tsk);
- mutex_unlock(&cgroup_mutex);
-
- out_release:
- mutex_unlock(&inode->i_mutex);
-
- mutex_lock(&cgroup_mutex);
- put_css_set(cg);
- mutex_unlock(&cgroup_mutex);
- deactivate_super(root->sb);
- return ret;
+ if (cg)
+ put_css_set_taskexit(cg);
}
/**
@@ -4620,14 +4867,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *child,
return ret;
}
-static void __free_css_id_cb(struct rcu_head *head)
-{
- struct css_id *id;
-
- id = container_of(head, struct css_id, rcu_head);
- kfree(id);
-}
-
void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
{
struct css_id *id = css->id;
@@ -4642,7 +4881,7 @@ void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
spin_lock(&ss->id_lock);
idr_remove(&ss->idr, id->id);
spin_unlock(&ss->id_lock);
- call_rcu(&id->rcu_head, __free_css_id_cb);
+ kfree_rcu(id, rcu_head);
}
EXPORT_SYMBOL_GPL(free_css_id);
@@ -4813,6 +5052,29 @@ css_get_next(struct cgroup_subsys *ss, int id,
return ret;
}
+/*
+ * get corresponding css from file open on cgroupfs directory
+ */
+struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id)
+{
+ struct cgroup *cgrp;
+ struct inode *inode;
+ struct cgroup_subsys_state *css;
+
+ inode = f->f_dentry->d_inode;
+ /* check in cgroup filesystem dir */
+ if (inode->i_op != &cgroup_dir_inode_operations)
+ return ERR_PTR(-EBADF);
+
+ if (id < 0 || id >= CGROUP_SUBSYS_COUNT)
+ return ERR_PTR(-EINVAL);
+
+ /* get cgroup */
+ cgrp = __d_cgrp(f->f_dentry);
+ css = cgrp->subsys[id];
+ return css ? css : ERR_PTR(-ENOENT);
+}
+
#ifdef CONFIG_CGROUP_DEBUG
static struct cgroup_subsys_state *debug_create(struct cgroup_subsys *ss,
struct cgroup *cont)
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
index e7bebb7c6c38..e691818d7e45 100644
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -160,7 +160,7 @@ static void freezer_destroy(struct cgroup_subsys *ss,
*/
static int freezer_can_attach(struct cgroup_subsys *ss,
struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
+ struct task_struct *task)
{
struct freezer *freezer;
@@ -172,26 +172,17 @@ static int freezer_can_attach(struct cgroup_subsys *ss,
if (freezer->state != CGROUP_THAWED)
return -EBUSY;
+ return 0;
+}
+
+static int freezer_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
+{
rcu_read_lock();
- if (__cgroup_freezing_or_frozen(task)) {
+ if (__cgroup_freezing_or_frozen(tsk)) {
rcu_read_unlock();
return -EBUSY;
}
rcu_read_unlock();
-
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (__cgroup_freezing_or_frozen(c)) {
- rcu_read_unlock();
- return -EBUSY;
- }
- }
- rcu_read_unlock();
- }
-
return 0;
}
@@ -390,6 +381,9 @@ struct cgroup_subsys freezer_subsys = {
.populate = freezer_populate,
.subsys_id = freezer_subsys_id,
.can_attach = freezer_can_attach,
+ .can_attach_task = freezer_can_attach_task,
+ .pre_attach = NULL,
+ .attach_task = NULL,
.attach = NULL,
.fork = freezer_fork,
.exit = NULL,
diff --git a/kernel/compat.c b/kernel/compat.c
index c9e2ec0b34a8..fc9eb093acd5 100644
--- a/kernel/compat.c
+++ b/kernel/compat.c
@@ -52,6 +52,64 @@ static int compat_put_timeval(struct compat_timeval __user *o,
put_user(i->tv_usec, &o->tv_usec)) ? -EFAULT : 0;
}
+static int compat_get_timex(struct timex *txc, struct compat_timex __user *utp)
+{
+ memset(txc, 0, sizeof(struct timex));
+
+ if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
+ __get_user(txc->modes, &utp->modes) ||
+ __get_user(txc->offset, &utp->offset) ||
+ __get_user(txc->freq, &utp->freq) ||
+ __get_user(txc->maxerror, &utp->maxerror) ||
+ __get_user(txc->esterror, &utp->esterror) ||
+ __get_user(txc->status, &utp->status) ||
+ __get_user(txc->constant, &utp->constant) ||
+ __get_user(txc->precision, &utp->precision) ||
+ __get_user(txc->tolerance, &utp->tolerance) ||
+ __get_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+ __get_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+ __get_user(txc->tick, &utp->tick) ||
+ __get_user(txc->ppsfreq, &utp->ppsfreq) ||
+ __get_user(txc->jitter, &utp->jitter) ||
+ __get_user(txc->shift, &utp->shift) ||
+ __get_user(txc->stabil, &utp->stabil) ||
+ __get_user(txc->jitcnt, &utp->jitcnt) ||
+ __get_user(txc->calcnt, &utp->calcnt) ||
+ __get_user(txc->errcnt, &utp->errcnt) ||
+ __get_user(txc->stbcnt, &utp->stbcnt))
+ return -EFAULT;
+
+ return 0;
+}
+
+static int compat_put_timex(struct compat_timex __user *utp, struct timex *txc)
+{
+ if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
+ __put_user(txc->modes, &utp->modes) ||
+ __put_user(txc->offset, &utp->offset) ||
+ __put_user(txc->freq, &utp->freq) ||
+ __put_user(txc->maxerror, &utp->maxerror) ||
+ __put_user(txc->esterror, &utp->esterror) ||
+ __put_user(txc->status, &utp->status) ||
+ __put_user(txc->constant, &utp->constant) ||
+ __put_user(txc->precision, &utp->precision) ||
+ __put_user(txc->tolerance, &utp->tolerance) ||
+ __put_user(txc->time.tv_sec, &utp->time.tv_sec) ||
+ __put_user(txc->time.tv_usec, &utp->time.tv_usec) ||
+ __put_user(txc->tick, &utp->tick) ||
+ __put_user(txc->ppsfreq, &utp->ppsfreq) ||
+ __put_user(txc->jitter, &utp->jitter) ||
+ __put_user(txc->shift, &utp->shift) ||
+ __put_user(txc->stabil, &utp->stabil) ||
+ __put_user(txc->jitcnt, &utp->jitcnt) ||
+ __put_user(txc->calcnt, &utp->calcnt) ||
+ __put_user(txc->errcnt, &utp->errcnt) ||
+ __put_user(txc->stbcnt, &utp->stbcnt) ||
+ __put_user(txc->tai, &utp->tai))
+ return -EFAULT;
+ return 0;
+}
+
asmlinkage long compat_sys_gettimeofday(struct compat_timeval __user *tv,
struct timezone __user *tz)
{
@@ -235,6 +293,8 @@ asmlinkage long compat_sys_times(struct compat_tms __user *tbuf)
return compat_jiffies_to_clock_t(jiffies);
}
+#ifdef __ARCH_WANT_SYS_SIGPENDING
+
/*
* Assumption: old_sigset_t and compat_old_sigset_t are both
* types that can be passed to put_user()/get_user().
@@ -254,6 +314,10 @@ asmlinkage long compat_sys_sigpending(compat_old_sigset_t __user *set)
return ret;
}
+#endif
+
+#ifdef __ARCH_WANT_SYS_SIGPROCMASK
+
asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
compat_old_sigset_t __user *oset)
{
@@ -275,6 +339,8 @@ asmlinkage long compat_sys_sigprocmask(int how, compat_old_sigset_t __user *set,
return ret;
}
+#endif
+
asmlinkage long compat_sys_setrlimit(unsigned int resource,
struct compat_rlimit __user *rlim)
{
@@ -617,6 +683,29 @@ long compat_sys_clock_gettime(clockid_t which_clock,
return err;
}
+long compat_sys_clock_adjtime(clockid_t which_clock,
+ struct compat_timex __user *utp)
+{
+ struct timex txc;
+ mm_segment_t oldfs;
+ int err, ret;
+
+ err = compat_get_timex(&txc, utp);
+ if (err)
+ return err;
+
+ oldfs = get_fs();
+ set_fs(KERNEL_DS);
+ ret = sys_clock_adjtime(which_clock, (struct timex __user *) &txc);
+ set_fs(oldfs);
+
+ err = compat_put_timex(utp, &txc);
+ if (err)
+ return err;
+
+ return ret;
+}
+
long compat_sys_clock_getres(clockid_t which_clock,
struct compat_timespec __user *tp)
{
@@ -809,10 +898,9 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
{
compat_sigset_t s32;
sigset_t s;
- int sig;
struct timespec t;
siginfo_t info;
- long ret, timeout = 0;
+ long ret;
if (sigsetsize != sizeof(sigset_t))
return -EINVAL;
@@ -820,51 +908,19 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
if (copy_from_user(&s32, uthese, sizeof(compat_sigset_t)))
return -EFAULT;
sigset_from_compat(&s, &s32);
- sigdelsetmask(&s,sigmask(SIGKILL)|sigmask(SIGSTOP));
- signotset(&s);
if (uts) {
- if (get_compat_timespec (&t, uts))
+ if (get_compat_timespec(&t, uts))
return -EFAULT;
- if (t.tv_nsec >= 1000000000L || t.tv_nsec < 0
- || t.tv_sec < 0)
- return -EINVAL;
}
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &s, &info);
- if (!sig) {
- timeout = MAX_SCHEDULE_TIMEOUT;
- if (uts)
- timeout = timespec_to_jiffies(&t)
- +(t.tv_sec || t.tv_nsec);
- if (timeout) {
- current->real_blocked = current->blocked;
- sigandsets(&current->blocked, &current->blocked, &s);
-
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- timeout = schedule_timeout_interruptible(timeout);
-
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &s, &info);
- current->blocked = current->real_blocked;
- siginitset(&current->real_blocked, 0);
- recalc_sigpending();
- }
- }
- spin_unlock_irq(&current->sighand->siglock);
+ ret = do_sigtimedwait(&s, &info, uts ? &t : NULL);
- if (sig) {
- ret = sig;
- if (uinfo) {
- if (copy_siginfo_to_user32(uinfo, &info))
- ret = -EFAULT;
- }
- }else {
- ret = timeout?-EINTR:-EAGAIN;
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user32(uinfo, &info))
+ ret = -EFAULT;
}
+
return ret;
}
@@ -951,58 +1007,17 @@ asmlinkage long compat_sys_rt_sigsuspend(compat_sigset_t __user *unewset, compat
asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp)
{
struct timex txc;
- int ret;
-
- memset(&txc, 0, sizeof(struct timex));
+ int err, ret;
- if (!access_ok(VERIFY_READ, utp, sizeof(struct compat_timex)) ||
- __get_user(txc.modes, &utp->modes) ||
- __get_user(txc.offset, &utp->offset) ||
- __get_user(txc.freq, &utp->freq) ||
- __get_user(txc.maxerror, &utp->maxerror) ||
- __get_user(txc.esterror, &utp->esterror) ||
- __get_user(txc.status, &utp->status) ||
- __get_user(txc.constant, &utp->constant) ||
- __get_user(txc.precision, &utp->precision) ||
- __get_user(txc.tolerance, &utp->tolerance) ||
- __get_user(txc.time.tv_sec, &utp->time.tv_sec) ||
- __get_user(txc.time.tv_usec, &utp->time.tv_usec) ||
- __get_user(txc.tick, &utp->tick) ||
- __get_user(txc.ppsfreq, &utp->ppsfreq) ||
- __get_user(txc.jitter, &utp->jitter) ||
- __get_user(txc.shift, &utp->shift) ||
- __get_user(txc.stabil, &utp->stabil) ||
- __get_user(txc.jitcnt, &utp->jitcnt) ||
- __get_user(txc.calcnt, &utp->calcnt) ||
- __get_user(txc.errcnt, &utp->errcnt) ||
- __get_user(txc.stbcnt, &utp->stbcnt))
- return -EFAULT;
+ err = compat_get_timex(&txc, utp);
+ if (err)
+ return err;
ret = do_adjtimex(&txc);
- if (!access_ok(VERIFY_WRITE, utp, sizeof(struct compat_timex)) ||
- __put_user(txc.modes, &utp->modes) ||
- __put_user(txc.offset, &utp->offset) ||
- __put_user(txc.freq, &utp->freq) ||
- __put_user(txc.maxerror, &utp->maxerror) ||
- __put_user(txc.esterror, &utp->esterror) ||
- __put_user(txc.status, &utp->status) ||
- __put_user(txc.constant, &utp->constant) ||
- __put_user(txc.precision, &utp->precision) ||
- __put_user(txc.tolerance, &utp->tolerance) ||
- __put_user(txc.time.tv_sec, &utp->time.tv_sec) ||
- __put_user(txc.time.tv_usec, &utp->time.tv_usec) ||
- __put_user(txc.tick, &utp->tick) ||
- __put_user(txc.ppsfreq, &utp->ppsfreq) ||
- __put_user(txc.jitter, &utp->jitter) ||
- __put_user(txc.shift, &utp->shift) ||
- __put_user(txc.stabil, &utp->stabil) ||
- __put_user(txc.jitcnt, &utp->jitcnt) ||
- __put_user(txc.calcnt, &utp->calcnt) ||
- __put_user(txc.errcnt, &utp->errcnt) ||
- __put_user(txc.stbcnt, &utp->stbcnt) ||
- __put_user(txc.tai, &utp->tai))
- ret = -EFAULT;
+ err = compat_put_timex(utp, &txc);
+ if (err)
+ return err;
return ret;
}
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 156cc5556140..12b7458f23b1 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -126,7 +126,7 @@ static void cpu_hotplug_done(void)
#else /* #if CONFIG_HOTPLUG_CPU */
static void cpu_hotplug_begin(void) {}
static void cpu_hotplug_done(void) {}
-#endif /* #esle #if CONFIG_HOTPLUG_CPU */
+#endif /* #else #if CONFIG_HOTPLUG_CPU */
/* Need to know about CPUs going up/down? */
int __ref register_cpu_notifier(struct notifier_block *nb)
@@ -160,7 +160,6 @@ static void cpu_notify_nofail(unsigned long val, void *v)
{
BUG_ON(cpu_notify(val, v));
}
-
EXPORT_SYMBOL(register_cpu_notifier);
void __ref unregister_cpu_notifier(struct notifier_block *nb)
@@ -205,7 +204,6 @@ static int __ref take_cpu_down(void *_param)
return err;
cpu_notify(CPU_DYING | param->mod, param->hcpu);
-
return 0;
}
@@ -227,6 +225,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
return -EINVAL;
cpu_hotplug_begin();
+
err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
if (err) {
nr_calls--;
@@ -304,7 +303,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls);
if (ret) {
nr_calls--;
- printk("%s: attempt to bring up CPU %u failed\n",
+ printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n",
__func__, cpu);
goto out_notify;
}
@@ -450,14 +449,14 @@ void __ref enable_nonboot_cpus(void)
if (cpumask_empty(frozen_cpus))
goto out;
- printk("Enabling non-boot CPUs ...\n");
+ printk(KERN_INFO "Enabling non-boot CPUs ...\n");
arch_enable_nonboot_cpus_begin();
for_each_cpu(cpu, frozen_cpus) {
error = _cpu_up(cpu, 1);
if (!error) {
- printk("CPU%d is up\n", cpu);
+ printk(KERN_INFO "CPU%d is up\n", cpu);
continue;
}
printk(KERN_WARNING "Error taking CPU%d up: %d\n", cpu, error);
@@ -509,7 +508,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu)
*/
/* cpu_bit_bitmap[0] is empty - so we can back into it */
-#define MASK_DECLARE_1(x) [x+1][0] = 1UL << (x)
+#define MASK_DECLARE_1(x) [x+1][0] = (1UL << (x))
#define MASK_DECLARE_2(x) MASK_DECLARE_1(x), MASK_DECLARE_1(x+1)
#define MASK_DECLARE_4(x) MASK_DECLARE_2(x), MASK_DECLARE_2(x+2)
#define MASK_DECLARE_8(x) MASK_DECLARE_4(x), MASK_DECLARE_4(x+4)
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index e92e98189032..9c9b7545c810 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1015,17 +1015,12 @@ static void cpuset_change_nodemask(struct task_struct *p,
struct cpuset *cs;
int migrate;
const nodemask_t *oldmem = scan->data;
- NODEMASK_ALLOC(nodemask_t, newmems, GFP_KERNEL);
-
- if (!newmems)
- return;
+ static nodemask_t newmems; /* protected by cgroup_mutex */
cs = cgroup_cs(scan->cg);
- guarantee_online_mems(cs, newmems);
+ guarantee_online_mems(cs, &newmems);
- cpuset_change_task_nodemask(p, newmems);
-
- NODEMASK_FREE(newmems);
+ cpuset_change_task_nodemask(p, &newmems);
mm = get_task_mm(p);
if (!mm)
@@ -1164,7 +1159,7 @@ int current_cpuset_is_being_rebound(void)
static int update_relax_domain_level(struct cpuset *cs, s64 val)
{
#ifdef CONFIG_SMP
- if (val < -1 || val >= SD_LV_MAX)
+ if (val < -1 || val >= sched_domain_level_max)
return -EINVAL;
#endif
@@ -1372,14 +1367,10 @@ static int fmeter_getrate(struct fmeter *fmp)
return val;
}
-/* Protected by cgroup_lock */
-static cpumask_var_t cpus_attach;
-
/* Called by cgroups to determine if a cpuset is usable; cgroup_mutex held */
static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct task_struct *tsk, bool threadgroup)
+ struct task_struct *tsk)
{
- int ret;
struct cpuset *cs = cgroup_cs(cont);
if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
@@ -1396,29 +1387,42 @@ static int cpuset_can_attach(struct cgroup_subsys *ss, struct cgroup *cont,
if (tsk->flags & PF_THREAD_BOUND)
return -EINVAL;
- ret = security_task_setscheduler(tsk);
- if (ret)
- return ret;
- if (threadgroup) {
- struct task_struct *c;
-
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- ret = security_task_setscheduler(c);
- if (ret) {
- rcu_read_unlock();
- return ret;
- }
- }
- rcu_read_unlock();
- }
return 0;
}
-static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
- struct cpuset *cs)
+static int cpuset_can_attach_task(struct cgroup *cgrp, struct task_struct *task)
+{
+ return security_task_setscheduler(task);
+}
+
+/*
+ * Protected by cgroup_lock. The nodemasks must be stored globally because
+ * dynamically allocating them is not allowed in pre_attach, and they must
+ * persist among pre_attach, attach_task, and attach.
+ */
+static cpumask_var_t cpus_attach;
+static nodemask_t cpuset_attach_nodemask_from;
+static nodemask_t cpuset_attach_nodemask_to;
+
+/* Set-up work for before attaching each task. */
+static void cpuset_pre_attach(struct cgroup *cont)
+{
+ struct cpuset *cs = cgroup_cs(cont);
+
+ if (cs == &top_cpuset)
+ cpumask_copy(cpus_attach, cpu_possible_mask);
+ else
+ guarantee_online_cpus(cs, cpus_attach);
+
+ guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+}
+
+/* Per-thread attachment work. */
+static void cpuset_attach_task(struct cgroup *cont, struct task_struct *tsk)
{
int err;
+ struct cpuset *cs = cgroup_cs(cont);
+
/*
* can_attach beforehand should guarantee that this doesn't fail.
* TODO: have a better way to handle failure here
@@ -1426,56 +1430,31 @@ static void cpuset_attach_task(struct task_struct *tsk, nodemask_t *to,
err = set_cpus_allowed_ptr(tsk, cpus_attach);
WARN_ON_ONCE(err);
- cpuset_change_task_nodemask(tsk, to);
+ cpuset_change_task_nodemask(tsk, &cpuset_attach_nodemask_to);
cpuset_update_task_spread_flag(cs, tsk);
-
}
static void cpuset_attach(struct cgroup_subsys *ss, struct cgroup *cont,
- struct cgroup *oldcont, struct task_struct *tsk,
- bool threadgroup)
+ struct cgroup *oldcont, struct task_struct *tsk)
{
struct mm_struct *mm;
struct cpuset *cs = cgroup_cs(cont);
struct cpuset *oldcs = cgroup_cs(oldcont);
- NODEMASK_ALLOC(nodemask_t, from, GFP_KERNEL);
- NODEMASK_ALLOC(nodemask_t, to, GFP_KERNEL);
- if (from == NULL || to == NULL)
- goto alloc_fail;
-
- if (cs == &top_cpuset) {
- cpumask_copy(cpus_attach, cpu_possible_mask);
- } else {
- guarantee_online_cpus(cs, cpus_attach);
- }
- guarantee_online_mems(cs, to);
-
- /* do per-task migration stuff possibly for each in the threadgroup */
- cpuset_attach_task(tsk, to, cs);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- cpuset_attach_task(c, to, cs);
- }
- rcu_read_unlock();
- }
-
- /* change mm; only needs to be done once even if threadgroup */
- *from = oldcs->mems_allowed;
- *to = cs->mems_allowed;
+ /*
+ * Change mm, possibly for multiple threads in a threadgroup. This is
+ * expensive and may sleep.
+ */
+ cpuset_attach_nodemask_from = oldcs->mems_allowed;
+ cpuset_attach_nodemask_to = cs->mems_allowed;
mm = get_task_mm(tsk);
if (mm) {
- mpol_rebind_mm(mm, to);
+ mpol_rebind_mm(mm, &cpuset_attach_nodemask_to);
if (is_memory_migrate(cs))
- cpuset_migrate_mm(mm, from, to);
+ cpuset_migrate_mm(mm, &cpuset_attach_nodemask_from,
+ &cpuset_attach_nodemask_to);
mmput(mm);
}
-
-alloc_fail:
- NODEMASK_FREE(from);
- NODEMASK_FREE(to);
}
/* The various types of files and directories in a cpuset file system */
@@ -1610,34 +1589,26 @@ out:
* across a page fault.
*/
-static int cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_cpulist(char *page, struct cpuset *cs)
{
- int ret;
+ size_t count;
mutex_lock(&callback_mutex);
- ret = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
+ count = cpulist_scnprintf(page, PAGE_SIZE, cs->cpus_allowed);
mutex_unlock(&callback_mutex);
- return ret;
+ return count;
}
-static int cpuset_sprintf_memlist(char *page, struct cpuset *cs)
+static size_t cpuset_sprintf_memlist(char *page, struct cpuset *cs)
{
- NODEMASK_ALLOC(nodemask_t, mask, GFP_KERNEL);
- int retval;
-
- if (mask == NULL)
- return -ENOMEM;
+ size_t count;
mutex_lock(&callback_mutex);
- *mask = cs->mems_allowed;
+ count = nodelist_scnprintf(page, PAGE_SIZE, cs->mems_allowed);
mutex_unlock(&callback_mutex);
- retval = nodelist_scnprintf(page, PAGE_SIZE, *mask);
-
- NODEMASK_FREE(mask);
-
- return retval;
+ return count;
}
static ssize_t cpuset_common_file_read(struct cgroup *cont,
@@ -1831,10 +1802,9 @@ static int cpuset_populate(struct cgroup_subsys *ss, struct cgroup *cont)
}
/*
- * post_clone() is called at the end of cgroup_clone().
- * 'cgroup' was just created automatically as a result of
- * a cgroup_clone(), and the current task is about to
- * be moved into 'cgroup'.
+ * post_clone() is called during cgroup_create() when the
+ * clone_children mount argument was specified. The cgroup
+ * can not yet have any tasks.
*
* Currently we refuse to set up the cgroup - thereby
* refusing the task to be entered, and as a result refusing
@@ -1862,8 +1832,10 @@ static void cpuset_post_clone(struct cgroup_subsys *ss,
cs = cgroup_cs(cgroup);
parent_cs = cgroup_cs(parent);
+ mutex_lock(&callback_mutex);
cs->mems_allowed = parent_cs->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent_cs->cpus_allowed);
+ mutex_unlock(&callback_mutex);
return;
}
@@ -1931,6 +1903,9 @@ struct cgroup_subsys cpuset_subsys = {
.create = cpuset_create,
.destroy = cpuset_destroy,
.can_attach = cpuset_can_attach,
+ .can_attach_task = cpuset_can_attach_task,
+ .pre_attach = cpuset_pre_attach,
+ .attach_task = cpuset_attach_task,
.attach = cpuset_attach,
.populate = cpuset_populate,
.post_clone = cpuset_post_clone,
@@ -2066,10 +2041,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
struct cpuset *cp; /* scans cpusets being updated */
struct cpuset *child; /* scans child cpusets of cp */
struct cgroup *cont;
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return;
+ static nodemask_t oldmems; /* protected by cgroup_mutex */
list_add_tail((struct list_head *)&root->stack_list, &queue);
@@ -2086,7 +2058,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY]))
continue;
- *oldmems = cp->mems_allowed;
+ oldmems = cp->mems_allowed;
/* Remove offline cpus and mems from this cpuset. */
mutex_lock(&callback_mutex);
@@ -2102,10 +2074,9 @@ static void scan_for_empty_cpusets(struct cpuset *root)
remove_tasks_in_empty_cpuset(cp);
else {
update_tasks_cpumask(cp, NULL);
- update_tasks_nodemask(cp, oldmems, NULL);
+ update_tasks_nodemask(cp, &oldmems, NULL);
}
}
- NODEMASK_FREE(oldmems);
}
/*
@@ -2147,19 +2118,16 @@ void cpuset_update_active_cpus(void)
static int cpuset_track_online_nodes(struct notifier_block *self,
unsigned long action, void *arg)
{
- NODEMASK_ALLOC(nodemask_t, oldmems, GFP_KERNEL);
-
- if (oldmems == NULL)
- return NOTIFY_DONE;
+ static nodemask_t oldmems; /* protected by cgroup_mutex */
cgroup_lock();
switch (action) {
case MEM_ONLINE:
- *oldmems = top_cpuset.mems_allowed;
+ oldmems = top_cpuset.mems_allowed;
mutex_lock(&callback_mutex);
top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
mutex_unlock(&callback_mutex);
- update_tasks_nodemask(&top_cpuset, oldmems, NULL);
+ update_tasks_nodemask(&top_cpuset, &oldmems, NULL);
break;
case MEM_OFFLINE:
/*
@@ -2173,7 +2141,6 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
}
cgroup_unlock();
- NODEMASK_FREE(oldmems);
return NOTIFY_OK;
}
#endif
@@ -2223,7 +2190,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
rcu_read_lock();
cs = task_cs(tsk);
if (cs)
- cpumask_copy(&tsk->cpus_allowed, cs->cpus_allowed);
+ do_set_cpus_allowed(tsk, cs->cpus_allowed);
rcu_read_unlock();
/*
@@ -2250,7 +2217,7 @@ int cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* Like above we can temporary set any mask and rely on
* set_cpus_allowed_ptr() as synchronization point.
*/
- cpumask_copy(&tsk->cpus_allowed, cpu_possible_mask);
+ do_set_cpus_allowed(tsk, cpu_possible_mask);
cpu = cpumask_any(cpu_active_mask);
}
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
new file mode 100644
index 000000000000..5f85690285d4
--- /dev/null
+++ b/kernel/crash_dump.c
@@ -0,0 +1,34 @@
+#include <linux/kernel.h>
+#include <linux/crash_dump.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/module.h>
+
+/*
+ * If we have booted due to a crash, max_pfn will be a very low value. We need
+ * to know the amount of memory that the previous kernel used.
+ */
+unsigned long saved_max_pfn;
+
+/*
+ * stores the physical address of elf header of crash image
+ *
+ * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
+ * is_kdump_kernel() to determine if we are booting after a panic. Hence put
+ * it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
+ */
+unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
+
+/*
+ * elfcorehdr= specifies the location of elf core header stored by the crashed
+ * kernel. This option will be passed by kexec loader to the capture kernel.
+ */
+static int __init setup_elfcorehdr(char *arg)
+{
+ char *end;
+ if (!arg)
+ return -EINVAL;
+ elfcorehdr_addr = memparse(arg, &end);
+ return end > arg ? 0 : -EINVAL;
+}
+early_param("elfcorehdr", setup_elfcorehdr);
diff --git a/kernel/cred.c b/kernel/cred.c
index 3a9d6dd53a6c..174fa84eca30 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -1,4 +1,4 @@
-/* Task credentials management - see Documentation/credentials.txt
+/* Task credentials management - see Documentation/security/credentials.txt
*
* Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
* Written by David Howells (dhowells@redhat.com)
@@ -35,7 +35,7 @@ static struct kmem_cache *cred_jar;
static struct thread_group_cred init_tgcred = {
.usage = ATOMIC_INIT(2),
.tgid = 0,
- .lock = SPIN_LOCK_UNLOCKED,
+ .lock = __SPIN_LOCK_UNLOCKED(init_cred.tgcred.lock),
};
#endif
@@ -49,11 +49,12 @@ struct cred init_cred = {
.magic = CRED_MAGIC,
#endif
.securebits = SECUREBITS_DEFAULT,
- .cap_inheritable = CAP_INIT_INH_SET,
+ .cap_inheritable = CAP_EMPTY_SET,
.cap_permitted = CAP_FULL_SET,
- .cap_effective = CAP_INIT_EFF_SET,
- .cap_bset = CAP_INIT_BSET,
+ .cap_effective = CAP_FULL_SET,
+ .cap_bset = CAP_FULL_SET,
.user = INIT_USER,
+ .user_ns = &init_user_ns,
.group_info = &init_groups,
#ifdef CONFIG_KEYS
.tgcred = &init_tgcred,
@@ -410,6 +411,11 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
goto error_put;
}
+ /* cache user_ns in cred. Doesn't need a refcount because it will
+ * stay pinned by cred->user
+ */
+ new->user_ns = new->user->user_ns;
+
#ifdef CONFIG_KEYS
/* new threads get their own thread keyrings if their parent already
* had one */
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
index cefd4a11f6d9..bad6786dee88 100644
--- a/kernel/debug/debug_core.c
+++ b/kernel/debug/debug_core.c
@@ -538,7 +538,7 @@ return_normal:
/*
* For single stepping, try to only enter on the processor
- * that was single stepping. To gaurd against a deadlock, the
+ * that was single stepping. To guard against a deadlock, the
* kernel will only try for the value of sstep_tries before
* giving up and continuing on.
*/
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
index 481a7bd2dfe7..a11db956dd62 100644
--- a/kernel/debug/gdbstub.c
+++ b/kernel/debug/gdbstub.c
@@ -1093,3 +1093,33 @@ int gdbstub_state(struct kgdb_state *ks, char *cmd)
put_packet(remcom_out_buffer);
return 0;
}
+
+/**
+ * gdbstub_exit - Send an exit message to GDB
+ * @status: The exit code to report.
+ */
+void gdbstub_exit(int status)
+{
+ unsigned char checksum, ch, buffer[3];
+ int loop;
+
+ buffer[0] = 'W';
+ buffer[1] = hex_asc_hi(status);
+ buffer[2] = hex_asc_lo(status);
+
+ dbg_io_ops->write_char('$');
+ checksum = 0;
+
+ for (loop = 0; loop < 3; loop++) {
+ ch = buffer[loop];
+ checksum += ch;
+ dbg_io_ops->write_char(ch);
+ }
+
+ dbg_io_ops->write_char('#');
+ dbg_io_ops->write_char(hex_asc_hi(checksum));
+ dbg_io_ops->write_char(hex_asc_lo(checksum));
+
+ /* make sure the output is flushed, lest the bootloader clobber it */
+ dbg_io_ops->flush();
+}
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
index bd3e8e29caa3..be14779bcef6 100644
--- a/kernel/debug/kdb/kdb_main.c
+++ b/kernel/debug/kdb/kdb_main.c
@@ -78,7 +78,7 @@ static unsigned int kdb_continue_catastrophic;
static kdbtab_t *kdb_commands;
#define KDB_BASE_CMD_MAX 50
static int kdb_max_commands = KDB_BASE_CMD_MAX;
-static kdbtab_t kdb_base_commands[50];
+static kdbtab_t kdb_base_commands[KDB_BASE_CMD_MAX];
#define for_each_kdbcmd(cmd, num) \
for ((cmd) = kdb_base_commands, (num) = 0; \
num < kdb_max_commands; \
@@ -441,9 +441,9 @@ static int kdb_check_regs(void)
* symbol name, and offset to the caller.
*
* The argument may consist of a numeric value (decimal or
- * hexidecimal), a symbol name, a register name (preceeded by the
+ * hexidecimal), a symbol name, a register name (preceded by the
* percent sign), an environment variable with a numeric value
- * (preceeded by a dollar sign) or a simple arithmetic expression
+ * (preceded by a dollar sign) or a simple arithmetic expression
* consisting of a symbol name, +/-, and a numeric constant value
* (offset).
* Parameters:
@@ -1335,7 +1335,7 @@ void kdb_print_state(const char *text, int value)
* error The hardware-defined error code
* reason2 kdb's current reason code.
* Initially error but can change
- * acording to kdb state.
+ * according to kdb state.
* db_result Result code from break or debug point.
* regs The exception frame at time of fault/breakpoint.
* should always be valid.
@@ -2892,7 +2892,7 @@ static void __init kdb_inittab(void)
"Send a signal to a process", 0, KDB_REPEAT_NONE);
kdb_register_repeat("summary", kdb_summary, "",
"Summarize the system", 4, KDB_REPEAT_NONE);
- kdb_register_repeat("per_cpu", kdb_per_cpu, "",
+ kdb_register_repeat("per_cpu", kdb_per_cpu, "<sym> [<bytes>] [<cpu>]",
"Display per_cpu variables", 3, KDB_REPEAT_NONE);
kdb_register_repeat("grephelp", kdb_grep_help, "",
"Display help on | grep", 0, KDB_REPEAT_NONE);
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
index 6b2485dcb050..5532dd37aa86 100644
--- a/kernel/debug/kdb/kdb_support.c
+++ b/kernel/debug/kdb/kdb_support.c
@@ -545,7 +545,7 @@ int kdb_putword(unsigned long addr, unsigned long word, size_t size)
* Mask for process state.
* Notes:
* The mask folds data from several sources into a single long value, so
- * be carefull not to overlap the bits. TASK_* bits are in the LSB,
+ * be careful not to overlap the bits. TASK_* bits are in the LSB,
* special cases like UNRUNNABLE are in the MSB. As of 2.6.10-rc1 there
* is no overlap between TASK_* and EXIT_* but that may not always be
* true, so EXIT_* bits are shifted left 16 bits before being stored in
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
new file mode 100644
index 000000000000..1ce23d3d8394
--- /dev/null
+++ b/kernel/events/Makefile
@@ -0,0 +1,6 @@
+ifdef CONFIG_FUNCTION_TRACER
+CFLAGS_REMOVE_core.o = -pg
+endif
+
+obj-y := core.o
+obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
diff --git a/kernel/perf_event.c b/kernel/events/core.c
index 656222fcf767..d863b3c057bb 100644
--- a/kernel/perf_event.c
+++ b/kernel/events/core.c
@@ -2,8 +2,8 @@
* Performance events core code:
*
* Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
- * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar
- * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
+ * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
+ * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
* Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
*
* For licensing details see kernel-base/COPYING
@@ -38,13 +38,96 @@
#include <asm/irq_regs.h>
+struct remote_function_call {
+ struct task_struct *p;
+ int (*func)(void *info);
+ void *info;
+ int ret;
+};
+
+static void remote_function(void *data)
+{
+ struct remote_function_call *tfc = data;
+ struct task_struct *p = tfc->p;
+
+ if (p) {
+ tfc->ret = -EAGAIN;
+ if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+ return;
+ }
+
+ tfc->ret = tfc->func(tfc->info);
+}
+
+/**
+ * task_function_call - call a function on the cpu on which a task runs
+ * @p: the task to evaluate
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func when the task is currently running. This might
+ * be on the current CPU, which just calls the function directly
+ *
+ * returns: @func return value, or
+ * -ESRCH - when the process isn't running
+ * -EAGAIN - when the process moved away
+ */
+static int
+task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
+{
+ struct remote_function_call data = {
+ .p = p,
+ .func = func,
+ .info = info,
+ .ret = -ESRCH, /* No such (running) process */
+ };
+
+ if (task_curr(p))
+ smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+
+ return data.ret;
+}
+
+/**
+ * cpu_function_call - call a function on the cpu
+ * @func: the function to be called
+ * @info: the function call argument
+ *
+ * Calls the function @func on the remote cpu.
+ *
+ * returns: @func return value or -ENXIO when the cpu is offline
+ */
+static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
+{
+ struct remote_function_call data = {
+ .p = NULL,
+ .func = func,
+ .info = info,
+ .ret = -ENXIO, /* No such CPU */
+ };
+
+ smp_call_function_single(cpu, remote_function, &data, 1);
+
+ return data.ret;
+}
+
+#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
+ PERF_FLAG_FD_OUTPUT |\
+ PERF_FLAG_PID_CGROUP)
+
enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
-atomic_t perf_task_events __read_mostly;
+/*
+ * perf_sched_events : >0 events exist
+ * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
+ */
+struct jump_label_key perf_sched_events __read_mostly;
+static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
+
static atomic_t nr_mmap_events __read_mostly;
static atomic_t nr_comm_events __read_mostly;
static atomic_t nr_task_events __read_mostly;
@@ -62,12 +145,30 @@ static struct srcu_struct pmus_srcu;
*/
int sysctl_perf_event_paranoid __read_mostly = 1;
-int sysctl_perf_event_mlock __read_mostly = 512; /* 'free' kb per user */
+/* Minimum for 512 kiB + 1 user control page */
+int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
/*
* max perf event sample rate
*/
-int sysctl_perf_event_sample_rate __read_mostly = 100000;
+#define DEFAULT_MAX_SAMPLE_RATE 100000
+int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
+static int max_samples_per_tick __read_mostly =
+ DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+
+int perf_proc_update_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+
+ return 0;
+}
static atomic64_t perf_event_id;
@@ -75,7 +176,11 @@ static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
enum event_type_t event_type);
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type);
+ enum event_type_t event_type,
+ struct task_struct *task);
+
+static void update_context_time(struct perf_event_context *ctx);
+static u64 perf_event_time(struct perf_event *event);
void __weak perf_event_print_debug(void) { }
@@ -89,6 +194,361 @@ static inline u64 perf_clock(void)
return local_clock();
}
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+ return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+#ifdef CONFIG_CGROUP_PERF
+
+/*
+ * Must ensure cgroup is pinned (css_get) before calling
+ * this function. In other words, we cannot call this function
+ * if there is no cgroup event for the current CPU context.
+ */
+static inline struct perf_cgroup *
+perf_cgroup_from_task(struct task_struct *task)
+{
+ return container_of(task_subsys_state(task, perf_subsys_id),
+ struct perf_cgroup, css);
+}
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+
+ return !event->cgrp || event->cgrp == cpuctx->cgrp;
+}
+
+static inline void perf_get_cgroup(struct perf_event *event)
+{
+ css_get(&event->cgrp->css);
+}
+
+static inline void perf_put_cgroup(struct perf_event *event)
+{
+ css_put(&event->cgrp->css);
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{
+ perf_put_cgroup(event);
+ event->cgrp = NULL;
+}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return event->cgrp != NULL;
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+ struct perf_cgroup_info *t;
+
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ return t->time;
+}
+
+static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
+{
+ struct perf_cgroup_info *info;
+ u64 now;
+
+ now = perf_clock();
+
+ info = this_cpu_ptr(cgrp->info);
+
+ info->time += now - info->timestamp;
+ info->timestamp = now;
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+ struct perf_cgroup *cgrp_out = cpuctx->cgrp;
+ if (cgrp_out)
+ __update_cgrp_time(cgrp_out);
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+ struct perf_cgroup *cgrp;
+
+ /*
+ * ensure we access cgroup data only when needed and
+ * when we know the cgroup is pinned (css_get)
+ */
+ if (!is_cgroup_event(event))
+ return;
+
+ cgrp = perf_cgroup_from_task(current);
+ /*
+ * Do not update time when cgroup is not active
+ */
+ if (cgrp == event->cgrp)
+ __update_cgrp_time(event->cgrp);
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+ struct perf_event_context *ctx)
+{
+ struct perf_cgroup *cgrp;
+ struct perf_cgroup_info *info;
+
+ /*
+ * ctx->lock held by caller
+ * ensure we do not access cgroup data
+ * unless we have the cgroup pinned (css_get)
+ */
+ if (!task || !ctx->nr_cgroups)
+ return;
+
+ cgrp = perf_cgroup_from_task(task);
+ info = this_cpu_ptr(cgrp->info);
+ info->timestamp = ctx->timestamp;
+}
+
+#define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
+#define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
+
+/*
+ * reschedule events based on the cgroup constraint of task.
+ *
+ * mode SWOUT : schedule out everything
+ * mode SWIN : schedule in based on cgroup for next
+ */
+void perf_cgroup_switch(struct task_struct *task, int mode)
+{
+ struct perf_cpu_context *cpuctx;
+ struct pmu *pmu;
+ unsigned long flags;
+
+ /*
+ * disable interrupts to avoid geting nr_cgroup
+ * changes via __perf_event_disable(). Also
+ * avoids preemption.
+ */
+ local_irq_save(flags);
+
+ /*
+ * we reschedule only in the presence of cgroup
+ * constrained events.
+ */
+ rcu_read_lock();
+
+ list_for_each_entry_rcu(pmu, &pmus, entry) {
+
+ cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+ perf_pmu_disable(cpuctx->ctx.pmu);
+
+ /*
+ * perf_cgroup_events says at least one
+ * context on this CPU has cgroup events.
+ *
+ * ctx->nr_cgroups reports the number of cgroup
+ * events for a context.
+ */
+ if (cpuctx->ctx.nr_cgroups > 0) {
+
+ if (mode & PERF_CGROUP_SWOUT) {
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /*
+ * must not be done before ctxswout due
+ * to event_filter_match() in event_sched_out()
+ */
+ cpuctx->cgrp = NULL;
+ }
+
+ if (mode & PERF_CGROUP_SWIN) {
+ WARN_ON_ONCE(cpuctx->cgrp);
+ /* set cgrp before ctxsw in to
+ * allow event_filter_match() to not
+ * have to pass task around
+ */
+ cpuctx->cgrp = perf_cgroup_from_task(task);
+ cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
+ }
+ }
+
+ perf_pmu_enable(cpuctx->ctx.pmu);
+ }
+
+ rcu_read_unlock();
+
+ local_irq_restore(flags);
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+ perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+ perf_cgroup_switch(task, PERF_CGROUP_SWIN);
+}
+
+static inline int perf_cgroup_connect(int fd, struct perf_event *event,
+ struct perf_event_attr *attr,
+ struct perf_event *group_leader)
+{
+ struct perf_cgroup *cgrp;
+ struct cgroup_subsys_state *css;
+ struct file *file;
+ int ret = 0, fput_needed;
+
+ file = fget_light(fd, &fput_needed);
+ if (!file)
+ return -EBADF;
+
+ css = cgroup_css_from_dir(file, perf_subsys_id);
+ if (IS_ERR(css)) {
+ ret = PTR_ERR(css);
+ goto out;
+ }
+
+ cgrp = container_of(css, struct perf_cgroup, css);
+ event->cgrp = cgrp;
+
+ /* must be done before we fput() the file */
+ perf_get_cgroup(event);
+
+ /*
+ * all events in a group must monitor
+ * the same cgroup because a task belongs
+ * to only one perf cgroup at a time
+ */
+ if (group_leader && group_leader->cgrp != cgrp) {
+ perf_detach_cgroup(event);
+ ret = -EINVAL;
+ }
+out:
+ fput_light(file, fput_needed);
+ return ret;
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+ struct perf_cgroup_info *t;
+ t = per_cpu_ptr(event->cgrp->info, event->cpu);
+ event->shadow_ctx_time = now - t->timestamp;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+ /*
+ * when the current task's perf cgroup does not match
+ * the event's, we need to remember to call the
+ * perf_mark_enable() function the first time a task with
+ * a matching perf cgroup is scheduled in.
+ */
+ if (is_cgroup_event(event) && !perf_cgroup_match(event))
+ event->cgrp_defer_enabled = 1;
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+ struct perf_event *sub;
+ u64 tstamp = perf_event_time(event);
+
+ if (!event->cgrp_defer_enabled)
+ return;
+
+ event->cgrp_defer_enabled = 0;
+
+ event->tstamp_enabled = tstamp - event->total_time_enabled;
+ list_for_each_entry(sub, &event->sibling_list, group_entry) {
+ if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
+ sub->tstamp_enabled = tstamp - sub->total_time_enabled;
+ sub->cgrp_defer_enabled = 0;
+ }
+ }
+}
+#else /* !CONFIG_CGROUP_PERF */
+
+static inline bool
+perf_cgroup_match(struct perf_event *event)
+{
+ return true;
+}
+
+static inline void perf_detach_cgroup(struct perf_event *event)
+{}
+
+static inline int is_cgroup_event(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void update_cgrp_time_from_event(struct perf_event *event)
+{
+}
+
+static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
+{
+}
+
+static inline void perf_cgroup_sched_out(struct task_struct *task)
+{
+}
+
+static inline void perf_cgroup_sched_in(struct task_struct *task)
+{
+}
+
+static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
+ struct perf_event_attr *attr,
+ struct perf_event *group_leader)
+{
+ return -EINVAL;
+}
+
+static inline void
+perf_cgroup_set_timestamp(struct task_struct *task,
+ struct perf_event_context *ctx)
+{
+}
+
+void
+perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
+{
+}
+
+static inline void
+perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
+{
+}
+
+static inline u64 perf_cgroup_event_time(struct perf_event *event)
+{
+ return 0;
+}
+
+static inline void
+perf_cgroup_defer_enabled(struct perf_event *event)
+{
+}
+
+static inline void
+perf_cgroup_mark_enabled(struct perf_event *event,
+ struct perf_event_context *ctx)
+{
+}
+#endif
+
void perf_pmu_disable(struct pmu *pmu)
{
int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -126,14 +586,6 @@ static void get_ctx(struct perf_event_context *ctx)
WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
}
-static void free_ctx(struct rcu_head *head)
-{
- struct perf_event_context *ctx;
-
- ctx = container_of(head, struct perf_event_context, rcu_head);
- kfree(ctx);
-}
-
static void put_ctx(struct perf_event_context *ctx)
{
if (atomic_dec_and_test(&ctx->refcount)) {
@@ -141,7 +593,7 @@ static void put_ctx(struct perf_event_context *ctx)
put_ctx(ctx->parent_ctx);
if (ctx->task)
put_task_struct(ctx->task);
- call_rcu(&ctx->rcu_head, free_ctx);
+ kfree_rcu(ctx, rcu_head);
}
}
@@ -254,7 +706,6 @@ static void perf_unpin_context(struct perf_event_context *ctx)
raw_spin_lock_irqsave(&ctx->lock, flags);
--ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
- put_ctx(ctx);
}
/*
@@ -271,6 +722,10 @@ static void update_context_time(struct perf_event_context *ctx)
static u64 perf_event_time(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
+
+ if (is_cgroup_event(event))
+ return perf_cgroup_event_time(event);
+
return ctx ? ctx->time : 0;
}
@@ -285,9 +740,20 @@ static void update_event_times(struct perf_event *event)
if (event->state < PERF_EVENT_STATE_INACTIVE ||
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;
-
- if (ctx->is_active)
+ /*
+ * in cgroup mode, time_enabled represents
+ * the time the event was enabled AND active
+ * tasks were in the monitored cgroup. This is
+ * independent of the activity of the context as
+ * there may be a mix of cgroup and non-cgroup events.
+ *
+ * That is why we treat cgroup events differently
+ * here.
+ */
+ if (is_cgroup_event(event))
run_end = perf_event_time(event);
+ else if (ctx->is_active)
+ run_end = ctx->time;
else
run_end = event->tstamp_stopped;
@@ -299,6 +765,7 @@ static void update_event_times(struct perf_event *event)
run_end = perf_event_time(event);
event->total_time_running = run_end - event->tstamp_running;
+
}
/*
@@ -347,6 +814,9 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
list_add_tail(&event->group_entry, list);
}
+ if (is_cgroup_event(event))
+ ctx->nr_cgroups++;
+
list_add_rcu(&event->event_entry, &ctx->event_list);
if (!ctx->nr_events)
perf_pmu_rotate_start(ctx->pmu);
@@ -465,6 +935,7 @@ static void perf_group_attach(struct perf_event *event)
static void
list_del_event(struct perf_event *event, struct perf_event_context *ctx)
{
+ struct perf_cpu_context *cpuctx;
/*
* We can have double detach due to exit/hot-unplug + close.
*/
@@ -473,6 +944,18 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
event->attach_state &= ~PERF_ATTACH_CONTEXT;
+ if (is_cgroup_event(event)) {
+ ctx->nr_cgroups--;
+ cpuctx = __get_cpu_context(ctx);
+ /*
+ * if there are no more cgroup events
+ * then cler cgrp to avoid stale pointer
+ * in update_cgrp_time_from_cpuctx()
+ */
+ if (!ctx->nr_cgroups)
+ cpuctx->cgrp = NULL;
+ }
+
ctx->nr_events--;
if (event->attr.inherit_stat)
ctx->nr_stat--;
@@ -544,7 +1027,8 @@ out:
static inline int
event_filter_match(struct perf_event *event)
{
- return event->cpu == -1 || event->cpu == smp_processor_id();
+ return (event->cpu == -1 || event->cpu == smp_processor_id())
+ && perf_cgroup_match(event);
}
static void
@@ -562,7 +1046,7 @@ event_sched_out(struct perf_event *event,
*/
if (event->state == PERF_EVENT_STATE_INACTIVE
&& !event_filter_match(event)) {
- delta = ctx->time - event->tstamp_stopped;
+ delta = tstamp - event->tstamp_stopped;
event->tstamp_running += delta;
event->tstamp_stopped = tstamp;
}
@@ -606,47 +1090,30 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
/*
* Cross CPU call to remove a performance event
*
* We disable the event on the hardware level first. After that we
* remove it from the context list.
*/
-static void __perf_event_remove_from_context(void *info)
+static int __perf_remove_from_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- /*
- * If this is a task context, we need to check whether it is
- * the current task context of this cpu. If not it has been
- * scheduled out before the smp call arrived.
- */
- if (ctx->task && cpuctx->task_ctx != ctx)
- return;
-
raw_spin_lock(&ctx->lock);
-
event_sched_out(event, cpuctx, ctx);
-
list_del_event(event, ctx);
-
raw_spin_unlock(&ctx->lock);
+
+ return 0;
}
/*
* Remove the event from a task's (or a CPU's) list of events.
*
- * Must be called with ctx->mutex held.
- *
* CPU events are removed with a smp call. For task events we only
* call when the task is on a CPU.
*
@@ -657,49 +1124,48 @@ static void __perf_event_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_event_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
struct task_struct *task = ctx->task;
+ lockdep_assert_held(&ctx->mutex);
+
if (!task) {
/*
* Per cpu events are removed via an smp call and
* the removal is always successful.
*/
- smp_call_function_single(event->cpu,
- __perf_event_remove_from_context,
- event, 1);
+ cpu_function_call(event->cpu, __perf_remove_from_context, event);
return;
}
retry:
- task_oncpu_function_call(task, __perf_event_remove_from_context,
- event);
+ if (!task_function_call(task, __perf_remove_from_context, event))
+ return;
raw_spin_lock_irq(&ctx->lock);
/*
- * If the context is active we need to retry the smp call.
+ * If we failed to find a running task, but find the context active now
+ * that we've acquired the ctx->lock, retry.
*/
- if (ctx->nr_active && !list_empty(&event->group_entry)) {
+ if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
goto retry;
}
/*
- * The lock prevents that this context is scheduled in so we
- * can remove the event safely, if the call above did not
- * succeed.
+ * Since the task isn't running, its safe to remove the event, us
+ * holding the ctx->lock ensures the task won't get scheduled in.
*/
- if (!list_empty(&event->group_entry))
- list_del_event(event, ctx);
+ list_del_event(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
/*
* Cross CPU call to disable a performance event
*/
-static void __perf_event_disable(void *info)
+static int __perf_event_disable(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -708,9 +1174,12 @@ static void __perf_event_disable(void *info)
/*
* If this is a per-task event, need to check whether this
* event's task is the current task on this cpu.
+ *
+ * Can trigger due to concurrent perf_event_context_sched_out()
+ * flipping contexts around.
*/
if (ctx->task && cpuctx->task_ctx != ctx)
- return;
+ return -EINVAL;
raw_spin_lock(&ctx->lock);
@@ -720,6 +1189,7 @@ static void __perf_event_disable(void *info)
*/
if (event->state >= PERF_EVENT_STATE_INACTIVE) {
update_context_time(ctx);
+ update_cgrp_time_from_event(event);
update_group_times(event);
if (event == event->group_leader)
group_sched_out(event, cpuctx, ctx);
@@ -729,6 +1199,8 @@ static void __perf_event_disable(void *info)
}
raw_spin_unlock(&ctx->lock);
+
+ return 0;
}
/*
@@ -753,13 +1225,13 @@ void perf_event_disable(struct perf_event *event)
/*
* Disable the event on the cpu that it's on
*/
- smp_call_function_single(event->cpu, __perf_event_disable,
- event, 1);
+ cpu_function_call(event->cpu, __perf_event_disable, event);
return;
}
retry:
- task_oncpu_function_call(task, __perf_event_disable, event);
+ if (!task_function_call(task, __perf_event_disable, event))
+ return;
raw_spin_lock_irq(&ctx->lock);
/*
@@ -767,6 +1239,11 @@ retry:
*/
if (event->state == PERF_EVENT_STATE_ACTIVE) {
raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
goto retry;
}
@@ -778,10 +1255,44 @@ retry:
update_group_times(event);
event->state = PERF_EVENT_STATE_OFF;
}
-
raw_spin_unlock_irq(&ctx->lock);
}
+static void perf_set_shadow_time(struct perf_event *event,
+ struct perf_event_context *ctx,
+ u64 tstamp)
+{
+ /*
+ * use the correct time source for the time snapshot
+ *
+ * We could get by without this by leveraging the
+ * fact that to get to this function, the caller
+ * has most likely already called update_context_time()
+ * and update_cgrp_time_xx() and thus both timestamp
+ * are identical (or very close). Given that tstamp is,
+ * already adjusted for cgroup, we could say that:
+ * tstamp - ctx->timestamp
+ * is equivalent to
+ * tstamp - cgrp->timestamp.
+ *
+ * Then, in perf_output_read(), the calculation would
+ * work with no changes because:
+ * - event is guaranteed scheduled in
+ * - no scheduled out in between
+ * - thus the timestamp would be the same
+ *
+ * But this is a bit hairy.
+ *
+ * So instead, we have an explicit cgroup call to remain
+ * within the time time source all along. We believe it
+ * is cleaner and simpler to understand.
+ */
+ if (is_cgroup_event(event))
+ perf_cgroup_set_shadow_time(event, tstamp);
+ else
+ event->shadow_ctx_time = tstamp - ctx->timestamp;
+}
+
#define MAX_INTERRUPTS (~0ULL)
static void perf_log_throttle(struct perf_event *event, int enable);
@@ -822,7 +1333,7 @@ event_sched_in(struct perf_event *event,
event->tstamp_running += tstamp - event->tstamp_stopped;
- event->shadow_ctx_time = tstamp - ctx->timestamp;
+ perf_set_shadow_time(event, ctx, tstamp);
if (!is_software_event(event))
cpuctx->active_oncpu++;
@@ -943,12 +1454,15 @@ static void add_event_to_ctx(struct perf_event *event,
event->tstamp_stopped = tstamp;
}
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+ struct task_struct *tsk);
+
/*
* Cross CPU call to install and enable a performance event
*
* Must be called with ctx->mutex held
*/
-static void __perf_install_in_context(void *info)
+static int __perf_install_in_context(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -957,21 +1471,22 @@ static void __perf_install_in_context(void *info)
int err;
/*
- * If this is a task context, we need to check whether it is
- * the current task context of this cpu. If not it has been
- * scheduled out before the smp call arrived.
- * Or possibly this is the right context but it isn't
- * on this cpu because it had no events.
+ * In case we're installing a new context to an already running task,
+ * could also happen before perf_event_task_sched_in() on architectures
+ * which do context switches with IRQs enabled.
*/
- if (ctx->task && cpuctx->task_ctx != ctx) {
- if (cpuctx->task_ctx || ctx->task != current)
- return;
- cpuctx->task_ctx = ctx;
- }
+ if (ctx->task && !cpuctx->task_ctx)
+ perf_event_context_sched_in(ctx, ctx->task);
raw_spin_lock(&ctx->lock);
ctx->is_active = 1;
update_context_time(ctx);
+ /*
+ * update cgrp time only if current cgrp
+ * matches event->cgrp. Must be done before
+ * calling add_event_to_ctx()
+ */
+ update_cgrp_time_from_event(event);
add_event_to_ctx(event, ctx);
@@ -1012,6 +1527,8 @@ static void __perf_install_in_context(void *info)
unlock:
raw_spin_unlock(&ctx->lock);
+
+ return 0;
}
/*
@@ -1023,8 +1540,6 @@ unlock:
* If the event is attached to a task which is on a CPU we use a smp
* call to enable it in the task context. The task might have been
* scheduled away, but we check this in the smp call again.
- *
- * Must be called with ctx->mutex held.
*/
static void
perf_install_in_context(struct perf_event_context *ctx,
@@ -1033,6 +1548,8 @@ perf_install_in_context(struct perf_event_context *ctx,
{
struct task_struct *task = ctx->task;
+ lockdep_assert_held(&ctx->mutex);
+
event->ctx = ctx;
if (!task) {
@@ -1040,31 +1557,29 @@ perf_install_in_context(struct perf_event_context *ctx,
* Per cpu events are installed via an smp call and
* the install is always successful.
*/
- smp_call_function_single(cpu, __perf_install_in_context,
- event, 1);
+ cpu_function_call(cpu, __perf_install_in_context, event);
return;
}
retry:
- task_oncpu_function_call(task, __perf_install_in_context,
- event);
+ if (!task_function_call(task, __perf_install_in_context, event))
+ return;
raw_spin_lock_irq(&ctx->lock);
/*
- * we need to retry the smp call.
+ * If we failed to find a running task, but find the context active now
+ * that we've acquired the ctx->lock, retry.
*/
- if (ctx->is_active && list_empty(&event->group_entry)) {
+ if (ctx->is_active) {
raw_spin_unlock_irq(&ctx->lock);
goto retry;
}
/*
- * The lock prevents that this context is scheduled in so we
- * can add the event safely, if it the call above did not
- * succeed.
+ * Since the task isn't running, its safe to add the event, us holding
+ * the ctx->lock ensures the task won't get scheduled in.
*/
- if (list_empty(&event->group_entry))
- add_event_to_ctx(event, ctx);
+ add_event_to_ctx(event, ctx);
raw_spin_unlock_irq(&ctx->lock);
}
@@ -1093,7 +1608,7 @@ static void __perf_event_mark_enabled(struct perf_event *event,
/*
* Cross CPU call to enable a performance event
*/
-static void __perf_event_enable(void *info)
+static int __perf_event_enable(void *info)
{
struct perf_event *event = info;
struct perf_event_context *ctx = event->ctx;
@@ -1101,26 +1616,27 @@ static void __perf_event_enable(void *info)
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
int err;
- /*
- * If this is a per-task event, need to check whether this
- * event's task is the current task on this cpu.
- */
- if (ctx->task && cpuctx->task_ctx != ctx) {
- if (cpuctx->task_ctx || ctx->task != current)
- return;
- cpuctx->task_ctx = ctx;
- }
+ if (WARN_ON_ONCE(!ctx->is_active))
+ return -EINVAL;
raw_spin_lock(&ctx->lock);
- ctx->is_active = 1;
update_context_time(ctx);
if (event->state >= PERF_EVENT_STATE_INACTIVE)
goto unlock;
+
+ /*
+ * set current task's cgroup time reference point
+ */
+ perf_cgroup_set_timestamp(current, ctx);
+
__perf_event_mark_enabled(event, ctx);
- if (!event_filter_match(event))
+ if (!event_filter_match(event)) {
+ if (is_cgroup_event(event))
+ perf_cgroup_defer_enabled(event);
goto unlock;
+ }
/*
* If the event is in a group and isn't the group leader,
@@ -1153,6 +1669,8 @@ static void __perf_event_enable(void *info)
unlock:
raw_spin_unlock(&ctx->lock);
+
+ return 0;
}
/*
@@ -1173,8 +1691,7 @@ void perf_event_enable(struct perf_event *event)
/*
* Enable the event on the cpu that it's on
*/
- smp_call_function_single(event->cpu, __perf_event_enable,
- event, 1);
+ cpu_function_call(event->cpu, __perf_event_enable, event);
return;
}
@@ -1193,8 +1710,15 @@ void perf_event_enable(struct perf_event *event)
event->state = PERF_EVENT_STATE_OFF;
retry:
+ if (!ctx->is_active) {
+ __perf_event_mark_enabled(event, ctx);
+ goto out;
+ }
+
raw_spin_unlock_irq(&ctx->lock);
- task_oncpu_function_call(task, __perf_event_enable, event);
+
+ if (!task_function_call(task, __perf_event_enable, event))
+ return;
raw_spin_lock_irq(&ctx->lock);
@@ -1202,15 +1726,14 @@ retry:
* If the context is active and the event is still off,
* we need to retry the cross-call.
*/
- if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF)
+ if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
+ /*
+ * task could have been flipped by a concurrent
+ * perf_event_context_sched_out()
+ */
+ task = ctx->task;
goto retry;
-
- /*
- * Since we have the lock this context can't be scheduled
- * in, so we can change the state safely.
- */
- if (event->state == PERF_EVENT_STATE_OFF)
- __perf_event_mark_enabled(event, ctx);
+ }
out:
raw_spin_unlock_irq(&ctx->lock);
@@ -1242,6 +1765,7 @@ static void ctx_sched_out(struct perf_event_context *ctx,
if (likely(!ctx->nr_events))
goto out;
update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
if (!ctx->nr_active)
goto out;
@@ -1354,8 +1878,8 @@ static void perf_event_sync_stat(struct perf_event_context *ctx,
}
}
-void perf_event_context_sched_out(struct task_struct *task, int ctxn,
- struct task_struct *next)
+static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
+ struct task_struct *next)
{
struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
struct perf_event_context *next_ctx;
@@ -1431,6 +1955,14 @@ void __perf_event_task_sched_out(struct task_struct *task,
for_each_task_context_nr(ctxn)
perf_event_context_sched_out(task, ctxn, next);
+
+ /*
+ * if cgroup events exist on this CPU, then we need
+ * to check if we have to switch out PMU state.
+ * cgroup event are system-wide mode only
+ */
+ if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ perf_cgroup_sched_out(task);
}
static void task_ctx_sched_out(struct perf_event_context *ctx,
@@ -1469,6 +2001,10 @@ ctx_pinned_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, ctx);
+
if (group_can_go_on(event, cpuctx, 1))
group_sched_in(event, cpuctx, ctx);
@@ -1501,6 +2037,10 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
if (!event_filter_match(event))
continue;
+ /* may need to reset tstamp_enabled */
+ if (is_cgroup_event(event))
+ perf_cgroup_mark_enabled(event, ctx);
+
if (group_can_go_on(event, cpuctx, can_add_hw)) {
if (group_sched_in(event, cpuctx, ctx))
can_add_hw = 0;
@@ -1511,15 +2051,19 @@ ctx_flexible_sched_in(struct perf_event_context *ctx,
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type,
+ struct task_struct *task)
{
+ u64 now;
+
raw_spin_lock(&ctx->lock);
ctx->is_active = 1;
if (likely(!ctx->nr_events))
goto out;
- ctx->timestamp = perf_clock();
-
+ now = perf_clock();
+ ctx->timestamp = now;
+ perf_cgroup_set_timestamp(task, ctx);
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
@@ -1536,11 +2080,12 @@ out:
}
static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
- enum event_type_t event_type)
+ enum event_type_t event_type,
+ struct task_struct *task)
{
struct perf_event_context *ctx = &cpuctx->ctx;
- ctx_sched_in(ctx, cpuctx, event_type);
+ ctx_sched_in(ctx, cpuctx, event_type, task);
}
static void task_ctx_sched_in(struct perf_event_context *ctx,
@@ -1548,15 +2093,16 @@ static void task_ctx_sched_in(struct perf_event_context *ctx,
{
struct perf_cpu_context *cpuctx;
- cpuctx = __get_cpu_context(ctx);
+ cpuctx = __get_cpu_context(ctx);
if (cpuctx->task_ctx == ctx)
return;
- ctx_sched_in(ctx, cpuctx, event_type);
+ ctx_sched_in(ctx, cpuctx, event_type, NULL);
cpuctx->task_ctx = ctx;
}
-void perf_event_context_sched_in(struct perf_event_context *ctx)
+static void perf_event_context_sched_in(struct perf_event_context *ctx,
+ struct task_struct *task)
{
struct perf_cpu_context *cpuctx;
@@ -1572,9 +2118,9 @@ void perf_event_context_sched_in(struct perf_event_context *ctx)
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
- ctx_sched_in(ctx, cpuctx, EVENT_PINNED);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
- ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE);
+ ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
+ ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
cpuctx->task_ctx = ctx;
@@ -1607,8 +2153,15 @@ void __perf_event_task_sched_in(struct task_struct *task)
if (likely(!ctx))
continue;
- perf_event_context_sched_in(ctx);
+ perf_event_context_sched_in(ctx, task);
}
+ /*
+ * if cgroup events exist on this CPU, then we need
+ * to check if we have to switch in PMU state.
+ * cgroup event are system-wide mode only
+ */
+ if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
+ perf_cgroup_sched_in(task);
}
static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
@@ -1638,7 +2191,7 @@ static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
* Reduce accuracy by one bit such that @a and @b converge
* to a similar magnitude.
*/
-#define REDUCE_FLS(a, b) \
+#define REDUCE_FLS(a, b) \
do { \
if (a##_fls > b##_fls) { \
a >>= 1; \
@@ -1808,7 +2361,7 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
if (ctx)
rotate_ctx(ctx);
- cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE);
+ cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, current);
if (ctx)
task_ctx_sched_in(ctx, EVENT_FLEXIBLE);
@@ -1863,6 +2416,14 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
if (!ctx || !ctx->nr_events)
goto out;
+ /*
+ * We must ctxsw out cgroup events to avoid conflict
+ * when invoking perf_task_event_sched_in() later on
+ * in this function. Otherwise we end up trying to
+ * ctxswin cgroup events which are already scheduled
+ * in.
+ */
+ perf_cgroup_sched_out(current);
task_ctx_sched_out(ctx, EVENT_ALL);
raw_spin_lock(&ctx->lock);
@@ -1887,7 +2448,10 @@ static void perf_event_enable_on_exec(struct perf_event_context *ctx)
raw_spin_unlock(&ctx->lock);
- perf_event_context_sched_in(ctx);
+ /*
+ * Also calls ctxswin for cgroup events, if any:
+ */
+ perf_event_context_sched_in(ctx, ctx->task);
out:
local_irq_restore(flags);
}
@@ -1912,8 +2476,10 @@ static void __perf_event_read(void *info)
return;
raw_spin_lock(&ctx->lock);
- if (ctx->is_active)
+ if (ctx->is_active) {
update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
update_event_times(event);
if (event->state == PERF_EVENT_STATE_ACTIVE)
event->pmu->read(event);
@@ -1944,8 +2510,10 @@ static u64 perf_event_read(struct perf_event *event)
* (e.g., thread is blocked), in that case
* we cannot update context time
*/
- if (ctx->is_active)
+ if (ctx->is_active) {
update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ }
update_event_times(event);
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -2224,6 +2792,9 @@ errout:
}
+/*
+ * Returns a matching context with refcount and pincount.
+ */
static struct perf_event_context *
find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
{
@@ -2248,6 +2819,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
ctx = &cpuctx->ctx;
get_ctx(ctx);
+ ++ctx->pin_count;
return ctx;
}
@@ -2261,6 +2833,7 @@ retry:
ctx = perf_lock_task_context(task, ctxn, &flags);
if (ctx) {
unclone_ctx(ctx);
+ ++ctx->pin_count;
raw_spin_unlock_irqrestore(&ctx->lock, flags);
}
@@ -2282,8 +2855,10 @@ retry:
err = -ESRCH;
else if (task->perf_event_ctxp[ctxn])
err = -EAGAIN;
- else
+ else {
+ ++ctx->pin_count;
rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
+ }
mutex_unlock(&task->perf_event_mutex);
if (unlikely(err)) {
@@ -2323,7 +2898,7 @@ static void free_event(struct perf_event *event)
if (!event->parent) {
if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_dec(&perf_task_events);
+ jump_label_dec(&perf_sched_events);
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
@@ -2332,6 +2907,10 @@ static void free_event(struct perf_event *event)
atomic_dec(&nr_task_events);
if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
put_callchain_buffers();
+ if (is_cgroup_event(event)) {
+ atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
+ jump_label_dec(&perf_sched_events);
+ }
}
if (event->buffer) {
@@ -2339,6 +2918,9 @@ static void free_event(struct perf_event *event)
event->buffer = NULL;
}
+ if (is_cgroup_event(event))
+ perf_detach_cgroup(event);
+
if (event->destroy)
event->destroy(event);
@@ -4406,26 +4988,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
if (unlikely(!is_sampling_event(event)))
return 0;
- if (!throttle) {
- hwc->interrupts++;
- } else {
- if (hwc->interrupts != MAX_INTERRUPTS) {
- hwc->interrupts++;
- if (HZ * hwc->interrupts >
- (u64)sysctl_perf_event_sample_rate) {
- hwc->interrupts = MAX_INTERRUPTS;
- perf_log_throttle(event, 0);
- ret = 1;
- }
- } else {
- /*
- * Keep re-disabling events even though on the previous
- * pass we disabled it - just in case we raced with a
- * sched-in and the event got enabled again:
- */
+ if (unlikely(hwc->interrupts >= max_samples_per_tick)) {
+ if (throttle) {
+ hwc->interrupts = MAX_INTERRUPTS;
+ perf_log_throttle(event, 0);
ret = 1;
}
- }
+ } else
+ hwc->interrupts++;
if (event->attr.freq) {
u64 now = perf_clock();
@@ -4458,6 +5028,14 @@ static int __perf_event_overflow(struct perf_event *event, int nmi,
else
perf_event_output(event, nmi, data, regs);
+ if (event->fasync && event->pending_kill) {
+ if (nmi) {
+ event->pending_wakeup = 1;
+ irq_work_queue(&event->pending);
+ } else
+ perf_event_wakeup(event);
+ }
+
return ret;
}
@@ -4567,7 +5145,7 @@ static int perf_exclude_event(struct perf_event *event,
struct pt_regs *regs)
{
if (event->hw.state & PERF_HES_STOPPED)
- return 0;
+ return 1;
if (regs) {
if (event->attr.exclude_user && user_mode(regs))
@@ -4753,14 +5331,6 @@ swevent_hlist_deref(struct swevent_htable *swhash)
lockdep_is_held(&swhash->hlist_mutex));
}
-static void swevent_hlist_release_rcu(struct rcu_head *rcu_head)
-{
- struct swevent_hlist *hlist;
-
- hlist = container_of(rcu_head, struct swevent_hlist, rcu_head);
- kfree(hlist);
-}
-
static void swevent_hlist_release(struct swevent_htable *swhash)
{
struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
@@ -4769,7 +5339,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash)
return;
rcu_assign_pointer(swhash->swevent_hlist, NULL);
- call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu);
+ kfree_rcu(hlist, rcu_head);
}
static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
@@ -4851,7 +5421,7 @@ fail:
return err;
}
-atomic_t perf_swevent_enabled[PERF_COUNT_SW_MAX];
+struct jump_label_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
static void sw_perf_event_destroy(struct perf_event *event)
{
@@ -4923,6 +5493,8 @@ static int perf_tp_event_match(struct perf_event *event,
struct perf_sample_data *data,
struct pt_regs *regs)
{
+ if (event->hw.state & PERF_HES_STOPPED)
+ return 0;
/*
* All tracepoints are from kernel-space.
*/
@@ -5062,6 +5634,10 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
u64 period;
event = container_of(hrtimer, struct perf_event, hw.hrtimer);
+
+ if (event->state != PERF_EVENT_STATE_ACTIVE)
+ return HRTIMER_NORESTART;
+
event->pmu->read(event);
perf_sample_data_init(&data, 0);
@@ -5088,9 +5664,6 @@ static void perf_swevent_start_hrtimer(struct perf_event *event)
if (!is_sampling_event(event))
return;
- hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
- hwc->hrtimer.function = perf_swevent_hrtimer;
-
period = local64_read(&hwc->period_left);
if (period) {
if (period < 0)
@@ -5117,6 +5690,30 @@ static void perf_swevent_cancel_hrtimer(struct perf_event *event)
}
}
+static void perf_swevent_init_hrtimer(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ if (!is_sampling_event(event))
+ return;
+
+ hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+ hwc->hrtimer.function = perf_swevent_hrtimer;
+
+ /*
+ * Since hrtimers have a fixed rate, we can do a static freq->period
+ * mapping and avoid the whole period adjust feedback stuff.
+ */
+ if (event->attr.freq) {
+ long freq = event->attr.sample_freq;
+
+ event->attr.sample_period = NSEC_PER_SEC / freq;
+ hwc->sample_period = event->attr.sample_period;
+ local64_set(&hwc->period_left, hwc->sample_period);
+ event->attr.freq = 0;
+ }
+}
+
/*
* Software event: cpu wall time clock
*/
@@ -5169,6 +5766,8 @@ static int cpu_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
return -ENOENT;
+ perf_swevent_init_hrtimer(event);
+
return 0;
}
@@ -5224,16 +5823,9 @@ static void task_clock_event_del(struct perf_event *event, int flags)
static void task_clock_event_read(struct perf_event *event)
{
- u64 time;
-
- if (!in_nmi()) {
- update_context_time(event->ctx);
- time = event->ctx->time;
- } else {
- u64 now = perf_clock();
- u64 delta = now - event->ctx->timestamp;
- time = event->ctx->time + delta;
- }
+ u64 now = perf_clock();
+ u64 delta = now - event->ctx->timestamp;
+ u64 time = event->ctx->time + delta;
task_clock_event_update(event, time);
}
@@ -5246,6 +5838,8 @@ static int task_clock_event_init(struct perf_event *event)
if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
return -ENOENT;
+ perf_swevent_init_hrtimer(event);
+
return 0;
}
@@ -5517,17 +6111,22 @@ struct pmu *perf_init_event(struct perf_event *event)
{
struct pmu *pmu = NULL;
int idx;
+ int ret;
idx = srcu_read_lock(&pmus_srcu);
rcu_read_lock();
pmu = idr_find(&pmu_idr, event->attr.type);
rcu_read_unlock();
- if (pmu)
+ if (pmu) {
+ ret = pmu->event_init(event);
+ if (ret)
+ pmu = ERR_PTR(ret);
goto unlock;
+ }
list_for_each_entry_rcu(pmu, &pmus, entry) {
- int ret = pmu->event_init(event);
+ ret = pmu->event_init(event);
if (!ret)
goto unlock;
@@ -5653,7 +6252,7 @@ done:
if (!event->parent) {
if (event->attach_state & PERF_ATTACH_TASK)
- jump_label_inc(&perf_task_events);
+ jump_label_inc(&perf_sched_events);
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
@@ -5828,7 +6427,7 @@ SYSCALL_DEFINE5(perf_event_open,
int err;
/* for future expandability... */
- if (flags & ~(PERF_FLAG_FD_NO_GROUP | PERF_FLAG_FD_OUTPUT))
+ if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
err = perf_copy_attr(attr_uptr, &attr);
@@ -5845,6 +6444,15 @@ SYSCALL_DEFINE5(perf_event_open,
return -EINVAL;
}
+ /*
+ * In cgroup mode, the pid argument is used to pass the fd
+ * opened to the cgroup directory in cgroupfs. The cpu argument
+ * designates the cpu on which to monitor threads from that
+ * cgroup.
+ */
+ if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
+ return -EINVAL;
+
event_fd = get_unused_fd_flags(O_RDWR);
if (event_fd < 0)
return event_fd;
@@ -5862,7 +6470,7 @@ SYSCALL_DEFINE5(perf_event_open,
group_leader = NULL;
}
- if (pid != -1) {
+ if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
task = find_lively_task_by_vpid(pid);
if (IS_ERR(task)) {
err = PTR_ERR(task);
@@ -5876,6 +6484,19 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_task;
}
+ if (flags & PERF_FLAG_PID_CGROUP) {
+ err = perf_cgroup_connect(pid, event, &attr, group_leader);
+ if (err)
+ goto err_alloc;
+ /*
+ * one more event:
+ * - that has cgroup constraint on event->cpu
+ * - that may need work on context switch
+ */
+ atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
+ jump_label_inc(&perf_sched_events);
+ }
+
/*
* Special case software events and allow them to be part of
* any hardware group.
@@ -5914,6 +6535,11 @@ SYSCALL_DEFINE5(perf_event_open,
goto err_alloc;
}
+ if (task) {
+ put_task_struct(task);
+ task = NULL;
+ }
+
/*
* Look up the group leader (we will attach this event to it):
*/
@@ -5961,10 +6587,10 @@ SYSCALL_DEFINE5(perf_event_open,
struct perf_event_context *gctx = group_leader->ctx;
mutex_lock(&gctx->mutex);
- perf_event_remove_from_context(group_leader);
+ perf_remove_from_context(group_leader);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_event_remove_from_context(sibling);
+ perf_remove_from_context(sibling);
put_ctx(gctx);
}
mutex_unlock(&gctx->mutex);
@@ -5987,6 +6613,7 @@ SYSCALL_DEFINE5(perf_event_open,
perf_install_in_context(ctx, event, cpu);
++ctx->generation;
+ perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
event->owner = current;
@@ -6012,6 +6639,7 @@ SYSCALL_DEFINE5(perf_event_open,
return event_fd;
err_context:
+ perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
free_event(event);
@@ -6062,6 +6690,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
mutex_lock(&ctx->mutex);
perf_install_in_context(ctx, event, cpu);
++ctx->generation;
+ perf_unpin_context(ctx);
mutex_unlock(&ctx->mutex);
return event;
@@ -6113,17 +6742,20 @@ __perf_event_exit_task(struct perf_event *child_event,
struct perf_event_context *child_ctx,
struct task_struct *child)
{
- struct perf_event *parent_event;
+ if (child_event->parent) {
+ raw_spin_lock_irq(&child_ctx->lock);
+ perf_group_detach(child_event);
+ raw_spin_unlock_irq(&child_ctx->lock);
+ }
- perf_event_remove_from_context(child_event);
+ perf_remove_from_context(child_event);
- parent_event = child_event->parent;
/*
- * It can happen that parent exits first, and has events
+ * It can happen that the parent exits first, and has events
* that are still around due to the child reference. These
- * events need to be zapped - but otherwise linger.
+ * events need to be zapped.
*/
- if (parent_event) {
+ if (child_event->parent) {
sync_child_event(child_event, child);
free_event(child_event);
}
@@ -6422,7 +7054,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
return 0;
}
- child_ctx = child->perf_event_ctxp[ctxn];
+ child_ctx = child->perf_event_ctxp[ctxn];
if (!child_ctx) {
/*
* This is executed from the parent task context, so
@@ -6537,6 +7169,7 @@ int perf_event_init_context(struct task_struct *child, int ctxn)
mutex_unlock(&parent_ctx->mutex);
perf_unpin_context(parent_ctx);
+ put_ctx(parent_ctx);
return ret;
}
@@ -6606,9 +7239,9 @@ static void __perf_event_exit_context(void *__info)
perf_pmu_rotate_stop(ctx->pmu);
list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
- __perf_event_remove_from_context(event);
+ __perf_remove_from_context(event);
list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
- __perf_event_remove_from_context(event);
+ __perf_remove_from_context(event);
}
static void perf_event_exit_cpu_context(int cpu)
@@ -6732,3 +7365,83 @@ unlock:
return ret;
}
device_initcall(perf_event_sysfs_init);
+
+#ifdef CONFIG_CGROUP_PERF
+static struct cgroup_subsys_state *perf_cgroup_create(
+ struct cgroup_subsys *ss, struct cgroup *cont)
+{
+ struct perf_cgroup *jc;
+
+ jc = kzalloc(sizeof(*jc), GFP_KERNEL);
+ if (!jc)
+ return ERR_PTR(-ENOMEM);
+
+ jc->info = alloc_percpu(struct perf_cgroup_info);
+ if (!jc->info) {
+ kfree(jc);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ return &jc->css;
+}
+
+static void perf_cgroup_destroy(struct cgroup_subsys *ss,
+ struct cgroup *cont)
+{
+ struct perf_cgroup *jc;
+ jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
+ struct perf_cgroup, css);
+ free_percpu(jc->info);
+ kfree(jc);
+}
+
+static int __perf_cgroup_move(void *info)
+{
+ struct task_struct *task = info;
+ perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
+ return 0;
+}
+
+static void perf_cgroup_move(struct task_struct *task)
+{
+ task_function_call(task, __perf_cgroup_move, task);
+}
+
+static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task,
+ bool threadgroup)
+{
+ perf_cgroup_move(task);
+ if (threadgroup) {
+ struct task_struct *c;
+ rcu_read_lock();
+ list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
+ perf_cgroup_move(c);
+ }
+ rcu_read_unlock();
+ }
+}
+
+static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task)
+{
+ /*
+ * cgroup_exit() is called in the copy_process() failure path.
+ * Ignore this case since the task hasn't ran yet, this avoids
+ * trying to poke a half freed task state from generic code.
+ */
+ if (!(task->flags & PF_EXITING))
+ return;
+
+ perf_cgroup_move(task);
+}
+
+struct cgroup_subsys perf_subsys = {
+ .name = "perf_event",
+ .subsys_id = perf_subsys_id,
+ .create = perf_cgroup_create,
+ .destroy = perf_cgroup_destroy,
+ .exit = perf_cgroup_exit,
+ .attach = perf_cgroup_attach,
+};
+#endif /* CONFIG_CGROUP_PERF */
diff --git a/kernel/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 086adf25a55e..086adf25a55e 100644
--- a/kernel/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
diff --git a/kernel/exit.c b/kernel/exit.c
index f9a45ebcc7b1..20a406471525 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -841,7 +841,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
/* Let father know we died
*
* Thread signals are configurable, but you aren't going to use
- * that to send signals to arbitary processes.
+ * that to send signals to arbitrary processes.
* That stops right now.
*
* If the parent exec id doesn't match the exec id we saved
@@ -908,6 +908,7 @@ NORET_TYPE void do_exit(long code)
profile_task_exit(tsk);
WARN_ON(atomic_read(&tsk->fs_excl));
+ WARN_ON(blk_needs_flush_plug(tsk));
if (unlikely(in_interrupt()))
panic("Aiee, killing interrupt handler!");
@@ -1015,7 +1016,7 @@ NORET_TYPE void do_exit(long code)
/*
* FIXME: do that only when needed, using sched_exit tracepoint
*/
- flush_ptrace_hw_breakpoint(tsk);
+ ptrace_put_breakpoints(tsk);
exit_notify(tsk, group_dead);
#ifdef CONFIG_NUMA
@@ -1376,11 +1377,23 @@ static int *task_stopped_code(struct task_struct *p, bool ptrace)
return NULL;
}
-/*
- * Handle sys_wait4 work for one task in state TASK_STOPPED. We hold
- * read_lock(&tasklist_lock) on entry. If we return zero, we still hold
- * the lock and this task is uninteresting. If we return nonzero, we have
- * released the lock and the system call should return.
+/**
+ * wait_task_stopped - Wait for %TASK_STOPPED or %TASK_TRACED
+ * @wo: wait options
+ * @ptrace: is the wait for ptrace
+ * @p: task to wait for
+ *
+ * Handle sys_wait4() work for %p in state %TASK_STOPPED or %TASK_TRACED.
+ *
+ * CONTEXT:
+ * read_lock(&tasklist_lock), which is released if return value is
+ * non-zero. Also, grabs and releases @p->sighand->siglock.
+ *
+ * RETURNS:
+ * 0 if wait condition didn't exist and search for other wait conditions
+ * should continue. Non-zero return, -errno on failure and @p's pid on
+ * success, implies that tasklist_lock is released and wait condition
+ * search should terminate.
*/
static int wait_task_stopped(struct wait_opts *wo,
int ptrace, struct task_struct *p)
@@ -1396,6 +1409,9 @@ static int wait_task_stopped(struct wait_opts *wo,
if (!ptrace && !(wo->wo_flags & WUNTRACED))
return 0;
+ if (!task_stopped_code(p, ptrace))
+ return 0;
+
exit_code = 0;
spin_lock_irq(&p->sighand->siglock);
@@ -1537,33 +1553,84 @@ static int wait_consider_task(struct wait_opts *wo, int ptrace,
return 0;
}
- if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+ /* dead body doesn't have much to contribute */
+ if (p->exit_state == EXIT_DEAD)
+ return 0;
+
+ /* slay zombie? */
+ if (p->exit_state == EXIT_ZOMBIE) {
+ /*
+ * A zombie ptracee is only visible to its ptracer.
+ * Notification and reaping will be cascaded to the real
+ * parent when the ptracer detaches.
+ */
+ if (likely(!ptrace) && unlikely(task_ptrace(p))) {
+ /* it will become visible, clear notask_error */
+ wo->notask_error = 0;
+ return 0;
+ }
+
+ /* we don't reap group leaders with subthreads */
+ if (!delay_group_leader(p))
+ return wait_task_zombie(wo, p);
+
/*
- * This child is hidden by ptrace.
- * We aren't allowed to see it now, but eventually we will.
+ * Allow access to stopped/continued state via zombie by
+ * falling through. Clearing of notask_error is complex.
+ *
+ * When !@ptrace:
+ *
+ * If WEXITED is set, notask_error should naturally be
+ * cleared. If not, subset of WSTOPPED|WCONTINUED is set,
+ * so, if there are live subthreads, there are events to
+ * wait for. If all subthreads are dead, it's still safe
+ * to clear - this function will be called again in finite
+ * amount time once all the subthreads are released and
+ * will then return without clearing.
+ *
+ * When @ptrace:
+ *
+ * Stopped state is per-task and thus can't change once the
+ * target task dies. Only continued and exited can happen.
+ * Clear notask_error if WCONTINUED | WEXITED.
+ */
+ if (likely(!ptrace) || (wo->wo_flags & (WCONTINUED | WEXITED)))
+ wo->notask_error = 0;
+ } else {
+ /*
+ * If @p is ptraced by a task in its real parent's group,
+ * hide group stop/continued state when looking at @p as
+ * the real parent; otherwise, a single stop can be
+ * reported twice as group and ptrace stops.
+ *
+ * If a ptracer wants to distinguish the two events for its
+ * own children, it should create a separate process which
+ * takes the role of real parent.
+ */
+ if (likely(!ptrace) && task_ptrace(p) &&
+ same_thread_group(p->parent, p->real_parent))
+ return 0;
+
+ /*
+ * @p is alive and it's gonna stop, continue or exit, so
+ * there always is something to wait for.
*/
wo->notask_error = 0;
- return 0;
}
- if (p->exit_state == EXIT_DEAD)
- return 0;
-
/*
- * We don't reap group leaders with subthreads.
+ * Wait for stopped. Depending on @ptrace, different stopped state
+ * is used and the two don't interact with each other.
*/
- if (p->exit_state == EXIT_ZOMBIE && !delay_group_leader(p))
- return wait_task_zombie(wo, p);
+ ret = wait_task_stopped(wo, ptrace, p);
+ if (ret)
+ return ret;
/*
- * It's stopped or running now, so it might
- * later continue, exit, or stop again.
+ * Wait for continued. There's only one continued state and the
+ * ptracer can consume it which can confuse the real parent. Don't
+ * use WCONTINUED from ptracer. You don't need or want it.
*/
- wo->notask_error = 0;
-
- if (task_stopped_code(p, ptrace))
- return wait_task_stopped(wo, ptrace, p);
-
return wait_task_continued(wo, p);
}
diff --git a/kernel/extable.c b/kernel/extable.c
index 7f8f263f8524..5339705b8241 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -72,6 +72,24 @@ int core_kernel_text(unsigned long addr)
return 0;
}
+/**
+ * core_kernel_data - tell if addr points to kernel data
+ * @addr: address to test
+ *
+ * Returns true if @addr passed in is from the core kernel data
+ * section.
+ *
+ * Note: On some archs it may return true for core RODATA, and false
+ * for others. But will always be true for core RW data.
+ */
+int core_kernel_data(unsigned long addr)
+{
+ if (addr >= (unsigned long)_sdata &&
+ addr < (unsigned long)_edata)
+ return 1;
+ return 0;
+}
+
int __kernel_text_address(unsigned long addr)
{
if (core_kernel_text(addr))
diff --git a/kernel/fork.c b/kernel/fork.c
index 25e429152ddc..ca406d916713 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -40,6 +40,7 @@
#include <linux/tracehook.h>
#include <linux/futex.h>
#include <linux/compat.h>
+#include <linux/kthread.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/rcupdate.h>
#include <linux/ptrace.h>
@@ -58,7 +59,6 @@
#include <linux/taskstats_kern.h>
#include <linux/random.h>
#include <linux/tty.h>
-#include <linux/proc_fs.h>
#include <linux/blkdev.h>
#include <linux/fs_struct.h>
#include <linux/magic.h>
@@ -109,20 +109,25 @@ int nr_processes(void)
}
#ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
-# define alloc_task_struct() kmem_cache_alloc(task_struct_cachep, GFP_KERNEL)
-# define free_task_struct(tsk) kmem_cache_free(task_struct_cachep, (tsk))
+# define alloc_task_struct_node(node) \
+ kmem_cache_alloc_node(task_struct_cachep, GFP_KERNEL, node)
+# define free_task_struct(tsk) \
+ kmem_cache_free(task_struct_cachep, (tsk))
static struct kmem_cache *task_struct_cachep;
#endif
#ifndef __HAVE_ARCH_THREAD_INFO_ALLOCATOR
-static inline struct thread_info *alloc_thread_info(struct task_struct *tsk)
+static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+ int node)
{
#ifdef CONFIG_DEBUG_STACK_USAGE
gfp_t mask = GFP_KERNEL | __GFP_ZERO;
#else
gfp_t mask = GFP_KERNEL;
#endif
- return (struct thread_info *)__get_free_pages(mask, THREAD_SIZE_ORDER);
+ struct page *page = alloc_pages_node(node, mask, THREAD_SIZE_ORDER);
+
+ return page ? page_address(page) : NULL;
}
static inline void free_thread_info(struct thread_info *ti)
@@ -193,6 +198,7 @@ void __put_task_struct(struct task_struct *tsk)
if (!profile_handoff_task(tsk))
free_task(tsk);
}
+EXPORT_SYMBOL_GPL(__put_task_struct);
/*
* macro override instead of weak attribute alias, to workaround
@@ -248,16 +254,16 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
struct task_struct *tsk;
struct thread_info *ti;
unsigned long *stackend;
-
+ int node = tsk_fork_get_node(orig);
int err;
prepare_to_copy(orig);
- tsk = alloc_task_struct();
+ tsk = alloc_task_struct_node(node);
if (!tsk)
return NULL;
- ti = alloc_thread_info(tsk);
+ ti = alloc_thread_info_node(tsk, node);
if (!ti) {
free_task_struct(tsk);
return NULL;
@@ -376,15 +382,14 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
get_file(file);
if (tmp->vm_flags & VM_DENYWRITE)
atomic_dec(&inode->i_writecount);
- spin_lock(&mapping->i_mmap_lock);
+ mutex_lock(&mapping->i_mmap_mutex);
if (tmp->vm_flags & VM_SHARED)
mapping->i_mmap_writable++;
- tmp->vm_truncate_count = mpnt->vm_truncate_count;
flush_dcache_mmap_lock(mapping);
/* insert tmp into the share list, just after mpnt */
vma_prio_tree_add(tmp, mpnt);
flush_dcache_mmap_unlock(mapping);
- spin_unlock(&mapping->i_mmap_lock);
+ mutex_unlock(&mapping->i_mmap_mutex);
}
/*
@@ -479,6 +484,20 @@ static void mm_init_aio(struct mm_struct *mm)
#endif
}
+int mm_init_cpumask(struct mm_struct *mm, struct mm_struct *oldmm)
+{
+#ifdef CONFIG_CPUMASK_OFFSTACK
+ if (!alloc_cpumask_var(&mm->cpu_vm_mask_var, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (oldmm)
+ cpumask_copy(mm_cpumask(mm), mm_cpumask(oldmm));
+ else
+ memset(mm_cpumask(mm), 0, cpumask_size());
+#endif
+ return 0;
+}
+
static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p)
{
atomic_set(&mm->mm_users, 1);
@@ -515,10 +534,20 @@ struct mm_struct * mm_alloc(void)
struct mm_struct * mm;
mm = allocate_mm();
- if (mm) {
- memset(mm, 0, sizeof(*mm));
- mm = mm_init(mm, current);
+ if (!mm)
+ return NULL;
+
+ memset(mm, 0, sizeof(*mm));
+ mm = mm_init(mm, current);
+ if (!mm)
+ return NULL;
+
+ if (mm_init_cpumask(mm, NULL)) {
+ mm_free_pgd(mm);
+ free_mm(mm);
+ return NULL;
}
+
return mm;
}
@@ -530,6 +559,7 @@ struct mm_struct * mm_alloc(void)
void __mmdrop(struct mm_struct *mm)
{
BUG_ON(mm == &init_mm);
+ free_cpumask_var(mm->cpu_vm_mask_var);
mm_free_pgd(mm);
destroy_context(mm);
mmu_notifier_mm_destroy(mm);
@@ -566,6 +596,57 @@ void mmput(struct mm_struct *mm)
}
EXPORT_SYMBOL_GPL(mmput);
+/*
+ * We added or removed a vma mapping the executable. The vmas are only mapped
+ * during exec and are not mapped with the mmap system call.
+ * Callers must hold down_write() on the mm's mmap_sem for these
+ */
+void added_exe_file_vma(struct mm_struct *mm)
+{
+ mm->num_exe_file_vmas++;
+}
+
+void removed_exe_file_vma(struct mm_struct *mm)
+{
+ mm->num_exe_file_vmas--;
+ if ((mm->num_exe_file_vmas == 0) && mm->exe_file){
+ fput(mm->exe_file);
+ mm->exe_file = NULL;
+ }
+
+}
+
+void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file)
+{
+ if (new_exe_file)
+ get_file(new_exe_file);
+ if (mm->exe_file)
+ fput(mm->exe_file);
+ mm->exe_file = new_exe_file;
+ mm->num_exe_file_vmas = 0;
+}
+
+struct file *get_mm_exe_file(struct mm_struct *mm)
+{
+ struct file *exe_file;
+
+ /* We need mmap_sem to protect against races with removal of
+ * VM_EXECUTABLE vmas */
+ down_read(&mm->mmap_sem);
+ exe_file = mm->exe_file;
+ if (exe_file)
+ get_file(exe_file);
+ up_read(&mm->mmap_sem);
+ return exe_file;
+}
+
+static void dup_mm_exe_file(struct mm_struct *oldmm, struct mm_struct *newmm)
+{
+ /* It's safe to write the exe_file pointer without exe_file_lock because
+ * this is called during fork when the task is not yet in /proc */
+ newmm->exe_file = get_mm_exe_file(oldmm);
+}
+
/**
* get_task_mm - acquire a reference to the task's mm
*
@@ -684,6 +765,9 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
if (!mm_init(mm, tsk))
goto fail_nomem;
+ if (mm_init_cpumask(mm, oldmm))
+ goto fail_nocpumask;
+
if (init_new_context(tsk, mm))
goto fail_nocontext;
@@ -710,6 +794,9 @@ fail_nomem:
return NULL;
fail_nocontext:
+ free_cpumask_var(mm->cpu_vm_mask_var);
+
+fail_nocpumask:
/*
* If init_new_context() failed, we cannot use mmput() to free the mm
* because it calls destroy_context()
@@ -920,6 +1007,10 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
tty_audit_fork(sig);
sched_autogroup_fork(sig);
+#ifdef CONFIG_CGROUPS
+ init_rwsem(&sig->threadgroup_fork_lock);
+#endif
+
sig->oom_adj = current->signal->oom_adj;
sig->oom_score_adj = current->signal->oom_score_adj;
sig->oom_score_adj_min = current->signal->oom_score_adj_min;
@@ -1096,12 +1187,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
posix_cpu_timers_init(p);
- p->lock_depth = -1; /* -1 = no lock */
do_posix_clock_monotonic_gettime(&p->start_time);
p->real_start_time = p->start_time;
monotonic_to_bootbased(&p->real_start_time);
p->io_context = NULL;
p->audit_context = NULL;
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_lock(current);
cgroup_fork(p);
#ifdef CONFIG_NUMA
p->mempolicy = mpol_dup(p->mempolicy);
@@ -1146,7 +1238,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
#endif
/* Perform scheduler related setup. Assign this task to a CPU. */
- sched_fork(p, clone_flags);
+ sched_fork(p);
retval = perf_event_init_task(p);
if (retval)
@@ -1180,12 +1272,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
pid = alloc_pid(p->nsproxy->pid_ns);
if (!pid)
goto bad_fork_cleanup_io;
-
- if (clone_flags & CLONE_NEWPID) {
- retval = pid_ns_prepare_proc(p->nsproxy->pid_ns);
- if (retval < 0)
- goto bad_fork_free_pid;
- }
}
p->pid = pid_nr(pid);
@@ -1193,17 +1279,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if (clone_flags & CLONE_THREAD)
p->tgid = current->tgid;
- if (current->nsproxy != p->nsproxy) {
- retval = ns_cgroup_clone(p, pid);
- if (retval)
- goto bad_fork_free_pid;
- }
-
p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
/*
* Clear TID on mm_release()?
*/
p->clear_child_tid = (clone_flags & CLONE_CHILD_CLEARTID) ? child_tidptr: NULL;
+#ifdef CONFIG_BLOCK
+ p->plug = NULL;
+#endif
#ifdef CONFIG_FUTEX
p->robust_list = NULL;
#ifdef CONFIG_COMPAT
@@ -1289,7 +1372,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
tracehook_finish_clone(p, clone_flags, trace);
if (thread_group_leader(p)) {
- if (clone_flags & CLONE_NEWPID)
+ if (is_child_reaper(pid))
p->nsproxy->pid_ns->child_reaper = p;
p->signal->leader_pid = pid;
@@ -1309,6 +1392,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
write_unlock_irq(&tasklist_lock);
proc_fork_connector(p);
cgroup_post_fork(p);
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
perf_event_fork(p);
return p;
@@ -1347,6 +1432,8 @@ bad_fork_cleanup_policy:
mpol_put(p->mempolicy);
bad_fork_cleanup_cgroup:
#endif
+ if (clone_flags & CLONE_THREAD)
+ threadgroup_fork_read_unlock(current);
cgroup_exit(p, cgroup_callbacks_done);
delayacct_tsk_free(p);
module_put(task_thread_info(p)->exec_domain->module);
@@ -1460,7 +1547,7 @@ long do_fork(unsigned long clone_flags,
*/
p->flags &= ~PF_STARTING;
- wake_up_new_task(p, clone_flags);
+ wake_up_new_task(p);
tracehook_report_clone_complete(trace, regs,
clone_flags, nr, p);
@@ -1512,38 +1599,24 @@ void __init proc_caches_init(void)
}
/*
- * Check constraints on flags passed to the unshare system call and
- * force unsharing of additional process context as appropriate.
+ * Check constraints on flags passed to the unshare system call.
*/
-static void check_unshare_flags(unsigned long *flags_ptr)
+static int check_unshare_flags(unsigned long unshare_flags)
{
+ if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
+ CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
+ CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ return -EINVAL;
/*
- * If unsharing a thread from a thread group, must also
- * unshare vm.
- */
- if (*flags_ptr & CLONE_THREAD)
- *flags_ptr |= CLONE_VM;
-
- /*
- * If unsharing vm, must also unshare signal handlers.
- */
- if (*flags_ptr & CLONE_VM)
- *flags_ptr |= CLONE_SIGHAND;
-
- /*
- * If unsharing namespace, must also unshare filesystem information.
+ * Not implemented, but pretend it works if there is nothing to
+ * unshare. Note that unsharing CLONE_THREAD or CLONE_SIGHAND
+ * needs to unshare vm.
*/
- if (*flags_ptr & CLONE_NEWNS)
- *flags_ptr |= CLONE_FS;
-}
-
-/*
- * Unsharing of tasks created with CLONE_THREAD is not supported yet
- */
-static int unshare_thread(unsigned long unshare_flags)
-{
- if (unshare_flags & CLONE_THREAD)
- return -EINVAL;
+ if (unshare_flags & (CLONE_THREAD | CLONE_SIGHAND | CLONE_VM)) {
+ /* FIXME: get_task_mm() increments ->mm_users */
+ if (atomic_read(&current->mm->mm_users) > 1)
+ return -EINVAL;
+ }
return 0;
}
@@ -1570,34 +1643,6 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
}
/*
- * Unsharing of sighand is not supported yet
- */
-static int unshare_sighand(unsigned long unshare_flags, struct sighand_struct **new_sighp)
-{
- struct sighand_struct *sigh = current->sighand;
-
- if ((unshare_flags & CLONE_SIGHAND) && atomic_read(&sigh->count) > 1)
- return -EINVAL;
- else
- return 0;
-}
-
-/*
- * Unshare vm if it is being shared
- */
-static int unshare_vm(unsigned long unshare_flags, struct mm_struct **new_mmp)
-{
- struct mm_struct *mm = current->mm;
-
- if ((unshare_flags & CLONE_VM) &&
- (mm && atomic_read(&mm->mm_users) > 1)) {
- return -EINVAL;
- }
-
- return 0;
-}
-
-/*
* Unshare file descriptor table if it is being shared
*/
static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp)
@@ -1625,45 +1670,37 @@ static int unshare_fd(unsigned long unshare_flags, struct files_struct **new_fdp
*/
SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
{
- int err = 0;
struct fs_struct *fs, *new_fs = NULL;
- struct sighand_struct *new_sigh = NULL;
- struct mm_struct *mm, *new_mm = NULL, *active_mm = NULL;
struct files_struct *fd, *new_fd = NULL;
struct nsproxy *new_nsproxy = NULL;
int do_sysvsem = 0;
+ int err;
- check_unshare_flags(&unshare_flags);
-
- /* Return -EINVAL for all unsupported flags */
- err = -EINVAL;
- if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND|
- CLONE_VM|CLONE_FILES|CLONE_SYSVSEM|
- CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET))
+ err = check_unshare_flags(unshare_flags);
+ if (err)
goto bad_unshare_out;
/*
+ * If unsharing namespace, must also unshare filesystem information.
+ */
+ if (unshare_flags & CLONE_NEWNS)
+ unshare_flags |= CLONE_FS;
+ /*
* CLONE_NEWIPC must also detach from the undolist: after switching
* to a new ipc namespace, the semaphore arrays from the old
* namespace are unreachable.
*/
if (unshare_flags & (CLONE_NEWIPC|CLONE_SYSVSEM))
do_sysvsem = 1;
- if ((err = unshare_thread(unshare_flags)))
- goto bad_unshare_out;
if ((err = unshare_fs(unshare_flags, &new_fs)))
- goto bad_unshare_cleanup_thread;
- if ((err = unshare_sighand(unshare_flags, &new_sigh)))
- goto bad_unshare_cleanup_fs;
- if ((err = unshare_vm(unshare_flags, &new_mm)))
- goto bad_unshare_cleanup_sigh;
+ goto bad_unshare_out;
if ((err = unshare_fd(unshare_flags, &new_fd)))
- goto bad_unshare_cleanup_vm;
+ goto bad_unshare_cleanup_fs;
if ((err = unshare_nsproxy_namespaces(unshare_flags, &new_nsproxy,
new_fs)))
goto bad_unshare_cleanup_fd;
- if (new_fs || new_mm || new_fd || do_sysvsem || new_nsproxy) {
+ if (new_fs || new_fd || do_sysvsem || new_nsproxy) {
if (do_sysvsem) {
/*
* CLONE_SYSVSEM is equivalent to sys_exit().
@@ -1689,19 +1726,6 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
spin_unlock(&fs->lock);
}
- if (new_mm) {
- mm = current->mm;
- active_mm = current->active_mm;
- current->mm = new_mm;
- current->active_mm = new_mm;
- if (current->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
- atomic_dec(&mm->oom_disable_count);
- atomic_inc(&new_mm->oom_disable_count);
- }
- activate_mm(active_mm, new_mm);
- new_mm = mm;
- }
-
if (new_fd) {
fd = current->files;
current->files = new_fd;
@@ -1718,20 +1742,10 @@ bad_unshare_cleanup_fd:
if (new_fd)
put_files_struct(new_fd);
-bad_unshare_cleanup_vm:
- if (new_mm)
- mmput(new_mm);
-
-bad_unshare_cleanup_sigh:
- if (new_sigh)
- if (atomic_dec_and_test(&new_sigh->count))
- kmem_cache_free(sighand_cachep, new_sigh);
-
bad_unshare_cleanup_fs:
if (new_fs)
free_fs_struct(new_fs);
-bad_unshare_cleanup_thread:
bad_unshare_out:
return err;
}
diff --git a/kernel/freezer.c b/kernel/freezer.c
index 66ecd2ead215..7b01de98bb6a 100644
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -17,7 +17,7 @@ static inline void frozen_process(void)
{
if (!unlikely(current->flags & PF_NOFREEZE)) {
current->flags |= PF_FROZEN;
- wmb();
+ smp_wmb();
}
clear_freeze_flag(current);
}
@@ -93,7 +93,7 @@ bool freeze_task(struct task_struct *p, bool sig_only)
* the task as frozen and next clears its TIF_FREEZE.
*/
if (!freezing(p)) {
- rmb();
+ smp_rmb();
if (frozen(p))
return false;
diff --git a/kernel/futex.c b/kernel/futex.c
index b766d28accd6..fe28dc282eae 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -381,15 +381,16 @@ static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
return NULL;
}
-static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
+static int cmpxchg_futex_value_locked(u32 *curval, u32 __user *uaddr,
+ u32 uval, u32 newval)
{
- u32 curval;
+ int ret;
pagefault_disable();
- curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval);
pagefault_enable();
- return curval;
+ return ret;
}
static int get_futex_value_locked(u32 *dest, u32 __user *from)
@@ -674,7 +675,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct task_struct *task, int set_waiters)
{
int lock_taken, ret, ownerdied = 0;
- u32 uval, newval, curval;
+ u32 uval, newval, curval, vpid = task_pid_vnr(task);
retry:
ret = lock_taken = 0;
@@ -684,19 +685,17 @@ retry:
* (by doing a 0 -> TID atomic cmpxchg), while holding all
* the locks. It will most likely not succeed.
*/
- newval = task_pid_vnr(task);
+ newval = vpid;
if (set_waiters)
newval |= FUTEX_WAITERS;
- curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
- if (unlikely(curval == -EFAULT))
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, 0, newval)))
return -EFAULT;
/*
* Detect deadlocks.
*/
- if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+ if ((unlikely((curval & FUTEX_TID_MASK) == vpid)))
return -EDEADLK;
/*
@@ -723,14 +722,12 @@ retry:
*/
if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
/* Keep the OWNER_DIED bit */
- newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+ newval = (curval & ~FUTEX_TID_MASK) | vpid;
ownerdied = 0;
lock_taken = 1;
}
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (unlikely(curval == -EFAULT))
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
if (unlikely(curval != uval))
goto retry;
@@ -775,6 +772,24 @@ retry:
return ret;
}
+/**
+ * __unqueue_futex() - Remove the futex_q from its futex_hash_bucket
+ * @q: The futex_q to unqueue
+ *
+ * The q->lock_ptr must not be NULL and must be held by the caller.
+ */
+static void __unqueue_futex(struct futex_q *q)
+{
+ struct futex_hash_bucket *hb;
+
+ if (WARN_ON_SMP(!q->lock_ptr || !spin_is_locked(q->lock_ptr))
+ || WARN_ON(plist_node_empty(&q->list)))
+ return;
+
+ hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock);
+ plist_del(&q->list, &hb->chain);
+}
+
/*
* The hash bucket lock must be held when this is called.
* Afterwards, the futex_q must not be accessed.
@@ -792,7 +807,7 @@ static void wake_futex(struct futex_q *q)
*/
get_task_struct(p);
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
/*
* The waiting task can free the futex_q as soon as
* q->lock_ptr = NULL is written, without taking any locks. A
@@ -843,9 +858,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
ret = -EFAULT;
else if (curval != uval)
ret = -EINVAL;
@@ -880,10 +893,8 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
* There is no waiter, so we unlock the futex. The owner died
* bit has not to be preserved here. We are the owner:
*/
- oldval = cmpxchg_futex_value_locked(uaddr, uval, 0);
-
- if (oldval == -EFAULT)
- return oldval;
+ if (cmpxchg_futex_value_locked(&oldval, uaddr, uval, 0))
+ return -EFAULT;
if (oldval != uval)
return -EAGAIN;
@@ -1071,9 +1082,6 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
plist_del(&q->list, &hb1->chain);
plist_add(&q->list, &hb2->chain);
q->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb2->lock;
-#endif
}
get_futex_key_refs(key2);
q->key = *key2;
@@ -1100,16 +1108,12 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
get_futex_key_refs(key);
q->key = *key;
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
q->lock_ptr = &hb->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb->lock;
-#endif
wake_up_state(q->task, TASK_NORMAL);
}
@@ -1457,9 +1461,6 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
prio = min(current->normal_prio, MAX_RT_PRIO);
plist_node_init(&q->list, prio);
-#ifdef CONFIG_DEBUG_PI_LIST
- q->list.plist.spinlock = &hb->lock;
-#endif
plist_add(&q->list, &hb->chain);
q->task = current;
spin_unlock(&hb->lock);
@@ -1504,8 +1505,7 @@ retry:
spin_unlock(lock_ptr);
goto retry;
}
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
BUG_ON(q->pi_state);
@@ -1525,8 +1525,7 @@ retry:
static void unqueue_me_pi(struct futex_q *q)
__releases(q->lock_ptr)
{
- WARN_ON(plist_node_empty(&q->list));
- plist_del(&q->list, &q->list.plist);
+ __unqueue_futex(q);
BUG_ON(!q->pi_state);
free_pi_state(q->pi_state);
@@ -1556,10 +1555,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
/*
* We are here either because we stole the rtmutex from the
- * pending owner or we are the pending owner which failed to
- * get the rtmutex. We have to replace the pending owner TID
- * in the user space variable. This must be atomic as we have
- * to preserve the owner died bit here.
+ * previous highest priority waiter or we are the highest priority
+ * waiter but failed to get the rtmutex the first time.
+ * We have to replace the newowner TID in the user space variable.
+ * This must be atomic as we have to preserve the owner died bit here.
*
* Note: We write the user space value _before_ changing the pi_state
* because we can fault here. Imagine swapped out pages or a fork
@@ -1578,9 +1577,7 @@ retry:
while (1) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
goto handle_fault;
if (curval == uval)
break;
@@ -1608,8 +1605,8 @@ retry:
/*
* To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the pending
- * owner itself or the task which stole the rtmutex) the
+ * lock here. That gives the other task (either the highest priority
+ * waiter itself or the task which stole the rtmutex) the
* chance to try the fixup of the pi_state. So once we are
* back from handling the fault we need to check the pi_state
* after reacquiring the hash bucket lock and before trying to
@@ -1685,18 +1682,20 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
/*
* pi_state is incorrect, some other task did a lock steal and
* we returned due to timeout or signal without taking the
- * rt_mutex. Too late. We can access the rt_mutex_owner without
- * locking, as the other task is now blocked on the hash bucket
- * lock. Fix the state up.
+ * rt_mutex. Too late.
*/
+ raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+ if (!owner)
+ owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+ raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
ret = fixup_pi_state_owner(uaddr, q, owner);
goto out;
}
/*
* Paranoia check. If we did not take the lock, then we should not be
- * the owner, nor the pending owner, of the rt_mutex.
+ * the owner of the rt_mutex.
*/
if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
@@ -1781,13 +1780,14 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
*
* The basic logical guarantee of a futex is that it blocks ONLY
* if cond(var) is known to be true at the time of blocking, for
- * any cond. If we queued after testing *uaddr, that would open
- * a race condition where we could block indefinitely with
+ * any cond. If we locked the hash-bucket after testing *uaddr, that
+ * would open a race condition where we could block indefinitely with
* cond(var) false, which would violate the guarantee.
*
- * A consequence is that futex_wait() can return zero and absorb
- * a wakeup when *uaddr != val on entry to the syscall. This is
- * rare, but normal.
+ * On the other hand, we insert q and release the hash-bucket only
+ * after testing *uaddr. This guarantees that futex_wait() will NOT
+ * absorb a wakeup if *uaddr does not match the desired values
+ * while the syscall executes.
*/
retry:
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &q->key);
@@ -1886,7 +1886,7 @@ retry:
restart->futex.val = val;
restart->futex.time = abs_time->tv64;
restart->futex.bitset = bitset;
- restart->futex.flags = flags;
+ restart->futex.flags = flags | FLAGS_HAS_TIMEOUT;
ret = -ERESTART_RESTARTBLOCK;
@@ -2046,9 +2046,9 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
{
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
- u32 uval;
struct plist_head *head;
union futex_key key = FUTEX_KEY_INIT;
+ u32 uval, vpid = task_pid_vnr(current);
int ret;
retry:
@@ -2057,7 +2057,7 @@ retry:
/*
* We release only a lock we actually own:
*/
- if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
+ if ((uval & FUTEX_TID_MASK) != vpid)
return -EPERM;
ret = get_futex_key(uaddr, flags & FLAGS_SHARED, &key);
@@ -2072,17 +2072,14 @@ retry:
* again. If it succeeds then we can return without waking
* anyone else up:
*/
- if (!(uval & FUTEX_OWNER_DIED))
- uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0);
-
-
- if (unlikely(uval == -EFAULT))
+ if (!(uval & FUTEX_OWNER_DIED) &&
+ cmpxchg_futex_value_locked(&uval, uaddr, vpid, 0))
goto pi_faulted;
/*
* Rare case: we managed to release the lock atomically,
* no need to wake anyone else up:
*/
- if (unlikely(uval == task_pid_vnr(current)))
+ if (unlikely(uval == vpid))
goto out_unlock;
/*
@@ -2167,7 +2164,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* We were woken prior to requeue by a timeout or a signal.
* Unqueue the futex_q and determine which it was.
*/
- plist_del(&q->list, &q->list.plist);
+ plist_del(&q->list, &hb->chain);
/* Handle spurious wakeups gracefully */
ret = -EWOULDBLOCK;
@@ -2421,10 +2418,19 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
goto err_unlock;
ret = -EPERM;
pcred = __task_cred(p);
+ /* If victim is in different user_ns, then uids are not
+ comparable, so we must have CAP_SYS_PTRACE */
+ if (cred->user->user_ns != pcred->user->user_ns) {
+ if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+ goto err_unlock;
+ goto ok;
+ }
+ /* If victim is in same user_ns, then uids are comparable */
if (cred->euid != pcred->euid &&
cred->euid != pcred->uid &&
- !capable(CAP_SYS_PTRACE))
+ !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
goto err_unlock;
+ok:
head = p->robust_list;
rcu_read_unlock();
}
@@ -2463,11 +2469,20 @@ retry:
* userspace.
*/
mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
- nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
-
- if (nval == -EFAULT)
- return -1;
-
+ /*
+ * We are not holding a lock here, but we want to have
+ * the pagefault_disable/enable() protection because
+ * we want to handle the fault gracefully. If the
+ * access fails we try to fault in the futex with R/W
+ * verification via get_user_pages. get_user() above
+ * does not guarantee R/W access. If that fails we
+ * give up and leave the futex locked.
+ */
+ if (cmpxchg_futex_value_locked(&nval, uaddr, uval, mval)) {
+ if (fault_in_user_writeable(uaddr))
+ return -1;
+ goto retry;
+ }
if (nval != uval)
goto retry;
@@ -2678,8 +2693,7 @@ static int __init futex_init(void)
* implementation, the non-functional ones will return
* -ENOSYS.
*/
- curval = cmpxchg_futex_value_locked(NULL, 0, 0);
- if (curval == -EFAULT)
+ if (cmpxchg_futex_value_locked(&curval, NULL, 0, 0) == -EFAULT)
futex_cmpxchg_enabled = 1;
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index a7934ac75e5b..5f9e689dc8f0 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -153,10 +153,19 @@ compat_sys_get_robust_list(int pid, compat_uptr_t __user *head_ptr,
goto err_unlock;
ret = -EPERM;
pcred = __task_cred(p);
+ /* If victim is in different user_ns, then uids are not
+ comparable, so we must have CAP_SYS_PTRACE */
+ if (cred->user->user_ns != pcred->user->user_ns) {
+ if (!ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
+ goto err_unlock;
+ goto ok;
+ }
+ /* If victim is in same user_ns, then uids are comparable */
if (cred->euid != pcred->euid &&
cred->euid != pcred->uid &&
- !capable(CAP_SYS_PTRACE))
+ !ns_capable(pcred->user->user_ns, CAP_SYS_PTRACE))
goto err_unlock;
+ok:
head = p->compat_robust_list;
rcu_read_unlock();
}
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
index 70a298d6da71..b8cadf70b1fb 100644
--- a/kernel/gcov/Kconfig
+++ b/kernel/gcov/Kconfig
@@ -34,7 +34,7 @@ config GCOV_KERNEL
config GCOV_PROFILE_ALL
bool "Profile entire Kernel"
depends on GCOV_KERNEL
- depends on S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
+ depends on SUPERH || S390 || X86 || (PPC && EXPERIMENTAL) || MICROBLAZE
default n
---help---
This options activates profiling for the entire kernel.
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
index 3f761001d517..e97ca59e2520 100644
--- a/kernel/gcov/Makefile
+++ b/kernel/gcov/Makefile
@@ -1,3 +1,3 @@
-EXTRA_CFLAGS := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
+ccflags-y := -DSRCTREE='"$(srctree)"' -DOBJTREE='"$(objtree)"'
obj-$(CONFIG_GCOV_KERNEL) := base.o fs.o gcc_3_4.o
diff --git a/kernel/groups.c b/kernel/groups.c
index 253dc0f35cf4..1cc476d52dd3 100644
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -233,7 +233,7 @@ SYSCALL_DEFINE2(setgroups, int, gidsetsize, gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!capable(CAP_SETGID))
+ if (!nsown_capable(CAP_SETGID))
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 0c8d7c048615..a9205e32a059 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -53,11 +53,10 @@
/*
* The timer bases:
*
- * Note: If we want to add new timer bases, we have to skip the two
- * clock ids captured by the cpu-timers. We do this by holding empty
- * entries rather than doing math adjustment of the clock ids.
- * This ensures that we capture erroneous accesses to these clock ids
- * rather than moving them into the range of valid clock id's.
+ * There are more clockids then hrtimer bases. Thus, we index
+ * into the timer bases by the hrtimer_base_type enum. When trying
+ * to reach a base using a clockid, hrtimer_clockid_to_base()
+ * is used to convert from clockid to the proper hrtimer_base_type.
*/
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
@@ -65,39 +64,55 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
.clock_base =
{
{
- .index = CLOCK_REALTIME,
+ .index = HRTIMER_BASE_MONOTONIC,
+ .clockid = CLOCK_MONOTONIC,
+ .get_time = &ktime_get,
+ .resolution = KTIME_LOW_RES,
+ },
+ {
+ .index = HRTIMER_BASE_REALTIME,
+ .clockid = CLOCK_REALTIME,
.get_time = &ktime_get_real,
.resolution = KTIME_LOW_RES,
},
{
- .index = CLOCK_MONOTONIC,
- .get_time = &ktime_get,
+ .index = HRTIMER_BASE_BOOTTIME,
+ .clockid = CLOCK_BOOTTIME,
+ .get_time = &ktime_get_boottime,
.resolution = KTIME_LOW_RES,
},
}
};
+static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = {
+ [CLOCK_REALTIME] = HRTIMER_BASE_REALTIME,
+ [CLOCK_MONOTONIC] = HRTIMER_BASE_MONOTONIC,
+ [CLOCK_BOOTTIME] = HRTIMER_BASE_BOOTTIME,
+};
+
+static inline int hrtimer_clockid_to_base(clockid_t clock_id)
+{
+ return hrtimer_clock_to_base_table[clock_id];
+}
+
+
/*
* Get the coarse grained time at the softirq based on xtime and
* wall_to_monotonic.
*/
static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base)
{
- ktime_t xtim, tomono;
- struct timespec xts, tom;
- unsigned long seq;
+ ktime_t xtim, mono, boot;
+ struct timespec xts, tom, slp;
- do {
- seq = read_seqbegin(&xtime_lock);
- xts = __current_kernel_time();
- tom = __get_wall_to_monotonic();
- } while (read_seqretry(&xtime_lock, seq));
+ get_xtime_and_monotonic_and_sleep_offset(&xts, &tom, &slp);
xtim = timespec_to_ktime(xts);
- tomono = timespec_to_ktime(tom);
- base->clock_base[CLOCK_REALTIME].softirq_time = xtim;
- base->clock_base[CLOCK_MONOTONIC].softirq_time =
- ktime_add(xtim, tomono);
+ mono = ktime_add(xtim, timespec_to_ktime(tom));
+ boot = ktime_add(mono, timespec_to_ktime(slp));
+ base->clock_base[HRTIMER_BASE_REALTIME].softirq_time = xtim;
+ base->clock_base[HRTIMER_BASE_MONOTONIC].softirq_time = mono;
+ base->clock_base[HRTIMER_BASE_BOOTTIME].softirq_time = boot;
}
/*
@@ -184,10 +199,11 @@ switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
struct hrtimer_cpu_base *new_cpu_base;
int this_cpu = smp_processor_id();
int cpu = hrtimer_get_target(this_cpu, pinned);
+ int basenum = base->index;
again:
new_cpu_base = &per_cpu(hrtimer_bases, cpu);
- new_base = &new_cpu_base->clock_base[base->index];
+ new_base = &new_cpu_base->clock_base[basenum];
if (base != new_base) {
/*
@@ -334,6 +350,11 @@ EXPORT_SYMBOL_GPL(ktime_add_safe);
static struct debug_obj_descr hrtimer_debug_descr;
+static void *hrtimer_debug_hint(void *addr)
+{
+ return ((struct hrtimer *) addr)->function;
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
@@ -393,6 +414,7 @@ static int hrtimer_fixup_free(void *addr, enum debug_obj_state state)
static struct debug_obj_descr hrtimer_debug_descr = {
.name = "hrtimer",
+ .debug_hint = hrtimer_debug_hint,
.fixup_init = hrtimer_fixup_init,
.fixup_activate = hrtimer_fixup_activate,
.fixup_free = hrtimer_fixup_free,
@@ -602,67 +624,6 @@ static int hrtimer_reprogram(struct hrtimer *timer,
return res;
}
-
-/*
- * Retrigger next event is called after clock was set
- *
- * Called with interrupts disabled via on_each_cpu()
- */
-static void retrigger_next_event(void *arg)
-{
- struct hrtimer_cpu_base *base;
- struct timespec realtime_offset, wtm;
- unsigned long seq;
-
- if (!hrtimer_hres_active())
- return;
-
- do {
- seq = read_seqbegin(&xtime_lock);
- wtm = __get_wall_to_monotonic();
- } while (read_seqretry(&xtime_lock, seq));
- set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
-
- base = &__get_cpu_var(hrtimer_bases);
-
- /* Adjust CLOCK_REALTIME offset */
- raw_spin_lock(&base->lock);
- base->clock_base[CLOCK_REALTIME].offset =
- timespec_to_ktime(realtime_offset);
-
- hrtimer_force_reprogram(base, 0);
- raw_spin_unlock(&base->lock);
-}
-
-/*
- * Clock realtime was set
- *
- * Change the offset of the realtime clock vs. the monotonic
- * clock.
- *
- * We might have to reprogram the high resolution timer interrupt. On
- * SMP we call the architecture specific code to retrigger _all_ high
- * resolution timer interrupts. On UP we just disable interrupts and
- * call the high resolution interrupt code.
- */
-void clock_was_set(void)
-{
- /* Retrigger the CPU local events everywhere */
- on_each_cpu(retrigger_next_event, NULL, 1);
-}
-
-/*
- * During resume we might have to reprogram the high resolution timer
- * interrupt (on the local CPU):
- */
-void hres_timers_resume(void)
-{
- WARN_ONCE(!irqs_disabled(),
- KERN_INFO "hres_timers_resume() called with IRQs enabled!");
-
- retrigger_next_event(NULL);
-}
-
/*
* Initialize the high resolution related parts of cpu_base
*/
@@ -673,14 +634,6 @@ static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base)
}
/*
- * Initialize the high resolution related parts of a hrtimer
- */
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
-{
-}
-
-
-/*
* When High resolution timers are active, try to reprogram. Note, that in case
* the state has HRTIMER_STATE_CALLBACK set, no reprogramming and no expiry
* check happens. The timer gets enqueued into the rbtree. The reprogramming
@@ -705,11 +658,39 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
}
/*
+ * Retrigger next event is called after clock was set
+ *
+ * Called with interrupts disabled via on_each_cpu()
+ */
+static void retrigger_next_event(void *arg)
+{
+ struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+ struct timespec realtime_offset, xtim, wtm, sleep;
+
+ if (!hrtimer_hres_active())
+ return;
+
+ /* Optimized out for !HIGH_RES */
+ get_xtime_and_monotonic_and_sleep_offset(&xtim, &wtm, &sleep);
+ set_normalized_timespec(&realtime_offset, -wtm.tv_sec, -wtm.tv_nsec);
+
+ /* Adjust CLOCK_REALTIME offset */
+ raw_spin_lock(&base->lock);
+ base->clock_base[HRTIMER_BASE_REALTIME].offset =
+ timespec_to_ktime(realtime_offset);
+ base->clock_base[HRTIMER_BASE_BOOTTIME].offset =
+ timespec_to_ktime(sleep);
+
+ hrtimer_force_reprogram(base, 0);
+ raw_spin_unlock(&base->lock);
+}
+
+/*
* Switch to high resolution mode
*/
static int hrtimer_switch_to_hres(void)
{
- int cpu = smp_processor_id();
+ int i, cpu = smp_processor_id();
struct hrtimer_cpu_base *base = &per_cpu(hrtimer_bases, cpu);
unsigned long flags;
@@ -725,8 +706,8 @@ static int hrtimer_switch_to_hres(void)
return 0;
}
base->hres_active = 1;
- base->clock_base[CLOCK_REALTIME].resolution = KTIME_HIGH_RES;
- base->clock_base[CLOCK_MONOTONIC].resolution = KTIME_HIGH_RES;
+ for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++)
+ base->clock_base[i].resolution = KTIME_HIGH_RES;
tick_setup_sched_timer();
@@ -750,10 +731,43 @@ static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
return 0;
}
static inline void hrtimer_init_hres(struct hrtimer_cpu_base *base) { }
-static inline void hrtimer_init_timer_hres(struct hrtimer *timer) { }
+static inline void retrigger_next_event(void *arg) { }
#endif /* CONFIG_HIGH_RES_TIMERS */
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock.
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+#ifdef CONFIG_HIGH_RES_TIMERS
+ /* Retrigger the CPU local events everywhere */
+ on_each_cpu(retrigger_next_event, NULL, 1);
+#endif
+ timerfd_clock_was_set();
+}
+
+/*
+ * During resume we might have to reprogram the high resolution timer
+ * interrupt (on the local CPU):
+ */
+void hrtimers_resume(void)
+{
+ WARN_ONCE(!irqs_disabled(),
+ KERN_INFO "hrtimers_resume() called with IRQs enabled!");
+
+ retrigger_next_event(NULL);
+ timerfd_clock_was_set();
+}
+
static inline void timer_stats_hrtimer_set_start_info(struct hrtimer *timer)
{
#ifdef CONFIG_TIMER_STATS
@@ -846,6 +860,7 @@ static int enqueue_hrtimer(struct hrtimer *timer,
debug_activate(timer);
timerqueue_add(&base->active, &timer->node);
+ base->cpu_base->active_bases |= 1 << base->index;
/*
* HRTIMER_STATE_ENQUEUED is or'ed to the current state to preserve the
@@ -887,6 +902,8 @@ static void __remove_hrtimer(struct hrtimer *timer,
#endif
}
timerqueue_del(&base->active, &timer->node);
+ if (!timerqueue_getnext(&base->active))
+ base->cpu_base->active_bases &= ~(1 << base->index);
out:
timer->state = newstate;
}
@@ -1121,6 +1138,7 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
enum hrtimer_mode mode)
{
struct hrtimer_cpu_base *cpu_base;
+ int base;
memset(timer, 0, sizeof(struct hrtimer));
@@ -1129,8 +1147,8 @@ static void __hrtimer_init(struct hrtimer *timer, clockid_t clock_id,
if (clock_id == CLOCK_REALTIME && mode != HRTIMER_MODE_ABS)
clock_id = CLOCK_MONOTONIC;
- timer->base = &cpu_base->clock_base[clock_id];
- hrtimer_init_timer_hres(timer);
+ base = hrtimer_clockid_to_base(clock_id);
+ timer->base = &cpu_base->clock_base[base];
timerqueue_init(&timer->node);
#ifdef CONFIG_TIMER_STATS
@@ -1165,9 +1183,10 @@ EXPORT_SYMBOL_GPL(hrtimer_init);
int hrtimer_get_res(const clockid_t which_clock, struct timespec *tp)
{
struct hrtimer_cpu_base *cpu_base;
+ int base = hrtimer_clockid_to_base(which_clock);
cpu_base = &__raw_get_cpu_var(hrtimer_bases);
- *tp = ktime_to_timespec(cpu_base->clock_base[which_clock].resolution);
+ *tp = ktime_to_timespec(cpu_base->clock_base[base].resolution);
return 0;
}
@@ -1222,7 +1241,6 @@ static void __run_hrtimer(struct hrtimer *timer, ktime_t *now)
void hrtimer_interrupt(struct clock_event_device *dev)
{
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
- struct hrtimer_clock_base *base;
ktime_t expires_next, now, entry_time, delta;
int i, retries = 0;
@@ -1244,12 +1262,15 @@ retry:
*/
cpu_base->expires_next.tv64 = KTIME_MAX;
- base = cpu_base->clock_base;
-
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
- ktime_t basenow;
+ struct hrtimer_clock_base *base;
struct timerqueue_node *node;
+ ktime_t basenow;
+
+ if (!(cpu_base->active_bases & (1 << i)))
+ continue;
+ base = cpu_base->clock_base + i;
basenow = ktime_add(now, base->offset);
while ((node = timerqueue_getnext(&base->active))) {
@@ -1282,7 +1303,6 @@ retry:
__run_hrtimer(timer, &basenow);
}
- base++;
}
/*
@@ -1513,7 +1533,7 @@ long __sched hrtimer_nanosleep_restart(struct restart_block *restart)
struct timespec __user *rmtp;
int ret = 0;
- hrtimer_init_on_stack(&t.timer, restart->nanosleep.index,
+ hrtimer_init_on_stack(&t.timer, restart->nanosleep.clockid,
HRTIMER_MODE_ABS);
hrtimer_set_expires_tv64(&t.timer, restart->nanosleep.expires);
@@ -1565,7 +1585,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp,
restart = &current_thread_info()->restart_block;
restart->fn = hrtimer_nanosleep_restart;
- restart->nanosleep.index = t.timer.base->index;
+ restart->nanosleep.clockid = t.timer.base->clockid;
restart->nanosleep.rmtp = rmtp;
restart->nanosleep.expires = hrtimer_get_expires_tv64(&t.timer);
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 53ead174da2f..ea640120ab86 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -33,7 +33,7 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
/*
* Zero means infinite timeout - no checking done:
*/
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = CONFIG_DEFAULT_HUNG_TASK_TIMEOUT;
unsigned long __read_mostly sysctl_hung_task_warnings = 10;
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
index 8e42fec7686d..d1d051b38e0b 100644
--- a/kernel/irq/Kconfig
+++ b/kernel/irq/Kconfig
@@ -1,5 +1,6 @@
+# Select this to activate the generic irq options below
config HAVE_GENERIC_HARDIRQS
- def_bool n
+ bool
if HAVE_GENERIC_HARDIRQS
menu "IRQ subsystem"
@@ -9,28 +10,51 @@ menu "IRQ subsystem"
config GENERIC_HARDIRQS
def_bool y
-# Select this to disable the deprecated stuff
-config GENERIC_HARDIRQS_NO_DEPRECATED
- def_bool n
-
# Options selectable by the architecture code
+
+# Make sparse irq Kconfig switch below available
config HAVE_SPARSE_IRQ
- def_bool n
+ bool
+# Enable the generic irq autoprobe mechanism
config GENERIC_IRQ_PROBE
- def_bool n
+ bool
+
+# Use the generic /proc/interrupts implementation
+config GENERIC_IRQ_SHOW
+ bool
+# Print level/edge extra information
+config GENERIC_IRQ_SHOW_LEVEL
+ bool
+
+# Support for delayed migration from interrupt context
config GENERIC_PENDING_IRQ
- def_bool n
+ bool
+# Alpha specific irq affinity mechanism
config AUTO_IRQ_AFFINITY
- def_bool n
-
-config IRQ_PER_CPU
- def_bool n
+ bool
+# Tasklet based software resend for pending interrupts on enable_irq()
config HARDIRQS_SW_RESEND
- def_bool n
+ bool
+
+# Preflow handler support for fasteoi (sparc64)
+config IRQ_PREFLOW_FASTEOI
+ bool
+
+# Edge style eoi based handler (cell)
+config IRQ_EDGE_EOI_HANDLER
+ bool
+
+# Generic configurable interrupt chip implementation
+config GENERIC_IRQ_CHIP
+ bool
+
+# Support forced irq threading
+config IRQ_FORCED_THREADING
+ bool
config SPARSE_IRQ
bool "Support sparse irq numbering"
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 54329cd7b3ee..73290056cfb6 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -1,5 +1,6 @@
obj-y := irqdesc.o handle.o manage.o spurious.o resend.o chip.o dummychip.o devres.o
+obj-$(CONFIG_GENERIC_IRQ_CHIP) += generic-chip.o
obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
obj-$(CONFIG_PROC_FS) += proc.o
obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
index 505798f86c36..342d8f44e401 100644
--- a/kernel/irq/autoprobe.c
+++ b/kernel/irq/autoprobe.c
@@ -17,7 +17,7 @@
/*
* Autodetection depends on the fact that any interrupt that
* comes in on to an unassigned handler will get stuck with
- * "IRQ_WAITING" cleared and the interrupt disabled.
+ * "IRQS_WAITING" cleared and the interrupt disabled.
*/
static DEFINE_MUTEX(probing_active);
@@ -32,7 +32,6 @@ unsigned long probe_irq_on(void)
{
struct irq_desc *desc;
unsigned long mask = 0;
- unsigned int status;
int i;
/*
@@ -46,13 +45,7 @@ unsigned long probe_irq_on(void)
*/
for_each_irq_desc_reverse(i, desc) {
raw_spin_lock_irq(&desc->lock);
- if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
- /*
- * An old-style architecture might still have
- * the handle_bad_irq handler there:
- */
- compat_irq_chip_set_default_handler(desc);
-
+ if (!desc->action && irq_settings_can_probe(desc)) {
/*
* Some chips need to know about probing in
* progress:
@@ -60,7 +53,7 @@ unsigned long probe_irq_on(void)
if (desc->irq_data.chip->irq_set_type)
desc->irq_data.chip->irq_set_type(&desc->irq_data,
IRQ_TYPE_PROBE);
- desc->irq_data.chip->irq_startup(&desc->irq_data);
+ irq_startup(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -75,10 +68,10 @@ unsigned long probe_irq_on(void)
*/
for_each_irq_desc_reverse(i, desc) {
raw_spin_lock_irq(&desc->lock);
- if (!desc->action && !(desc->status & IRQ_NOPROBE)) {
- desc->status |= IRQ_AUTODETECT | IRQ_WAITING;
- if (desc->irq_data.chip->irq_startup(&desc->irq_data))
- desc->status |= IRQ_PENDING;
+ if (!desc->action && irq_settings_can_probe(desc)) {
+ desc->istate |= IRQS_AUTODETECT | IRQS_WAITING;
+ if (irq_startup(desc))
+ desc->istate |= IRQS_PENDING;
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -93,13 +86,12 @@ unsigned long probe_irq_on(void)
*/
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
- status = desc->status;
- if (status & IRQ_AUTODETECT) {
+ if (desc->istate & IRQS_AUTODETECT) {
/* It triggered already - consider it spurious. */
- if (!(status & IRQ_WAITING)) {
- desc->status = status & ~IRQ_AUTODETECT;
- desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+ if (!(desc->istate & IRQS_WAITING)) {
+ desc->istate &= ~IRQS_AUTODETECT;
+ irq_shutdown(desc);
} else
if (i < 32)
mask |= 1 << i;
@@ -125,20 +117,18 @@ EXPORT_SYMBOL(probe_irq_on);
*/
unsigned int probe_irq_mask(unsigned long val)
{
- unsigned int status, mask = 0;
+ unsigned int mask = 0;
struct irq_desc *desc;
int i;
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
- status = desc->status;
-
- if (status & IRQ_AUTODETECT) {
- if (i < 16 && !(status & IRQ_WAITING))
+ if (desc->istate & IRQS_AUTODETECT) {
+ if (i < 16 && !(desc->istate & IRQS_WAITING))
mask |= 1 << i;
- desc->status = status & ~IRQ_AUTODETECT;
- desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+ desc->istate &= ~IRQS_AUTODETECT;
+ irq_shutdown(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
@@ -169,20 +159,18 @@ int probe_irq_off(unsigned long val)
{
int i, irq_found = 0, nr_of_irqs = 0;
struct irq_desc *desc;
- unsigned int status;
for_each_irq_desc(i, desc) {
raw_spin_lock_irq(&desc->lock);
- status = desc->status;
- if (status & IRQ_AUTODETECT) {
- if (!(status & IRQ_WAITING)) {
+ if (desc->istate & IRQS_AUTODETECT) {
+ if (!(desc->istate & IRQS_WAITING)) {
if (!nr_of_irqs)
irq_found = i;
nr_of_irqs++;
}
- desc->status = status & ~IRQ_AUTODETECT;
- desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+ desc->istate &= ~IRQS_AUTODETECT;
+ irq_shutdown(desc);
}
raw_spin_unlock_irq(&desc->lock);
}
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index baa5c4acad83..d5a3009da71a 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -19,140 +19,115 @@
#include "internals.h"
/**
- * set_irq_chip - set the irq chip for an irq
+ * irq_set_chip - set the irq chip for an irq
* @irq: irq number
* @chip: pointer to irq chip description structure
*/
-int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+int irq_set_chip(unsigned int irq, struct irq_chip *chip)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
- if (!desc) {
- WARN(1, KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+ if (!desc)
return -EINVAL;
- }
if (!chip)
chip = &no_irq_chip;
- raw_spin_lock_irqsave(&desc->lock, flags);
- irq_chip_set_defaults(chip);
desc->irq_data.chip = chip;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+ irq_put_desc_unlock(desc, flags);
+ /*
+ * For !CONFIG_SPARSE_IRQ make the irq show up in
+ * allocated_irqs. For the CONFIG_SPARSE_IRQ case, it is
+ * already marked, and this call is harmless.
+ */
+ irq_reserve_irq(irq);
return 0;
}
-EXPORT_SYMBOL(set_irq_chip);
+EXPORT_SYMBOL(irq_set_chip);
/**
- * set_irq_type - set the irq trigger type for an irq
+ * irq_set_type - set the irq trigger type for an irq
* @irq: irq number
* @type: IRQ_TYPE_{LEVEL,EDGE}_* value - see include/linux/irq.h
*/
-int set_irq_type(unsigned int irq, unsigned int type)
+int irq_set_irq_type(unsigned int irq, unsigned int type)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
- int ret = -ENXIO;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+ int ret = 0;
- if (!desc) {
- printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
- return -ENODEV;
- }
+ if (!desc)
+ return -EINVAL;
type &= IRQ_TYPE_SENSE_MASK;
- if (type == IRQ_TYPE_NONE)
- return 0;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- ret = __irq_set_trigger(desc, irq, type);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (type != IRQ_TYPE_NONE)
+ ret = __irq_set_trigger(desc, irq, type);
+ irq_put_desc_busunlock(desc, flags);
return ret;
}
-EXPORT_SYMBOL(set_irq_type);
+EXPORT_SYMBOL(irq_set_irq_type);
/**
- * set_irq_data - set irq type data for an irq
+ * irq_set_handler_data - set irq handler data for an irq
* @irq: Interrupt number
* @data: Pointer to interrupt specific data
*
* Set the hardware irq controller data for an irq
*/
-int set_irq_data(unsigned int irq, void *data)
+int irq_set_handler_data(unsigned int irq, void *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
- if (!desc) {
- printk(KERN_ERR
- "Trying to install controller data for IRQ%d\n", irq);
+ if (!desc)
return -EINVAL;
- }
-
- raw_spin_lock_irqsave(&desc->lock, flags);
desc->irq_data.handler_data = data;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ irq_put_desc_unlock(desc, flags);
return 0;
}
-EXPORT_SYMBOL(set_irq_data);
+EXPORT_SYMBOL(irq_set_handler_data);
/**
- * set_irq_msi - set MSI descriptor data for an irq
+ * irq_set_msi_desc - set MSI descriptor data for an irq
* @irq: Interrupt number
* @entry: Pointer to MSI descriptor data
*
* Set the MSI descriptor entry for an irq
*/
-int set_irq_msi(unsigned int irq, struct msi_desc *entry)
+int irq_set_msi_desc(unsigned int irq, struct msi_desc *entry)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
- if (!desc) {
- printk(KERN_ERR
- "Trying to install msi data for IRQ%d\n", irq);
+ if (!desc)
return -EINVAL;
- }
-
- raw_spin_lock_irqsave(&desc->lock, flags);
desc->irq_data.msi_desc = entry;
if (entry)
entry->irq = irq;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ irq_put_desc_unlock(desc, flags);
return 0;
}
/**
- * set_irq_chip_data - set irq chip data for an irq
+ * irq_set_chip_data - set irq chip data for an irq
* @irq: Interrupt number
* @data: Pointer to chip specific data
*
* Set the hardware irq chip data for an irq
*/
-int set_irq_chip_data(unsigned int irq, void *data)
+int irq_set_chip_data(unsigned int irq, void *data)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
- if (!desc) {
- printk(KERN_ERR
- "Trying to install chip data for IRQ%d\n", irq);
- return -EINVAL;
- }
-
- if (!desc->irq_data.chip) {
- printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
+ if (!desc)
return -EINVAL;
- }
-
- raw_spin_lock_irqsave(&desc->lock, flags);
desc->irq_data.chip_data = data;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
+ irq_put_desc_unlock(desc, flags);
return 0;
}
-EXPORT_SYMBOL(set_irq_chip_data);
+EXPORT_SYMBOL(irq_set_chip_data);
struct irq_data *irq_get_irq_data(unsigned int irq)
{
@@ -162,221 +137,71 @@ struct irq_data *irq_get_irq_data(unsigned int irq)
}
EXPORT_SYMBOL_GPL(irq_get_irq_data);
-/**
- * set_irq_nested_thread - Set/Reset the IRQ_NESTED_THREAD flag of an irq
- *
- * @irq: Interrupt number
- * @nest: 0 to clear / 1 to set the IRQ_NESTED_THREAD flag
- *
- * The IRQ_NESTED_THREAD flag indicates that on
- * request_threaded_irq() no separate interrupt thread should be
- * created for the irq as the handler are called nested in the
- * context of a demultiplexing interrupt handler thread.
- */
-void set_irq_nested_thread(unsigned int irq, int nest)
-{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
-
- if (!desc)
- return;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- if (nest)
- desc->status |= IRQ_NESTED_THREAD;
- else
- desc->status &= ~IRQ_NESTED_THREAD;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-}
-EXPORT_SYMBOL_GPL(set_irq_nested_thread);
-
-/*
- * default enable function
- */
-static void default_enable(struct irq_data *data)
+static void irq_state_clr_disabled(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_data_to_desc(data);
-
- desc->irq_data.chip->irq_unmask(&desc->irq_data);
- desc->status &= ~IRQ_MASKED;
+ irqd_clear(&desc->irq_data, IRQD_IRQ_DISABLED);
}
-/*
- * default disable function
- */
-static void default_disable(struct irq_data *data)
-{
-}
-
-/*
- * default startup function
- */
-static unsigned int default_startup(struct irq_data *data)
+static void irq_state_set_disabled(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_data_to_desc(data);
-
- desc->irq_data.chip->irq_enable(data);
- return 0;
+ irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
}
-/*
- * default shutdown function
- */
-static void default_shutdown(struct irq_data *data)
+static void irq_state_clr_masked(struct irq_desc *desc)
{
- struct irq_desc *desc = irq_data_to_desc(data);
-
- desc->irq_data.chip->irq_mask(&desc->irq_data);
- desc->status |= IRQ_MASKED;
+ irqd_clear(&desc->irq_data, IRQD_IRQ_MASKED);
}
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-/* Temporary migration helpers */
-static void compat_irq_mask(struct irq_data *data)
+static void irq_state_set_masked(struct irq_desc *desc)
{
- data->chip->mask(data->irq);
+ irqd_set(&desc->irq_data, IRQD_IRQ_MASKED);
}
-static void compat_irq_unmask(struct irq_data *data)
+int irq_startup(struct irq_desc *desc)
{
- data->chip->unmask(data->irq);
-}
+ irq_state_clr_disabled(desc);
+ desc->depth = 0;
-static void compat_irq_ack(struct irq_data *data)
-{
- data->chip->ack(data->irq);
-}
-
-static void compat_irq_mask_ack(struct irq_data *data)
-{
- data->chip->mask_ack(data->irq);
-}
-
-static void compat_irq_eoi(struct irq_data *data)
-{
- data->chip->eoi(data->irq);
-}
-
-static void compat_irq_enable(struct irq_data *data)
-{
- data->chip->enable(data->irq);
-}
-
-static void compat_irq_disable(struct irq_data *data)
-{
- data->chip->disable(data->irq);
-}
-
-static void compat_irq_shutdown(struct irq_data *data)
-{
- data->chip->shutdown(data->irq);
-}
-
-static unsigned int compat_irq_startup(struct irq_data *data)
-{
- return data->chip->startup(data->irq);
-}
-
-static int compat_irq_set_affinity(struct irq_data *data,
- const struct cpumask *dest, bool force)
-{
- return data->chip->set_affinity(data->irq, dest);
-}
-
-static int compat_irq_set_type(struct irq_data *data, unsigned int type)
-{
- return data->chip->set_type(data->irq, type);
-}
-
-static int compat_irq_set_wake(struct irq_data *data, unsigned int on)
-{
- return data->chip->set_wake(data->irq, on);
-}
+ if (desc->irq_data.chip->irq_startup) {
+ int ret = desc->irq_data.chip->irq_startup(&desc->irq_data);
+ irq_state_clr_masked(desc);
+ return ret;
+ }
-static int compat_irq_retrigger(struct irq_data *data)
-{
- return data->chip->retrigger(data->irq);
+ irq_enable(desc);
+ return 0;
}
-static void compat_bus_lock(struct irq_data *data)
+void irq_shutdown(struct irq_desc *desc)
{
- data->chip->bus_lock(data->irq);
+ irq_state_set_disabled(desc);
+ desc->depth = 1;
+ if (desc->irq_data.chip->irq_shutdown)
+ desc->irq_data.chip->irq_shutdown(&desc->irq_data);
+ if (desc->irq_data.chip->irq_disable)
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
+ else
+ desc->irq_data.chip->irq_mask(&desc->irq_data);
+ irq_state_set_masked(desc);
}
-static void compat_bus_sync_unlock(struct irq_data *data)
+void irq_enable(struct irq_desc *desc)
{
- data->chip->bus_sync_unlock(data->irq);
+ irq_state_clr_disabled(desc);
+ if (desc->irq_data.chip->irq_enable)
+ desc->irq_data.chip->irq_enable(&desc->irq_data);
+ else
+ desc->irq_data.chip->irq_unmask(&desc->irq_data);
+ irq_state_clr_masked(desc);
}
-#endif
-/*
- * Fixup enable/disable function pointers
- */
-void irq_chip_set_defaults(struct irq_chip *chip)
+void irq_disable(struct irq_desc *desc)
{
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
- /*
- * Compat fixup functions need to be before we set the
- * defaults for enable/disable/startup/shutdown
- */
- if (chip->enable)
- chip->irq_enable = compat_irq_enable;
- if (chip->disable)
- chip->irq_disable = compat_irq_disable;
- if (chip->shutdown)
- chip->irq_shutdown = compat_irq_shutdown;
- if (chip->startup)
- chip->irq_startup = compat_irq_startup;
-#endif
- /*
- * The real defaults
- */
- if (!chip->irq_enable)
- chip->irq_enable = default_enable;
- if (!chip->irq_disable)
- chip->irq_disable = default_disable;
- if (!chip->irq_startup)
- chip->irq_startup = default_startup;
- /*
- * We use chip->irq_disable, when the user provided its own. When
- * we have default_disable set for chip->irq_disable, then we need
- * to use default_shutdown, otherwise the irq line is not
- * disabled on free_irq():
- */
- if (!chip->irq_shutdown)
- chip->irq_shutdown = chip->irq_disable != default_disable ?
- chip->irq_disable : default_shutdown;
-
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
- if (!chip->end)
- chip->end = dummy_irq_chip.end;
-
- /*
- * Now fix up the remaining compat handlers
- */
- if (chip->bus_lock)
- chip->irq_bus_lock = compat_bus_lock;
- if (chip->bus_sync_unlock)
- chip->irq_bus_sync_unlock = compat_bus_sync_unlock;
- if (chip->mask)
- chip->irq_mask = compat_irq_mask;
- if (chip->unmask)
- chip->irq_unmask = compat_irq_unmask;
- if (chip->ack)
- chip->irq_ack = compat_irq_ack;
- if (chip->mask_ack)
- chip->irq_mask_ack = compat_irq_mask_ack;
- if (chip->eoi)
- chip->irq_eoi = compat_irq_eoi;
- if (chip->set_affinity)
- chip->irq_set_affinity = compat_irq_set_affinity;
- if (chip->set_type)
- chip->irq_set_type = compat_irq_set_type;
- if (chip->set_wake)
- chip->irq_set_wake = compat_irq_set_wake;
- if (chip->retrigger)
- chip->irq_retrigger = compat_irq_retrigger;
-#endif
+ irq_state_set_disabled(desc);
+ if (desc->irq_data.chip->irq_disable) {
+ desc->irq_data.chip->irq_disable(&desc->irq_data);
+ irq_state_set_masked(desc);
+ }
}
static inline void mask_ack_irq(struct irq_desc *desc)
@@ -388,22 +213,22 @@ static inline void mask_ack_irq(struct irq_desc *desc)
if (desc->irq_data.chip->irq_ack)
desc->irq_data.chip->irq_ack(&desc->irq_data);
}
- desc->status |= IRQ_MASKED;
+ irq_state_set_masked(desc);
}
-static inline void mask_irq(struct irq_desc *desc)
+void mask_irq(struct irq_desc *desc)
{
if (desc->irq_data.chip->irq_mask) {
desc->irq_data.chip->irq_mask(&desc->irq_data);
- desc->status |= IRQ_MASKED;
+ irq_state_set_masked(desc);
}
}
-static inline void unmask_irq(struct irq_desc *desc)
+void unmask_irq(struct irq_desc *desc)
{
if (desc->irq_data.chip->irq_unmask) {
desc->irq_data.chip->irq_unmask(&desc->irq_data);
- desc->status &= ~IRQ_MASKED;
+ irq_state_clr_masked(desc);
}
}
@@ -428,10 +253,10 @@ void handle_nested_irq(unsigned int irq)
kstat_incr_irqs_this_cpu(irq, desc);
action = desc->action;
- if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ if (unlikely(!action || irqd_irq_disabled(&desc->irq_data)))
goto out_unlock;
- desc->status |= IRQ_INPROGRESS;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
raw_spin_unlock_irq(&desc->lock);
action_ret = action->thread_fn(action->irq, action->dev_id);
@@ -439,13 +264,20 @@ void handle_nested_irq(unsigned int irq)
note_interrupt(irq, desc, action_ret);
raw_spin_lock_irq(&desc->lock);
- desc->status &= ~IRQ_INPROGRESS;
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
out_unlock:
raw_spin_unlock_irq(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_nested_irq);
+static bool irq_check_poll(struct irq_desc *desc)
+{
+ if (!(desc->istate & IRQS_POLL_INPROGRESS))
+ return false;
+ return irq_wait_for_poll(desc);
+}
+
/**
* handle_simple_irq - Simple and software-decoded IRQs.
* @irq: the interrupt number
@@ -461,32 +293,24 @@ EXPORT_SYMBOL_GPL(handle_nested_irq);
void
handle_simple_irq(unsigned int irq, struct irq_desc *desc)
{
- struct irqaction *action;
- irqreturn_t action_ret;
-
raw_spin_lock(&desc->lock);
- if (unlikely(desc->status & IRQ_INPROGRESS))
- goto out_unlock;
- desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
+ if (!irq_check_poll(desc))
+ goto out_unlock;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
- action = desc->action;
- if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
goto out_unlock;
- desc->status |= IRQ_INPROGRESS;
- raw_spin_unlock(&desc->lock);
+ handle_irq_event(desc);
- action_ret = handle_IRQ_event(irq, action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
-
- raw_spin_lock(&desc->lock);
- desc->status &= ~IRQ_INPROGRESS;
out_unlock:
raw_spin_unlock(&desc->lock);
}
+EXPORT_SYMBOL_GPL(handle_simple_irq);
/**
* handle_level_irq - Level type irq handler
@@ -501,42 +325,42 @@ out_unlock:
void
handle_level_irq(unsigned int irq, struct irq_desc *desc)
{
- struct irqaction *action;
- irqreturn_t action_ret;
-
raw_spin_lock(&desc->lock);
mask_ack_irq(desc);
- if (unlikely(desc->status & IRQ_INPROGRESS))
- goto out_unlock;
- desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
+ if (!irq_check_poll(desc))
+ goto out_unlock;
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
/*
* If its disabled or no action available
* keep it masked and get out of here
*/
- action = desc->action;
- if (unlikely(!action || (desc->status & IRQ_DISABLED)))
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data)))
goto out_unlock;
- desc->status |= IRQ_INPROGRESS;
- raw_spin_unlock(&desc->lock);
-
- action_ret = handle_IRQ_event(irq, action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
-
- raw_spin_lock(&desc->lock);
- desc->status &= ~IRQ_INPROGRESS;
+ handle_irq_event(desc);
- if (!(desc->status & (IRQ_DISABLED | IRQ_ONESHOT)))
+ if (!irqd_irq_disabled(&desc->irq_data) && !(desc->istate & IRQS_ONESHOT))
unmask_irq(desc);
out_unlock:
raw_spin_unlock(&desc->lock);
}
EXPORT_SYMBOL_GPL(handle_level_irq);
+#ifdef CONFIG_IRQ_PREFLOW_FASTEOI
+static inline void preflow_handler(struct irq_desc *desc)
+{
+ if (desc->preflow_handler)
+ desc->preflow_handler(&desc->irq_data);
+}
+#else
+static inline void preflow_handler(struct irq_desc *desc) { }
+#endif
+
/**
* handle_fasteoi_irq - irq handler for transparent controllers
* @irq: the interrupt number
@@ -550,42 +374,40 @@ EXPORT_SYMBOL_GPL(handle_level_irq);
void
handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
{
- struct irqaction *action;
- irqreturn_t action_ret;
-
raw_spin_lock(&desc->lock);
- if (unlikely(desc->status & IRQ_INPROGRESS))
- goto out;
+ if (unlikely(irqd_irq_inprogress(&desc->irq_data)))
+ if (!irq_check_poll(desc))
+ goto out;
- desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
kstat_incr_irqs_this_cpu(irq, desc);
/*
* If its disabled or no action available
* then mask it and get out of here:
*/
- action = desc->action;
- if (unlikely(!action || (desc->status & IRQ_DISABLED))) {
- desc->status |= IRQ_PENDING;
+ if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) {
+ desc->istate |= IRQS_PENDING;
mask_irq(desc);
goto out;
}
- desc->status |= IRQ_INPROGRESS;
- desc->status &= ~IRQ_PENDING;
- raw_spin_unlock(&desc->lock);
+ if (desc->istate & IRQS_ONESHOT)
+ mask_irq(desc);
- action_ret = handle_IRQ_event(irq, action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
+ preflow_handler(desc);
+ handle_irq_event(desc);
- raw_spin_lock(&desc->lock);
- desc->status &= ~IRQ_INPROGRESS;
-out:
+out_eoi:
desc->irq_data.chip->irq_eoi(&desc->irq_data);
-
+out_unlock:
raw_spin_unlock(&desc->lock);
+ return;
+out:
+ if (!(desc->irq_data.chip->flags & IRQCHIP_EOI_IF_HANDLED))
+ goto out_eoi;
+ goto out_unlock;
}
/**
@@ -594,7 +416,7 @@ out:
* @desc: the interrupt description structure for this irq
*
* Interrupt occures on the falling and/or rising edge of a hardware
- * signal. The occurence is latched into the irq controller hardware
+ * signal. The occurrence is latched into the irq controller hardware
* and must be acked in order to be reenabled. After the ack another
* interrupt can happen on the same source even before the first one
* is handled by the associated event handler. If this happens it
@@ -609,32 +431,27 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
{
raw_spin_lock(&desc->lock);
- desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
-
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
/*
* If we're currently running this IRQ, or its disabled,
* we shouldn't process the IRQ. Mark it pending, handle
* the necessary masking and go out
*/
- if (unlikely((desc->status & (IRQ_INPROGRESS | IRQ_DISABLED)) ||
- !desc->action)) {
- desc->status |= (IRQ_PENDING | IRQ_MASKED);
- mask_ack_irq(desc);
- goto out_unlock;
+ if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
+ irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
+ if (!irq_check_poll(desc)) {
+ desc->istate |= IRQS_PENDING;
+ mask_ack_irq(desc);
+ goto out_unlock;
+ }
}
kstat_incr_irqs_this_cpu(irq, desc);
/* Start handling the irq */
desc->irq_data.chip->irq_ack(&desc->irq_data);
- /* Mark the IRQ currently in progress.*/
- desc->status |= IRQ_INPROGRESS;
-
do {
- struct irqaction *action = desc->action;
- irqreturn_t action_ret;
-
- if (unlikely(!action)) {
+ if (unlikely(!desc->action)) {
mask_irq(desc);
goto out_unlock;
}
@@ -644,26 +461,66 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
* one, we could have masked the irq.
* Renable it, if it was not disabled in meantime.
*/
- if (unlikely((desc->status &
- (IRQ_PENDING | IRQ_MASKED | IRQ_DISABLED)) ==
- (IRQ_PENDING | IRQ_MASKED))) {
- unmask_irq(desc);
+ if (unlikely(desc->istate & IRQS_PENDING)) {
+ if (!irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data))
+ unmask_irq(desc);
}
- desc->status &= ~IRQ_PENDING;
- raw_spin_unlock(&desc->lock);
- action_ret = handle_IRQ_event(irq, action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
- raw_spin_lock(&desc->lock);
+ handle_irq_event(desc);
- } while ((desc->status & (IRQ_PENDING | IRQ_DISABLED)) == IRQ_PENDING);
+ } while ((desc->istate & IRQS_PENDING) &&
+ !irqd_irq_disabled(&desc->irq_data));
- desc->status &= ~IRQ_INPROGRESS;
out_unlock:
raw_spin_unlock(&desc->lock);
}
+#ifdef CONFIG_IRQ_EDGE_EOI_HANDLER
+/**
+ * handle_edge_eoi_irq - edge eoi type IRQ handler
+ * @irq: the interrupt number
+ * @desc: the interrupt description structure for this irq
+ *
+ * Similar as the above handle_edge_irq, but using eoi and w/o the
+ * mask/unmask logic.
+ */
+void handle_edge_eoi_irq(unsigned int irq, struct irq_desc *desc)
+{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+
+ raw_spin_lock(&desc->lock);
+
+ desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING);
+ /*
+ * If we're currently running this IRQ, or its disabled,
+ * we shouldn't process the IRQ. Mark it pending, handle
+ * the necessary masking and go out
+ */
+ if (unlikely(irqd_irq_disabled(&desc->irq_data) ||
+ irqd_irq_inprogress(&desc->irq_data) || !desc->action)) {
+ if (!irq_check_poll(desc)) {
+ desc->istate |= IRQS_PENDING;
+ goto out_eoi;
+ }
+ }
+ kstat_incr_irqs_this_cpu(irq, desc);
+
+ do {
+ if (unlikely(!desc->action))
+ goto out_eoi;
+
+ handle_irq_event(desc);
+
+ } while ((desc->istate & IRQS_PENDING) &&
+ !irqd_irq_disabled(&desc->irq_data));
+
+out_eoi:
+ chip->irq_eoi(&desc->irq_data);
+ raw_spin_unlock(&desc->lock);
+}
+#endif
+
/**
* handle_percpu_irq - Per CPU local irq handler
* @irq: the interrupt number
@@ -674,103 +531,147 @@ out_unlock:
void
handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
{
- irqreturn_t action_ret;
+ struct irq_chip *chip = irq_desc_get_chip(desc);
kstat_incr_irqs_this_cpu(irq, desc);
- if (desc->irq_data.chip->irq_ack)
- desc->irq_data.chip->irq_ack(&desc->irq_data);
+ if (chip->irq_ack)
+ chip->irq_ack(&desc->irq_data);
- action_ret = handle_IRQ_event(irq, desc->action);
- if (!noirqdebug)
- note_interrupt(irq, desc, action_ret);
+ handle_irq_event_percpu(desc, desc->action);
- if (desc->irq_data.chip->irq_eoi)
- desc->irq_data.chip->irq_eoi(&desc->irq_data);
+ if (chip->irq_eoi)
+ chip->irq_eoi(&desc->irq_data);
}
void
-__set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
+__irq_set_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
const char *name)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
- if (!desc) {
- printk(KERN_ERR
- "Trying to install type control for IRQ%d\n", irq);
+ if (!desc)
return;
- }
- if (!handle)
+ if (!handle) {
handle = handle_bad_irq;
- else if (desc->irq_data.chip == &no_irq_chip) {
- printk(KERN_WARNING "Trying to install %sinterrupt handler "
- "for IRQ%d\n", is_chained ? "chained " : "", irq);
- /*
- * Some ARM implementations install a handler for really dumb
- * interrupt hardware without setting an irq_chip. This worked
- * with the ARM no_irq_chip but the check in setup_irq would
- * prevent us to setup the interrupt at all. Switch it to
- * dummy_irq_chip for easy transition.
- */
- desc->irq_data.chip = &dummy_irq_chip;
+ } else {
+ if (WARN_ON(desc->irq_data.chip == &no_irq_chip))
+ goto out;
}
- chip_bus_lock(desc);
- raw_spin_lock_irqsave(&desc->lock, flags);
-
/* Uninstall? */
if (handle == handle_bad_irq) {
if (desc->irq_data.chip != &no_irq_chip)
mask_ack_irq(desc);
- desc->status |= IRQ_DISABLED;
+ irq_state_set_disabled(desc);
desc->depth = 1;
}
desc->handle_irq = handle;
desc->name = name;
if (handle != handle_bad_irq && is_chained) {
- desc->status &= ~IRQ_DISABLED;
- desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
- desc->depth = 0;
- desc->irq_data.chip->irq_startup(&desc->irq_data);
+ irq_settings_set_noprobe(desc);
+ irq_settings_set_norequest(desc);
+ irq_settings_set_nothread(desc);
+ irq_startup(desc);
}
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- chip_bus_sync_unlock(desc);
-}
-EXPORT_SYMBOL_GPL(__set_irq_handler);
-
-void
-set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
- irq_flow_handler_t handle)
-{
- set_irq_chip(irq, chip);
- __set_irq_handler(irq, handle, 0, NULL);
+out:
+ irq_put_desc_busunlock(desc, flags);
}
+EXPORT_SYMBOL_GPL(__irq_set_handler);
void
-set_irq_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
+irq_set_chip_and_handler_name(unsigned int irq, struct irq_chip *chip,
irq_flow_handler_t handle, const char *name)
{
- set_irq_chip(irq, chip);
- __set_irq_handler(irq, handle, 0, name);
+ irq_set_chip(irq, chip);
+ __irq_set_handler(irq, handle, 0, name);
}
void irq_modify_status(unsigned int irq, unsigned long clr, unsigned long set)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
if (!desc)
return;
+ irq_settings_clr_and_set(desc, clr, set);
+
+ irqd_clear(&desc->irq_data, IRQD_NO_BALANCING | IRQD_PER_CPU |
+ IRQD_TRIGGER_MASK | IRQD_LEVEL | IRQD_MOVE_PCNTXT);
+ if (irq_settings_has_no_balance_set(desc))
+ irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+ if (irq_settings_is_per_cpu(desc))
+ irqd_set(&desc->irq_data, IRQD_PER_CPU);
+ if (irq_settings_can_move_pcntxt(desc))
+ irqd_set(&desc->irq_data, IRQD_MOVE_PCNTXT);
+ if (irq_settings_is_level(desc))
+ irqd_set(&desc->irq_data, IRQD_LEVEL);
+
+ irqd_set(&desc->irq_data, irq_settings_get_trigger_mask(desc));
+
+ irq_put_desc_unlock(desc, flags);
+}
+EXPORT_SYMBOL_GPL(irq_modify_status);
+
+/**
+ * irq_cpu_online - Invoke all irq_cpu_online functions.
+ *
+ * Iterate through all irqs and invoke the chip.irq_cpu_online()
+ * for each.
+ */
+void irq_cpu_online(void)
+{
+ struct irq_desc *desc;
+ struct irq_chip *chip;
+ unsigned long flags;
+ unsigned int irq;
+
+ for_each_active_irq(irq) {
+ desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
- /* Sanitize flags */
- set &= IRQF_MODIFY_MASK;
- clr &= IRQF_MODIFY_MASK;
+ raw_spin_lock_irqsave(&desc->lock, flags);
- raw_spin_lock_irqsave(&desc->lock, flags);
- desc->status &= ~clr;
- desc->status |= set;
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ chip = irq_data_get_irq_chip(&desc->irq_data);
+ if (chip && chip->irq_cpu_online &&
+ (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
+ !irqd_irq_disabled(&desc->irq_data)))
+ chip->irq_cpu_online(&desc->irq_data);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ }
+}
+
+/**
+ * irq_cpu_offline - Invoke all irq_cpu_offline functions.
+ *
+ * Iterate through all irqs and invoke the chip.irq_cpu_offline()
+ * for each.
+ */
+void irq_cpu_offline(void)
+{
+ struct irq_desc *desc;
+ struct irq_chip *chip;
+ unsigned long flags;
+ unsigned int irq;
+
+ for_each_active_irq(irq) {
+ desc = irq_to_desc(irq);
+ if (!desc)
+ continue;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+
+ chip = irq_data_get_irq_chip(&desc->irq_data);
+ if (chip && chip->irq_cpu_offline &&
+ (!(chip->flags & IRQCHIP_ONOFFLINE_ENABLED) ||
+ !irqd_irq_disabled(&desc->irq_data)))
+ chip->irq_cpu_offline(&desc->irq_data);
+
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ }
}
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
new file mode 100644
index 000000000000..97a8bfadc88a
--- /dev/null
+++ b/kernel/irq/debug.h
@@ -0,0 +1,45 @@
+/*
+ * Debugging printout:
+ */
+
+#include <linux/kallsyms.h>
+
+#define P(f) if (desc->status_use_accessors & f) printk("%14s set\n", #f)
+#define PS(f) if (desc->istate & f) printk("%14s set\n", #f)
+/* FIXME */
+#define PD(f) do { } while (0)
+
+static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+{
+ printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
+ irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
+ printk("->handle_irq(): %p, ", desc->handle_irq);
+ print_symbol("%s\n", (unsigned long)desc->handle_irq);
+ printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
+ print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
+ printk("->action(): %p\n", desc->action);
+ if (desc->action) {
+ printk("->action->handler(): %p, ", desc->action->handler);
+ print_symbol("%s\n", (unsigned long)desc->action->handler);
+ }
+
+ P(IRQ_LEVEL);
+ P(IRQ_PER_CPU);
+ P(IRQ_NOPROBE);
+ P(IRQ_NOREQUEST);
+ P(IRQ_NOTHREAD);
+ P(IRQ_NOAUTOEN);
+
+ PS(IRQS_AUTODETECT);
+ PS(IRQS_REPLAY);
+ PS(IRQS_WAITING);
+ PS(IRQS_PENDING);
+
+ PD(IRQS_INPROGRESS);
+ PD(IRQS_DISABLED);
+ PD(IRQS_MASKED);
+}
+
+#undef P
+#undef PS
+#undef PD
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
index 20dc5474947e..b5fcd96c7102 100644
--- a/kernel/irq/dummychip.c
+++ b/kernel/irq/dummychip.c
@@ -31,13 +31,6 @@ static unsigned int noop_ret(struct irq_data *data)
return 0;
}
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-static void compat_noop(unsigned int irq) { }
-#define END_INIT .end = compat_noop
-#else
-#define END_INIT
-#endif
-
/*
* Generic no controller implementation
*/
@@ -48,7 +41,6 @@ struct irq_chip no_irq_chip = {
.irq_enable = noop,
.irq_disable = noop,
.irq_ack = ack_bad,
- END_INIT
};
/*
@@ -64,5 +56,4 @@ struct irq_chip dummy_irq_chip = {
.irq_ack = noop,
.irq_mask = noop,
.irq_unmask = noop,
- END_INIT
};
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
new file mode 100644
index 000000000000..31a9db711906
--- /dev/null
+++ b/kernel/irq/generic-chip.c
@@ -0,0 +1,354 @@
+/*
+ * Library implementing the most common irq chip callback functions
+ *
+ * Copyright (C) 2011, Thomas Gleixner
+ */
+#include <linux/io.h>
+#include <linux/irq.h>
+#include <linux/slab.h>
+#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
+#include <linux/syscore_ops.h>
+
+#include "internals.h"
+
+static LIST_HEAD(gc_list);
+static DEFINE_RAW_SPINLOCK(gc_lock);
+
+static inline struct irq_chip_regs *cur_regs(struct irq_data *d)
+{
+ return &container_of(d->chip, struct irq_chip_type, chip)->regs;
+}
+
+/**
+ * irq_gc_noop - NOOP function
+ * @d: irq_data
+ */
+void irq_gc_noop(struct irq_data *d)
+{
+}
+
+/**
+ * irq_gc_mask_disable_reg - Mask chip via disable register
+ * @d: irq_data
+ *
+ * Chip has separate enable/disable registers instead of a single mask
+ * register.
+ */
+void irq_gc_mask_disable_reg(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ u32 mask = 1 << (d->irq - gc->irq_base);
+
+ irq_gc_lock(gc);
+ irq_reg_writel(mask, gc->reg_base + cur_regs(d)->disable);
+ gc->mask_cache &= ~mask;
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_mask_set_mask_bit - Mask chip via setting bit in mask register
+ * @d: irq_data
+ *
+ * Chip has a single mask register. Values of this register are cached
+ * and protected by gc->lock
+ */
+void irq_gc_mask_set_bit(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ u32 mask = 1 << (d->irq - gc->irq_base);
+
+ irq_gc_lock(gc);
+ gc->mask_cache |= mask;
+ irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_mask_set_mask_bit - Mask chip via clearing bit in mask register
+ * @d: irq_data
+ *
+ * Chip has a single mask register. Values of this register are cached
+ * and protected by gc->lock
+ */
+void irq_gc_mask_clr_bit(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ u32 mask = 1 << (d->irq - gc->irq_base);
+
+ irq_gc_lock(gc);
+ gc->mask_cache &= ~mask;
+ irq_reg_writel(gc->mask_cache, gc->reg_base + cur_regs(d)->mask);
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_unmask_enable_reg - Unmask chip via enable register
+ * @d: irq_data
+ *
+ * Chip has separate enable/disable registers instead of a single mask
+ * register.
+ */
+void irq_gc_unmask_enable_reg(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ u32 mask = 1 << (d->irq - gc->irq_base);
+
+ irq_gc_lock(gc);
+ irq_reg_writel(mask, gc->reg_base + cur_regs(d)->enable);
+ gc->mask_cache |= mask;
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_ack - Ack pending interrupt
+ * @d: irq_data
+ */
+void irq_gc_ack(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ u32 mask = 1 << (d->irq - gc->irq_base);
+
+ irq_gc_lock(gc);
+ irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_mask_disable_reg_and_ack- Mask and ack pending interrupt
+ * @d: irq_data
+ */
+void irq_gc_mask_disable_reg_and_ack(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ u32 mask = 1 << (d->irq - gc->irq_base);
+
+ irq_gc_lock(gc);
+ irq_reg_writel(mask, gc->reg_base + cur_regs(d)->mask);
+ irq_reg_writel(mask, gc->reg_base + cur_regs(d)->ack);
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_eoi - EOI interrupt
+ * @d: irq_data
+ */
+void irq_gc_eoi(struct irq_data *d)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ u32 mask = 1 << (d->irq - gc->irq_base);
+
+ irq_gc_lock(gc);
+ irq_reg_writel(mask, gc->reg_base + cur_regs(d)->eoi);
+ irq_gc_unlock(gc);
+}
+
+/**
+ * irq_gc_set_wake - Set/clr wake bit for an interrupt
+ * @d: irq_data
+ *
+ * For chips where the wake from suspend functionality is not
+ * configured in a separate register and the wakeup active state is
+ * just stored in a bitmask.
+ */
+int irq_gc_set_wake(struct irq_data *d, unsigned int on)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ u32 mask = 1 << (d->irq - gc->irq_base);
+
+ if (!(mask & gc->wake_enabled))
+ return -EINVAL;
+
+ irq_gc_lock(gc);
+ if (on)
+ gc->wake_active |= mask;
+ else
+ gc->wake_active &= ~mask;
+ irq_gc_unlock(gc);
+ return 0;
+}
+
+/**
+ * irq_alloc_generic_chip - Allocate a generic chip and initialize it
+ * @name: Name of the irq chip
+ * @num_ct: Number of irq_chip_type instances associated with this
+ * @irq_base: Interrupt base nr for this chip
+ * @reg_base: Register base address (virtual)
+ * @handler: Default flow handler associated with this chip
+ *
+ * Returns an initialized irq_chip_generic structure. The chip defaults
+ * to the primary (index 0) irq_chip_type and @handler
+ */
+struct irq_chip_generic *
+irq_alloc_generic_chip(const char *name, int num_ct, unsigned int irq_base,
+ void __iomem *reg_base, irq_flow_handler_t handler)
+{
+ struct irq_chip_generic *gc;
+ unsigned long sz = sizeof(*gc) + num_ct * sizeof(struct irq_chip_type);
+
+ gc = kzalloc(sz, GFP_KERNEL);
+ if (gc) {
+ raw_spin_lock_init(&gc->lock);
+ gc->num_ct = num_ct;
+ gc->irq_base = irq_base;
+ gc->reg_base = reg_base;
+ gc->chip_types->chip.name = name;
+ gc->chip_types->handler = handler;
+ }
+ return gc;
+}
+
+/*
+ * Separate lockdep class for interrupt chip which can nest irq_desc
+ * lock.
+ */
+static struct lock_class_key irq_nested_lock_class;
+
+/**
+ * irq_setup_generic_chip - Setup a range of interrupts with a generic chip
+ * @gc: Generic irq chip holding all data
+ * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
+ * @flags: Flags for initialization
+ * @clr: IRQ_* bits to clear
+ * @set: IRQ_* bits to set
+ *
+ * Set up max. 32 interrupts starting from gc->irq_base. Note, this
+ * initializes all interrupts to the primary irq_chip_type and its
+ * associated handler.
+ */
+void irq_setup_generic_chip(struct irq_chip_generic *gc, u32 msk,
+ enum irq_gc_flags flags, unsigned int clr,
+ unsigned int set)
+{
+ struct irq_chip_type *ct = gc->chip_types;
+ unsigned int i;
+
+ raw_spin_lock(&gc_lock);
+ list_add_tail(&gc->list, &gc_list);
+ raw_spin_unlock(&gc_lock);
+
+ /* Init mask cache ? */
+ if (flags & IRQ_GC_INIT_MASK_CACHE)
+ gc->mask_cache = irq_reg_readl(gc->reg_base + ct->regs.mask);
+
+ for (i = gc->irq_base; msk; msk >>= 1, i++) {
+ if (!msk & 0x01)
+ continue;
+
+ if (flags & IRQ_GC_INIT_NESTED_LOCK)
+ irq_set_lockdep_class(i, &irq_nested_lock_class);
+
+ irq_set_chip_and_handler(i, &ct->chip, ct->handler);
+ irq_set_chip_data(i, gc);
+ irq_modify_status(i, clr, set);
+ }
+ gc->irq_cnt = i - gc->irq_base;
+}
+
+/**
+ * irq_setup_alt_chip - Switch to alternative chip
+ * @d: irq_data for this interrupt
+ * @type Flow type to be initialized
+ *
+ * Only to be called from chip->irq_set_type() callbacks.
+ */
+int irq_setup_alt_chip(struct irq_data *d, unsigned int type)
+{
+ struct irq_chip_generic *gc = irq_data_get_irq_chip_data(d);
+ struct irq_chip_type *ct = gc->chip_types;
+ unsigned int i;
+
+ for (i = 0; i < gc->num_ct; i++, ct++) {
+ if (ct->type & type) {
+ d->chip = &ct->chip;
+ irq_data_to_desc(d)->handle_irq = ct->handler;
+ return 0;
+ }
+ }
+ return -EINVAL;
+}
+
+/**
+ * irq_remove_generic_chip - Remove a chip
+ * @gc: Generic irq chip holding all data
+ * @msk: Bitmask holding the irqs to initialize relative to gc->irq_base
+ * @clr: IRQ_* bits to clear
+ * @set: IRQ_* bits to set
+ *
+ * Remove up to 32 interrupts starting from gc->irq_base.
+ */
+void irq_remove_generic_chip(struct irq_chip_generic *gc, u32 msk,
+ unsigned int clr, unsigned int set)
+{
+ unsigned int i = gc->irq_base;
+
+ raw_spin_lock(&gc_lock);
+ list_del(&gc->list);
+ raw_spin_unlock(&gc_lock);
+
+ for (; msk; msk >>= 1, i++) {
+ if (!msk & 0x01)
+ continue;
+
+ /* Remove handler first. That will mask the irq line */
+ irq_set_handler(i, NULL);
+ irq_set_chip(i, &no_irq_chip);
+ irq_set_chip_data(i, NULL);
+ irq_modify_status(i, clr, set);
+ }
+}
+
+#ifdef CONFIG_PM
+static int irq_gc_suspend(void)
+{
+ struct irq_chip_generic *gc;
+
+ list_for_each_entry(gc, &gc_list, list) {
+ struct irq_chip_type *ct = gc->chip_types;
+
+ if (ct->chip.irq_suspend)
+ ct->chip.irq_suspend(irq_get_irq_data(gc->irq_base));
+ }
+ return 0;
+}
+
+static void irq_gc_resume(void)
+{
+ struct irq_chip_generic *gc;
+
+ list_for_each_entry(gc, &gc_list, list) {
+ struct irq_chip_type *ct = gc->chip_types;
+
+ if (ct->chip.irq_resume)
+ ct->chip.irq_resume(irq_get_irq_data(gc->irq_base));
+ }
+}
+#else
+#define irq_gc_suspend NULL
+#define irq_gc_resume NULL
+#endif
+
+static void irq_gc_shutdown(void)
+{
+ struct irq_chip_generic *gc;
+
+ list_for_each_entry(gc, &gc_list, list) {
+ struct irq_chip_type *ct = gc->chip_types;
+
+ if (ct->chip.irq_pm_shutdown)
+ ct->chip.irq_pm_shutdown(irq_get_irq_data(gc->irq_base));
+ }
+}
+
+static struct syscore_ops irq_gc_syscore_ops = {
+ .suspend = irq_gc_suspend,
+ .resume = irq_gc_resume,
+ .shutdown = irq_gc_shutdown,
+};
+
+static int __init irq_gc_init_ops(void)
+{
+ register_syscore_ops(&irq_gc_syscore_ops);
+ return 0;
+}
+device_initcall(irq_gc_init_ops);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3540a7190122..90cb55f6d7eb 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -51,30 +51,92 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
"but no thread function available.", irq, action->name);
}
-/**
- * handle_IRQ_event - irq action chain handler
- * @irq: the interrupt number
- * @action: the interrupt action chain for this irq
- *
- * Handles the action chain of an irq event
- */
-irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
+static void irq_wake_thread(struct irq_desc *desc, struct irqaction *action)
+{
+ /*
+ * Wake up the handler thread for this action. In case the
+ * thread crashed and was killed we just pretend that we
+ * handled the interrupt. The hardirq handler has disabled the
+ * device interrupt, so no irq storm is lurking. If the
+ * RUNTHREAD bit is already set, nothing to do.
+ */
+ if (test_bit(IRQTF_DIED, &action->thread_flags) ||
+ test_and_set_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ return;
+
+ /*
+ * It's safe to OR the mask lockless here. We have only two
+ * places which write to threads_oneshot: This code and the
+ * irq thread.
+ *
+ * This code is the hard irq context and can never run on two
+ * cpus in parallel. If it ever does we have more serious
+ * problems than this bitmask.
+ *
+ * The irq threads of this irq which clear their "running" bit
+ * in threads_oneshot are serialized via desc->lock against
+ * each other and they are serialized against this code by
+ * IRQS_INPROGRESS.
+ *
+ * Hard irq handler:
+ *
+ * spin_lock(desc->lock);
+ * desc->state |= IRQS_INPROGRESS;
+ * spin_unlock(desc->lock);
+ * set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+ * desc->threads_oneshot |= mask;
+ * spin_lock(desc->lock);
+ * desc->state &= ~IRQS_INPROGRESS;
+ * spin_unlock(desc->lock);
+ *
+ * irq thread:
+ *
+ * again:
+ * spin_lock(desc->lock);
+ * if (desc->state & IRQS_INPROGRESS) {
+ * spin_unlock(desc->lock);
+ * while(desc->state & IRQS_INPROGRESS)
+ * cpu_relax();
+ * goto again;
+ * }
+ * if (!test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ * desc->threads_oneshot &= ~mask;
+ * spin_unlock(desc->lock);
+ *
+ * So either the thread waits for us to clear IRQS_INPROGRESS
+ * or we are waiting in the flow handler for desc->lock to be
+ * released before we reach this point. The thread also checks
+ * IRQTF_RUNTHREAD under desc->lock. If set it leaves
+ * threads_oneshot untouched and runs the thread another time.
+ */
+ desc->threads_oneshot |= action->thread_mask;
+ wake_up_process(action->thread);
+}
+
+irqreturn_t
+handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action)
{
- irqreturn_t ret, retval = IRQ_NONE;
- unsigned int status = 0;
+ irqreturn_t retval = IRQ_NONE;
+ unsigned int random = 0, irq = desc->irq_data.irq;
do {
+ irqreturn_t res;
+
trace_irq_handler_entry(irq, action);
- ret = action->handler(irq, action->dev_id);
- trace_irq_handler_exit(irq, action, ret);
+ res = action->handler(irq, action->dev_id);
+ trace_irq_handler_exit(irq, action, res);
+
+ if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
+ irq, action->handler))
+ local_irq_disable();
- switch (ret) {
+ switch (res) {
case IRQ_WAKE_THREAD:
/*
* Set result to handled so the spurious check
* does not trigger.
*/
- ret = IRQ_HANDLED;
+ res = IRQ_HANDLED;
/*
* Catch drivers which return WAKE_THREAD but
@@ -85,36 +147,41 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
break;
}
- /*
- * Wake up the handler thread for this
- * action. In case the thread crashed and was
- * killed we just pretend that we handled the
- * interrupt. The hardirq handler above has
- * disabled the device interrupt, so no irq
- * storm is lurking.
- */
- if (likely(!test_bit(IRQTF_DIED,
- &action->thread_flags))) {
- set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
- wake_up_process(action->thread);
- }
+ irq_wake_thread(desc, action);
/* Fall through to add to randomness */
case IRQ_HANDLED:
- status |= action->flags;
+ random |= action->flags;
break;
default:
break;
}
- retval |= ret;
+ retval |= res;
action = action->next;
} while (action);
- if (status & IRQF_SAMPLE_RANDOM)
+ if (random & IRQF_SAMPLE_RANDOM)
add_interrupt_randomness(irq);
- local_irq_disable();
+ if (!noirqdebug)
+ note_interrupt(irq, desc, retval);
return retval;
}
+
+irqreturn_t handle_irq_event(struct irq_desc *desc)
+{
+ struct irqaction *action = desc->action;
+ irqreturn_t ret;
+
+ desc->istate &= ~IRQS_PENDING;
+ irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ raw_spin_unlock(&desc->lock);
+
+ ret = handle_irq_event_percpu(desc, action);
+
+ raw_spin_lock(&desc->lock);
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+ return ret;
+}
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index 99c3bc8a6fb4..6546431447d7 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -1,5 +1,9 @@
/*
* IRQ subsystem internal functions and variables:
+ *
+ * Do not ever include this file from anything else than
+ * kernel/irq/. Do not even think about using any information outside
+ * of this file for your non core code.
*/
#include <linux/irqdesc.h>
@@ -9,25 +13,75 @@
# define IRQ_BITMAP_BITS NR_IRQS
#endif
+#define istate core_internal_state__do_not_mess_with_it
+
extern int noirqdebug;
-#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
+/*
+ * Bits used by threaded handlers:
+ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
+ * IRQTF_DIED - handler thread died
+ * IRQTF_WARNED - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
+ * IRQTF_AFFINITY - irq thread is requested to adjust affinity
+ * IRQTF_FORCED_THREAD - irq action is force threaded
+ */
+enum {
+ IRQTF_RUNTHREAD,
+ IRQTF_DIED,
+ IRQTF_WARNED,
+ IRQTF_AFFINITY,
+ IRQTF_FORCED_THREAD,
+};
-/* Set default functions for irq_chip structures: */
-extern void irq_chip_set_defaults(struct irq_chip *chip);
+/*
+ * Bit masks for desc->state
+ *
+ * IRQS_AUTODETECT - autodetection in progress
+ * IRQS_SPURIOUS_DISABLED - was disabled due to spurious interrupt
+ * detection
+ * IRQS_POLL_INPROGRESS - polling in progress
+ * IRQS_ONESHOT - irq is not unmasked in primary handler
+ * IRQS_REPLAY - irq is replayed
+ * IRQS_WAITING - irq is waiting
+ * IRQS_PENDING - irq is pending and replayed later
+ * IRQS_SUSPENDED - irq is suspended
+ */
+enum {
+ IRQS_AUTODETECT = 0x00000001,
+ IRQS_SPURIOUS_DISABLED = 0x00000002,
+ IRQS_POLL_INPROGRESS = 0x00000008,
+ IRQS_ONESHOT = 0x00000020,
+ IRQS_REPLAY = 0x00000040,
+ IRQS_WAITING = 0x00000080,
+ IRQS_PENDING = 0x00000200,
+ IRQS_SUSPENDED = 0x00000800,
+};
+
+#include "debug.h"
+#include "settings.h"
-/* Set default handler: */
-extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
+#define irq_data_to_desc(data) container_of(data, struct irq_desc, irq_data)
extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags);
extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
+extern int irq_startup(struct irq_desc *desc);
+extern void irq_shutdown(struct irq_desc *desc);
+extern void irq_enable(struct irq_desc *desc);
+extern void irq_disable(struct irq_desc *desc);
+extern void mask_irq(struct irq_desc *desc);
+extern void unmask_irq(struct irq_desc *desc);
+
extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
+irqreturn_t handle_irq_event_percpu(struct irq_desc *desc, struct irqaction *action);
+irqreturn_t handle_irq_event(struct irq_desc *desc);
+
/* Resending of interrupts :*/
void check_irq_resend(struct irq_desc *desc, unsigned int irq);
+bool irq_wait_for_poll(struct irq_desc *desc);
#ifdef CONFIG_PROC_FS
extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
@@ -43,20 +97,10 @@ static inline void unregister_handler_proc(unsigned int irq,
struct irqaction *action) { }
#endif
-extern int irq_select_affinity_usr(unsigned int irq);
+extern int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask);
extern void irq_set_thread_affinity(struct irq_desc *desc);
-#ifndef CONFIG_GENERIC_HARDIRQS_NO_DEPRECATED
-static inline void irq_end(unsigned int irq, struct irq_desc *desc)
-{
- if (desc->irq_data.chip && desc->irq_data.chip->end)
- desc->irq_data.chip->end(irq);
-}
-#else
-static inline void irq_end(unsigned int irq, struct irq_desc *desc) { }
-#endif
-
/* Inline functions for support of irq chips on slow busses */
static inline void chip_bus_lock(struct irq_desc *desc)
{
@@ -70,43 +114,58 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc)
desc->irq_data.chip->irq_bus_sync_unlock(&desc->irq_data);
}
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus);
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus);
+
+static inline struct irq_desc *
+irq_get_desc_buslock(unsigned int irq, unsigned long *flags)
+{
+ return __irq_get_desc_lock(irq, flags, true);
+}
+
+static inline void
+irq_put_desc_busunlock(struct irq_desc *desc, unsigned long flags)
+{
+ __irq_put_desc_unlock(desc, flags, true);
+}
+
+static inline struct irq_desc *
+irq_get_desc_lock(unsigned int irq, unsigned long *flags)
+{
+ return __irq_get_desc_lock(irq, flags, false);
+}
+
+static inline void
+irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags)
+{
+ __irq_put_desc_unlock(desc, flags, false);
+}
+
/*
- * Debugging printout:
+ * Manipulation functions for irq_data.state
*/
+static inline void irqd_set_move_pending(struct irq_data *d)
+{
+ d->state_use_accessors |= IRQD_SETAFFINITY_PENDING;
+}
-#include <linux/kallsyms.h>
-
-#define P(f) if (desc->status & f) printk("%14s set\n", #f)
+static inline void irqd_clr_move_pending(struct irq_data *d)
+{
+ d->state_use_accessors &= ~IRQD_SETAFFINITY_PENDING;
+}
-static inline void print_irq_desc(unsigned int irq, struct irq_desc *desc)
+static inline void irqd_clear(struct irq_data *d, unsigned int mask)
{
- printk("irq %d, desc: %p, depth: %d, count: %d, unhandled: %d\n",
- irq, desc, desc->depth, desc->irq_count, desc->irqs_unhandled);
- printk("->handle_irq(): %p, ", desc->handle_irq);
- print_symbol("%s\n", (unsigned long)desc->handle_irq);
- printk("->irq_data.chip(): %p, ", desc->irq_data.chip);
- print_symbol("%s\n", (unsigned long)desc->irq_data.chip);
- printk("->action(): %p\n", desc->action);
- if (desc->action) {
- printk("->action->handler(): %p, ", desc->action->handler);
- print_symbol("%s\n", (unsigned long)desc->action->handler);
- }
-
- P(IRQ_INPROGRESS);
- P(IRQ_DISABLED);
- P(IRQ_PENDING);
- P(IRQ_REPLAY);
- P(IRQ_AUTODETECT);
- P(IRQ_WAITING);
- P(IRQ_LEVEL);
- P(IRQ_MASKED);
-#ifdef CONFIG_IRQ_PER_CPU
- P(IRQ_PER_CPU);
-#endif
- P(IRQ_NOPROBE);
- P(IRQ_NOREQUEST);
- P(IRQ_NOAUTOEN);
+ d->state_use_accessors &= ~mask;
}
-#undef P
+static inline void irqd_set(struct irq_data *d, unsigned int mask)
+{
+ d->state_use_accessors |= mask;
+}
+static inline bool irqd_has_set(struct irq_data *d, unsigned int mask)
+{
+ return d->state_use_accessors & mask;
+}
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
index 2039bea31bdf..886e80347b32 100644
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -22,7 +22,7 @@
*/
static struct lock_class_key irq_desc_lock_class;
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+#if defined(CONFIG_SMP)
static void __init init_irq_default_affinity(void)
{
alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
@@ -79,7 +79,8 @@ static void desc_set_defaults(unsigned int irq, struct irq_desc *desc, int node)
desc->irq_data.chip_data = NULL;
desc->irq_data.handler_data = NULL;
desc->irq_data.msi_desc = NULL;
- desc->status = IRQ_DEFAULT_INIT_FLAGS;
+ irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
+ irqd_set(&desc->irq_data, IRQD_IRQ_DISABLED);
desc->handle_irq = handle_bad_irq;
desc->depth = 1;
desc->irq_count = 0;
@@ -197,13 +198,12 @@ err:
return -ENOMEM;
}
-struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
+static int irq_expand_nr_irqs(unsigned int nr)
{
- int res = irq_alloc_descs(irq, irq, 1, node);
-
- if (res == -EEXIST || res == irq)
- return irq_to_desc(irq);
- return NULL;
+ if (nr > IRQ_BITMAP_BITS)
+ return -ENOMEM;
+ nr_irqs = nr;
+ return 0;
}
int __init early_irq_init(void)
@@ -238,7 +238,6 @@ int __init early_irq_init(void)
struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
[0 ... NR_IRQS-1] = {
- .status = IRQ_DEFAULT_INIT_FLAGS,
.handle_irq = handle_bad_irq,
.depth = 1,
.lock = __RAW_SPIN_LOCK_UNLOCKED(irq_desc->lock),
@@ -260,8 +259,8 @@ int __init early_irq_init(void)
for (i = 0; i < count; i++) {
desc[i].irq_data.irq = i;
desc[i].irq_data.chip = &no_irq_chip;
- /* TODO : do this allocation on-demand ... */
desc[i].kstat_irqs = alloc_percpu(unsigned int);
+ irq_settings_clr_and_set(desc, ~0, _IRQ_DEFAULT_INIT_FLAGS);
alloc_masks(desc + i, GFP_KERNEL, node);
desc_smp_init(desc + i, node);
lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
@@ -274,11 +273,6 @@ struct irq_desc *irq_to_desc(unsigned int irq)
return (irq < NR_IRQS) ? irq_desc + irq : NULL;
}
-struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
-{
- return irq_to_desc(irq);
-}
-
static void free_desc(unsigned int irq)
{
dynamic_irq_cleanup(irq);
@@ -286,26 +280,32 @@ static void free_desc(unsigned int irq)
static inline int alloc_descs(unsigned int start, unsigned int cnt, int node)
{
-#if defined(CONFIG_KSTAT_IRQS_ONDEMAND)
- struct irq_desc *desc;
- unsigned int i;
-
- for (i = 0; i < cnt; i++) {
- desc = irq_to_desc(start + i);
- if (desc && !desc->kstat_irqs) {
- unsigned int __percpu *stats = alloc_percpu(unsigned int);
-
- if (!stats)
- return -1;
- if (cmpxchg(&desc->kstat_irqs, NULL, stats) != NULL)
- free_percpu(stats);
- }
- }
-#endif
return start;
}
+
+static int irq_expand_nr_irqs(unsigned int nr)
+{
+ return -ENOMEM;
+}
+
#endif /* !CONFIG_SPARSE_IRQ */
+/**
+ * generic_handle_irq - Invoke the handler for a particular irq
+ * @irq: The irq number to handle
+ *
+ */
+int generic_handle_irq(unsigned int irq)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (!desc)
+ return -EINVAL;
+ generic_handle_irq_desc(irq, desc);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(generic_handle_irq);
+
/* Dynamic interrupt handling */
/**
@@ -327,6 +327,7 @@ void irq_free_descs(unsigned int from, unsigned int cnt)
bitmap_clear(allocated_irqs, from, cnt);
mutex_unlock(&sparse_irq_lock);
}
+EXPORT_SYMBOL_GPL(irq_free_descs);
/**
* irq_alloc_descs - allocate and initialize a range of irq descriptors
@@ -347,14 +348,17 @@ irq_alloc_descs(int irq, unsigned int from, unsigned int cnt, int node)
mutex_lock(&sparse_irq_lock);
- start = bitmap_find_next_zero_area(allocated_irqs, nr_irqs, from, cnt, 0);
+ start = bitmap_find_next_zero_area(allocated_irqs, IRQ_BITMAP_BITS,
+ from, cnt, 0);
ret = -EEXIST;
if (irq >=0 && start != irq)
goto err;
- ret = -ENOMEM;
- if (start >= nr_irqs)
- goto err;
+ if (start + cnt > nr_irqs) {
+ ret = irq_expand_nr_irqs(start + cnt);
+ if (ret)
+ goto err;
+ }
bitmap_set(allocated_irqs, start, cnt);
mutex_unlock(&sparse_irq_lock);
@@ -364,6 +368,7 @@ err:
mutex_unlock(&sparse_irq_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(irq_alloc_descs);
/**
* irq_reserve_irqs - mark irqs allocated
@@ -401,6 +406,26 @@ unsigned int irq_get_next_irq(unsigned int offset)
return find_next_bit(allocated_irqs, nr_irqs, offset);
}
+struct irq_desc *
+__irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus)
+{
+ struct irq_desc *desc = irq_to_desc(irq);
+
+ if (desc) {
+ if (bus)
+ chip_bus_lock(desc);
+ raw_spin_lock_irqsave(&desc->lock, *flags);
+ }
+ return desc;
+}
+
+void __irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags, bool bus)
+{
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (bus)
+ chip_bus_sync_unlock(desc);
+}
+
/**
* dynamic_irq_cleanup - cleanup a dynamically allocated irq
* @irq: irq number to initialize
@@ -423,7 +448,6 @@ unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
*per_cpu_ptr(desc->kstat_irqs, cpu) : 0;
}
-#ifdef CONFIG_GENERIC_HARDIRQS
unsigned int kstat_irqs(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
@@ -436,4 +460,3 @@ unsigned int kstat_irqs(unsigned int irq)
sum += *per_cpu_ptr(desc->kstat_irqs, cpu);
return sum;
}
-#endif /* CONFIG_GENERIC_HARDIRQS */
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 9033c1c70828..f7ce0021e1c4 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -17,6 +17,17 @@
#include "internals.h"
+#ifdef CONFIG_IRQ_FORCED_THREADING
+__read_mostly bool force_irqthreads;
+
+static int __init setup_forced_irqthreads(char *arg)
+{
+ force_irqthreads = true;
+ return 0;
+}
+early_param("threadirqs", setup_forced_irqthreads);
+#endif
+
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
* @irq: interrupt number to wait for
@@ -30,7 +41,7 @@
void synchronize_irq(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- unsigned int status;
+ bool inprogress;
if (!desc)
return;
@@ -42,16 +53,16 @@ void synchronize_irq(unsigned int irq)
* Wait until we're out of the critical section. This might
* give the wrong answer due to the lack of memory barriers.
*/
- while (desc->status & IRQ_INPROGRESS)
+ while (irqd_irq_inprogress(&desc->irq_data))
cpu_relax();
/* Ok, that indicated we're done: double-check carefully. */
raw_spin_lock_irqsave(&desc->lock, flags);
- status = desc->status;
+ inprogress = irqd_irq_inprogress(&desc->irq_data);
raw_spin_unlock_irqrestore(&desc->lock, flags);
/* Oops, that failed? */
- } while (status & IRQ_INPROGRESS);
+ } while (inprogress);
/*
* We made sure that no hardirq handler is running. Now verify
@@ -73,8 +84,8 @@ int irq_can_set_affinity(unsigned int irq)
{
struct irq_desc *desc = irq_to_desc(irq);
- if (CHECK_IRQ_PER_CPU(desc->status) || !desc->irq_data.chip ||
- !desc->irq_data.chip->irq_set_affinity)
+ if (!desc || !irqd_can_balance(&desc->irq_data) ||
+ !desc->irq_data.chip || !desc->irq_data.chip->irq_set_affinity)
return 0;
return 1;
@@ -100,67 +111,180 @@ void irq_set_thread_affinity(struct irq_desc *desc)
}
}
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+static inline bool irq_can_move_pcntxt(struct irq_data *data)
+{
+ return irqd_can_move_in_process_context(data);
+}
+static inline bool irq_move_pending(struct irq_data *data)
+{
+ return irqd_is_setaffinity_pending(data);
+}
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask)
+{
+ cpumask_copy(desc->pending_mask, mask);
+}
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc)
+{
+ cpumask_copy(mask, desc->pending_mask);
+}
+#else
+static inline bool irq_can_move_pcntxt(struct irq_data *data) { return true; }
+static inline bool irq_move_pending(struct irq_data *data) { return false; }
+static inline void
+irq_copy_pending(struct irq_desc *desc, const struct cpumask *mask) { }
+static inline void
+irq_get_pending(struct cpumask *mask, struct irq_desc *desc) { }
+#endif
+
+int __irq_set_affinity_locked(struct irq_data *data, const struct cpumask *mask)
+{
+ struct irq_chip *chip = irq_data_get_irq_chip(data);
+ struct irq_desc *desc = irq_data_to_desc(data);
+ int ret = 0;
+
+ if (!chip || !chip->irq_set_affinity)
+ return -EINVAL;
+
+ if (irq_can_move_pcntxt(data)) {
+ ret = chip->irq_set_affinity(data, mask, false);
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
+ cpumask_copy(data->affinity, mask);
+ case IRQ_SET_MASK_OK_NOCOPY:
+ irq_set_thread_affinity(desc);
+ ret = 0;
+ }
+ } else {
+ irqd_set_move_pending(data);
+ irq_copy_pending(desc, mask);
+ }
+
+ if (desc->affinity_notify) {
+ kref_get(&desc->affinity_notify->kref);
+ schedule_work(&desc->affinity_notify->work);
+ }
+ irqd_set(data, IRQD_AFFINITY_SET);
+
+ return ret;
+}
+
/**
* irq_set_affinity - Set the irq affinity of a given irq
* @irq: Interrupt to set affinity
- * @cpumask: cpumask
+ * @mask: cpumask
*
*/
-int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int irq_set_affinity(unsigned int irq, const struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
- struct irq_chip *chip = desc->irq_data.chip;
unsigned long flags;
+ int ret;
- if (!chip->irq_set_affinity)
+ if (!desc)
return -EINVAL;
raw_spin_lock_irqsave(&desc->lock, flags);
-
-#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (desc->status & IRQ_MOVE_PCNTXT) {
- if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
- cpumask_copy(desc->irq_data.affinity, cpumask);
- irq_set_thread_affinity(desc);
- }
- }
- else {
- desc->status |= IRQ_MOVE_PENDING;
- cpumask_copy(desc->pending_mask, cpumask);
- }
-#else
- if (!chip->irq_set_affinity(&desc->irq_data, cpumask, false)) {
- cpumask_copy(desc->irq_data.affinity, cpumask);
- irq_set_thread_affinity(desc);
- }
-#endif
- desc->status |= IRQ_AFFINITY_SET;
+ ret = __irq_set_affinity_locked(irq_desc_get_irq_data(desc), mask);
raw_spin_unlock_irqrestore(&desc->lock, flags);
- return 0;
+ return ret;
}
int irq_set_affinity_hint(unsigned int irq, const struct cpumask *m)
{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+
+ if (!desc)
+ return -EINVAL;
+ desc->affinity_hint = m;
+ irq_put_desc_unlock(desc, flags);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+
+static void irq_affinity_notify(struct work_struct *work)
+{
+ struct irq_affinity_notify *notify =
+ container_of(work, struct irq_affinity_notify, work);
+ struct irq_desc *desc = irq_to_desc(notify->irq);
+ cpumask_var_t cpumask;
+ unsigned long flags;
+
+ if (!desc || !alloc_cpumask_var(&cpumask, GFP_KERNEL))
+ goto out;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ if (irq_move_pending(&desc->irq_data))
+ irq_get_pending(cpumask, desc);
+ else
+ cpumask_copy(cpumask, desc->irq_data.affinity);
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+
+ notify->notify(notify, cpumask);
+
+ free_cpumask_var(cpumask);
+out:
+ kref_put(&notify->kref, notify->release);
+}
+
+/**
+ * irq_set_affinity_notifier - control notification of IRQ affinity changes
+ * @irq: Interrupt for which to enable/disable notification
+ * @notify: Context for notification, or %NULL to disable
+ * notification. Function pointers must be initialised;
+ * the other fields will be initialised by this function.
+ *
+ * Must be called in process context. Notification may only be enabled
+ * after the IRQ is allocated and must be disabled before the IRQ is
+ * freed using free_irq().
+ */
+int
+irq_set_affinity_notifier(unsigned int irq, struct irq_affinity_notify *notify)
+{
struct irq_desc *desc = irq_to_desc(irq);
+ struct irq_affinity_notify *old_notify;
unsigned long flags;
+ /* The release function is promised process context */
+ might_sleep();
+
if (!desc)
return -EINVAL;
+ /* Complete initialisation of *notify */
+ if (notify) {
+ notify->irq = irq;
+ kref_init(&notify->kref);
+ INIT_WORK(&notify->work, irq_affinity_notify);
+ }
+
raw_spin_lock_irqsave(&desc->lock, flags);
- desc->affinity_hint = m;
+ old_notify = desc->affinity_notify;
+ desc->affinity_notify = notify;
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ if (old_notify)
+ kref_put(&old_notify->kref, old_notify->release);
+
return 0;
}
-EXPORT_SYMBOL_GPL(irq_set_affinity_hint);
+EXPORT_SYMBOL_GPL(irq_set_affinity_notifier);
#ifndef CONFIG_AUTO_IRQ_AFFINITY
/*
* Generic version of the affinity autoselector.
*/
-static int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
{
+ struct irq_chip *chip = irq_desc_get_chip(desc);
+ struct cpumask *set = irq_default_affinity;
+ int ret;
+
+ /* Excludes PER_CPU and NO_BALANCE interrupts */
if (!irq_can_set_affinity(irq))
return 0;
@@ -168,22 +292,27 @@ static int setup_affinity(unsigned int irq, struct irq_desc *desc)
* Preserve an userspace affinity setup, but make sure that
* one of the targets is online.
*/
- if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
- if (cpumask_any_and(desc->irq_data.affinity, cpu_online_mask)
- < nr_cpu_ids)
- goto set_affinity;
+ if (irqd_has_set(&desc->irq_data, IRQD_AFFINITY_SET)) {
+ if (cpumask_intersects(desc->irq_data.affinity,
+ cpu_online_mask))
+ set = desc->irq_data.affinity;
else
- desc->status &= ~IRQ_AFFINITY_SET;
+ irqd_clear(&desc->irq_data, IRQD_AFFINITY_SET);
}
- cpumask_and(desc->irq_data.affinity, cpu_online_mask, irq_default_affinity);
-set_affinity:
- desc->irq_data.chip->irq_set_affinity(&desc->irq_data, desc->irq_data.affinity, false);
-
+ cpumask_and(mask, cpu_online_mask, set);
+ ret = chip->irq_set_affinity(&desc->irq_data, mask, false);
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
+ cpumask_copy(desc->irq_data.affinity, mask);
+ case IRQ_SET_MASK_OK_NOCOPY:
+ irq_set_thread_affinity(desc);
+ }
return 0;
}
#else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *d, struct cpumask *mask)
{
return irq_select_affinity(irq);
}
@@ -192,23 +321,21 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
/*
* Called when affinity is set via /proc/irq
*/
-int irq_select_affinity_usr(unsigned int irq)
+int irq_select_affinity_usr(unsigned int irq, struct cpumask *mask)
{
struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
int ret;
raw_spin_lock_irqsave(&desc->lock, flags);
- ret = setup_affinity(irq, desc);
- if (!ret)
- irq_set_thread_affinity(desc);
+ ret = setup_affinity(irq, desc, mask);
raw_spin_unlock_irqrestore(&desc->lock, flags);
-
return ret;
}
#else
-static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
+static inline int
+setup_affinity(unsigned int irq, struct irq_desc *desc, struct cpumask *mask)
{
return 0;
}
@@ -219,13 +346,23 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
if (suspend) {
if (!desc->action || (desc->action->flags & IRQF_NO_SUSPEND))
return;
- desc->status |= IRQ_SUSPENDED;
+ desc->istate |= IRQS_SUSPENDED;
}
- if (!desc->depth++) {
- desc->status |= IRQ_DISABLED;
- desc->irq_data.chip->irq_disable(&desc->irq_data);
- }
+ if (!desc->depth++)
+ irq_disable(desc);
+}
+
+static int __disable_irq_nosync(unsigned int irq)
+{
+ unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
+
+ if (!desc)
+ return -EINVAL;
+ __disable_irq(desc, irq, false);
+ irq_put_desc_busunlock(desc, flags);
+ return 0;
}
/**
@@ -241,17 +378,7 @@ void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
*/
void disable_irq_nosync(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
- unsigned long flags;
-
- if (!desc)
- return;
-
- chip_bus_lock(desc);
- raw_spin_lock_irqsave(&desc->lock, flags);
- __disable_irq(desc, irq, false);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- chip_bus_sync_unlock(desc);
+ __disable_irq_nosync(irq);
}
EXPORT_SYMBOL(disable_irq_nosync);
@@ -269,21 +396,24 @@ EXPORT_SYMBOL(disable_irq_nosync);
*/
void disable_irq(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
-
- if (!desc)
- return;
-
- disable_irq_nosync(irq);
- if (desc->action)
+ if (!__disable_irq_nosync(irq))
synchronize_irq(irq);
}
EXPORT_SYMBOL(disable_irq);
void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
{
- if (resume)
- desc->status &= ~IRQ_SUSPENDED;
+ if (resume) {
+ if (!(desc->istate & IRQS_SUSPENDED)) {
+ if (!desc->action)
+ return;
+ if (!(desc->action->flags & IRQF_FORCE_RESUME))
+ return;
+ /* Pretend that it got disabled ! */
+ desc->depth++;
+ }
+ desc->istate &= ~IRQS_SUSPENDED;
+ }
switch (desc->depth) {
case 0:
@@ -291,12 +421,11 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
break;
case 1: {
- unsigned int status = desc->status & ~IRQ_DISABLED;
-
- if (desc->status & IRQ_SUSPENDED)
+ if (desc->istate & IRQS_SUSPENDED)
goto err_out;
/* Prevent probing on this irq: */
- desc->status = status | IRQ_NOPROBE;
+ irq_settings_set_noprobe(desc);
+ irq_enable(desc);
check_irq_resend(desc, irq);
/* fall-through */
}
@@ -318,21 +447,18 @@ void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
*/
void enable_irq(unsigned int irq)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
if (!desc)
return;
+ if (WARN(!desc->irq_data.chip,
+ KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
+ goto out;
- if (WARN(!desc->irq_data.chip || !desc->irq_data.chip->irq_enable,
- KERN_ERR "enable_irq before setup/request_irq: irq %u\n", irq))
- return;
-
- chip_bus_lock(desc);
- raw_spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, false);
- raw_spin_unlock_irqrestore(&desc->lock, flags);
- chip_bus_sync_unlock(desc);
+out:
+ irq_put_desc_busunlock(desc, flags);
}
EXPORT_SYMBOL(enable_irq);
@@ -348,7 +474,7 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
}
/**
- * set_irq_wake - control irq power management wakeup
+ * irq_set_irq_wake - control irq power management wakeup
* @irq: interrupt to control
* @on: enable/disable power management wakeup
*
@@ -359,23 +485,22 @@ static int set_irq_wake_real(unsigned int irq, unsigned int on)
* Wakeup mode lets this IRQ wake the system from sleep
* states like "suspend to RAM".
*/
-int set_irq_wake(unsigned int irq, unsigned int on)
+int irq_set_irq_wake(unsigned int irq, unsigned int on)
{
- struct irq_desc *desc = irq_to_desc(irq);
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_buslock(irq, &flags);
int ret = 0;
/* wakeup-capable irqs can be shared between drivers that
* don't need to have the same sleep mode behaviors.
*/
- raw_spin_lock_irqsave(&desc->lock, flags);
if (on) {
if (desc->wake_depth++ == 0) {
ret = set_irq_wake_real(irq, on);
if (ret)
desc->wake_depth = 0;
else
- desc->status |= IRQ_WAKEUP;
+ irqd_set(&desc->irq_data, IRQD_WAKEUP_STATE);
}
} else {
if (desc->wake_depth == 0) {
@@ -385,14 +510,13 @@ int set_irq_wake(unsigned int irq, unsigned int on)
if (ret)
desc->wake_depth = 1;
else
- desc->status &= ~IRQ_WAKEUP;
+ irqd_clear(&desc->irq_data, IRQD_WAKEUP_STATE);
}
}
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
+ irq_put_desc_busunlock(desc, flags);
return ret;
}
-EXPORT_SYMBOL(set_irq_wake);
+EXPORT_SYMBOL(irq_set_irq_wake);
/*
* Internal function that tells the architecture code whether a
@@ -401,43 +525,27 @@ EXPORT_SYMBOL(set_irq_wake);
*/
int can_request_irq(unsigned int irq, unsigned long irqflags)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irqaction *action;
unsigned long flags;
+ struct irq_desc *desc = irq_get_desc_lock(irq, &flags);
+ int canrequest = 0;
if (!desc)
return 0;
- if (desc->status & IRQ_NOREQUEST)
- return 0;
-
- raw_spin_lock_irqsave(&desc->lock, flags);
- action = desc->action;
- if (action)
- if (irqflags & action->flags & IRQF_SHARED)
- action = NULL;
-
- raw_spin_unlock_irqrestore(&desc->lock, flags);
-
- return !action;
-}
-
-void compat_irq_chip_set_default_handler(struct irq_desc *desc)
-{
- /*
- * If the architecture still has not overriden
- * the flow handler then zap the default. This
- * should catch incorrect flow-type setting.
- */
- if (desc->handle_irq == &handle_bad_irq)
- desc->handle_irq = NULL;
+ if (irq_settings_can_request(desc)) {
+ if (desc->action)
+ if (irqflags & desc->action->flags & IRQF_SHARED)
+ canrequest =1;
+ }
+ irq_put_desc_unlock(desc, flags);
+ return canrequest;
}
int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
unsigned long flags)
{
- int ret;
struct irq_chip *chip = desc->irq_data.chip;
+ int ret, unmask = 0;
if (!chip || !chip->irq_set_type) {
/*
@@ -449,23 +557,41 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
return 0;
}
+ flags &= IRQ_TYPE_SENSE_MASK;
+
+ if (chip->flags & IRQCHIP_SET_TYPE_MASKED) {
+ if (!irqd_irq_masked(&desc->irq_data))
+ mask_irq(desc);
+ if (!irqd_irq_disabled(&desc->irq_data))
+ unmask = 1;
+ }
+
/* caller masked out all except trigger mode flags */
ret = chip->irq_set_type(&desc->irq_data, flags);
- if (ret)
+ switch (ret) {
+ case IRQ_SET_MASK_OK:
+ irqd_clear(&desc->irq_data, IRQD_TRIGGER_MASK);
+ irqd_set(&desc->irq_data, flags);
+
+ case IRQ_SET_MASK_OK_NOCOPY:
+ flags = irqd_get_trigger_type(&desc->irq_data);
+ irq_settings_set_trigger_mask(desc, flags);
+ irqd_clear(&desc->irq_data, IRQD_LEVEL);
+ irq_settings_clr_level(desc);
+ if (flags & IRQ_TYPE_LEVEL_MASK) {
+ irq_settings_set_level(desc);
+ irqd_set(&desc->irq_data, IRQD_LEVEL);
+ }
+
+ ret = 0;
+ break;
+ default:
pr_err("setting trigger mode %lu for irq %u failed (%pF)\n",
flags, irq, chip->irq_set_type);
- else {
- if (flags & (IRQ_TYPE_LEVEL_LOW | IRQ_TYPE_LEVEL_HIGH))
- flags |= IRQ_LEVEL;
- /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */
- desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK);
- desc->status |= flags;
-
- if (chip != desc->irq_data.chip)
- irq_chip_set_defaults(desc->irq_data.chip);
}
-
+ if (unmask)
+ unmask_irq(desc);
return ret;
}
@@ -509,8 +635,11 @@ static int irq_wait_for_interrupt(struct irqaction *action)
* handler finished. unmask if the interrupt has not been disabled and
* is marked MASKED.
*/
-static void irq_finalize_oneshot(unsigned int irq, struct irq_desc *desc)
+static void irq_finalize_oneshot(struct irq_desc *desc,
+ struct irqaction *action, bool force)
{
+ if (!(desc->istate & IRQS_ONESHOT))
+ return;
again:
chip_bus_lock(desc);
raw_spin_lock_irq(&desc->lock);
@@ -522,26 +651,42 @@ again:
* The thread is faster done than the hard interrupt handler
* on the other CPU. If we unmask the irq line then the
* interrupt can come in again and masks the line, leaves due
- * to IRQ_INPROGRESS and the irq line is masked forever.
+ * to IRQS_INPROGRESS and the irq line is masked forever.
+ *
+ * This also serializes the state of shared oneshot handlers
+ * versus "desc->threads_onehsot |= action->thread_mask;" in
+ * irq_wake_thread(). See the comment there which explains the
+ * serialization.
*/
- if (unlikely(desc->status & IRQ_INPROGRESS)) {
+ if (unlikely(irqd_irq_inprogress(&desc->irq_data))) {
raw_spin_unlock_irq(&desc->lock);
chip_bus_sync_unlock(desc);
cpu_relax();
goto again;
}
- if (!(desc->status & IRQ_DISABLED) && (desc->status & IRQ_MASKED)) {
- desc->status &= ~IRQ_MASKED;
- desc->irq_data.chip->irq_unmask(&desc->irq_data);
- }
+ /*
+ * Now check again, whether the thread should run. Otherwise
+ * we would clear the threads_oneshot bit of this thread which
+ * was just set.
+ */
+ if (!force && test_bit(IRQTF_RUNTHREAD, &action->thread_flags))
+ goto out_unlock;
+
+ desc->threads_oneshot &= ~action->thread_mask;
+
+ if (!desc->threads_oneshot && !irqd_irq_disabled(&desc->irq_data) &&
+ irqd_irq_masked(&desc->irq_data))
+ unmask_irq(desc);
+
+out_unlock:
raw_spin_unlock_irq(&desc->lock);
chip_bus_sync_unlock(desc);
}
#ifdef CONFIG_SMP
/*
- * Check whether we need to change the affinity of the interrupt thread.
+ * Check whether we need to chasnge the affinity of the interrupt thread.
*/
static void
irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action)
@@ -573,6 +718,32 @@ irq_thread_check_affinity(struct irq_desc *desc, struct irqaction *action) { }
#endif
/*
+ * Interrupts which are not explicitely requested as threaded
+ * interrupts rely on the implicit bh/preempt disable of the hard irq
+ * context. So we need to disable bh here to avoid deadlocks and other
+ * side effects.
+ */
+static void
+irq_forced_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+ local_bh_disable();
+ action->thread_fn(action->irq, action->dev_id);
+ irq_finalize_oneshot(desc, action, false);
+ local_bh_enable();
+}
+
+/*
+ * Interrupts explicitely requested as threaded interupts want to be
+ * preemtible - many of them need to sleep and wait for slow busses to
+ * complete.
+ */
+static void irq_thread_fn(struct irq_desc *desc, struct irqaction *action)
+{
+ action->thread_fn(action->irq, action->dev_id);
+ irq_finalize_oneshot(desc, action, false);
+}
+
+/*
* Interrupt handler thread
*/
static int irq_thread(void *data)
@@ -582,7 +753,14 @@ static int irq_thread(void *data)
};
struct irqaction *action = data;
struct irq_desc *desc = irq_to_desc(action->irq);
- int wake, oneshot = desc->status & IRQ_ONESHOT;
+ void (*handler_fn)(struct irq_desc *desc, struct irqaction *action);
+ int wake;
+
+ if (force_irqthreads & test_bit(IRQTF_FORCED_THREAD,
+ &action->thread_flags))
+ handler_fn = irq_forced_thread_fn;
+ else
+ handler_fn = irq_thread_fn;
sched_setscheduler(current, SCHED_FIFO, &param);
current->irqaction = action;
@@ -594,23 +772,19 @@ static int irq_thread(void *data)
atomic_inc(&desc->threads_active);
raw_spin_lock_irq(&desc->lock);
- if (unlikely(desc->status & IRQ_DISABLED)) {
+ if (unlikely(irqd_irq_disabled(&desc->irq_data))) {
/*
* CHECKME: We might need a dedicated
* IRQ_THREAD_PENDING flag here, which
* retriggers the thread in check_irq_resend()
- * but AFAICT IRQ_PENDING should be fine as it
+ * but AFAICT IRQS_PENDING should be fine as it
* retriggers the interrupt itself --- tglx
*/
- desc->status |= IRQ_PENDING;
+ desc->istate |= IRQS_PENDING;
raw_spin_unlock_irq(&desc->lock);
} else {
raw_spin_unlock_irq(&desc->lock);
-
- action->thread_fn(action->irq, action->dev_id);
-
- if (oneshot)
- irq_finalize_oneshot(action->irq, desc);
+ handler_fn(desc, action);
}
wake = atomic_dec_and_test(&desc->threads_active);
@@ -619,6 +793,9 @@ static int irq_thread(void *data)
wake_up(&desc->wait_for_threads);
}
+ /* Prevent a stale desc->threads_oneshot */
+ irq_finalize_oneshot(desc, action, true);
+
/*
* Clear irqaction. Otherwise exit_irq_thread() would make
* fuzz about an active irq thread going into nirvana.
@@ -633,6 +810,7 @@ static int irq_thread(void *data)
void exit_irq_thread(void)
{
struct task_struct *tsk = current;
+ struct irq_desc *desc;
if (!tsk->irqaction)
return;
@@ -641,6 +819,14 @@ void exit_irq_thread(void)
"exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+ desc = irq_to_desc(tsk->irqaction->irq);
+
+ /*
+ * Prevent a stale desc->threads_oneshot. Must be called
+ * before setting the IRQTF_DIED flag.
+ */
+ irq_finalize_oneshot(desc, tsk->irqaction, true);
+
/*
* Set the THREAD DIED flag to prevent further wakeups of the
* soon to be gone threaded handler.
@@ -648,6 +834,22 @@ void exit_irq_thread(void)
set_bit(IRQTF_DIED, &tsk->irqaction->flags);
}
+static void irq_setup_forced_threading(struct irqaction *new)
+{
+ if (!force_irqthreads)
+ return;
+ if (new->flags & (IRQF_NO_THREAD | IRQF_PERCPU | IRQF_ONESHOT))
+ return;
+
+ new->flags |= IRQF_ONESHOT;
+
+ if (!new->thread_fn) {
+ set_bit(IRQTF_FORCED_THREAD, &new->thread_flags);
+ new->thread_fn = new->handler;
+ new->handler = irq_default_primary_handler;
+ }
+}
+
/*
* Internal function to register an irqaction - typically used to
* allocate special interrupts that are part of the architecture.
@@ -657,9 +859,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
{
struct irqaction *old, **old_ptr;
const char *old_name = NULL;
- unsigned long flags;
- int nested, shared = 0;
- int ret;
+ unsigned long flags, thread_mask = 0;
+ int ret, nested, shared = 0;
+ cpumask_var_t mask;
if (!desc)
return -EINVAL;
@@ -683,15 +885,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
rand_initialize_irq(irq);
}
- /* Oneshot interrupts are not allowed with shared */
- if ((new->flags & IRQF_ONESHOT) && (new->flags & IRQF_SHARED))
- return -EINVAL;
-
/*
* Check whether the interrupt nests into another interrupt
* thread.
*/
- nested = desc->status & IRQ_NESTED_THREAD;
+ nested = irq_settings_is_nested_thread(desc);
if (nested) {
if (!new->thread_fn)
return -EINVAL;
@@ -701,6 +899,9 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* dummy function which warns when called.
*/
new->handler = irq_nested_primary_handler;
+ } else {
+ if (irq_settings_can_thread(desc))
+ irq_setup_forced_threading(new);
}
/*
@@ -724,6 +925,11 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->thread = t;
}
+ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+ ret = -ENOMEM;
+ goto out_thread;
+ }
+
/*
* The following block of code has to be executed atomically
*/
@@ -735,32 +941,41 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* Can't share interrupts unless both agree to and are
* the same type (level, edge, polarity). So both flag
* fields must have IRQF_SHARED set and the bits which
- * set the trigger type must match.
+ * set the trigger type must match. Also all must
+ * agree on ONESHOT.
*/
if (!((old->flags & new->flags) & IRQF_SHARED) ||
- ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK)) {
+ ((old->flags ^ new->flags) & IRQF_TRIGGER_MASK) ||
+ ((old->flags ^ new->flags) & IRQF_ONESHOT)) {
old_name = old->name;
goto mismatch;
}
-#if defined(CONFIG_IRQ_PER_CPU)
/* All handlers must agree on per-cpuness */
if ((old->flags & IRQF_PERCPU) !=
(new->flags & IRQF_PERCPU))
goto mismatch;
-#endif
/* add new interrupt at end of irq queue */
do {
+ thread_mask |= old->thread_mask;
old_ptr = &old->next;
old = *old_ptr;
} while (old);
shared = 1;
}
- if (!shared) {
- irq_chip_set_defaults(desc->irq_data.chip);
+ /*
+ * Setup the thread mask for this irqaction. Unlikely to have
+ * 32 resp 64 irqs sharing one line, but who knows.
+ */
+ if (new->flags & IRQF_ONESHOT && thread_mask == ~0UL) {
+ ret = -EBUSY;
+ goto out_mask;
+ }
+ new->thread_mask = 1 << ffz(thread_mask);
+ if (!shared) {
init_waitqueue_head(&desc->wait_for_threads);
/* Setup the type (level, edge polarity) if configured: */
@@ -769,42 +984,44 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
new->flags & IRQF_TRIGGER_MASK);
if (ret)
- goto out_thread;
- } else
- compat_irq_chip_set_default_handler(desc);
-#if defined(CONFIG_IRQ_PER_CPU)
- if (new->flags & IRQF_PERCPU)
- desc->status |= IRQ_PER_CPU;
-#endif
+ goto out_mask;
+ }
- desc->status &= ~(IRQ_AUTODETECT | IRQ_WAITING | IRQ_ONESHOT |
- IRQ_INPROGRESS | IRQ_SPURIOUS_DISABLED);
+ desc->istate &= ~(IRQS_AUTODETECT | IRQS_SPURIOUS_DISABLED | \
+ IRQS_ONESHOT | IRQS_WAITING);
+ irqd_clear(&desc->irq_data, IRQD_IRQ_INPROGRESS);
+
+ if (new->flags & IRQF_PERCPU) {
+ irqd_set(&desc->irq_data, IRQD_PER_CPU);
+ irq_settings_set_per_cpu(desc);
+ }
if (new->flags & IRQF_ONESHOT)
- desc->status |= IRQ_ONESHOT;
+ desc->istate |= IRQS_ONESHOT;
- if (!(desc->status & IRQ_NOAUTOEN)) {
- desc->depth = 0;
- desc->status &= ~IRQ_DISABLED;
- desc->irq_data.chip->irq_startup(&desc->irq_data);
- } else
+ if (irq_settings_can_autoenable(desc))
+ irq_startup(desc);
+ else
/* Undo nested disables: */
desc->depth = 1;
/* Exclude IRQ from balancing if requested */
- if (new->flags & IRQF_NOBALANCING)
- desc->status |= IRQ_NO_BALANCING;
+ if (new->flags & IRQF_NOBALANCING) {
+ irq_settings_set_no_balancing(desc);
+ irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
+ }
/* Set default affinity mask once everything is setup */
- setup_affinity(irq, desc);
-
- } else if ((new->flags & IRQF_TRIGGER_MASK)
- && (new->flags & IRQF_TRIGGER_MASK)
- != (desc->status & IRQ_TYPE_SENSE_MASK)) {
- /* hope the handler works with the actual trigger mode... */
- pr_warning("IRQ %d uses trigger mode %d; requested %d\n",
- irq, (int)(desc->status & IRQ_TYPE_SENSE_MASK),
- (int)(new->flags & IRQF_TRIGGER_MASK));
+ setup_affinity(irq, desc, mask);
+
+ } else if (new->flags & IRQF_TRIGGER_MASK) {
+ unsigned int nmsk = new->flags & IRQF_TRIGGER_MASK;
+ unsigned int omsk = irq_settings_get_trigger_mask(desc);
+
+ if (nmsk != omsk)
+ /* hope the handler works with current trigger mode */
+ pr_warning("IRQ %d uses trigger mode %u; requested %u\n",
+ irq, nmsk, omsk);
}
new->irq = irq;
@@ -818,8 +1035,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
* Check whether we disabled the irq via the spurious handler
* before. Reenable it and give it another chance.
*/
- if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
- desc->status &= ~IRQ_SPURIOUS_DISABLED;
+ if (shared && (desc->istate & IRQS_SPURIOUS_DISABLED)) {
+ desc->istate &= ~IRQS_SPURIOUS_DISABLED;
__enable_irq(desc, irq, false);
}
@@ -835,6 +1052,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
register_irq_proc(irq, desc);
new->dir = NULL;
register_handler_proc(irq, new);
+ free_cpumask_var(mask);
return 0;
@@ -849,8 +1067,11 @@ mismatch:
#endif
ret = -EBUSY;
-out_thread:
+out_mask:
raw_spin_unlock_irqrestore(&desc->lock, flags);
+ free_cpumask_var(mask);
+
+out_thread:
if (new->thread) {
struct task_struct *t = new->thread;
@@ -871,9 +1092,14 @@ out_thread:
*/
int setup_irq(unsigned int irq, struct irqaction *act)
{
+ int retval;
struct irq_desc *desc = irq_to_desc(irq);
- return __setup_irq(irq, desc, act);
+ chip_bus_lock(desc);
+ retval = __setup_irq(irq, desc, act);
+ chip_bus_sync_unlock(desc);
+
+ return retval;
}
EXPORT_SYMBOL_GPL(setup_irq);
@@ -924,13 +1150,8 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
#endif
/* If this was the last handler, shut down the IRQ line: */
- if (!desc->action) {
- desc->status |= IRQ_DISABLED;
- if (desc->irq_data.chip->irq_shutdown)
- desc->irq_data.chip->irq_shutdown(&desc->irq_data);
- else
- desc->irq_data.chip->irq_disable(&desc->irq_data);
- }
+ if (!desc->action)
+ irq_shutdown(desc);
#ifdef CONFIG_SMP
/* make sure affinity_hint is cleaned up */
@@ -1004,6 +1225,11 @@ void free_irq(unsigned int irq, void *dev_id)
if (!desc)
return;
+#ifdef CONFIG_SMP
+ if (WARN_ON(desc->affinity_notify))
+ desc->affinity_notify = NULL;
+#endif
+
chip_bus_lock(desc);
kfree(__free_irq(irq, dev_id));
chip_bus_sync_unlock(desc);
@@ -1074,7 +1300,7 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler,
if (!desc)
return -EINVAL;
- if (desc->status & IRQ_NOREQUEST)
+ if (!irq_settings_can_request(desc))
return -EINVAL;
if (!handler) {
@@ -1149,7 +1375,7 @@ int request_any_context_irq(unsigned int irq, irq_handler_t handler,
if (!desc)
return -EINVAL;
- if (desc->status & IRQ_NESTED_THREAD) {
+ if (irq_settings_is_nested_thread(desc)) {
ret = request_threaded_irq(irq, NULL, handler,
flags, name, dev_id);
return !ret ? IRQC_IS_NESTED : ret;
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index 441fd629ff04..47420908fba0 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -4,23 +4,23 @@
#include "internals.h"
-void move_masked_irq(int irq)
+void irq_move_masked_irq(struct irq_data *idata)
{
- struct irq_desc *desc = irq_to_desc(irq);
- struct irq_chip *chip = desc->irq_data.chip;
+ struct irq_desc *desc = irq_data_to_desc(idata);
+ struct irq_chip *chip = idata->chip;
- if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+ if (likely(!irqd_is_setaffinity_pending(&desc->irq_data)))
return;
/*
* Paranoia: cpu-local interrupts shouldn't be calling in here anyway.
*/
- if (CHECK_IRQ_PER_CPU(desc->status)) {
+ if (!irqd_can_balance(&desc->irq_data)) {
WARN_ON(1);
return;
}
- desc->status &= ~IRQ_MOVE_PENDING;
+ irqd_clr_move_pending(&desc->irq_data);
if (unlikely(cpumask_empty(desc->pending_mask)))
return;
@@ -35,7 +35,7 @@ void move_masked_irq(int irq)
* do the disable, re-program, enable sequence.
* This is *not* particularly important for level triggered
* but in a edge trigger case, we might be setting rte
- * when an active trigger is comming in. This could
+ * when an active trigger is coming in. This could
* cause some ioapics to mal-function.
* Being paranoid i guess!
*
@@ -53,15 +53,14 @@ void move_masked_irq(int irq)
cpumask_clear(desc->pending_mask);
}
-void move_native_irq(int irq)
+void irq_move_irq(struct irq_data *idata)
{
- struct irq_desc *desc = irq_to_desc(irq);
bool masked;
- if (likely(!(desc->status & IRQ_MOVE_PENDING)))
+ if (likely(!irqd_is_setaffinity_pending(idata)))
return;
- if (unlikely(desc->status & IRQ_DISABLED))
+ if (unlikely(irqd_irq_disabled(idata)))
return;
/*
@@ -69,10 +68,10 @@ void move_native_irq(int irq)
* threaded interrupt with ONESHOT set, we can end up with an
* interrupt storm.
*/
- masked = desc->status & IRQ_MASKED;
+ masked = irqd_irq_masked(idata);
if (!masked)
- desc->irq_data.chip->irq_mask(&desc->irq_data);
- move_masked_irq(irq);
+ idata->chip->irq_mask(idata);
+ irq_move_masked_irq(idata);
if (!masked)
- desc->irq_data.chip->irq_unmask(&desc->irq_data);
+ idata->chip->irq_unmask(idata);
}
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
index 0d4005d85b03..f76fc00c9877 100644
--- a/kernel/irq/pm.c
+++ b/kernel/irq/pm.c
@@ -18,7 +18,7 @@
* During system-wide suspend or hibernation device drivers need to be prevented
* from receiving interrupts and this function is provided for this purpose.
* It marks all interrupt lines in use, except for the timer ones, as disabled
- * and sets the IRQ_SUSPENDED flag for each of them.
+ * and sets the IRQS_SUSPENDED flag for each of them.
*/
void suspend_device_irqs(void)
{
@@ -34,7 +34,7 @@ void suspend_device_irqs(void)
}
for_each_irq_desc(irq, desc)
- if (desc->status & IRQ_SUSPENDED)
+ if (desc->istate & IRQS_SUSPENDED)
synchronize_irq(irq);
}
EXPORT_SYMBOL_GPL(suspend_device_irqs);
@@ -43,7 +43,7 @@ EXPORT_SYMBOL_GPL(suspend_device_irqs);
* resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
*
* Enable all interrupt lines previously disabled by suspend_device_irqs() that
- * have the IRQ_SUSPENDED flag set.
+ * have the IRQS_SUSPENDED flag set.
*/
void resume_device_irqs(void)
{
@@ -53,9 +53,6 @@ void resume_device_irqs(void)
for_each_irq_desc(irq, desc) {
unsigned long flags;
- if (!(desc->status & IRQ_SUSPENDED))
- continue;
-
raw_spin_lock_irqsave(&desc->lock, flags);
__enable_irq(desc, irq, true);
raw_spin_unlock_irqrestore(&desc->lock, flags);
@@ -71,9 +68,24 @@ int check_wakeup_irqs(void)
struct irq_desc *desc;
int irq;
- for_each_irq_desc(irq, desc)
- if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
- return -EBUSY;
+ for_each_irq_desc(irq, desc) {
+ if (irqd_is_wakeup_set(&desc->irq_data)) {
+ if (desc->istate & IRQS_PENDING)
+ return -EBUSY;
+ continue;
+ }
+ /*
+ * Check the non wakeup interrupts whether they need
+ * to be masked before finally going into suspend
+ * state. That's for hardware which has no wakeup
+ * source configuration facility. The chip
+ * implementation indicates that with
+ * IRQCHIP_MASK_ON_SUSPEND.
+ */
+ if (desc->istate & IRQS_SUSPENDED &&
+ irq_desc_get_chip(desc)->flags & IRQCHIP_MASK_ON_SUSPEND)
+ mask_irq(desc);
+ }
return 0;
}
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index 6c8a2a9f8a7b..4bd4faa6323a 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -11,6 +11,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/interrupt.h>
+#include <linux/kernel_stat.h>
#include "internals.h"
@@ -18,16 +19,19 @@ static struct proc_dir_entry *root_irq_dir;
#ifdef CONFIG_SMP
-static int irq_affinity_proc_show(struct seq_file *m, void *v)
+static int show_irq_affinity(int type, struct seq_file *m, void *v)
{
struct irq_desc *desc = irq_to_desc((long)m->private);
const struct cpumask *mask = desc->irq_data.affinity;
#ifdef CONFIG_GENERIC_PENDING_IRQ
- if (desc->status & IRQ_MOVE_PENDING)
+ if (irqd_is_setaffinity_pending(&desc->irq_data))
mask = desc->pending_mask;
#endif
- seq_cpumask(m, mask);
+ if (type)
+ seq_cpumask_list(m, mask);
+ else
+ seq_cpumask(m, mask);
seq_putc(m, '\n');
return 0;
}
@@ -58,21 +62,34 @@ static int irq_affinity_hint_proc_show(struct seq_file *m, void *v)
#endif
int no_irq_affinity;
-static ssize_t irq_affinity_proc_write(struct file *file,
+static int irq_affinity_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(0, m, v);
+}
+
+static int irq_affinity_list_proc_show(struct seq_file *m, void *v)
+{
+ return show_irq_affinity(1, m, v);
+}
+
+
+static ssize_t write_irq_affinity(int type, struct file *file,
const char __user *buffer, size_t count, loff_t *pos)
{
unsigned int irq = (int)(long)PDE(file->f_path.dentry->d_inode)->data;
cpumask_var_t new_value;
int err;
- if (!irq_to_desc(irq)->irq_data.chip->irq_set_affinity || no_irq_affinity ||
- irq_balancing_disabled(irq))
+ if (!irq_can_set_affinity(irq) || no_irq_affinity)
return -EIO;
if (!alloc_cpumask_var(&new_value, GFP_KERNEL))
return -ENOMEM;
- err = cpumask_parse_user(buffer, count, new_value);
+ if (type)
+ err = cpumask_parselist_user(buffer, count, new_value);
+ else
+ err = cpumask_parse_user(buffer, count, new_value);
if (err)
goto free_cpumask;
@@ -89,7 +106,7 @@ static ssize_t irq_affinity_proc_write(struct file *file,
if (!cpumask_intersects(new_value, cpu_online_mask)) {
/* Special case for empty set - allow the architecture
code to set default SMP affinity. */
- err = irq_select_affinity_usr(irq) ? -EINVAL : count;
+ err = irq_select_affinity_usr(irq, new_value) ? -EINVAL : count;
} else {
irq_set_affinity(irq, new_value);
err = count;
@@ -100,11 +117,28 @@ free_cpumask:
return err;
}
+static ssize_t irq_affinity_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ return write_irq_affinity(0, file, buffer, count, pos);
+}
+
+static ssize_t irq_affinity_list_proc_write(struct file *file,
+ const char __user *buffer, size_t count, loff_t *pos)
+{
+ return write_irq_affinity(1, file, buffer, count, pos);
+}
+
static int irq_affinity_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_proc_show, PDE(inode)->data);
}
+static int irq_affinity_list_proc_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, irq_affinity_list_proc_show, PDE(inode)->data);
+}
+
static int irq_affinity_hint_proc_open(struct inode *inode, struct file *file)
{
return single_open(file, irq_affinity_hint_proc_show, PDE(inode)->data);
@@ -125,6 +159,14 @@ static const struct file_operations irq_affinity_hint_proc_fops = {
.release = single_release,
};
+static const struct file_operations irq_affinity_list_proc_fops = {
+ .open = irq_affinity_list_proc_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+ .write = irq_affinity_list_proc_write,
+};
+
static int default_affinity_show(struct seq_file *m, void *v)
{
seq_cpumask(m, irq_default_affinity);
@@ -289,6 +331,10 @@ void register_irq_proc(unsigned int irq, struct irq_desc *desc)
proc_create_data("affinity_hint", 0400, desc->dir,
&irq_affinity_hint_proc_fops, (void *)(long)irq);
+ /* create /proc/irq/<irq>/smp_affinity_list */
+ proc_create_data("smp_affinity_list", 0600, desc->dir,
+ &irq_affinity_list_proc_fops, (void *)(long)irq);
+
proc_create_data("node", 0444, desc->dir,
&irq_node_proc_fops, (void *)(long)irq);
#endif
@@ -306,6 +352,7 @@ void unregister_irq_proc(unsigned int irq, struct irq_desc *desc)
#ifdef CONFIG_SMP
remove_proc_entry("smp_affinity", desc->dir);
remove_proc_entry("affinity_hint", desc->dir);
+ remove_proc_entry("smp_affinity_list", desc->dir);
remove_proc_entry("node", desc->dir);
#endif
remove_proc_entry("spurious", desc->dir);
@@ -357,3 +404,83 @@ void init_irq_proc(void)
}
}
+#ifdef CONFIG_GENERIC_IRQ_SHOW
+
+int __weak arch_show_interrupts(struct seq_file *p, int prec)
+{
+ return 0;
+}
+
+#ifndef ACTUAL_NR_IRQS
+# define ACTUAL_NR_IRQS nr_irqs
+#endif
+
+int show_interrupts(struct seq_file *p, void *v)
+{
+ static int prec;
+
+ unsigned long flags, any_count = 0;
+ int i = *(loff_t *) v, j;
+ struct irqaction *action;
+ struct irq_desc *desc;
+
+ if (i > ACTUAL_NR_IRQS)
+ return 0;
+
+ if (i == ACTUAL_NR_IRQS)
+ return arch_show_interrupts(p, prec);
+
+ /* print header and calculate the width of the first column */
+ if (i == 0) {
+ for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
+ j *= 10;
+
+ seq_printf(p, "%*s", prec + 8, "");
+ for_each_online_cpu(j)
+ seq_printf(p, "CPU%-8d", j);
+ seq_putc(p, '\n');
+ }
+
+ desc = irq_to_desc(i);
+ if (!desc)
+ return 0;
+
+ raw_spin_lock_irqsave(&desc->lock, flags);
+ for_each_online_cpu(j)
+ any_count |= kstat_irqs_cpu(i, j);
+ action = desc->action;
+ if (!action && !any_count)
+ goto out;
+
+ seq_printf(p, "%*d: ", prec, i);
+ for_each_online_cpu(j)
+ seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
+
+ if (desc->irq_data.chip) {
+ if (desc->irq_data.chip->irq_print_chip)
+ desc->irq_data.chip->irq_print_chip(&desc->irq_data, p);
+ else if (desc->irq_data.chip->name)
+ seq_printf(p, " %8s", desc->irq_data.chip->name);
+ else
+ seq_printf(p, " %8s", "-");
+ } else {
+ seq_printf(p, " %8s", "None");
+ }
+#ifdef CONFIG_GENERIC_IRQ_SHOW_LEVEL
+ seq_printf(p, " %-8s", irqd_is_level_type(&desc->irq_data) ? "Level" : "Edge");
+#endif
+ if (desc->name)
+ seq_printf(p, "-%-8s", desc->name);
+
+ if (action) {
+ seq_printf(p, " %s", action->name);
+ while ((action = action->next) != NULL)
+ seq_printf(p, ", %s", action->name);
+ }
+
+ seq_putc(p, '\n');
+out:
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
+ return 0;
+}
+#endif
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
index dc49358b73fa..14dd5761e8c9 100644
--- a/kernel/irq/resend.c
+++ b/kernel/irq/resend.c
@@ -55,20 +55,18 @@ static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
*/
void check_irq_resend(struct irq_desc *desc, unsigned int irq)
{
- unsigned int status = desc->status;
-
- /*
- * Make sure the interrupt is enabled, before resending it:
- */
- desc->irq_data.chip->irq_enable(&desc->irq_data);
-
/*
* We do not resend level type interrupts. Level type
* interrupts are resent by hardware when they are still
* active.
*/
- if ((status & (IRQ_LEVEL | IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
- desc->status = (status & ~IRQ_PENDING) | IRQ_REPLAY;
+ if (irq_settings_is_level(desc))
+ return;
+ if (desc->istate & IRQS_REPLAY)
+ return;
+ if (desc->istate & IRQS_PENDING) {
+ desc->istate &= ~IRQS_PENDING;
+ desc->istate |= IRQS_REPLAY;
if (!desc->irq_data.chip->irq_retrigger ||
!desc->irq_data.chip->irq_retrigger(&desc->irq_data)) {
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
new file mode 100644
index 000000000000..f1667833d444
--- /dev/null
+++ b/kernel/irq/settings.h
@@ -0,0 +1,142 @@
+/*
+ * Internal header to deal with irq_desc->status which will be renamed
+ * to irq_desc->settings.
+ */
+enum {
+ _IRQ_DEFAULT_INIT_FLAGS = IRQ_DEFAULT_INIT_FLAGS,
+ _IRQ_PER_CPU = IRQ_PER_CPU,
+ _IRQ_LEVEL = IRQ_LEVEL,
+ _IRQ_NOPROBE = IRQ_NOPROBE,
+ _IRQ_NOREQUEST = IRQ_NOREQUEST,
+ _IRQ_NOTHREAD = IRQ_NOTHREAD,
+ _IRQ_NOAUTOEN = IRQ_NOAUTOEN,
+ _IRQ_MOVE_PCNTXT = IRQ_MOVE_PCNTXT,
+ _IRQ_NO_BALANCING = IRQ_NO_BALANCING,
+ _IRQ_NESTED_THREAD = IRQ_NESTED_THREAD,
+ _IRQF_MODIFY_MASK = IRQF_MODIFY_MASK,
+};
+
+#define IRQ_PER_CPU GOT_YOU_MORON
+#define IRQ_NO_BALANCING GOT_YOU_MORON
+#define IRQ_LEVEL GOT_YOU_MORON
+#define IRQ_NOPROBE GOT_YOU_MORON
+#define IRQ_NOREQUEST GOT_YOU_MORON
+#define IRQ_NOTHREAD GOT_YOU_MORON
+#define IRQ_NOAUTOEN GOT_YOU_MORON
+#define IRQ_NESTED_THREAD GOT_YOU_MORON
+#undef IRQF_MODIFY_MASK
+#define IRQF_MODIFY_MASK GOT_YOU_MORON
+
+static inline void
+irq_settings_clr_and_set(struct irq_desc *desc, u32 clr, u32 set)
+{
+ desc->status_use_accessors &= ~(clr & _IRQF_MODIFY_MASK);
+ desc->status_use_accessors |= (set & _IRQF_MODIFY_MASK);
+}
+
+static inline bool irq_settings_is_per_cpu(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_PER_CPU;
+}
+
+static inline void irq_settings_set_per_cpu(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_PER_CPU;
+}
+
+static inline void irq_settings_set_no_balancing(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NO_BALANCING;
+}
+
+static inline bool irq_settings_has_no_balance_set(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_NO_BALANCING;
+}
+
+static inline u32 irq_settings_get_trigger_mask(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & IRQ_TYPE_SENSE_MASK;
+}
+
+static inline void
+irq_settings_set_trigger_mask(struct irq_desc *desc, u32 mask)
+{
+ desc->status_use_accessors &= ~IRQ_TYPE_SENSE_MASK;
+ desc->status_use_accessors |= mask & IRQ_TYPE_SENSE_MASK;
+}
+
+static inline bool irq_settings_is_level(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_LEVEL;
+}
+
+static inline void irq_settings_clr_level(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_LEVEL;
+}
+
+static inline void irq_settings_set_level(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_LEVEL;
+}
+
+static inline bool irq_settings_can_request(struct irq_desc *desc)
+{
+ return !(desc->status_use_accessors & _IRQ_NOREQUEST);
+}
+
+static inline void irq_settings_clr_norequest(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_NOREQUEST;
+}
+
+static inline void irq_settings_set_norequest(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NOREQUEST;
+}
+
+static inline bool irq_settings_can_thread(struct irq_desc *desc)
+{
+ return !(desc->status_use_accessors & _IRQ_NOTHREAD);
+}
+
+static inline void irq_settings_clr_nothread(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_NOTHREAD;
+}
+
+static inline void irq_settings_set_nothread(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NOTHREAD;
+}
+
+static inline bool irq_settings_can_probe(struct irq_desc *desc)
+{
+ return !(desc->status_use_accessors & _IRQ_NOPROBE);
+}
+
+static inline void irq_settings_clr_noprobe(struct irq_desc *desc)
+{
+ desc->status_use_accessors &= ~_IRQ_NOPROBE;
+}
+
+static inline void irq_settings_set_noprobe(struct irq_desc *desc)
+{
+ desc->status_use_accessors |= _IRQ_NOPROBE;
+}
+
+static inline bool irq_settings_can_move_pcntxt(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_MOVE_PCNTXT;
+}
+
+static inline bool irq_settings_can_autoenable(struct irq_desc *desc)
+{
+ return !(desc->status_use_accessors & _IRQ_NOAUTOEN);
+}
+
+static inline bool irq_settings_is_nested_thread(struct irq_desc *desc)
+{
+ return desc->status_use_accessors & _IRQ_NESTED_THREAD;
+}
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 3089d3b9d5f3..dfbd550401b2 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -21,70 +21,93 @@ static int irqfixup __read_mostly;
#define POLL_SPURIOUS_IRQ_INTERVAL (HZ/10)
static void poll_spurious_irqs(unsigned long dummy);
static DEFINE_TIMER(poll_spurious_irq_timer, poll_spurious_irqs, 0, 0);
+static int irq_poll_cpu;
+static atomic_t irq_poll_active;
+
+/*
+ * We wait here for a poller to finish.
+ *
+ * If the poll runs on this CPU, then we yell loudly and return
+ * false. That will leave the interrupt line disabled in the worst
+ * case, but it should never happen.
+ *
+ * We wait until the poller is done and then recheck disabled and
+ * action (about to be disabled). Only if it's still active, we return
+ * true and let the handler run.
+ */
+bool irq_wait_for_poll(struct irq_desc *desc)
+{
+ if (WARN_ONCE(irq_poll_cpu == smp_processor_id(),
+ "irq poll in progress on cpu %d for irq %d\n",
+ smp_processor_id(), desc->irq_data.irq))
+ return false;
+
+#ifdef CONFIG_SMP
+ do {
+ raw_spin_unlock(&desc->lock);
+ while (irqd_irq_inprogress(&desc->irq_data))
+ cpu_relax();
+ raw_spin_lock(&desc->lock);
+ } while (irqd_irq_inprogress(&desc->irq_data));
+ /* Might have been disabled in meantime */
+ return !irqd_irq_disabled(&desc->irq_data) && desc->action;
+#else
+ return false;
+#endif
+}
+
/*
* Recovery handler for misrouted interrupts.
*/
-static int try_one_irq(int irq, struct irq_desc *desc)
+static int try_one_irq(int irq, struct irq_desc *desc, bool force)
{
+ irqreturn_t ret = IRQ_NONE;
struct irqaction *action;
- int ok = 0, work = 0;
raw_spin_lock(&desc->lock);
- /* Already running on another processor */
- if (desc->status & IRQ_INPROGRESS) {
- /*
- * Already running: If it is shared get the other
- * CPU to go looking for our mystery interrupt too
- */
- if (desc->action && (desc->action->flags & IRQF_SHARED))
- desc->status |= IRQ_PENDING;
- raw_spin_unlock(&desc->lock);
- return ok;
- }
- /* Honour the normal IRQ locking */
- desc->status |= IRQ_INPROGRESS;
- action = desc->action;
- raw_spin_unlock(&desc->lock);
- while (action) {
- /* Only shared IRQ handlers are safe to call */
- if (action->flags & IRQF_SHARED) {
- if (action->handler(irq, action->dev_id) ==
- IRQ_HANDLED)
- ok = 1;
- }
- action = action->next;
- }
- local_irq_disable();
- /* Now clean up the flags */
- raw_spin_lock(&desc->lock);
- action = desc->action;
+ /* PER_CPU and nested thread interrupts are never polled */
+ if (irq_settings_is_per_cpu(desc) || irq_settings_is_nested_thread(desc))
+ goto out;
/*
- * While we were looking for a fixup someone queued a real
- * IRQ clashing with our walk:
+ * Do not poll disabled interrupts unless the spurious
+ * disabled poller asks explicitely.
*/
- while ((desc->status & IRQ_PENDING) && action) {
+ if (irqd_irq_disabled(&desc->irq_data) && !force)
+ goto out;
+
+ /*
+ * All handlers must agree on IRQF_SHARED, so we test just the
+ * first. Check for action->next as well.
+ */
+ action = desc->action;
+ if (!action || !(action->flags & IRQF_SHARED) ||
+ (action->flags & __IRQF_TIMER) || !action->next)
+ goto out;
+
+ /* Already running on another processor */
+ if (irqd_irq_inprogress(&desc->irq_data)) {
/*
- * Perform real IRQ processing for the IRQ we deferred
+ * Already running: If it is shared get the other
+ * CPU to go looking for our mystery interrupt too
*/
- work = 1;
- raw_spin_unlock(&desc->lock);
- handle_IRQ_event(irq, action);
- raw_spin_lock(&desc->lock);
- desc->status &= ~IRQ_PENDING;
+ desc->istate |= IRQS_PENDING;
+ goto out;
}
- desc->status &= ~IRQ_INPROGRESS;
- /*
- * If we did actual work for the real IRQ line we must let the
- * IRQ controller clean up too
- */
- if (work)
- irq_end(irq, desc);
- raw_spin_unlock(&desc->lock);
- return ok;
+ /* Mark it poll in progress */
+ desc->istate |= IRQS_POLL_INPROGRESS;
+ do {
+ if (handle_irq_event(desc) == IRQ_HANDLED)
+ ret = IRQ_HANDLED;
+ action = desc->action;
+ } while ((desc->istate & IRQS_PENDING) && action);
+ desc->istate &= ~IRQS_POLL_INPROGRESS;
+out:
+ raw_spin_unlock(&desc->lock);
+ return ret == IRQ_HANDLED;
}
static int misrouted_irq(int irq)
@@ -92,6 +115,11 @@ static int misrouted_irq(int irq)
struct irq_desc *desc;
int i, ok = 0;
+ if (atomic_inc_return(&irq_poll_active) == 1)
+ goto out;
+
+ irq_poll_cpu = smp_processor_id();
+
for_each_irq_desc(i, desc) {
if (!i)
continue;
@@ -99,9 +127,11 @@ static int misrouted_irq(int irq)
if (i == irq) /* Already tried */
continue;
- if (try_one_irq(i, desc))
+ if (try_one_irq(i, desc, false))
ok = 1;
}
+out:
+ atomic_dec(&irq_poll_active);
/* So the caller can adjust the irq error counts */
return ok;
}
@@ -111,23 +141,28 @@ static void poll_spurious_irqs(unsigned long dummy)
struct irq_desc *desc;
int i;
+ if (atomic_inc_return(&irq_poll_active) != 1)
+ goto out;
+ irq_poll_cpu = smp_processor_id();
+
for_each_irq_desc(i, desc) {
- unsigned int status;
+ unsigned int state;
if (!i)
continue;
/* Racy but it doesn't matter */
- status = desc->status;
+ state = desc->istate;
barrier();
- if (!(status & IRQ_SPURIOUS_DISABLED))
+ if (!(state & IRQS_SPURIOUS_DISABLED))
continue;
local_irq_disable();
- try_one_irq(i, desc);
+ try_one_irq(i, desc, true);
local_irq_enable();
}
-
+out:
+ atomic_dec(&irq_poll_active);
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
}
@@ -139,15 +174,13 @@ static void poll_spurious_irqs(unsigned long dummy)
*
* (The other 100-of-100,000 interrupts may have been a correctly
* functioning device sharing an IRQ with the failing one)
- *
- * Called under desc->lock
*/
-
static void
__report_bad_irq(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
struct irqaction *action;
+ unsigned long flags;
if (action_ret != IRQ_HANDLED && action_ret != IRQ_NONE) {
printk(KERN_ERR "irq event %d: bogus return value %x\n",
@@ -159,6 +192,13 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
dump_stack();
printk(KERN_ERR "handlers:\n");
+ /*
+ * We need to take desc->lock here. note_interrupt() is called
+ * w/o desc->lock held, but IRQ_PROGRESS set. We might race
+ * with something else removing an action. It's ok to take
+ * desc->lock here. See synchronize_irq().
+ */
+ raw_spin_lock_irqsave(&desc->lock, flags);
action = desc->action;
while (action) {
printk(KERN_ERR "[<%p>]", action->handler);
@@ -167,6 +207,7 @@ __report_bad_irq(unsigned int irq, struct irq_desc *desc,
printk("\n");
action = action->next;
}
+ raw_spin_unlock_irqrestore(&desc->lock, flags);
}
static void
@@ -218,6 +259,9 @@ try_misrouted_irq(unsigned int irq, struct irq_desc *desc,
void note_interrupt(unsigned int irq, struct irq_desc *desc,
irqreturn_t action_ret)
{
+ if (desc->istate & IRQS_POLL_INPROGRESS)
+ return;
+
if (unlikely(action_ret != IRQ_HANDLED)) {
/*
* If we are seeing only the odd spurious IRQ caused by
@@ -254,9 +298,9 @@ void note_interrupt(unsigned int irq, struct irq_desc *desc,
* Now kill the IRQ
*/
printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
- desc->status |= IRQ_DISABLED | IRQ_SPURIOUS_DISABLED;
+ desc->istate |= IRQS_SPURIOUS_DISABLED;
desc->depth++;
- desc->irq_data.chip->irq_disable(&desc->irq_data);
+ irq_disable(desc);
mod_timer(&poll_spurious_irq_timer,
jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
index 3b79bd938330..fa27e750dbc0 100644
--- a/kernel/jump_label.c
+++ b/kernel/jump_label.c
@@ -2,43 +2,23 @@
* jump label support
*
* Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
+ * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com>
*
*/
-#include <linux/jump_label.h>
#include <linux/memory.h>
#include <linux/uaccess.h>
#include <linux/module.h>
#include <linux/list.h>
-#include <linux/jhash.h>
#include <linux/slab.h>
#include <linux/sort.h>
#include <linux/err.h>
+#include <linux/jump_label.h>
#ifdef HAVE_JUMP_LABEL
-#define JUMP_LABEL_HASH_BITS 6
-#define JUMP_LABEL_TABLE_SIZE (1 << JUMP_LABEL_HASH_BITS)
-static struct hlist_head jump_label_table[JUMP_LABEL_TABLE_SIZE];
-
/* mutex to protect coming/going of the the jump_label table */
static DEFINE_MUTEX(jump_label_mutex);
-struct jump_label_entry {
- struct hlist_node hlist;
- struct jump_entry *table;
- int nr_entries;
- /* hang modules off here */
- struct hlist_head modules;
- unsigned long key;
-};
-
-struct jump_label_module_entry {
- struct hlist_node hlist;
- struct jump_entry *table;
- int nr_entries;
- struct module *mod;
-};
-
void jump_label_lock(void)
{
mutex_lock(&jump_label_mutex);
@@ -49,6 +29,11 @@ void jump_label_unlock(void)
mutex_unlock(&jump_label_mutex);
}
+bool jump_label_enabled(struct jump_label_key *key)
+{
+ return !!atomic_read(&key->enabled);
+}
+
static int jump_label_cmp(const void *a, const void *b)
{
const struct jump_entry *jea = a;
@@ -64,7 +49,7 @@ static int jump_label_cmp(const void *a, const void *b)
}
static void
-sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
+jump_label_sort_entries(struct jump_entry *start, struct jump_entry *stop)
{
unsigned long size;
@@ -73,118 +58,25 @@ sort_jump_label_entries(struct jump_entry *start, struct jump_entry *stop)
sort(start, size, sizeof(struct jump_entry), jump_label_cmp, NULL);
}
-static struct jump_label_entry *get_jump_label_entry(jump_label_t key)
-{
- struct hlist_head *head;
- struct hlist_node *node;
- struct jump_label_entry *e;
- u32 hash = jhash((void *)&key, sizeof(jump_label_t), 0);
-
- head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
- hlist_for_each_entry(e, node, head, hlist) {
- if (key == e->key)
- return e;
- }
- return NULL;
-}
+static void jump_label_update(struct jump_label_key *key, int enable);
-static struct jump_label_entry *
-add_jump_label_entry(jump_label_t key, int nr_entries, struct jump_entry *table)
+void jump_label_inc(struct jump_label_key *key)
{
- struct hlist_head *head;
- struct jump_label_entry *e;
- u32 hash;
-
- e = get_jump_label_entry(key);
- if (e)
- return ERR_PTR(-EEXIST);
-
- e = kmalloc(sizeof(struct jump_label_entry), GFP_KERNEL);
- if (!e)
- return ERR_PTR(-ENOMEM);
-
- hash = jhash((void *)&key, sizeof(jump_label_t), 0);
- head = &jump_label_table[hash & (JUMP_LABEL_TABLE_SIZE - 1)];
- e->key = key;
- e->table = table;
- e->nr_entries = nr_entries;
- INIT_HLIST_HEAD(&(e->modules));
- hlist_add_head(&e->hlist, head);
- return e;
-}
+ if (atomic_inc_not_zero(&key->enabled))
+ return;
-static int
-build_jump_label_hashtable(struct jump_entry *start, struct jump_entry *stop)
-{
- struct jump_entry *iter, *iter_begin;
- struct jump_label_entry *entry;
- int count;
-
- sort_jump_label_entries(start, stop);
- iter = start;
- while (iter < stop) {
- entry = get_jump_label_entry(iter->key);
- if (!entry) {
- iter_begin = iter;
- count = 0;
- while ((iter < stop) &&
- (iter->key == iter_begin->key)) {
- iter++;
- count++;
- }
- entry = add_jump_label_entry(iter_begin->key,
- count, iter_begin);
- if (IS_ERR(entry))
- return PTR_ERR(entry);
- } else {
- WARN_ONCE(1, KERN_ERR "build_jump_hashtable: unexpected entry!\n");
- return -1;
- }
- }
- return 0;
+ jump_label_lock();
+ if (atomic_add_return(1, &key->enabled) == 1)
+ jump_label_update(key, JUMP_LABEL_ENABLE);
+ jump_label_unlock();
}
-/***
- * jump_label_update - update jump label text
- * @key - key value associated with a a jump label
- * @type - enum set to JUMP_LABEL_ENABLE or JUMP_LABEL_DISABLE
- *
- * Will enable/disable the jump for jump label @key, depending on the
- * value of @type.
- *
- */
-
-void jump_label_update(unsigned long key, enum jump_label_type type)
+void jump_label_dec(struct jump_label_key *key)
{
- struct jump_entry *iter;
- struct jump_label_entry *entry;
- struct hlist_node *module_node;
- struct jump_label_module_entry *e_module;
- int count;
+ if (!atomic_dec_and_mutex_lock(&key->enabled, &jump_label_mutex))
+ return;
- jump_label_lock();
- entry = get_jump_label_entry((jump_label_t)key);
- if (entry) {
- count = entry->nr_entries;
- iter = entry->table;
- while (count--) {
- if (kernel_text_address(iter->code))
- arch_jump_label_transform(iter, type);
- iter++;
- }
- /* eanble/disable jump labels in modules */
- hlist_for_each_entry(e_module, module_node, &(entry->modules),
- hlist) {
- count = e_module->nr_entries;
- iter = e_module->table;
- while (count--) {
- if (iter->key &&
- kernel_text_address(iter->code))
- arch_jump_label_transform(iter, type);
- iter++;
- }
- }
- }
+ jump_label_update(key, JUMP_LABEL_DISABLE);
jump_label_unlock();
}
@@ -197,77 +89,36 @@ static int addr_conflict(struct jump_entry *entry, void *start, void *end)
return 0;
}
-#ifdef CONFIG_MODULES
-
-static int module_conflict(void *start, void *end)
-{
- struct hlist_head *head;
- struct hlist_node *node, *node_next, *module_node, *module_node_next;
- struct jump_label_entry *e;
- struct jump_label_module_entry *e_module;
- struct jump_entry *iter;
- int i, count;
- int conflict = 0;
-
- for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
- head = &jump_label_table[i];
- hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
- hlist_for_each_entry_safe(e_module, module_node,
- module_node_next,
- &(e->modules), hlist) {
- count = e_module->nr_entries;
- iter = e_module->table;
- while (count--) {
- if (addr_conflict(iter, start, end)) {
- conflict = 1;
- goto out;
- }
- iter++;
- }
- }
- }
- }
-out:
- return conflict;
-}
-
-#endif
-
-/***
- * jump_label_text_reserved - check if addr range is reserved
- * @start: start text addr
- * @end: end text addr
- *
- * checks if the text addr located between @start and @end
- * overlaps with any of the jump label patch addresses. Code
- * that wants to modify kernel text should first verify that
- * it does not overlap with any of the jump label addresses.
- * Caller must hold jump_label_mutex.
- *
- * returns 1 if there is an overlap, 0 otherwise
- */
-int jump_label_text_reserved(void *start, void *end)
+static int __jump_label_text_reserved(struct jump_entry *iter_start,
+ struct jump_entry *iter_stop, void *start, void *end)
{
struct jump_entry *iter;
- struct jump_entry *iter_start = __start___jump_table;
- struct jump_entry *iter_stop = __start___jump_table;
- int conflict = 0;
iter = iter_start;
while (iter < iter_stop) {
- if (addr_conflict(iter, start, end)) {
- conflict = 1;
- goto out;
- }
+ if (addr_conflict(iter, start, end))
+ return 1;
iter++;
}
- /* now check modules */
-#ifdef CONFIG_MODULES
- conflict = module_conflict(start, end);
-#endif
-out:
- return conflict;
+ return 0;
+}
+
+static void __jump_label_update(struct jump_label_key *key,
+ struct jump_entry *entry,
+ struct jump_entry *stop, int enable)
+{
+ for (; (entry < stop) &&
+ (entry->key == (jump_label_t)(unsigned long)key);
+ entry++) {
+ /*
+ * entry->code set to 0 invalidates module init text sections
+ * kernel_text_address() verifies we are not in core kernel
+ * init code, see jump_label_invalidate_module_init().
+ */
+ if (entry->code && kernel_text_address(entry->code))
+ arch_jump_label_transform(entry, enable);
+ }
}
/*
@@ -277,145 +128,181 @@ void __weak arch_jump_label_text_poke_early(jump_label_t addr)
{
}
-static __init int init_jump_label(void)
+static __init int jump_label_init(void)
{
- int ret;
struct jump_entry *iter_start = __start___jump_table;
struct jump_entry *iter_stop = __stop___jump_table;
+ struct jump_label_key *key = NULL;
struct jump_entry *iter;
jump_label_lock();
- ret = build_jump_label_hashtable(__start___jump_table,
- __stop___jump_table);
- iter = iter_start;
- while (iter < iter_stop) {
+ jump_label_sort_entries(iter_start, iter_stop);
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
arch_jump_label_text_poke_early(iter->code);
- iter++;
+ if (iter->key == (jump_label_t)(unsigned long)key)
+ continue;
+
+ key = (struct jump_label_key *)(unsigned long)iter->key;
+ atomic_set(&key->enabled, 0);
+ key->entries = iter;
+#ifdef CONFIG_MODULES
+ key->next = NULL;
+#endif
}
jump_label_unlock();
- return ret;
+
+ return 0;
}
-early_initcall(init_jump_label);
+early_initcall(jump_label_init);
#ifdef CONFIG_MODULES
-static struct jump_label_module_entry *
-add_jump_label_module_entry(struct jump_label_entry *entry,
- struct jump_entry *iter_begin,
- int count, struct module *mod)
+struct jump_label_mod {
+ struct jump_label_mod *next;
+ struct jump_entry *entries;
+ struct module *mod;
+};
+
+static int __jump_label_mod_text_reserved(void *start, void *end)
{
- struct jump_label_module_entry *e;
-
- e = kmalloc(sizeof(struct jump_label_module_entry), GFP_KERNEL);
- if (!e)
- return ERR_PTR(-ENOMEM);
- e->mod = mod;
- e->nr_entries = count;
- e->table = iter_begin;
- hlist_add_head(&e->hlist, &entry->modules);
- return e;
+ struct module *mod;
+
+ mod = __module_text_address((unsigned long)start);
+ if (!mod)
+ return 0;
+
+ WARN_ON_ONCE(__module_text_address((unsigned long)end) != mod);
+
+ return __jump_label_text_reserved(mod->jump_entries,
+ mod->jump_entries + mod->num_jump_entries,
+ start, end);
}
-static int add_jump_label_module(struct module *mod)
+static void __jump_label_mod_update(struct jump_label_key *key, int enable)
{
- struct jump_entry *iter, *iter_begin;
- struct jump_label_entry *entry;
- struct jump_label_module_entry *module_entry;
- int count;
+ struct jump_label_mod *mod = key->next;
- /* if the module doesn't have jump label entries, just return */
- if (!mod->num_jump_entries)
- return 0;
+ while (mod) {
+ struct module *m = mod->mod;
- sort_jump_label_entries(mod->jump_entries,
- mod->jump_entries + mod->num_jump_entries);
- iter = mod->jump_entries;
- while (iter < mod->jump_entries + mod->num_jump_entries) {
- entry = get_jump_label_entry(iter->key);
- iter_begin = iter;
- count = 0;
- while ((iter < mod->jump_entries + mod->num_jump_entries) &&
- (iter->key == iter_begin->key)) {
- iter++;
- count++;
- }
- if (!entry) {
- entry = add_jump_label_entry(iter_begin->key, 0, NULL);
- if (IS_ERR(entry))
- return PTR_ERR(entry);
- }
- module_entry = add_jump_label_module_entry(entry, iter_begin,
- count, mod);
- if (IS_ERR(module_entry))
- return PTR_ERR(module_entry);
+ __jump_label_update(key, mod->entries,
+ m->jump_entries + m->num_jump_entries,
+ enable);
+ mod = mod->next;
}
- return 0;
}
-static void remove_jump_label_module(struct module *mod)
+/***
+ * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
+ * @mod: module to patch
+ *
+ * Allow for run-time selection of the optimal nops. Before the module
+ * loads patch these with arch_get_jump_label_nop(), which is specified by
+ * the arch specific jump label code.
+ */
+void jump_label_apply_nops(struct module *mod)
{
- struct hlist_head *head;
- struct hlist_node *node, *node_next, *module_node, *module_node_next;
- struct jump_label_entry *e;
- struct jump_label_module_entry *e_module;
- int i;
+ struct jump_entry *iter_start = mod->jump_entries;
+ struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+ struct jump_entry *iter;
/* if the module doesn't have jump label entries, just return */
- if (!mod->num_jump_entries)
+ if (iter_start == iter_stop)
return;
- for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
- head = &jump_label_table[i];
- hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
- hlist_for_each_entry_safe(e_module, module_node,
- module_node_next,
- &(e->modules), hlist) {
- if (e_module->mod == mod) {
- hlist_del(&e_module->hlist);
- kfree(e_module);
- }
- }
- if (hlist_empty(&e->modules) && (e->nr_entries == 0)) {
- hlist_del(&e->hlist);
- kfree(e);
- }
+ for (iter = iter_start; iter < iter_stop; iter++)
+ arch_jump_label_text_poke_early(iter->code);
+}
+
+static int jump_label_add_module(struct module *mod)
+{
+ struct jump_entry *iter_start = mod->jump_entries;
+ struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+ struct jump_entry *iter;
+ struct jump_label_key *key = NULL;
+ struct jump_label_mod *jlm;
+
+ /* if the module doesn't have jump label entries, just return */
+ if (iter_start == iter_stop)
+ return 0;
+
+ jump_label_sort_entries(iter_start, iter_stop);
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ if (iter->key == (jump_label_t)(unsigned long)key)
+ continue;
+
+ key = (struct jump_label_key *)(unsigned long)iter->key;
+
+ if (__module_address(iter->key) == mod) {
+ atomic_set(&key->enabled, 0);
+ key->entries = iter;
+ key->next = NULL;
+ continue;
}
+
+ jlm = kzalloc(sizeof(struct jump_label_mod), GFP_KERNEL);
+ if (!jlm)
+ return -ENOMEM;
+
+ jlm->mod = mod;
+ jlm->entries = iter;
+ jlm->next = key->next;
+ key->next = jlm;
+
+ if (jump_label_enabled(key))
+ __jump_label_update(key, iter, iter_stop,
+ JUMP_LABEL_ENABLE);
}
+
+ return 0;
}
-static void remove_jump_label_module_init(struct module *mod)
+static void jump_label_del_module(struct module *mod)
{
- struct hlist_head *head;
- struct hlist_node *node, *node_next, *module_node, *module_node_next;
- struct jump_label_entry *e;
- struct jump_label_module_entry *e_module;
+ struct jump_entry *iter_start = mod->jump_entries;
+ struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
struct jump_entry *iter;
- int i, count;
+ struct jump_label_key *key = NULL;
+ struct jump_label_mod *jlm, **prev;
- /* if the module doesn't have jump label entries, just return */
- if (!mod->num_jump_entries)
- return;
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ if (iter->key == (jump_label_t)(unsigned long)key)
+ continue;
+
+ key = (struct jump_label_key *)(unsigned long)iter->key;
+
+ if (__module_address(iter->key) == mod)
+ continue;
+
+ prev = &key->next;
+ jlm = key->next;
+
+ while (jlm && jlm->mod != mod) {
+ prev = &jlm->next;
+ jlm = jlm->next;
+ }
- for (i = 0; i < JUMP_LABEL_TABLE_SIZE; i++) {
- head = &jump_label_table[i];
- hlist_for_each_entry_safe(e, node, node_next, head, hlist) {
- hlist_for_each_entry_safe(e_module, module_node,
- module_node_next,
- &(e->modules), hlist) {
- if (e_module->mod != mod)
- continue;
- count = e_module->nr_entries;
- iter = e_module->table;
- while (count--) {
- if (within_module_init(iter->code, mod))
- iter->key = 0;
- iter++;
- }
- }
+ if (jlm) {
+ *prev = jlm->next;
+ kfree(jlm);
}
}
}
+static void jump_label_invalidate_module_init(struct module *mod)
+{
+ struct jump_entry *iter_start = mod->jump_entries;
+ struct jump_entry *iter_stop = iter_start + mod->num_jump_entries;
+ struct jump_entry *iter;
+
+ for (iter = iter_start; iter < iter_stop; iter++) {
+ if (within_module_init(iter->code, mod))
+ iter->code = 0;
+ }
+}
+
static int
jump_label_module_notify(struct notifier_block *self, unsigned long val,
void *data)
@@ -426,59 +313,77 @@ jump_label_module_notify(struct notifier_block *self, unsigned long val,
switch (val) {
case MODULE_STATE_COMING:
jump_label_lock();
- ret = add_jump_label_module(mod);
+ ret = jump_label_add_module(mod);
if (ret)
- remove_jump_label_module(mod);
+ jump_label_del_module(mod);
jump_label_unlock();
break;
case MODULE_STATE_GOING:
jump_label_lock();
- remove_jump_label_module(mod);
+ jump_label_del_module(mod);
jump_label_unlock();
break;
case MODULE_STATE_LIVE:
jump_label_lock();
- remove_jump_label_module_init(mod);
+ jump_label_invalidate_module_init(mod);
jump_label_unlock();
break;
}
- return ret;
-}
-/***
- * apply_jump_label_nops - patch module jump labels with arch_get_jump_label_nop()
- * @mod: module to patch
- *
- * Allow for run-time selection of the optimal nops. Before the module
- * loads patch these with arch_get_jump_label_nop(), which is specified by
- * the arch specific jump label code.
- */
-void jump_label_apply_nops(struct module *mod)
-{
- struct jump_entry *iter;
-
- /* if the module doesn't have jump label entries, just return */
- if (!mod->num_jump_entries)
- return;
-
- iter = mod->jump_entries;
- while (iter < mod->jump_entries + mod->num_jump_entries) {
- arch_jump_label_text_poke_early(iter->code);
- iter++;
- }
+ return notifier_from_errno(ret);
}
struct notifier_block jump_label_module_nb = {
.notifier_call = jump_label_module_notify,
- .priority = 0,
+ .priority = 1, /* higher than tracepoints */
};
-static __init int init_jump_label_module(void)
+static __init int jump_label_init_module(void)
{
return register_module_notifier(&jump_label_module_nb);
}
-early_initcall(init_jump_label_module);
+early_initcall(jump_label_init_module);
#endif /* CONFIG_MODULES */
+/***
+ * jump_label_text_reserved - check if addr range is reserved
+ * @start: start text addr
+ * @end: end text addr
+ *
+ * checks if the text addr located between @start and @end
+ * overlaps with any of the jump label patch addresses. Code
+ * that wants to modify kernel text should first verify that
+ * it does not overlap with any of the jump label addresses.
+ * Caller must hold jump_label_mutex.
+ *
+ * returns 1 if there is an overlap, 0 otherwise
+ */
+int jump_label_text_reserved(void *start, void *end)
+{
+ int ret = __jump_label_text_reserved(__start___jump_table,
+ __stop___jump_table, start, end);
+
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_MODULES
+ ret = __jump_label_mod_text_reserved(start, end);
+#endif
+ return ret;
+}
+
+static void jump_label_update(struct jump_label_key *key, int enable)
+{
+ struct jump_entry *entry = key->entries;
+
+ /* if there are no users, entry can be NULL */
+ if (entry)
+ __jump_label_update(key, entry, __stop___jump_table, enable);
+
+#ifdef CONFIG_MODULES
+ __jump_label_mod_update(key, enable);
+#endif
+}
+
#endif
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 6f6d091b5757..079f1d39a8b8 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -64,14 +64,14 @@ static inline int is_kernel_text(unsigned long addr)
if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
arch_is_kernel_text(addr))
return 1;
- return in_gate_area_no_task(addr);
+ return in_gate_area_no_mm(addr);
}
static inline int is_kernel(unsigned long addr)
{
if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
return 1;
- return in_gate_area_no_task(addr);
+ return in_gate_area_no_mm(addr);
}
static int is_ksym_addr(unsigned long addr)
@@ -342,13 +342,15 @@ int lookup_symbol_attrs(unsigned long addr, unsigned long *size,
}
/* Look up a kernel symbol and return it in a text buffer. */
-int sprint_symbol(char *buffer, unsigned long address)
+static int __sprint_symbol(char *buffer, unsigned long address,
+ int symbol_offset)
{
char *modname;
const char *name;
unsigned long offset, size;
int len;
+ address += symbol_offset;
name = kallsyms_lookup(address, &size, &offset, &modname, buffer);
if (!name)
return sprintf(buffer, "0x%lx", address);
@@ -357,17 +359,53 @@ int sprint_symbol(char *buffer, unsigned long address)
strcpy(buffer, name);
len = strlen(buffer);
buffer += len;
+ offset -= symbol_offset;
if (modname)
- len += sprintf(buffer, "+%#lx/%#lx [%s]",
- offset, size, modname);
+ len += sprintf(buffer, "+%#lx/%#lx [%s]", offset, size, modname);
else
len += sprintf(buffer, "+%#lx/%#lx", offset, size);
return len;
}
+
+/**
+ * sprint_symbol - Look up a kernel symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function looks up a kernel symbol with @address and stores its name,
+ * offset, size and module name to @buffer if possible. If no symbol was found,
+ * just saves its @address as is.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_symbol(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, 0);
+}
+
EXPORT_SYMBOL_GPL(sprint_symbol);
+/**
+ * sprint_backtrace - Look up a backtrace symbol and return it in a text buffer
+ * @buffer: buffer to be stored
+ * @address: address to lookup
+ *
+ * This function is for stack backtrace and does the same thing as
+ * sprint_symbol() but with modified/decreased @address. If there is a
+ * tail-call to the function marked "noreturn", gcc optimized out code after
+ * the call so that the stack-saved return address could point outside of the
+ * caller. This function ensures that kallsyms will find the original caller
+ * by decreasing @address.
+ *
+ * This function returns the number of bytes stored in @buffer.
+ */
+int sprint_backtrace(char *buffer, unsigned long address)
+{
+ return __sprint_symbol(buffer, address, -1);
+}
+
/* Look up a kernel symbol and print it to the kernel messages. */
void __print_symbol(const char *fmt, unsigned long address)
{
@@ -477,13 +515,11 @@ static int s_show(struct seq_file *m, void *p)
*/
type = iter->exported ? toupper(iter->type) :
tolower(iter->type);
- seq_printf(m, "%0*lx %c %s\t[%s]\n",
- (int)(2 * sizeof(void *)),
- iter->value, type, iter->name, iter->module_name);
+ seq_printf(m, "%pK %c %s\t[%s]\n", (void *)iter->value,
+ type, iter->name, iter->module_name);
} else
- seq_printf(m, "%0*lx %c %s\n",
- (int)(2 * sizeof(void *)),
- iter->value, iter->type, iter->name);
+ seq_printf(m, "%pK %c %s\n", (void *)iter->value,
+ iter->type, iter->name);
return 0;
}
diff --git a/kernel/kexec.c b/kernel/kexec.c
index ec19b92c7ebd..8d814cbc8109 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -33,6 +33,7 @@
#include <linux/vmalloc.h>
#include <linux/swap.h>
#include <linux/kmsg_dump.h>
+#include <linux/syscore_ops.h>
#include <asm/page.h>
#include <asm/uaccess.h>
@@ -144,7 +145,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
/* Initialize the list of destination pages */
INIT_LIST_HEAD(&image->dest_pages);
- /* Initialize the list of unuseable pages */
+ /* Initialize the list of unusable pages */
INIT_LIST_HEAD(&image->unuseable_pages);
/* Read in the segments */
@@ -454,7 +455,7 @@ static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
/* Deal with the destination pages I have inadvertently allocated.
*
* Ideally I would convert multi-page allocations into single
- * page allocations, and add everyting to image->dest_pages.
+ * page allocations, and add everything to image->dest_pages.
*
* For now it is simpler to just free the pages.
*/
@@ -602,7 +603,7 @@ static void kimage_free_extra_pages(struct kimage *image)
/* Walk through and free any extra destination pages I may have */
kimage_free_page_list(&image->dest_pages);
- /* Walk through and free any unuseable pages I have cached */
+ /* Walk through and free any unusable pages I have cached */
kimage_free_page_list(&image->unuseable_pages);
}
@@ -1099,7 +1100,8 @@ size_t crash_get_memory_size(void)
return size;
}
-static void free_reserved_phys_range(unsigned long begin, unsigned long end)
+void __weak crash_free_reserved_phys_range(unsigned long begin,
+ unsigned long end)
{
unsigned long addr;
@@ -1135,7 +1137,7 @@ int crash_shrink_memory(unsigned long new_size)
start = roundup(start, PAGE_SIZE);
end = roundup(start + new_size, PAGE_SIZE);
- free_reserved_phys_range(end, crashk_res.end);
+ crash_free_reserved_phys_range(end, crashk_res.end);
if ((start == end) && (crashk_res.parent != NULL))
release_resource(&crashk_res);
@@ -1529,8 +1531,7 @@ int kernel_kexec(void)
if (error)
goto Enable_cpus;
local_irq_disable();
- /* Suspend system devices */
- error = sysdev_suspend(PMSG_FREEZE);
+ error = syscore_suspend();
if (error)
goto Enable_irqs;
} else
@@ -1545,7 +1546,7 @@ int kernel_kexec(void)
#ifdef CONFIG_KEXEC_JUMP
if (kexec_image->preserve_context) {
- sysdev_resume();
+ syscore_resume();
Enable_irqs:
local_irq_enable();
Enable_cpus:
diff --git a/kernel/kmod.c b/kernel/kmod.c
index 9cd0591c96a2..ad6a81c58b44 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -25,6 +25,7 @@
#include <linux/kmod.h>
#include <linux/slab.h>
#include <linux/completion.h>
+#include <linux/cred.h>
#include <linux/file.h>
#include <linux/fdtable.h>
#include <linux/workqueue.h>
@@ -43,6 +44,13 @@ extern int max_threads;
static struct workqueue_struct *khelper_wq;
+#define CAP_BSET (void *)1
+#define CAP_PI (void *)2
+
+static kernel_cap_t usermodehelper_bset = CAP_FULL_SET;
+static kernel_cap_t usermodehelper_inheritable = CAP_FULL_SET;
+static DEFINE_SPINLOCK(umh_sysctl_lock);
+
#ifdef CONFIG_MODULES
/*
@@ -132,6 +140,7 @@ EXPORT_SYMBOL(__request_module);
static int ____call_usermodehelper(void *data)
{
struct subprocess_info *sub_info = data;
+ struct cred *new;
int retval;
spin_lock_irq(&current->sighand->siglock);
@@ -153,6 +162,19 @@ static int ____call_usermodehelper(void *data)
goto fail;
}
+ retval = -ENOMEM;
+ new = prepare_kernel_cred(current);
+ if (!new)
+ goto fail;
+
+ spin_lock(&umh_sysctl_lock);
+ new->cap_bset = cap_intersect(usermodehelper_bset, new->cap_bset);
+ new->cap_inheritable = cap_intersect(usermodehelper_inheritable,
+ new->cap_inheritable);
+ spin_unlock(&umh_sysctl_lock);
+
+ commit_creds(new);
+
retval = kernel_execve(sub_info->path,
(const char *const *)sub_info->argv,
(const char *const *)sub_info->envp);
@@ -245,7 +267,6 @@ static void __call_usermodehelper(struct work_struct *work)
}
}
-#ifdef CONFIG_PM_SLEEP
/*
* If set, call_usermodehelper_exec() will exit immediately returning -EBUSY
* (used for preventing user land processes from being created after the user
@@ -301,6 +322,15 @@ void usermodehelper_enable(void)
usermodehelper_disabled = 0;
}
+/**
+ * usermodehelper_is_disabled - check if new helpers are allowed to be started
+ */
+bool usermodehelper_is_disabled(void)
+{
+ return usermodehelper_disabled;
+}
+EXPORT_SYMBOL_GPL(usermodehelper_is_disabled);
+
static void helper_lock(void)
{
atomic_inc(&running_helpers);
@@ -312,12 +342,6 @@ static void helper_unlock(void)
if (atomic_dec_and_test(&running_helpers))
wake_up(&running_helpers_waitq);
}
-#else /* CONFIG_PM_SLEEP */
-#define usermodehelper_disabled 0
-
-static inline void helper_lock(void) {}
-static inline void helper_unlock(void) {}
-#endif /* CONFIG_PM_SLEEP */
/**
* call_usermodehelper_setup - prepare to call a usermode helper
@@ -418,6 +442,84 @@ unlock:
}
EXPORT_SYMBOL(call_usermodehelper_exec);
+static int proc_cap_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ unsigned long cap_array[_KERNEL_CAPABILITY_U32S];
+ kernel_cap_t new_cap;
+ int err, i;
+
+ if (write && (!capable(CAP_SETPCAP) ||
+ !capable(CAP_SYS_MODULE)))
+ return -EPERM;
+
+ /*
+ * convert from the global kernel_cap_t to the ulong array to print to
+ * userspace if this is a read.
+ */
+ spin_lock(&umh_sysctl_lock);
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++) {
+ if (table->data == CAP_BSET)
+ cap_array[i] = usermodehelper_bset.cap[i];
+ else if (table->data == CAP_PI)
+ cap_array[i] = usermodehelper_inheritable.cap[i];
+ else
+ BUG();
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ t = *table;
+ t.data = &cap_array;
+
+ /*
+ * actually read or write and array of ulongs from userspace. Remember
+ * these are least significant 32 bits first
+ */
+ err = proc_doulongvec_minmax(&t, write, buffer, lenp, ppos);
+ if (err < 0)
+ return err;
+
+ /*
+ * convert from the sysctl array of ulongs to the kernel_cap_t
+ * internal representation
+ */
+ for (i = 0; i < _KERNEL_CAPABILITY_U32S; i++)
+ new_cap.cap[i] = cap_array[i];
+
+ /*
+ * Drop everything not in the new_cap (but don't add things)
+ */
+ spin_lock(&umh_sysctl_lock);
+ if (write) {
+ if (table->data == CAP_BSET)
+ usermodehelper_bset = cap_intersect(usermodehelper_bset, new_cap);
+ if (table->data == CAP_PI)
+ usermodehelper_inheritable = cap_intersect(usermodehelper_inheritable, new_cap);
+ }
+ spin_unlock(&umh_sysctl_lock);
+
+ return 0;
+}
+
+struct ctl_table usermodehelper_table[] = {
+ {
+ .procname = "bset",
+ .data = CAP_BSET,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ {
+ .procname = "inheritable",
+ .data = CAP_PI,
+ .maxlen = _KERNEL_CAPABILITY_U32S * sizeof(unsigned long),
+ .mode = 0600,
+ .proc_handler = proc_cap_handler,
+ },
+ { }
+};
+
void __init usermodehelper_init(void)
{
khelper_wq = create_singlethread_workqueue("khelper");
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
index 0b624e791805..3b053c04dd86 100644
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -16,6 +16,7 @@
#include <linux/kexec.h>
#include <linux/profile.h>
#include <linux/sched.h>
+#include <linux/capability.h>
#define KERNEL_ATTR_RO(_name) \
static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -131,6 +132,14 @@ KERNEL_ATTR_RO(vmcoreinfo);
#endif /* CONFIG_KEXEC */
+/* whether file capabilities are enabled */
+static ssize_t fscaps_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%d\n", file_caps_enabled);
+}
+KERNEL_ATTR_RO(fscaps);
+
/*
* Make /sys/kernel/notes give the raw contents of our kernel .notes section.
*/
@@ -158,6 +167,7 @@ struct kobject *kernel_kobj;
EXPORT_SYMBOL_GPL(kernel_kobj);
static struct attribute * kernel_attrs[] = {
+ &fscaps_attr.attr,
#if defined(CONFIG_HOTPLUG)
&uevent_seqnum_attr.attr,
&uevent_helper_attr.attr,
diff --git a/kernel/kthread.c b/kernel/kthread.c
index c55afba990a3..4ba7cccb4994 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -27,6 +27,7 @@ struct kthread_create_info
/* Information passed to kthread() from kthreadd. */
int (*threadfn)(void *data);
void *data;
+ int node;
/* Result passed back to kthread_create() from kthreadd. */
struct task_struct *result;
@@ -98,10 +99,23 @@ static int kthread(void *_create)
do_exit(ret);
}
+/* called from do_fork() to get node information for about to be created task */
+int tsk_fork_get_node(struct task_struct *tsk)
+{
+#ifdef CONFIG_NUMA
+ if (tsk == kthreadd_task)
+ return tsk->pref_node_fork;
+#endif
+ return numa_node_id();
+}
+
static void create_kthread(struct kthread_create_info *create)
{
int pid;
+#ifdef CONFIG_NUMA
+ current->pref_node_fork = create->node;
+#endif
/* We want our own signal handler (we take no signals by default). */
pid = kernel_thread(kthread, create, CLONE_FS | CLONE_FILES | SIGCHLD);
if (pid < 0) {
@@ -111,33 +125,38 @@ static void create_kthread(struct kthread_create_info *create)
}
/**
- * kthread_create - create a kthread.
+ * kthread_create_on_node - create a kthread.
* @threadfn: the function to run until signal_pending(current).
* @data: data ptr for @threadfn.
+ * @node: memory node number.
* @namefmt: printf-style name for the thread.
*
* Description: This helper function creates and names a kernel
* thread. The thread will be stopped: use wake_up_process() to start
* it. See also kthread_run().
*
+ * If thread is going to be bound on a particular cpu, give its node
+ * in @node, to get NUMA affinity for kthread stack, or else give -1.
* When woken, the thread will run @threadfn() with @data as its
* argument. @threadfn() can either call do_exit() directly if it is a
- * standalone thread for which noone will call kthread_stop(), or
+ * standalone thread for which no one will call kthread_stop(), or
* return when 'kthread_should_stop()' is true (which means
* kthread_stop() has been called). The return value should be zero
* or a negative error number; it will be passed to kthread_stop().
*
* Returns a task_struct or ERR_PTR(-ENOMEM).
*/
-struct task_struct *kthread_create(int (*threadfn)(void *data),
- void *data,
- const char namefmt[],
- ...)
+struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
+ void *data,
+ int node,
+ const char namefmt[],
+ ...)
{
struct kthread_create_info create;
create.threadfn = threadfn;
create.data = data;
+ create.node = node;
init_completion(&create.done);
spin_lock(&kthread_create_lock);
@@ -164,7 +183,7 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
}
return create.result;
}
-EXPORT_SYMBOL(kthread_create);
+EXPORT_SYMBOL(kthread_create_on_node);
/**
* kthread_bind - bind a just-created kthread to a cpu.
@@ -183,8 +202,8 @@ void kthread_bind(struct task_struct *p, unsigned int cpu)
return;
}
- p->cpus_allowed = cpumask_of_cpu(cpu);
- p->rt.nr_cpus_allowed = 1;
+ /* It's safe because the task is inactive. */
+ do_set_cpus_allowed(p, cpumask_of(cpu));
p->flags |= PF_THREAD_BOUND;
}
EXPORT_SYMBOL(kthread_bind);
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index ee74b35e528d..376066e10413 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -153,7 +153,7 @@ static inline void store_stacktrace(struct task_struct *tsk,
}
/**
- * __account_scheduler_latency - record an occured latency
+ * __account_scheduler_latency - record an occurred latency
* @tsk - the task struct of the task hitting the latency
* @usecs - the duration of the latency in microseconds
* @inter - 1 if the sleep was interruptible, 0 if uninterruptible
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 0d2058da80f5..63437d065ac8 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -490,6 +490,18 @@ void get_usage_chars(struct lock_class *class, char usage[LOCK_USAGE_CHARS])
usage[i] = '\0';
}
+static int __print_lock_name(struct lock_class *class)
+{
+ char str[KSYM_NAME_LEN];
+ const char *name;
+
+ name = class->name;
+ if (!name)
+ name = __get_key_name(class->key, str);
+
+ return printk("%s", name);
+}
+
static void print_lock_name(struct lock_class *class)
{
char str[KSYM_NAME_LEN], usage[LOCK_USAGE_CHARS];
@@ -1053,6 +1065,56 @@ print_circular_bug_entry(struct lock_list *target, int depth)
return 0;
}
+static void
+print_circular_lock_scenario(struct held_lock *src,
+ struct held_lock *tgt,
+ struct lock_list *prt)
+{
+ struct lock_class *source = hlock_class(src);
+ struct lock_class *target = hlock_class(tgt);
+ struct lock_class *parent = prt->class;
+
+ /*
+ * A direct locking problem where unsafe_class lock is taken
+ * directly by safe_class lock, then all we need to show
+ * is the deadlock scenario, as it is obvious that the
+ * unsafe lock is taken under the safe lock.
+ *
+ * But if there is a chain instead, where the safe lock takes
+ * an intermediate lock (middle_class) where this lock is
+ * not the same as the safe lock, then the lock chain is
+ * used to describe the problem. Otherwise we would need
+ * to show a different CPU case for each link in the chain
+ * from the safe_class lock to the unsafe_class lock.
+ */
+ if (parent != source) {
+ printk("Chain exists of:\n ");
+ __print_lock_name(source);
+ printk(" --> ");
+ __print_lock_name(parent);
+ printk(" --> ");
+ __print_lock_name(target);
+ printk("\n\n");
+ }
+
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(");\n");
+ printk(" lock(");
+ __print_lock_name(parent);
+ printk(");\n");
+ printk(" lock(");
+ __print_lock_name(target);
+ printk(");\n");
+ printk(" lock(");
+ __print_lock_name(source);
+ printk(");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+}
+
/*
* When a circular dependency is detected, print the
* header first:
@@ -1096,6 +1158,7 @@ static noinline int print_circular_bug(struct lock_list *this,
{
struct task_struct *curr = current;
struct lock_list *parent;
+ struct lock_list *first_parent;
int depth;
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
@@ -1109,6 +1172,7 @@ static noinline int print_circular_bug(struct lock_list *this,
print_circular_bug_header(target, depth, check_src, check_tgt);
parent = get_lock_parent(target);
+ first_parent = parent;
while (parent) {
print_circular_bug_entry(parent, --depth);
@@ -1116,6 +1180,9 @@ static noinline int print_circular_bug(struct lock_list *this,
}
printk("\nother info that might help us debug this:\n\n");
+ print_circular_lock_scenario(check_src, check_tgt,
+ first_parent);
+
lockdep_print_held_locks(curr);
printk("\nstack backtrace:\n");
@@ -1314,7 +1381,7 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
printk("\n");
if (depth == 0 && (entry != root)) {
- printk("lockdep:%s bad BFS generated tree\n", __func__);
+ printk("lockdep:%s bad path found in chain graph\n", __func__);
break;
}
@@ -1325,6 +1392,62 @@ print_shortest_lock_dependencies(struct lock_list *leaf,
return;
}
+static void
+print_irq_lock_scenario(struct lock_list *safe_entry,
+ struct lock_list *unsafe_entry,
+ struct lock_class *prev_class,
+ struct lock_class *next_class)
+{
+ struct lock_class *safe_class = safe_entry->class;
+ struct lock_class *unsafe_class = unsafe_entry->class;
+ struct lock_class *middle_class = prev_class;
+
+ if (middle_class == safe_class)
+ middle_class = next_class;
+
+ /*
+ * A direct locking problem where unsafe_class lock is taken
+ * directly by safe_class lock, then all we need to show
+ * is the deadlock scenario, as it is obvious that the
+ * unsafe lock is taken under the safe lock.
+ *
+ * But if there is a chain instead, where the safe lock takes
+ * an intermediate lock (middle_class) where this lock is
+ * not the same as the safe lock, then the lock chain is
+ * used to describe the problem. Otherwise we would need
+ * to show a different CPU case for each link in the chain
+ * from the safe_class lock to the unsafe_class lock.
+ */
+ if (middle_class != unsafe_class) {
+ printk("Chain exists of:\n ");
+ __print_lock_name(safe_class);
+ printk(" --> ");
+ __print_lock_name(middle_class);
+ printk(" --> ");
+ __print_lock_name(unsafe_class);
+ printk("\n\n");
+ }
+
+ printk(" Possible interrupt unsafe locking scenario:\n\n");
+ printk(" CPU0 CPU1\n");
+ printk(" ---- ----\n");
+ printk(" lock(");
+ __print_lock_name(unsafe_class);
+ printk(");\n");
+ printk(" local_irq_disable();\n");
+ printk(" lock(");
+ __print_lock_name(safe_class);
+ printk(");\n");
+ printk(" lock(");
+ __print_lock_name(middle_class);
+ printk(");\n");
+ printk(" <Interrupt>\n");
+ printk(" lock(");
+ __print_lock_name(safe_class);
+ printk(");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+}
+
static int
print_bad_irq_dependency(struct task_struct *curr,
struct lock_list *prev_root,
@@ -1376,6 +1499,9 @@ print_bad_irq_dependency(struct task_struct *curr,
print_stack_trace(forwards_entry->class->usage_traces + bit2, 1);
printk("\nother info that might help us debug this:\n\n");
+ print_irq_lock_scenario(backwards_entry, forwards_entry,
+ hlock_class(prev), hlock_class(next));
+
lockdep_print_held_locks(curr);
printk("\nthe dependencies between %s-irq-safe lock", irqclass);
@@ -1539,6 +1665,26 @@ static inline void inc_chains(void)
#endif
+static void
+print_deadlock_scenario(struct held_lock *nxt,
+ struct held_lock *prv)
+{
+ struct lock_class *next = hlock_class(nxt);
+ struct lock_class *prev = hlock_class(prv);
+
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0\n");
+ printk(" ----\n");
+ printk(" lock(");
+ __print_lock_name(prev);
+ printk(");\n");
+ printk(" lock(");
+ __print_lock_name(next);
+ printk(");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+ printk(" May be due to missing lock nesting notation\n\n");
+}
+
static int
print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
struct held_lock *next)
@@ -1557,6 +1703,7 @@ print_deadlock_bug(struct task_struct *curr, struct held_lock *prev,
print_lock(prev);
printk("\nother info that might help us debug this:\n");
+ print_deadlock_scenario(next, prev);
lockdep_print_held_locks(curr);
printk("\nstack backtrace:\n");
@@ -1826,7 +1973,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
struct list_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
struct held_lock *hlock_curr, *hlock_next;
- int i, j, n, cn;
+ int i, j;
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return 0;
@@ -1886,15 +2033,9 @@ cache_hit:
}
i++;
chain->depth = curr->lockdep_depth + 1 - i;
- cn = nr_chain_hlocks;
- while (cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS) {
- n = cmpxchg(&nr_chain_hlocks, cn, cn + chain->depth);
- if (n == cn)
- break;
- cn = n;
- }
- if (likely(cn + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
- chain->base = cn;
+ if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) {
+ chain->base = nr_chain_hlocks;
+ nr_chain_hlocks += chain->depth;
for (j = 0; j < chain->depth - 1; j++, i++) {
int lock_id = curr->held_locks[i].class_idx - 1;
chain_hlocks[chain->base + j] = lock_id;
@@ -2011,6 +2152,24 @@ static void check_chain_key(struct task_struct *curr)
#endif
}
+static void
+print_usage_bug_scenario(struct held_lock *lock)
+{
+ struct lock_class *class = hlock_class(lock);
+
+ printk(" Possible unsafe locking scenario:\n\n");
+ printk(" CPU0\n");
+ printk(" ----\n");
+ printk(" lock(");
+ __print_lock_name(class);
+ printk(");\n");
+ printk(" <Interrupt>\n");
+ printk(" lock(");
+ __print_lock_name(class);
+ printk(");\n");
+ printk("\n *** DEADLOCK ***\n\n");
+}
+
static int
print_usage_bug(struct task_struct *curr, struct held_lock *this,
enum lock_usage_bit prev_bit, enum lock_usage_bit new_bit)
@@ -2039,6 +2198,8 @@ print_usage_bug(struct task_struct *curr, struct held_lock *this,
print_irqtrace_events(curr);
printk("\nother info that might help us debug this:\n");
+ print_usage_bug_scenario(this);
+
lockdep_print_held_locks(curr);
printk("\nstack backtrace:\n");
@@ -2073,6 +2234,10 @@ print_irq_inversion_bug(struct task_struct *curr,
struct held_lock *this, int forwards,
const char *irqclass)
{
+ struct lock_list *entry = other;
+ struct lock_list *middle = NULL;
+ int depth;
+
if (!debug_locks_off_graph_unlock() || debug_locks_silent)
return 0;
@@ -2091,6 +2256,25 @@ print_irq_inversion_bug(struct task_struct *curr,
printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
printk("\nother info that might help us debug this:\n");
+
+ /* Find a middle lock (if one exists) */
+ depth = get_lock_depth(other);
+ do {
+ if (depth == 0 && (entry != root)) {
+ printk("lockdep:%s bad path found in chain graph\n", __func__);
+ break;
+ }
+ middle = entry;
+ entry = get_lock_parent(entry);
+ depth--;
+ } while (entry && entry != root && (depth >= 0));
+ if (forwards)
+ print_irq_lock_scenario(root, other,
+ middle ? middle->class : root->class, other->class);
+ else
+ print_irq_lock_scenario(other, root,
+ middle ? middle->class : other->class, root->class);
+
lockdep_print_held_locks(curr);
printk("\nthe shortest dependencies between 2nd lock and 1st lock:\n");
@@ -2309,7 +2493,7 @@ void trace_hardirqs_on_caller(unsigned long ip)
if (unlikely(curr->hardirqs_enabled)) {
/*
* Neither irq nor preemption are disabled here
- * so this is racy by nature but loosing one hit
+ * so this is racy by nature but losing one hit
* in a stat is not a big deal.
*/
__debug_atomic_inc(redundant_hardirqs_on);
@@ -2620,7 +2804,7 @@ static int mark_lock(struct task_struct *curr, struct held_lock *this,
if (!graph_lock())
return 0;
/*
- * Make sure we didnt race:
+ * Make sure we didn't race:
*/
if (unlikely(hlock_class(this)->usage_mask & new_mask)) {
graph_unlock();
diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c
index 1969d2fc4b36..71edd2f60c02 100644
--- a/kernel/lockdep_proc.c
+++ b/kernel/lockdep_proc.c
@@ -225,7 +225,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_irq_read_safe = 0, nr_irq_read_unsafe = 0,
nr_softirq_read_safe = 0, nr_softirq_read_unsafe = 0,
nr_hardirq_read_safe = 0, nr_hardirq_read_unsafe = 0,
- sum_forward_deps = 0, factor = 0;
+ sum_forward_deps = 0;
list_for_each_entry(class, &all_lock_classes, lock_entry) {
@@ -283,13 +283,6 @@ static int lockdep_stats_show(struct seq_file *m, void *v)
nr_hardirq_unsafe * nr_hardirq_safe +
nr_list_entries);
- /*
- * Estimated factor between direct and indirect
- * dependencies:
- */
- if (nr_list_entries)
- factor = sum_forward_deps / nr_list_entries;
-
#ifdef CONFIG_PROVE_LOCKING
seq_printf(m, " dependency chains: %11lu [max: %lu]\n",
nr_lock_chains, MAX_LOCKDEP_CHAINS);
diff --git a/kernel/module.c b/kernel/module.c
index efa290ea94bf..795bdc7f5c3f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -57,6 +57,7 @@
#include <linux/kmemleak.h>
#include <linux/jump_label.h>
#include <linux/pfn.h>
+#include <linux/bsearch.h>
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
@@ -240,23 +241,24 @@ static bool each_symbol_in_section(const struct symsearch *arr,
struct module *owner,
bool (*fn)(const struct symsearch *syms,
struct module *owner,
- unsigned int symnum, void *data),
+ void *data),
void *data)
{
- unsigned int i, j;
+ unsigned int j;
for (j = 0; j < arrsize; j++) {
- for (i = 0; i < arr[j].stop - arr[j].start; i++)
- if (fn(&arr[j], owner, i, data))
- return true;
+ if (fn(&arr[j], owner, data))
+ return true;
}
return false;
}
/* Returns true as soon as fn returns true, otherwise false. */
-bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
- unsigned int symnum, void *data), void *data)
+bool each_symbol_section(bool (*fn)(const struct symsearch *arr,
+ struct module *owner,
+ void *data),
+ void *data)
{
struct module *mod;
static const struct symsearch arr[] = {
@@ -309,7 +311,7 @@ bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
}
return false;
}
-EXPORT_SYMBOL_GPL(each_symbol);
+EXPORT_SYMBOL_GPL(each_symbol_section);
struct find_symbol_arg {
/* Input */
@@ -323,15 +325,12 @@ struct find_symbol_arg {
const struct kernel_symbol *sym;
};
-static bool find_symbol_in_section(const struct symsearch *syms,
- struct module *owner,
- unsigned int symnum, void *data)
+static bool check_symbol(const struct symsearch *syms,
+ struct module *owner,
+ unsigned int symnum, void *data)
{
struct find_symbol_arg *fsa = data;
- if (strcmp(syms->start[symnum].name, fsa->name) != 0)
- return false;
-
if (!fsa->gplok) {
if (syms->licence == GPL_ONLY)
return false;
@@ -365,6 +364,30 @@ static bool find_symbol_in_section(const struct symsearch *syms,
return true;
}
+static int cmp_name(const void *va, const void *vb)
+{
+ const char *a;
+ const struct kernel_symbol *b;
+ a = va; b = vb;
+ return strcmp(a, b->name);
+}
+
+static bool find_symbol_in_section(const struct symsearch *syms,
+ struct module *owner,
+ void *data)
+{
+ struct find_symbol_arg *fsa = data;
+ struct kernel_symbol *sym;
+
+ sym = bsearch(fsa->name, syms->start, syms->stop - syms->start,
+ sizeof(struct kernel_symbol), cmp_name);
+
+ if (sym != NULL && check_symbol(syms, owner, sym - syms->start, data))
+ return true;
+
+ return false;
+}
+
/* Find a symbol and return it, along with, (optional) crc and
* (optional) module which owns it. Needs preempt disabled or module_mutex. */
const struct kernel_symbol *find_symbol(const char *name,
@@ -379,7 +402,7 @@ const struct kernel_symbol *find_symbol(const char *name,
fsa.gplok = gplok;
fsa.warn = warn;
- if (each_symbol(find_symbol_in_section, &fsa)) {
+ if (each_symbol_section(find_symbol_in_section, &fsa)) {
if (owner)
*owner = fsa.owner;
if (crc)
@@ -809,7 +832,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
wait_for_zero_refcount(mod);
mutex_unlock(&module_mutex);
- /* Final destruction now noone is using it. */
+ /* Final destruction now no one is using it. */
if (mod->exit != NULL)
mod->exit();
blocking_notifier_call_chain(&module_notify_list,
@@ -1168,7 +1191,7 @@ static ssize_t module_sect_show(struct module_attribute *mattr,
{
struct module_sect_attr *sattr =
container_of(mattr, struct module_sect_attr, mattr);
- return sprintf(buf, "0x%lx\n", sattr->address);
+ return sprintf(buf, "0x%pK\n", (void *)sattr->address);
}
static void free_sect_attrs(struct module_sect_attrs *sect_attrs)
@@ -1607,27 +1630,28 @@ static void set_section_ro_nx(void *base,
}
}
-/* Setting memory back to RW+NX before releasing it */
-void unset_section_ro_nx(struct module *mod, void *module_region)
+static void unset_module_core_ro_nx(struct module *mod)
{
- unsigned long total_pages;
-
- if (mod->module_core == module_region) {
- /* Set core as NX+RW */
- total_pages = MOD_NUMBER_OF_PAGES(mod->module_core, mod->core_size);
- set_memory_nx((unsigned long)mod->module_core, total_pages);
- set_memory_rw((unsigned long)mod->module_core, total_pages);
+ set_page_attributes(mod->module_core + mod->core_text_size,
+ mod->module_core + mod->core_size,
+ set_memory_x);
+ set_page_attributes(mod->module_core,
+ mod->module_core + mod->core_ro_size,
+ set_memory_rw);
+}
- } else if (mod->module_init == module_region) {
- /* Set init as NX+RW */
- total_pages = MOD_NUMBER_OF_PAGES(mod->module_init, mod->init_size);
- set_memory_nx((unsigned long)mod->module_init, total_pages);
- set_memory_rw((unsigned long)mod->module_init, total_pages);
- }
+static void unset_module_init_ro_nx(struct module *mod)
+{
+ set_page_attributes(mod->module_init + mod->init_text_size,
+ mod->module_init + mod->init_size,
+ set_memory_x);
+ set_page_attributes(mod->module_init,
+ mod->module_init + mod->init_ro_size,
+ set_memory_rw);
}
/* Iterate through all modules and set each module's text as RW */
-void set_all_modules_text_rw()
+void set_all_modules_text_rw(void)
{
struct module *mod;
@@ -1648,7 +1672,7 @@ void set_all_modules_text_rw()
}
/* Iterate through all modules and set each module's text as RO */
-void set_all_modules_text_ro()
+void set_all_modules_text_ro(void)
{
struct module *mod;
@@ -1669,7 +1693,8 @@ void set_all_modules_text_ro()
}
#else
static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { }
-static inline void unset_section_ro_nx(struct module *mod, void *module_region) { }
+static void unset_module_core_ro_nx(struct module *mod) { }
+static void unset_module_init_ro_nx(struct module *mod) { }
#endif
/* Free a module, remove from lists, etc. */
@@ -1696,7 +1721,7 @@ static void free_module(struct module *mod)
destroy_params(mod->kp, mod->num_kp);
/* This may be NULL, but that's OK */
- unset_section_ro_nx(mod, mod->module_init);
+ unset_module_init_ro_nx(mod);
module_free(mod, mod->module_init);
kfree(mod->args);
percpu_modfree(mod);
@@ -1705,7 +1730,7 @@ static void free_module(struct module *mod)
lockdep_free_key_range(mod->module_core, mod->core_size);
/* Finally, free the core (containing the module structure) */
- unset_section_ro_nx(mod, mod->module_core);
+ unset_module_core_ro_nx(mod);
module_free(mod, mod->module_core);
#ifdef CONFIG_MPU
@@ -2030,11 +2055,8 @@ static const struct kernel_symbol *lookup_symbol(const char *name,
const struct kernel_symbol *start,
const struct kernel_symbol *stop)
{
- const struct kernel_symbol *ks = start;
- for (; ks < stop; ks++)
- if (strcmp(ks->name, name) == 0)
- return ks;
- return NULL;
+ return bsearch(name, start, stop - start,
+ sizeof(struct kernel_symbol), cmp_name);
}
static int is_exported(const char *name, unsigned long value,
@@ -2777,7 +2799,7 @@ static struct module *load_module(void __user *umod,
mod->state = MODULE_STATE_COMING;
/* Now sew it into the lists so we can get lockdep and oops
- * info during argument parsing. Noone should access us, since
+ * info during argument parsing. No one should access us, since
* strong_try_module_get() will fail.
* lockdep/oops can run asynchronous, so use the RCU list insertion
* function to insert in a way safe to concurrent readers.
@@ -2790,7 +2812,7 @@ static struct module *load_module(void __user *umod,
}
/* This has to be done once we're sure module name is unique. */
- if (!mod->taints)
+ if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
dynamic_debug_setup(info.debug, info.num_debug);
/* Find duplicate symbols */
@@ -2827,7 +2849,7 @@ static struct module *load_module(void __user *umod,
module_bug_cleanup(mod);
ddebug:
- if (!mod->taints)
+ if (!mod->taints || mod->taints == (1U<<TAINT_CRAP))
dynamic_debug_remove(info.debug);
unlock:
mutex_unlock(&module_mutex);
@@ -2931,10 +2953,11 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
mod->symtab = mod->core_symtab;
mod->strtab = mod->core_strtab;
#endif
- unset_section_ro_nx(mod, mod->module_init);
+ unset_module_init_ro_nx(mod);
module_free(mod, mod->module_init);
mod->module_init = NULL;
mod->init_size = 0;
+ mod->init_ro_size = 0;
mod->init_text_size = 0;
mutex_unlock(&module_mutex);
@@ -2971,7 +2994,7 @@ static const char *get_ksymbol(struct module *mod,
else
nextval = (unsigned long)mod->module_core+mod->core_text_size;
- /* Scan for closest preceeding symbol, and next symbol. (ELF
+ /* Scan for closest preceding symbol, and next symbol. (ELF
starts real symbols at 1). */
for (i = 1; i < mod->num_symtab; i++) {
if (mod->symtab[i].st_shndx == SHN_UNDEF)
@@ -3224,7 +3247,7 @@ static int m_show(struct seq_file *m, void *p)
mod->state == MODULE_STATE_COMING ? "Loading":
"Live");
/* Used by oprofile and other similar tools. */
- seq_printf(m, " 0x%p", mod->module_core);
+ seq_printf(m, " 0x%pK", mod->module_core);
/* Taints info */
if (mod->taints)
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index ec815a960b5d..73da83aff418 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -75,7 +75,7 @@ void debug_mutex_unlock(struct mutex *lock)
return;
DEBUG_LOCKS_WARN_ON(lock->magic != lock);
- DEBUG_LOCKS_WARN_ON(lock->owner != current_thread_info());
+ DEBUG_LOCKS_WARN_ON(lock->owner != current);
DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next);
mutex_clear_owner(lock);
}
diff --git a/kernel/mutex-debug.h b/kernel/mutex-debug.h
index 57d527a16f9d..0799fd3e4cfa 100644
--- a/kernel/mutex-debug.h
+++ b/kernel/mutex-debug.h
@@ -29,7 +29,7 @@ extern void debug_mutex_init(struct mutex *lock, const char *name,
static inline void mutex_set_owner(struct mutex *lock)
{
- lock->owner = current_thread_info();
+ lock->owner = current;
}
static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/mutex.c b/kernel/mutex.c
index a5889fb28ecf..d607ed5dd441 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -131,14 +131,14 @@ EXPORT_SYMBOL(mutex_unlock);
*/
static inline int __sched
__mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
- unsigned long ip)
+ struct lockdep_map *nest_lock, unsigned long ip)
{
struct task_struct *task = current;
struct mutex_waiter waiter;
unsigned long flags;
preempt_disable();
- mutex_acquire(&lock->dep_map, subclass, 0, ip);
+ mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
/*
@@ -160,14 +160,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
for (;;) {
- struct thread_info *owner;
-
- /*
- * If we own the BKL, then don't spin. The owner of
- * the mutex might be waiting on us to release the BKL.
- */
- if (unlikely(current->lock_depth >= 0))
- break;
+ struct task_struct *owner;
/*
* If there's an owner, wait for it to either
@@ -245,7 +238,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
}
__set_task_state(task, state);
- /* didnt get the lock, go to sleep: */
+ /* didn't get the lock, go to sleep: */
spin_unlock_mutex(&lock->wait_lock, flags);
preempt_enable_no_resched();
schedule();
@@ -276,16 +269,25 @@ void __sched
mutex_lock_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, _RET_IP_);
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_nested);
+void __sched
+_mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest)
+{
+ might_sleep();
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_);
+}
+
+EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock);
+
int __sched
mutex_lock_killable_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
- return __mutex_lock_common(lock, TASK_KILLABLE, subclass, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_KILLABLE, subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_killable_nested);
@@ -294,7 +296,7 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass)
{
might_sleep();
return __mutex_lock_common(lock, TASK_INTERRUPTIBLE,
- subclass, _RET_IP_);
+ subclass, NULL, _RET_IP_);
}
EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested);
@@ -400,7 +402,7 @@ __mutex_lock_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, _RET_IP_);
+ __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
@@ -408,7 +410,7 @@ __mutex_lock_killable_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- return __mutex_lock_common(lock, TASK_KILLABLE, 0, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_KILLABLE, 0, NULL, _RET_IP_);
}
static noinline int __sched
@@ -416,7 +418,7 @@ __mutex_lock_interruptible_slowpath(atomic_t *lock_count)
{
struct mutex *lock = container_of(lock_count, struct mutex, count);
- return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, _RET_IP_);
+ return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, 0, NULL, _RET_IP_);
}
#endif
diff --git a/kernel/mutex.h b/kernel/mutex.h
index 67578ca48f94..4115fbf83b12 100644
--- a/kernel/mutex.h
+++ b/kernel/mutex.h
@@ -19,7 +19,7 @@
#ifdef CONFIG_SMP
static inline void mutex_set_owner(struct mutex *lock)
{
- lock->owner = current_thread_info();
+ lock->owner = current;
}
static inline void mutex_clear_owner(struct mutex *lock)
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
deleted file mode 100644
index 2c98ad94ba0e..000000000000
--- a/kernel/ns_cgroup.c
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * ns_cgroup.c - namespace cgroup subsystem
- *
- * Copyright 2006, 2007 IBM Corp
- */
-
-#include <linux/module.h>
-#include <linux/cgroup.h>
-#include <linux/fs.h>
-#include <linux/proc_fs.h>
-#include <linux/slab.h>
-#include <linux/nsproxy.h>
-
-struct ns_cgroup {
- struct cgroup_subsys_state css;
-};
-
-struct cgroup_subsys ns_subsys;
-
-static inline struct ns_cgroup *cgroup_to_ns(
- struct cgroup *cgroup)
-{
- return container_of(cgroup_subsys_state(cgroup, ns_subsys_id),
- struct ns_cgroup, css);
-}
-
-int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
-{
- char name[PROC_NUMBUF];
-
- snprintf(name, PROC_NUMBUF, "%d", pid_vnr(pid));
- return cgroup_clone(task, &ns_subsys, name);
-}
-
-/*
- * Rules:
- * 1. you can only enter a cgroup which is a descendant of your current
- * cgroup
- * 2. you can only place another process into a cgroup if
- * a. you have CAP_SYS_ADMIN
- * b. your cgroup is an ancestor of task's destination cgroup
- * (hence either you are in the same cgroup as task, or in an
- * ancestor cgroup thereof)
- */
-static int ns_can_attach(struct cgroup_subsys *ss, struct cgroup *new_cgroup,
- struct task_struct *task, bool threadgroup)
-{
- if (current != task) {
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
-
- if (!cgroup_is_descendant(new_cgroup, current))
- return -EPERM;
- }
-
- if (!cgroup_is_descendant(new_cgroup, task))
- return -EPERM;
-
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &task->thread_group, thread_group) {
- if (!cgroup_is_descendant(new_cgroup, c)) {
- rcu_read_unlock();
- return -EPERM;
- }
- }
- rcu_read_unlock();
- }
-
- return 0;
-}
-
-/*
- * Rules: you can only create a cgroup if
- * 1. you are capable(CAP_SYS_ADMIN)
- * 2. the target cgroup is a descendant of your own cgroup
- */
-static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
-{
- struct ns_cgroup *ns_cgroup;
-
- if (!capable(CAP_SYS_ADMIN))
- return ERR_PTR(-EPERM);
- if (!cgroup_is_descendant(cgroup, current))
- return ERR_PTR(-EPERM);
- if (test_bit(CGRP_CLONE_CHILDREN, &cgroup->flags)) {
- printk("ns_cgroup can't be created with parent "
- "'clone_children' set.\n");
- return ERR_PTR(-EINVAL);
- }
-
- printk_once("ns_cgroup deprecated: consider using the "
- "'clone_children' flag without the ns_cgroup.\n");
-
- ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
- if (!ns_cgroup)
- return ERR_PTR(-ENOMEM);
- return &ns_cgroup->css;
-}
-
-static void ns_destroy(struct cgroup_subsys *ss,
- struct cgroup *cgroup)
-{
- struct ns_cgroup *ns_cgroup;
-
- ns_cgroup = cgroup_to_ns(cgroup);
- kfree(ns_cgroup);
-}
-
-struct cgroup_subsys ns_subsys = {
- .name = "ns",
- .can_attach = ns_can_attach,
- .create = ns_create,
- .destroy = ns_destroy,
- .subsys_id = ns_subsys_id,
-};
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index f74e6c00e26d..d6a00f3de15d 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -22,6 +22,9 @@
#include <linux/pid_namespace.h>
#include <net/net_namespace.h>
#include <linux/ipc_namespace.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/syscalls.h>
static struct kmem_cache *nsproxy_cachep;
@@ -69,13 +72,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags,
goto out_ns;
}
- new_nsp->uts_ns = copy_utsname(flags, tsk->nsproxy->uts_ns);
+ new_nsp->uts_ns = copy_utsname(flags, tsk);
if (IS_ERR(new_nsp->uts_ns)) {
err = PTR_ERR(new_nsp->uts_ns);
goto out_uts;
}
- new_nsp->ipc_ns = copy_ipcs(flags, tsk->nsproxy->ipc_ns);
+ new_nsp->ipc_ns = copy_ipcs(flags, tsk);
if (IS_ERR(new_nsp->ipc_ns)) {
err = PTR_ERR(new_nsp->ipc_ns);
goto out_ipc;
@@ -198,10 +201,6 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags,
goto out;
}
- err = ns_cgroup_clone(current, task_pid(current));
- if (err)
- put_nsproxy(*new_nsp);
-
out:
return err;
}
@@ -233,6 +232,45 @@ void exit_task_namespaces(struct task_struct *p)
switch_task_namespaces(p, NULL);
}
+SYSCALL_DEFINE2(setns, int, fd, int, nstype)
+{
+ const struct proc_ns_operations *ops;
+ struct task_struct *tsk = current;
+ struct nsproxy *new_nsproxy;
+ struct proc_inode *ei;
+ struct file *file;
+ int err;
+
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ file = proc_ns_fget(fd);
+ if (IS_ERR(file))
+ return PTR_ERR(file);
+
+ err = -EINVAL;
+ ei = PROC_I(file->f_dentry->d_inode);
+ ops = ei->ns_ops;
+ if (nstype && (ops->type != nstype))
+ goto out;
+
+ new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+ if (IS_ERR(new_nsproxy)) {
+ err = PTR_ERR(new_nsproxy);
+ goto out;
+ }
+
+ err = ops->install(new_nsproxy, ei->ns);
+ if (err) {
+ free_nsproxy(new_nsproxy);
+ goto out;
+ }
+ switch_task_namespaces(tsk, new_nsproxy);
+out:
+ fput(file);
+ return err;
+}
+
static int __init nsproxy_cache_init(void)
{
nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
diff --git a/kernel/padata.c b/kernel/padata.c
index 751019415d23..b91941df5e63 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -262,7 +262,7 @@ static void padata_reorder(struct parallel_data *pd)
/*
* This cpu has to do the parallel processing of the next
* object. It's waiting in the cpu's parallelization queue,
- * so exit imediately.
+ * so exit immediately.
*/
if (PTR_ERR(padata) == -ENODATA) {
del_timer(&pd->timer);
@@ -284,7 +284,7 @@ static void padata_reorder(struct parallel_data *pd)
/*
* The next object that needs serialization might have arrived to
* the reorder queues in the meantime, we will be called again
- * from the timer function if noone else cares for it.
+ * from the timer function if no one else cares for it.
*/
if (atomic_read(&pd->reorder_objects)
&& !(pinst->flags & PADATA_RESET))
@@ -515,7 +515,7 @@ static void __padata_stop(struct padata_instance *pinst)
put_online_cpus();
}
-/* Replace the internal control stucture with a new one. */
+/* Replace the internal control structure with a new one. */
static void padata_replace(struct padata_instance *pinst,
struct parallel_data *pd_new)
{
@@ -768,7 +768,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
}
/**
- * padata_remove_cpu - remove a cpu from the one or both(serial and paralell)
+ * padata_remove_cpu - remove a cpu from the one or both(serial and parallel)
* padata cpumasks.
*
* @pinst: padata instance
diff --git a/kernel/panic.c b/kernel/panic.c
index 991bb87a1704..69231670eb95 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -433,3 +433,13 @@ EXPORT_SYMBOL(__stack_chk_fail);
core_param(panic, panic_timeout, int, 0644);
core_param(pause_on_oops, pause_on_oops, int, 0644);
+
+static int __init oops_setup(char *s)
+{
+ if (!s)
+ return -EINVAL;
+ if (!strcmp(s, "panic"))
+ panic_on_oops = 1;
+ return 0;
+}
+early_param("oops", oops_setup);
diff --git a/kernel/params.c b/kernel/params.c
index 0da1411222b9..ed72e1330862 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -95,7 +95,7 @@ static int parse_one(char *param,
/* Find parameter */
for (i = 0; i < num_params; i++) {
if (parameq(param, params[i].name)) {
- /* Noone handled NULL, so do it here. */
+ /* No one handled NULL, so do it here. */
if (!val && params[i].ops->set != param_set_bool)
return -EINVAL;
DEBUGP("They are equal! Calling %p\n",
@@ -297,21 +297,15 @@ EXPORT_SYMBOL(param_ops_charp);
int param_set_bool(const char *val, const struct kernel_param *kp)
{
bool v;
+ int ret;
/* No equals means "set"... */
if (!val) val = "1";
/* One of =[yYnN01] */
- switch (val[0]) {
- case 'y': case 'Y': case '1':
- v = true;
- break;
- case 'n': case 'N': case '0':
- v = false;
- break;
- default:
- return -EINVAL;
- }
+ ret = strtobool(val, &v);
+ if (ret)
+ return ret;
if (kp->flags & KPARAM_ISBOOL)
*(bool *)kp->arg = v;
@@ -821,15 +815,18 @@ ssize_t __modver_version_show(struct module_attribute *mattr,
return sprintf(buf, "%s\n", vattr->version);
}
-extern struct module_version_attribute __start___modver[], __stop___modver[];
+extern const struct module_version_attribute *__start___modver[];
+extern const struct module_version_attribute *__stop___modver[];
static void __init version_sysfs_builtin(void)
{
- const struct module_version_attribute *vattr;
+ const struct module_version_attribute **p;
struct module_kobject *mk;
int err;
- for (vattr = __start___modver; vattr < __stop___modver; vattr++) {
+ for (p = __start___modver; p < __stop___modver; p++) {
+ const struct module_version_attribute *vattr = *p;
+
mk = locate_module_kobject(vattr->module_name);
if (mk) {
err = sysfs_create_file(&mk->kobj, &vattr->mattr.attr);
diff --git a/kernel/pid.c b/kernel/pid.c
index 39b65b69584f..57a8346a270e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -217,11 +217,14 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
return -1;
}
-int next_pidmap(struct pid_namespace *pid_ns, int last)
+int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
{
int offset;
struct pidmap *map, *end;
+ if (last >= PID_MAX_LIMIT)
+ return -1;
+
offset = (last + 1) & BITS_PER_PAGE_MASK;
map = &pid_ns->pidmap[(last + 1)/BITS_PER_PAGE];
end = &pid_ns->pidmap[PIDMAP_ENTRIES];
@@ -435,6 +438,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
rcu_read_unlock();
return pid;
}
+EXPORT_SYMBOL_GPL(get_task_pid);
struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
{
@@ -446,6 +450,7 @@ struct task_struct *get_pid_task(struct pid *pid, enum pid_type type)
rcu_read_unlock();
return result;
}
+EXPORT_SYMBOL_GPL(get_pid_task);
struct pid *find_get_pid(pid_t nr)
{
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index a5aff94e1f0b..e9c9adc84ca6 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -14,6 +14,7 @@
#include <linux/err.h>
#include <linux/acct.h>
#include <linux/slab.h>
+#include <linux/proc_fs.h>
#define BITS_PER_PAGE (PAGE_SIZE*8)
@@ -72,7 +73,7 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
{
struct pid_namespace *ns;
unsigned int level = parent_pid_ns->level + 1;
- int i;
+ int i, err = -ENOMEM;
ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL);
if (ns == NULL)
@@ -96,14 +97,20 @@ static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_p
for (i = 1; i < PIDMAP_ENTRIES; i++)
atomic_set(&ns->pidmap[i].nr_free, BITS_PER_PAGE);
+ err = pid_ns_prepare_proc(ns);
+ if (err)
+ goto out_put_parent_pid_ns;
+
return ns;
+out_put_parent_pid_ns:
+ put_pid_ns(parent_pid_ns);
out_free_map:
kfree(ns->pidmap[0].page);
out_free:
kmem_cache_free(pid_ns_cachep, ns);
out:
- return ERR_PTR(-ENOMEM);
+ return ERR_PTR(err);
}
static void destroy_pid_namespace(struct pid_namespace *ns)
diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c
index 6a8fad82a3ad..6824ca7d4d0c 100644
--- a/kernel/pm_qos_params.c
+++ b/kernel/pm_qos_params.c
@@ -40,6 +40,7 @@
#include <linux/string.h>
#include <linux/platform_device.h>
#include <linux/init.h>
+#include <linux/kernel.h>
#include <linux/uaccess.h>
@@ -112,11 +113,14 @@ static struct pm_qos_object *pm_qos_array[] = {
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos);
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos);
static int pm_qos_power_open(struct inode *inode, struct file *filp);
static int pm_qos_power_release(struct inode *inode, struct file *filp);
static const struct file_operations pm_qos_power_fops = {
.write = pm_qos_power_write,
+ .read = pm_qos_power_read,
.open = pm_qos_power_open,
.release = pm_qos_power_release,
.llseek = noop_llseek,
@@ -389,28 +393,61 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp)
}
+static ssize_t pm_qos_power_read(struct file *filp, char __user *buf,
+ size_t count, loff_t *f_pos)
+{
+ s32 value;
+ unsigned long flags;
+ struct pm_qos_object *o;
+ struct pm_qos_request_list *pm_qos_req = filp->private_data;
+
+ if (!pm_qos_req)
+ return -EINVAL;
+ if (!pm_qos_request_active(pm_qos_req))
+ return -EINVAL;
+
+ o = pm_qos_array[pm_qos_req->pm_qos_class];
+ spin_lock_irqsave(&pm_qos_lock, flags);
+ value = pm_qos_get_value(o);
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ return simple_read_from_buffer(buf, count, f_pos, &value, sizeof(s32));
+}
+
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
size_t count, loff_t *f_pos)
{
s32 value;
- int x;
- char ascii_value[11];
struct pm_qos_request_list *pm_qos_req;
if (count == sizeof(s32)) {
if (copy_from_user(&value, buf, sizeof(s32)))
return -EFAULT;
- } else if (count == 11) { /* len('0x12345678/0') */
- if (copy_from_user(ascii_value, buf, 11))
+ } else if (count <= 11) { /* ASCII perhaps? */
+ char ascii_value[11];
+ unsigned long int ulval;
+ int ret;
+
+ if (copy_from_user(ascii_value, buf, count))
return -EFAULT;
- if (strlen(ascii_value) != 10)
- return -EINVAL;
- x = sscanf(ascii_value, "%x", &value);
- if (x != 1)
+
+ if (count > 10) {
+ if (ascii_value[10] == '\n')
+ ascii_value[10] = '\0';
+ else
+ return -EINVAL;
+ } else {
+ ascii_value[count] = '\0';
+ }
+ ret = strict_strtoul(ascii_value, 16, &ulval);
+ if (ret) {
+ pr_debug("%s, 0x%lx, 0x%x\n", ascii_value, ulval, ret);
return -EINVAL;
- pr_debug("%s, %d, 0x%x\n", ascii_value, x, value);
- } else
+ }
+ value = (s32)lower_32_bits(ulval);
+ } else {
return -EINVAL;
+ }
pm_qos_req = filp->private_data;
pm_qos_update_request(pm_qos_req, value);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 05bb7173850e..58f405b581e7 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -176,7 +176,8 @@ static inline cputime_t virt_ticks(struct task_struct *p)
return p->utime;
}
-int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
+static int
+posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
{
int error = check_clock(which_clock);
if (!error) {
@@ -194,7 +195,8 @@ int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
return error;
}
-int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
+static int
+posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
{
/*
* You can never reset a CPU clock, but we check for other errors
@@ -317,7 +319,7 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
}
-int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
+static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
{
const pid_t pid = CPUCLOCK_PID(which_clock);
int error = -EINVAL;
@@ -379,7 +381,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
* This is called from sys_timer_create() and do_cpu_nanosleep() with the
* new timer already all-zeros initialized.
*/
-int posix_cpu_timer_create(struct k_itimer *new_timer)
+static int posix_cpu_timer_create(struct k_itimer *new_timer)
{
int ret = 0;
const pid_t pid = CPUCLOCK_PID(new_timer->it_clock);
@@ -425,7 +427,7 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
* If we return TIMER_RETRY, it's necessary to release the timer's lock
* and try again. (This happens when the timer is in the middle of firing.)
*/
-int posix_cpu_timer_del(struct k_itimer *timer)
+static int posix_cpu_timer_del(struct k_itimer *timer)
{
struct task_struct *p = timer->it.cpu.task;
int ret = 0;
@@ -665,8 +667,8 @@ static int cpu_timer_sample_group(const clockid_t which_clock,
* If we return TIMER_RETRY, it's necessary to release the timer's lock
* and try again. (This happens when the timer is in the middle of firing.)
*/
-int posix_cpu_timer_set(struct k_itimer *timer, int flags,
- struct itimerspec *new, struct itimerspec *old)
+static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+ struct itimerspec *new, struct itimerspec *old)
{
struct task_struct *p = timer->it.cpu.task;
union cpu_time_count old_expires, new_expires, old_incr, val;
@@ -820,7 +822,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
return ret;
}
-void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
+static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
{
union cpu_time_count now;
struct task_struct *p = timer->it.cpu.task;
@@ -1345,7 +1347,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
/*
* Now that all the timers on our list have the firing flag,
- * noone will touch their list entries but us. We'll take
+ * no one will touch their list entries but us. We'll take
* each timer's lock before clearing its firing flag, so no
* timer call will interfere.
*/
@@ -1481,11 +1483,13 @@ static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
return error;
}
-int posix_cpu_nsleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp, struct timespec __user *rmtp)
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block);
+
+static int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *rqtp, struct timespec __user *rmtp)
{
struct restart_block *restart_block =
- &current_thread_info()->restart_block;
+ &current_thread_info()->restart_block;
struct itimerspec it;
int error;
@@ -1501,56 +1505,47 @@ int posix_cpu_nsleep(const clockid_t which_clock, int flags,
if (error == -ERESTART_RESTARTBLOCK) {
- if (flags & TIMER_ABSTIME)
+ if (flags & TIMER_ABSTIME)
return -ERESTARTNOHAND;
/*
- * Report back to the user the time still remaining.
- */
- if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
return -EFAULT;
restart_block->fn = posix_cpu_nsleep_restart;
- restart_block->arg0 = which_clock;
- restart_block->arg1 = (unsigned long) rmtp;
- restart_block->arg2 = rqtp->tv_sec;
- restart_block->arg3 = rqtp->tv_nsec;
+ restart_block->nanosleep.clockid = which_clock;
+ restart_block->nanosleep.rmtp = rmtp;
+ restart_block->nanosleep.expires = timespec_to_ns(rqtp);
}
return error;
}
-long posix_cpu_nsleep_restart(struct restart_block *restart_block)
+static long posix_cpu_nsleep_restart(struct restart_block *restart_block)
{
- clockid_t which_clock = restart_block->arg0;
- struct timespec __user *rmtp;
+ clockid_t which_clock = restart_block->nanosleep.clockid;
struct timespec t;
struct itimerspec it;
int error;
- rmtp = (struct timespec __user *) restart_block->arg1;
- t.tv_sec = restart_block->arg2;
- t.tv_nsec = restart_block->arg3;
+ t = ns_to_timespec(restart_block->nanosleep.expires);
- restart_block->fn = do_no_restart_syscall;
error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
if (error == -ERESTART_RESTARTBLOCK) {
+ struct timespec __user *rmtp = restart_block->nanosleep.rmtp;
/*
- * Report back to the user the time still remaining.
- */
- if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+ * Report back to the user the time still remaining.
+ */
+ if (rmtp && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
return -EFAULT;
- restart_block->fn = posix_cpu_nsleep_restart;
- restart_block->arg0 = which_clock;
- restart_block->arg1 = (unsigned long) rmtp;
- restart_block->arg2 = t.tv_sec;
- restart_block->arg3 = t.tv_nsec;
+ restart_block->nanosleep.expires = timespec_to_ns(&t);
}
return error;
}
-
#define PROCESS_CLOCK MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
#define THREAD_CLOCK MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
@@ -1594,38 +1589,37 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
timer->it_clock = THREAD_CLOCK;
return posix_cpu_timer_create(timer);
}
-static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
- struct timespec *rqtp, struct timespec __user *rmtp)
-{
- return -EINVAL;
-}
-static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
-{
- return -EINVAL;
-}
+
+struct k_clock clock_posix_cpu = {
+ .clock_getres = posix_cpu_clock_getres,
+ .clock_set = posix_cpu_clock_set,
+ .clock_get = posix_cpu_clock_get,
+ .timer_create = posix_cpu_timer_create,
+ .nsleep = posix_cpu_nsleep,
+ .nsleep_restart = posix_cpu_nsleep_restart,
+ .timer_set = posix_cpu_timer_set,
+ .timer_del = posix_cpu_timer_del,
+ .timer_get = posix_cpu_timer_get,
+};
static __init int init_posix_cpu_timers(void)
{
struct k_clock process = {
- .clock_getres = process_cpu_clock_getres,
- .clock_get = process_cpu_clock_get,
- .clock_set = do_posix_clock_nosettime,
- .timer_create = process_cpu_timer_create,
- .nsleep = process_cpu_nsleep,
- .nsleep_restart = process_cpu_nsleep_restart,
+ .clock_getres = process_cpu_clock_getres,
+ .clock_get = process_cpu_clock_get,
+ .timer_create = process_cpu_timer_create,
+ .nsleep = process_cpu_nsleep,
+ .nsleep_restart = process_cpu_nsleep_restart,
};
struct k_clock thread = {
- .clock_getres = thread_cpu_clock_getres,
- .clock_get = thread_cpu_clock_get,
- .clock_set = do_posix_clock_nosettime,
- .timer_create = thread_cpu_timer_create,
- .nsleep = thread_cpu_nsleep,
- .nsleep_restart = thread_cpu_nsleep_restart,
+ .clock_getres = thread_cpu_clock_getres,
+ .clock_get = thread_cpu_clock_get,
+ .timer_create = thread_cpu_timer_create,
};
struct timespec ts;
- register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
- register_posix_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
+ posix_timers_register_clock(CLOCK_PROCESS_CPUTIME_ID, &process);
+ posix_timers_register_clock(CLOCK_THREAD_CPUTIME_ID, &thread);
cputime_to_timespec(cputime_one_jiffy, &ts);
onecputick = ts.tv_nsec;
diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c
index 93bd2eb2bc53..4556182527f3 100644
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -41,6 +41,7 @@
#include <linux/init.h>
#include <linux/compiler.h>
#include <linux/idr.h>
+#include <linux/posix-clock.h>
#include <linux/posix-timers.h>
#include <linux/syscalls.h>
#include <linux/wait.h>
@@ -81,6 +82,14 @@ static DEFINE_SPINLOCK(idr_lock);
#error "SIGEV_THREAD_ID must not share bit with other SIGEV values!"
#endif
+/*
+ * parisc wants ENOTSUP instead of EOPNOTSUPP
+ */
+#ifndef ENOTSUP
+# define ENANOSLEEP_NOTSUP EOPNOTSUPP
+#else
+# define ENANOSLEEP_NOTSUP ENOTSUP
+#endif
/*
* The timer ID is turned into a timer address by idr_find().
@@ -94,11 +103,7 @@ static DEFINE_SPINLOCK(idr_lock);
/*
* CLOCKs: The POSIX standard calls for a couple of clocks and allows us
* to implement others. This structure defines the various
- * clocks and allows the possibility of adding others. We
- * provide an interface to add clocks to the table and expect
- * the "arch" code to add at least one clock that is high
- * resolution. Here we define the standard CLOCK_REALTIME as a
- * 1/HZ resolution clock.
+ * clocks.
*
* RESOLUTION: Clock resolution is used to round up timer and interval
* times, NOT to report clock times, which are reported with as
@@ -108,20 +113,13 @@ static DEFINE_SPINLOCK(idr_lock);
* necessary code is written. The standard says we should say
* something about this issue in the documentation...
*
- * FUNCTIONS: The CLOCKs structure defines possible functions to handle
- * various clock functions. For clocks that use the standard
- * system timer code these entries should be NULL. This will
- * allow dispatch without the overhead of indirect function
- * calls. CLOCKS that depend on other sources (e.g. WWV or GPS)
- * must supply functions here, even if the function just returns
- * ENOSYS. The standard POSIX timer management code assumes the
- * following: 1.) The k_itimer struct (sched.h) is used for the
- * timer. 2.) The list, it_lock, it_clock, it_id and it_pid
- * fields are not modified by timer code.
+ * FUNCTIONS: The CLOCKs structure defines possible functions to
+ * handle various clock functions.
*
- * At this time all functions EXCEPT clock_nanosleep can be
- * redirected by the CLOCKS structure. Clock_nanosleep is in
- * there, but the code ignores it.
+ * The standard POSIX timer management code assumes the
+ * following: 1.) The k_itimer struct (sched.h) is used for
+ * the timer. 2.) The list, it_lock, it_clock, it_id and
+ * it_pid fields are not modified by timer code.
*
* Permissions: It is assumed that the clock_settime() function defined
* for each clock will take care of permission checks. Some
@@ -138,6 +136,7 @@ static struct k_clock posix_clocks[MAX_CLOCKS];
*/
static int common_nsleep(const clockid_t, int flags, struct timespec *t,
struct timespec __user *rmtp);
+static int common_timer_create(struct k_itimer *new_timer);
static void common_timer_get(struct k_itimer *, struct itimerspec *);
static int common_timer_set(struct k_itimer *, int,
struct itimerspec *, struct itimerspec *);
@@ -158,76 +157,24 @@ static inline void unlock_timer(struct k_itimer *timr, unsigned long flags)
spin_unlock_irqrestore(&timr->it_lock, flags);
}
-/*
- * Call the k_clock hook function if non-null, or the default function.
- */
-#define CLOCK_DISPATCH(clock, call, arglist) \
- ((clock) < 0 ? posix_cpu_##call arglist : \
- (posix_clocks[clock].call != NULL \
- ? (*posix_clocks[clock].call) arglist : common_##call arglist))
-
-/*
- * Default clock hook functions when the struct k_clock passed
- * to register_posix_clock leaves a function pointer null.
- *
- * The function common_CALL is the default implementation for
- * the function pointer CALL in struct k_clock.
- */
-
-static inline int common_clock_getres(const clockid_t which_clock,
- struct timespec *tp)
-{
- tp->tv_sec = 0;
- tp->tv_nsec = posix_clocks[which_clock].res;
- return 0;
-}
-
-/*
- * Get real time for posix timers
- */
-static int common_clock_get(clockid_t which_clock, struct timespec *tp)
+/* Get clock_realtime */
+static int posix_clock_realtime_get(clockid_t which_clock, struct timespec *tp)
{
ktime_get_real_ts(tp);
return 0;
}
-static inline int common_clock_set(const clockid_t which_clock,
- struct timespec *tp)
+/* Set clock_realtime */
+static int posix_clock_realtime_set(const clockid_t which_clock,
+ const struct timespec *tp)
{
return do_sys_settimeofday(tp, NULL);
}
-static int common_timer_create(struct k_itimer *new_timer)
-{
- hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
- return 0;
-}
-
-static int no_timer_create(struct k_itimer *new_timer)
-{
- return -EOPNOTSUPP;
-}
-
-static int no_nsleep(const clockid_t which_clock, int flags,
- struct timespec *tsave, struct timespec __user *rmtp)
-{
- return -EOPNOTSUPP;
-}
-
-/*
- * Return nonzero if we know a priori this clockid_t value is bogus.
- */
-static inline int invalid_clockid(const clockid_t which_clock)
+static int posix_clock_realtime_adj(const clockid_t which_clock,
+ struct timex *t)
{
- if (which_clock < 0) /* CPU clock, posix_cpu_* will check it */
- return 0;
- if ((unsigned) which_clock >= MAX_CLOCKS)
- return 1;
- if (posix_clocks[which_clock].clock_getres != NULL)
- return 0;
- if (posix_clocks[which_clock].res != 0)
- return 0;
- return 1;
+ return do_adjtimex(t);
}
/*
@@ -240,7 +187,7 @@ static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
}
/*
- * Get monotonic time for posix timers
+ * Get monotonic-raw time for posix timers
*/
static int posix_get_monotonic_raw(clockid_t which_clock, struct timespec *tp)
{
@@ -267,46 +214,70 @@ static int posix_get_coarse_res(const clockid_t which_clock, struct timespec *tp
*tp = ktime_to_timespec(KTIME_LOW_RES);
return 0;
}
+
+static int posix_get_boottime(const clockid_t which_clock, struct timespec *tp)
+{
+ get_monotonic_boottime(tp);
+ return 0;
+}
+
+
/*
* Initialize everything, well, just everything in Posix clocks/timers ;)
*/
static __init int init_posix_timers(void)
{
struct k_clock clock_realtime = {
- .clock_getres = hrtimer_get_res,
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_clock_realtime_get,
+ .clock_set = posix_clock_realtime_set,
+ .clock_adj = posix_clock_realtime_adj,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
};
struct k_clock clock_monotonic = {
- .clock_getres = hrtimer_get_res,
- .clock_get = posix_ktime_get_ts,
- .clock_set = do_posix_clock_nosettime,
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_ktime_get_ts,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
};
struct k_clock clock_monotonic_raw = {
- .clock_getres = hrtimer_get_res,
- .clock_get = posix_get_monotonic_raw,
- .clock_set = do_posix_clock_nosettime,
- .timer_create = no_timer_create,
- .nsleep = no_nsleep,
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_monotonic_raw,
};
struct k_clock clock_realtime_coarse = {
- .clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_realtime_coarse,
- .clock_set = do_posix_clock_nosettime,
- .timer_create = no_timer_create,
- .nsleep = no_nsleep,
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_realtime_coarse,
};
struct k_clock clock_monotonic_coarse = {
- .clock_getres = posix_get_coarse_res,
- .clock_get = posix_get_monotonic_coarse,
- .clock_set = do_posix_clock_nosettime,
- .timer_create = no_timer_create,
- .nsleep = no_nsleep,
+ .clock_getres = posix_get_coarse_res,
+ .clock_get = posix_get_monotonic_coarse,
+ };
+ struct k_clock clock_boottime = {
+ .clock_getres = hrtimer_get_res,
+ .clock_get = posix_get_boottime,
+ .nsleep = common_nsleep,
+ .nsleep_restart = hrtimer_nanosleep_restart,
+ .timer_create = common_timer_create,
+ .timer_set = common_timer_set,
+ .timer_get = common_timer_get,
+ .timer_del = common_timer_del,
};
- register_posix_clock(CLOCK_REALTIME, &clock_realtime);
- register_posix_clock(CLOCK_MONOTONIC, &clock_monotonic);
- register_posix_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
- register_posix_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
- register_posix_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+ posix_timers_register_clock(CLOCK_REALTIME, &clock_realtime);
+ posix_timers_register_clock(CLOCK_MONOTONIC, &clock_monotonic);
+ posix_timers_register_clock(CLOCK_MONOTONIC_RAW, &clock_monotonic_raw);
+ posix_timers_register_clock(CLOCK_REALTIME_COARSE, &clock_realtime_coarse);
+ posix_timers_register_clock(CLOCK_MONOTONIC_COARSE, &clock_monotonic_coarse);
+ posix_timers_register_clock(CLOCK_BOOTTIME, &clock_boottime);
posix_timers_cache = kmem_cache_create("posix_timers_cache",
sizeof (struct k_itimer), 0, SLAB_PANIC,
@@ -342,7 +313,7 @@ static void schedule_next_timer(struct k_itimer *timr)
* restarted (i.e. we have flagged this in the sys_private entry of the
* info block).
*
- * To protect aginst the timer going away while the interrupt is queued,
+ * To protect against the timer going away while the interrupt is queued,
* we require that the it_requeue_pending flag be set.
*/
void do_schedule_next_timer(struct siginfo *info)
@@ -482,17 +453,29 @@ static struct pid *good_sigevent(sigevent_t * event)
return task_pid(rtn);
}
-void register_posix_clock(const clockid_t clock_id, struct k_clock *new_clock)
+void posix_timers_register_clock(const clockid_t clock_id,
+ struct k_clock *new_clock)
{
if ((unsigned) clock_id >= MAX_CLOCKS) {
- printk("POSIX clock register failed for clock_id %d\n",
+ printk(KERN_WARNING "POSIX clock register failed for clock_id %d\n",
+ clock_id);
+ return;
+ }
+
+ if (!new_clock->clock_get) {
+ printk(KERN_WARNING "POSIX clock id %d lacks clock_get()\n",
+ clock_id);
+ return;
+ }
+ if (!new_clock->clock_getres) {
+ printk(KERN_WARNING "POSIX clock id %d lacks clock_getres()\n",
clock_id);
return;
}
posix_clocks[clock_id] = *new_clock;
}
-EXPORT_SYMBOL_GPL(register_posix_clock);
+EXPORT_SYMBOL_GPL(posix_timers_register_clock);
static struct k_itimer * alloc_posix_timer(void)
{
@@ -508,6 +491,13 @@ static struct k_itimer * alloc_posix_timer(void)
return tmr;
}
+static void k_itimer_rcu_free(struct rcu_head *head)
+{
+ struct k_itimer *tmr = container_of(head, struct k_itimer, it.rcu);
+
+ kmem_cache_free(posix_timers_cache, tmr);
+}
+
#define IT_ID_SET 1
#define IT_ID_NOT_SET 0
static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
@@ -520,7 +510,24 @@ static void release_posix_timer(struct k_itimer *tmr, int it_id_set)
}
put_pid(tmr->it_pid);
sigqueue_free(tmr->sigq);
- kmem_cache_free(posix_timers_cache, tmr);
+ call_rcu(&tmr->it.rcu, k_itimer_rcu_free);
+}
+
+static struct k_clock *clockid_to_kclock(const clockid_t id)
+{
+ if (id < 0)
+ return (id & CLOCKFD_MASK) == CLOCKFD ?
+ &clock_posix_dynamic : &clock_posix_cpu;
+
+ if (id >= MAX_CLOCKS || !posix_clocks[id].clock_getres)
+ return NULL;
+ return &posix_clocks[id];
+}
+
+static int common_timer_create(struct k_itimer *new_timer)
+{
+ hrtimer_init(&new_timer->it.real.timer, new_timer->it_clock, 0);
+ return 0;
}
/* Create a POSIX.1b interval timer. */
@@ -529,13 +536,16 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
struct sigevent __user *, timer_event_spec,
timer_t __user *, created_timer_id)
{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
struct k_itimer *new_timer;
int error, new_timer_id;
sigevent_t event;
int it_id_set = IT_ID_NOT_SET;
- if (invalid_clockid(which_clock))
+ if (!kc)
return -EINVAL;
+ if (!kc->timer_create)
+ return -EOPNOTSUPP;
new_timer = alloc_posix_timer();
if (unlikely(!new_timer))
@@ -597,7 +607,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
goto out;
}
- error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer));
+ error = kc->timer_create(new_timer);
if (error)
goto out;
@@ -607,7 +617,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock,
spin_unlock_irq(&current->sighand->siglock);
return 0;
- /*
+ /*
* In the case of the timer belonging to another task, after
* the task is unlocked, the timer is owned by the other task
* and may cease to exist at any time. Don't use or modify
@@ -628,22 +638,18 @@ out:
static struct k_itimer *__lock_timer(timer_t timer_id, unsigned long *flags)
{
struct k_itimer *timr;
- /*
- * Watch out here. We do a irqsave on the idr_lock and pass the
- * flags part over to the timer lock. Must not let interrupts in
- * while we are moving the lock.
- */
- spin_lock_irqsave(&idr_lock, *flags);
+
+ rcu_read_lock();
timr = idr_find(&posix_timers_id, (int)timer_id);
if (timr) {
- spin_lock(&timr->it_lock);
+ spin_lock_irqsave(&timr->it_lock, *flags);
if (timr->it_signal == current->signal) {
- spin_unlock(&idr_lock);
+ rcu_read_unlock();
return timr;
}
- spin_unlock(&timr->it_lock);
+ spin_unlock_irqrestore(&timr->it_lock, *flags);
}
- spin_unlock_irqrestore(&idr_lock, *flags);
+ rcu_read_unlock();
return NULL;
}
@@ -709,22 +715,28 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
SYSCALL_DEFINE2(timer_gettime, timer_t, timer_id,
struct itimerspec __user *, setting)
{
- struct k_itimer *timr;
struct itimerspec cur_setting;
+ struct k_itimer *timr;
+ struct k_clock *kc;
unsigned long flags;
+ int ret = 0;
timr = lock_timer(timer_id, &flags);
if (!timr)
return -EINVAL;
- CLOCK_DISPATCH(timr->it_clock, timer_get, (timr, &cur_setting));
+ kc = clockid_to_kclock(timr->it_clock);
+ if (WARN_ON_ONCE(!kc || !kc->timer_get))
+ ret = -EINVAL;
+ else
+ kc->timer_get(timr, &cur_setting);
unlock_timer(timr, flags);
- if (copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
+ if (!ret && copy_to_user(setting, &cur_setting, sizeof (cur_setting)))
return -EFAULT;
- return 0;
+ return ret;
}
/*
@@ -813,6 +825,7 @@ SYSCALL_DEFINE4(timer_settime, timer_t, timer_id, int, flags,
int error = 0;
unsigned long flag;
struct itimerspec *rtn = old_setting ? &old_spec : NULL;
+ struct k_clock *kc;
if (!new_setting)
return -EINVAL;
@@ -828,8 +841,11 @@ retry:
if (!timr)
return -EINVAL;
- error = CLOCK_DISPATCH(timr->it_clock, timer_set,
- (timr, flags, &new_spec, rtn));
+ kc = clockid_to_kclock(timr->it_clock);
+ if (WARN_ON_ONCE(!kc || !kc->timer_set))
+ error = -EINVAL;
+ else
+ error = kc->timer_set(timr, flags, &new_spec, rtn);
unlock_timer(timr, flag);
if (error == TIMER_RETRY) {
@@ -844,7 +860,7 @@ retry:
return error;
}
-static inline int common_timer_del(struct k_itimer *timer)
+static int common_timer_del(struct k_itimer *timer)
{
timer->it.real.interval.tv64 = 0;
@@ -855,7 +871,11 @@ static inline int common_timer_del(struct k_itimer *timer)
static inline int timer_delete_hook(struct k_itimer *timer)
{
- return CLOCK_DISPATCH(timer->it_clock, timer_del, (timer));
+ struct k_clock *kc = clockid_to_kclock(timer->it_clock);
+
+ if (WARN_ON_ONCE(!kc || !kc->timer_del))
+ return -EINVAL;
+ return kc->timer_del(timer);
}
/* Delete a POSIX.1b interval timer. */
@@ -927,69 +947,76 @@ void exit_itimers(struct signal_struct *sig)
}
}
-/* Not available / possible... functions */
-int do_posix_clock_nosettime(const clockid_t clockid, struct timespec *tp)
-{
- return -EINVAL;
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nosettime);
-
-int do_posix_clock_nonanosleep(const clockid_t clock, int flags,
- struct timespec *t, struct timespec __user *r)
-{
-#ifndef ENOTSUP
- return -EOPNOTSUPP; /* aka ENOTSUP in userland for POSIX */
-#else /* parisc does define it separately. */
- return -ENOTSUP;
-#endif
-}
-EXPORT_SYMBOL_GPL(do_posix_clock_nonanosleep);
-
SYSCALL_DEFINE2(clock_settime, const clockid_t, which_clock,
const struct timespec __user *, tp)
{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec new_tp;
- if (invalid_clockid(which_clock))
+ if (!kc || !kc->clock_set)
return -EINVAL;
+
if (copy_from_user(&new_tp, tp, sizeof (*tp)))
return -EFAULT;
- return CLOCK_DISPATCH(which_clock, clock_set, (which_clock, &new_tp));
+ return kc->clock_set(which_clock, &new_tp);
}
SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
struct timespec __user *,tp)
{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec kernel_tp;
int error;
- if (invalid_clockid(which_clock))
+ if (!kc)
return -EINVAL;
- error = CLOCK_DISPATCH(which_clock, clock_get,
- (which_clock, &kernel_tp));
+
+ error = kc->clock_get(which_clock, &kernel_tp);
+
if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
error = -EFAULT;
return error;
+}
+
+SYSCALL_DEFINE2(clock_adjtime, const clockid_t, which_clock,
+ struct timex __user *, utx)
+{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
+ struct timex ktx;
+ int err;
+
+ if (!kc)
+ return -EINVAL;
+ if (!kc->clock_adj)
+ return -EOPNOTSUPP;
+ if (copy_from_user(&ktx, utx, sizeof(ktx)))
+ return -EFAULT;
+
+ err = kc->clock_adj(which_clock, &ktx);
+
+ if (!err && copy_to_user(utx, &ktx, sizeof(ktx)))
+ return -EFAULT;
+
+ return err;
}
SYSCALL_DEFINE2(clock_getres, const clockid_t, which_clock,
struct timespec __user *, tp)
{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec rtn_tp;
int error;
- if (invalid_clockid(which_clock))
+ if (!kc)
return -EINVAL;
- error = CLOCK_DISPATCH(which_clock, clock_getres,
- (which_clock, &rtn_tp));
+ error = kc->clock_getres(which_clock, &rtn_tp);
- if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp))) {
+ if (!error && tp && copy_to_user(tp, &rtn_tp, sizeof (rtn_tp)))
error = -EFAULT;
- }
return error;
}
@@ -1009,10 +1036,13 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
const struct timespec __user *, rqtp,
struct timespec __user *, rmtp)
{
+ struct k_clock *kc = clockid_to_kclock(which_clock);
struct timespec t;
- if (invalid_clockid(which_clock))
+ if (!kc)
return -EINVAL;
+ if (!kc->nsleep)
+ return -ENANOSLEEP_NOTSUP;
if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
return -EFAULT;
@@ -1020,27 +1050,20 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
if (!timespec_valid(&t))
return -EINVAL;
- return CLOCK_DISPATCH(which_clock, nsleep,
- (which_clock, flags, &t, rmtp));
-}
-
-/*
- * nanosleep_restart for monotonic and realtime clocks
- */
-static int common_nsleep_restart(struct restart_block *restart_block)
-{
- return hrtimer_nanosleep_restart(restart_block);
+ return kc->nsleep(which_clock, flags, &t, rmtp);
}
/*
* This will restart clock_nanosleep. This is required only by
* compat_clock_nanosleep_restart for now.
*/
-long
-clock_nanosleep_restart(struct restart_block *restart_block)
+long clock_nanosleep_restart(struct restart_block *restart_block)
{
- clockid_t which_clock = restart_block->arg0;
+ clockid_t which_clock = restart_block->nanosleep.clockid;
+ struct k_clock *kc = clockid_to_kclock(which_clock);
+
+ if (WARN_ON_ONCE(!kc || !kc->nsleep_restart))
+ return -EINVAL;
- return CLOCK_DISPATCH(which_clock, nsleep_restart,
- (restart_block));
+ return kc->nsleep_restart(restart_block);
}
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 265729966ece..87f4d24b55b0 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,125 +1,12 @@
-config PM
- bool "Power Management support"
- depends on !IA64_HP_SIM
- ---help---
- "Power Management" means that parts of your computer are shut
- off or put into a power conserving "sleep" mode if they are not
- being used. There are two competing standards for doing this: APM
- and ACPI. If you want to use either one, say Y here and then also
- to the requisite support below.
-
- Power Management is most important for battery powered laptop
- computers; if you have a laptop, check out the Linux Laptop home
- page on the WWW at <http://www.linux-on-laptops.com/> or
- Tuxmobil - Linux on Mobile Computers at <http://www.tuxmobil.org/>
- and the Battery Powered Linux mini-HOWTO, available from
- <http://www.tldp.org/docs.html#howto>.
-
- Note that, even if you say N here, Linux on the x86 architecture
- will issue the hlt instruction if nothing is to be done, thereby
- sending the processor to sleep and saving power.
-
-config PM_DEBUG
- bool "Power Management Debug Support"
- depends on PM
- ---help---
- This option enables various debugging support in the Power Management
- code. This is helpful when debugging and reporting PM bugs, like
- suspend support.
-
-config PM_ADVANCED_DEBUG
- bool "Extra PM attributes in sysfs for low-level debugging/testing"
- depends on PM_DEBUG
- default n
- ---help---
- Add extra sysfs attributes allowing one to access some Power Management
- fields of device objects from user space. If you are not a kernel
- developer interested in debugging/testing Power Management, say "no".
-
-config PM_VERBOSE
- bool "Verbose Power Management debugging"
- depends on PM_DEBUG
- default n
- ---help---
- This option enables verbose messages from the Power Management code.
-
-config CAN_PM_TRACE
- def_bool y
- depends on PM_DEBUG && PM_SLEEP && EXPERIMENTAL
-
-config PM_TRACE
- bool
- help
- This enables code to save the last PM event point across
- reboot. The architecture needs to support this, x86 for
- example does by saving things in the RTC, see below.
-
- The architecture specific code must provide the extern
- functions from <linux/resume-trace.h> as well as the
- <asm/resume-trace.h> header with a TRACE_RESUME() macro.
-
- The way the information is presented is architecture-
- dependent, x86 will print the information during a
- late_initcall.
-
-config PM_TRACE_RTC
- bool "Suspend/resume event tracing"
- depends on CAN_PM_TRACE
- depends on X86
- select PM_TRACE
- default n
- ---help---
- This enables some cheesy code to save the last PM event point in the
- RTC across reboots, so that you can debug a machine that just hangs
- during suspend (or more commonly, during resume).
-
- To use this debugging feature you should attempt to suspend the
- machine, reboot it and then run
-
- dmesg -s 1000000 | grep 'hash matches'
-
- CAUTION: this option will cause your machine's real-time clock to be
- set to an invalid time after a resume.
-
-config PM_SLEEP_SMP
- bool
- depends on SMP
- depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
- depends on PM_SLEEP
- select HOTPLUG
- select HOTPLUG_CPU
- default y
-
-config PM_SLEEP
- bool
- depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE
- default y
-
-config PM_SLEEP_ADVANCED_DEBUG
- bool
- depends on PM_ADVANCED_DEBUG
- default n
-
config SUSPEND
bool "Suspend to RAM and standby"
- depends on PM && ARCH_SUSPEND_POSSIBLE
+ depends on ARCH_SUSPEND_POSSIBLE
default y
---help---
Allow the system to enter sleep states in which main memory is
powered and thus its contents are preserved, such as the
suspend-to-RAM state (e.g. the ACPI S3 state).
-config PM_TEST_SUSPEND
- bool "Test suspend/resume and wakealarm during bootup"
- depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
- ---help---
- This option will let you suspend your machine during bootup, and
- make it wake up a few seconds later using an RTC wakeup alarm.
- Enable this with a kernel parameter like "test_suspend=mem".
-
- You probably want to have your system's RTC driver statically
- linked, ensuring that it's available when this test runs.
-
config SUSPEND_FREEZER
bool "Enable freezer for suspend to RAM/standby" \
if ARCH_WANTS_FREEZER_CONTROL || BROKEN
@@ -131,9 +18,13 @@ config SUSPEND_FREEZER
Turning OFF this setting is NOT recommended! If in doubt, say Y.
+config HIBERNATE_CALLBACKS
+ bool
+
config HIBERNATION
bool "Hibernation (aka 'suspend to disk')"
- depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE
+ depends on SWAP && ARCH_HIBERNATION_POSSIBLE
+ select HIBERNATE_CALLBACKS
select LZO_COMPRESS
select LZO_DECOMPRESS
---help---
@@ -196,6 +87,100 @@ config PM_STD_PARTITION
suspended image to. It will simply pick the first available swap
device.
+config PM_SLEEP
+ def_bool y
+ depends on SUSPEND || HIBERNATE_CALLBACKS
+
+config PM_SLEEP_SMP
+ def_bool y
+ depends on SMP
+ depends on ARCH_SUSPEND_POSSIBLE || ARCH_HIBERNATION_POSSIBLE
+ depends on PM_SLEEP
+ select HOTPLUG
+ select HOTPLUG_CPU
+
+config PM_RUNTIME
+ bool "Run-time PM core functionality"
+ depends on !IA64_HP_SIM
+ ---help---
+ Enable functionality allowing I/O devices to be put into energy-saving
+ (low power) states at run time (or autosuspended) after a specified
+ period of inactivity and woken up in response to a hardware-generated
+ wake-up event or a driver's request.
+
+ Hardware support is generally required for this functionality to work
+ and the bus type drivers of the buses the devices are on are
+ responsible for the actual handling of the autosuspend requests and
+ wake-up events.
+
+config PM
+ def_bool y
+ depends on PM_SLEEP || PM_RUNTIME
+
+config PM_DEBUG
+ bool "Power Management Debug Support"
+ depends on PM
+ ---help---
+ This option enables various debugging support in the Power Management
+ code. This is helpful when debugging and reporting PM bugs, like
+ suspend support.
+
+config PM_ADVANCED_DEBUG
+ bool "Extra PM attributes in sysfs for low-level debugging/testing"
+ depends on PM_DEBUG
+ ---help---
+ Add extra sysfs attributes allowing one to access some Power Management
+ fields of device objects from user space. If you are not a kernel
+ developer interested in debugging/testing Power Management, say "no".
+
+config PM_TEST_SUSPEND
+ bool "Test suspend/resume and wakealarm during bootup"
+ depends on SUSPEND && PM_DEBUG && RTC_CLASS=y
+ ---help---
+ This option will let you suspend your machine during bootup, and
+ make it wake up a few seconds later using an RTC wakeup alarm.
+ Enable this with a kernel parameter like "test_suspend=mem".
+
+ You probably want to have your system's RTC driver statically
+ linked, ensuring that it's available when this test runs.
+
+config CAN_PM_TRACE
+ def_bool y
+ depends on PM_DEBUG && PM_SLEEP
+
+config PM_TRACE
+ bool
+ help
+ This enables code to save the last PM event point across
+ reboot. The architecture needs to support this, x86 for
+ example does by saving things in the RTC, see below.
+
+ The architecture specific code must provide the extern
+ functions from <linux/resume-trace.h> as well as the
+ <asm/resume-trace.h> header with a TRACE_RESUME() macro.
+
+ The way the information is presented is architecture-
+ dependent, x86 will print the information during a
+ late_initcall.
+
+config PM_TRACE_RTC
+ bool "Suspend/resume event tracing"
+ depends on CAN_PM_TRACE
+ depends on X86
+ select PM_TRACE
+ ---help---
+ This enables some cheesy code to save the last PM event point in the
+ RTC across reboots, so that you can debug a machine that just hangs
+ during suspend (or more commonly, during resume).
+
+ To use this debugging feature you should attempt to suspend the
+ machine, reboot it and then run
+
+ dmesg -s 1000000 | grep 'hash matches'
+
+ CAUTION: this option will cause your machine's real-time clock to be
+ set to an invalid time after a resume.
+
config APM_EMULATION
tristate "Advanced Power Management Emulation"
depends on PM && SYS_SUPPORTS_APM_EMULATION
@@ -222,31 +207,11 @@ config APM_EMULATION
anything, try disabling/enabling this option (or disabling/enabling
APM in your BIOS).
-config PM_RUNTIME
- bool "Run-time PM core functionality"
- depends on PM
- ---help---
- Enable functionality allowing I/O devices to be put into energy-saving
- (low power) states at run time (or autosuspended) after a specified
- period of inactivity and woken up in response to a hardware-generated
- wake-up event or a driver's request.
-
- Hardware support is generally required for this functionality to work
- and the bus type drivers of the buses the devices are on are
- responsible for the actual handling of the autosuspend requests and
- wake-up events.
-
-config PM_OPS
- bool
- depends on PM_SLEEP || PM_RUNTIME
- default y
-
config ARCH_HAS_OPP
bool
config PM_OPP
bool "Operating Performance Point (OPP) Layer library"
- depends on PM
depends on ARCH_HAS_OPP
---help---
SOCs have a standard set of tuples consisting of frequency and
@@ -258,3 +223,7 @@ config PM_OPP
representing individual voltage domains and provides SOC
implementations a ready to use framework to manage OPPs.
For more information, read <file:Documentation/power/opp.txt>
+
+config PM_RUNTIME_CLK
+ def_bool y
+ depends on PM_RUNTIME && HAVE_CLK
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index c350e18b53e3..c5ebc6a90643 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -1,4 +1,5 @@
-ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
+
+ccflags-$(CONFIG_PM_DEBUG) := -DDEBUG
obj-$(CONFIG_PM) += main.o
obj-$(CONFIG_PM_SLEEP) += console.o
diff --git a/kernel/power/block_io.c b/kernel/power/block_io.c
index 83bbc7c02df9..d09dd10c5a5e 100644
--- a/kernel/power/block_io.c
+++ b/kernel/power/block_io.c
@@ -28,7 +28,7 @@
static int submit(int rw, struct block_device *bdev, sector_t sector,
struct page *page, struct bio **bio_chain)
{
- const int bio_rw = rw | REQ_SYNC | REQ_UNPLUG;
+ const int bio_rw = rw | REQ_SYNC;
struct bio *bio;
bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 1832bd264219..8f7b1db1ece1 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -23,8 +23,8 @@
#include <linux/cpu.h>
#include <linux/freezer.h>
#include <linux/gfp.h>
+#include <linux/syscore_ops.h>
#include <scsi/scsi_scan.h>
-#include <asm/suspend.h>
#include "power.h"
@@ -54,10 +54,9 @@ static int hibernation_mode = HIBERNATION_SHUTDOWN;
static const struct platform_hibernation_ops *hibernation_ops;
/**
- * hibernation_set_ops - set the global hibernate operations
- * @ops: the hibernation operations to use in subsequent hibernation transitions
+ * hibernation_set_ops - Set the global hibernate operations.
+ * @ops: Hibernation operations to use in subsequent hibernation transitions.
*/
-
void hibernation_set_ops(const struct platform_hibernation_ops *ops)
{
if (ops && !(ops->begin && ops->end && ops->pre_snapshot
@@ -114,10 +113,9 @@ static int hibernation_test(int level) { return 0; }
#endif /* !CONFIG_PM_DEBUG */
/**
- * platform_begin - tell the platform driver that we're starting
- * hibernation
+ * platform_begin - Call platform to start hibernation.
+ * @platform_mode: Whether or not to use the platform driver.
*/
-
static int platform_begin(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
@@ -125,10 +123,9 @@ static int platform_begin(int platform_mode)
}
/**
- * platform_end - tell the platform driver that we've entered the
- * working state
+ * platform_end - Call platform to finish transition to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
*/
-
static void platform_end(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -136,8 +133,11 @@ static void platform_end(int platform_mode)
}
/**
- * platform_pre_snapshot - prepare the machine for hibernation using the
- * platform driver if so configured and return an error code if it fails
+ * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to prepare the system for creating a hibernate image,
+ * if so configured, and return an error code if that fails.
*/
static int platform_pre_snapshot(int platform_mode)
@@ -147,10 +147,14 @@ static int platform_pre_snapshot(int platform_mode)
}
/**
- * platform_leave - prepare the machine for switching to the normal mode
- * of operation using the platform driver (called with interrupts disabled)
+ * platform_leave - Call platform to prepare a transition to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver prepare to prepare the machine for switching to the
+ * normal mode of operation.
+ *
+ * This routine is called on one CPU with interrupts disabled.
*/
-
static void platform_leave(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -158,10 +162,14 @@ static void platform_leave(int platform_mode)
}
/**
- * platform_finish - switch the machine to the normal mode of operation
- * using the platform driver (must be called after platform_prepare())
+ * platform_finish - Call platform to switch the system to the working state.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to switch the machine to the normal mode of
+ * operation.
+ *
+ * This routine must be called after platform_prepare().
*/
-
static void platform_finish(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -169,11 +177,15 @@ static void platform_finish(int platform_mode)
}
/**
- * platform_pre_restore - prepare the platform for the restoration from a
- * hibernation image. If the restore fails after this function has been
- * called, platform_restore_cleanup() must be called.
+ * platform_pre_restore - Prepare for hibernate image restoration.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to prepare the system for resume from a hibernation
+ * image.
+ *
+ * If the restore fails after this function has been called,
+ * platform_restore_cleanup() must be called.
*/
-
static int platform_pre_restore(int platform_mode)
{
return (platform_mode && hibernation_ops) ?
@@ -181,12 +193,16 @@ static int platform_pre_restore(int platform_mode)
}
/**
- * platform_restore_cleanup - switch the platform to the normal mode of
- * operation after a failing restore. If platform_pre_restore() has been
- * called before the failing restore, this function must be called too,
- * regardless of the result of platform_pre_restore().
+ * platform_restore_cleanup - Switch to the working state after failing restore.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Use the platform driver to switch the system to the normal mode of operation
+ * after a failing restore.
+ *
+ * If platform_pre_restore() has been called before the failing restore, this
+ * function must be called too, regardless of the result of
+ * platform_pre_restore().
*/
-
static void platform_restore_cleanup(int platform_mode)
{
if (platform_mode && hibernation_ops)
@@ -194,10 +210,9 @@ static void platform_restore_cleanup(int platform_mode)
}
/**
- * platform_recover - recover the platform from a failure to suspend
- * devices.
+ * platform_recover - Recover from a failure to suspend devices.
+ * @platform_mode: Whether or not to use the platform driver.
*/
-
static void platform_recover(int platform_mode)
{
if (platform_mode && hibernation_ops && hibernation_ops->recover)
@@ -205,13 +220,12 @@ static void platform_recover(int platform_mode)
}
/**
- * swsusp_show_speed - print the time elapsed between two events.
- * @start: Starting event.
- * @stop: Final event.
- * @nr_pages - number of pages processed between @start and @stop
- * @msg - introductory message to print
+ * swsusp_show_speed - Print time elapsed between two events during hibernation.
+ * @start: Starting event.
+ * @stop: Final event.
+ * @nr_pages: Number of memory pages processed between @start and @stop.
+ * @msg: Additional diagnostic message to print.
*/
-
void swsusp_show_speed(struct timeval *start, struct timeval *stop,
unsigned nr_pages, char *msg)
{
@@ -234,25 +248,18 @@ void swsusp_show_speed(struct timeval *start, struct timeval *stop,
}
/**
- * create_image - freeze devices that need to be frozen with interrupts
- * off, create the hibernation image and thaw those devices. Control
- * reappears in this routine after a restore.
+ * create_image - Create a hibernation image.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Execute device drivers' .freeze_noirq() callbacks, create a hibernation image
+ * and execute the drivers' .thaw_noirq() callbacks.
+ *
+ * Control reappears in this routine after the subsequent restore.
*/
-
static int create_image(int platform_mode)
{
int error;
- error = arch_prepare_suspend();
- if (error)
- return error;
-
- /* At this point, dpm_suspend_start() has been called, but *not*
- * dpm_suspend_noirq(). We *must* call dpm_suspend_noirq() now.
- * Otherwise, drivers for some devices (e.g. interrupt controllers)
- * become desynchronized with the actual state of the hardware
- * at resume time, and evil weirdness ensues.
- */
error = dpm_suspend_noirq(PMSG_FREEZE);
if (error) {
printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -271,7 +278,7 @@ static int create_image(int platform_mode)
local_irq_disable();
- error = sysdev_suspend(PMSG_FREEZE);
+ error = syscore_suspend();
if (error) {
printk(KERN_ERR "PM: Some system devices failed to power down, "
"aborting hibernation\n");
@@ -295,10 +302,7 @@ static int create_image(int platform_mode)
}
Power_up:
- sysdev_resume();
- /* NOTE: dpm_resume_noirq() is just a resume() for devices
- * that suspended with irqs off ... no overall powerup.
- */
+ syscore_resume();
Enable_irqs:
local_irq_enable();
@@ -316,30 +320,32 @@ static int create_image(int platform_mode)
}
/**
- * hibernation_snapshot - quiesce devices and create the hibernation
- * snapshot image.
- * @platform_mode - if set, use the platform driver, if available, to
- * prepare the platform firmware for the power transition.
+ * hibernation_snapshot - Quiesce devices and create a hibernation image.
+ * @platform_mode: If set, use platform driver to prepare for the transition.
*
- * Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held.
*/
-
int hibernation_snapshot(int platform_mode)
{
+ pm_message_t msg = PMSG_RECOVER;
int error;
error = platform_begin(platform_mode);
if (error)
goto Close;
+ error = dpm_prepare(PMSG_FREEZE);
+ if (error)
+ goto Complete_devices;
+
/* Preallocate image memory before shutting down devices. */
error = hibernate_preallocate_memory();
if (error)
- goto Close;
+ goto Complete_devices;
suspend_console();
pm_restrict_gfp_mask();
- error = dpm_suspend_start(PMSG_FREEZE);
+ error = dpm_suspend(PMSG_FREEZE);
if (error)
goto Recover_platform;
@@ -357,13 +363,17 @@ int hibernation_snapshot(int platform_mode)
if (error || !in_suspend)
swsusp_free();
- dpm_resume_end(in_suspend ?
- (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
+ msg = in_suspend ? (error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE;
+ dpm_resume(msg);
if (error || !in_suspend)
pm_restore_gfp_mask();
resume_console();
+
+ Complete_devices:
+ dpm_complete(msg);
+
Close:
platform_end(platform_mode);
return error;
@@ -374,13 +384,14 @@ int hibernation_snapshot(int platform_mode)
}
/**
- * resume_target_kernel - prepare devices that need to be suspended with
- * interrupts off, restore the contents of highmem that have not been
- * restored yet from the image and run the low level code that will restore
- * the remaining contents of memory and switch to the just restored target
- * kernel.
+ * resume_target_kernel - Restore system state from a hibernation image.
+ * @platform_mode: Whether or not to use the platform driver.
+ *
+ * Execute device drivers' .freeze_noirq() callbacks, restore the contents of
+ * highmem that have not been restored yet from the image and run the low-level
+ * code that will restore the remaining contents of memory and switch to the
+ * just restored target kernel.
*/
-
static int resume_target_kernel(bool platform_mode)
{
int error;
@@ -402,34 +413,36 @@ static int resume_target_kernel(bool platform_mode)
local_irq_disable();
- error = sysdev_suspend(PMSG_QUIESCE);
+ error = syscore_suspend();
if (error)
goto Enable_irqs;
- /* We'll ignore saved state, but this gets preempt count (etc) right */
save_processor_state();
error = restore_highmem();
if (!error) {
error = swsusp_arch_resume();
/*
* The code below is only ever reached in case of a failure.
- * Otherwise execution continues at place where
- * swsusp_arch_suspend() was called
+ * Otherwise, execution continues at the place where
+ * swsusp_arch_suspend() was called.
*/
BUG_ON(!error);
- /* This call to restore_highmem() undos the previous one */
+ /*
+ * This call to restore_highmem() reverts the changes made by
+ * the previous one.
+ */
restore_highmem();
}
/*
* The only reason why swsusp_arch_resume() can fail is memory being
* very tight, so we have to free it as soon as we can to avoid
- * subsequent failures
+ * subsequent failures.
*/
swsusp_free();
restore_processor_state();
touch_softlockup_watchdog();
- sysdev_resume();
+ syscore_resume();
Enable_irqs:
local_irq_enable();
@@ -446,14 +459,12 @@ static int resume_target_kernel(bool platform_mode)
}
/**
- * hibernation_restore - quiesce devices and restore the hibernation
- * snapshot image. If successful, control returns in hibernation_snaphot()
- * @platform_mode - if set, use the platform driver, if available, to
- * prepare the platform firmware for the transition.
+ * hibernation_restore - Quiesce devices and restore from a hibernation image.
+ * @platform_mode: If set, use platform driver to prepare for the transition.
*
- * Must be called with pm_mutex held
+ * This routine must be called with pm_mutex held. If it is successful, control
+ * reappears in the restored target kernel in hibernation_snaphot().
*/
-
int hibernation_restore(int platform_mode)
{
int error;
@@ -473,10 +484,8 @@ int hibernation_restore(int platform_mode)
}
/**
- * hibernation_platform_enter - enter the hibernation state using the
- * platform driver (if available)
+ * hibernation_platform_enter - Power off the system using the platform driver.
*/
-
int hibernation_platform_enter(void)
{
int error;
@@ -515,7 +524,7 @@ int hibernation_platform_enter(void)
goto Platform_finish;
local_irq_disable();
- sysdev_suspend(PMSG_HIBERNATE);
+ syscore_suspend();
if (pm_wakeup_pending()) {
error = -EAGAIN;
goto Power_up;
@@ -526,7 +535,7 @@ int hibernation_platform_enter(void)
while (1);
Power_up:
- sysdev_resume();
+ syscore_resume();
local_irq_enable();
enable_nonboot_cpus();
@@ -547,12 +556,12 @@ int hibernation_platform_enter(void)
}
/**
- * power_down - Shut the machine down for hibernation.
+ * power_down - Shut the machine down for hibernation.
*
- * Use the platform driver, if configured so; otherwise try
- * to power off or reboot.
+ * Use the platform driver, if configured, to put the system into the sleep
+ * state corresponding to hibernation, or try to power it off or reboot,
+ * depending on the value of hibernation_mode.
*/
-
static void power_down(void)
{
switch (hibernation_mode) {
@@ -589,9 +598,8 @@ static int prepare_processes(void)
}
/**
- * hibernate - The granpappy of the built-in hibernation management
+ * hibernate - Carry out system hibernation, including saving the image.
*/
-
int hibernate(void)
{
int error;
@@ -669,17 +677,20 @@ int hibernate(void)
/**
- * software_resume - Resume from a saved image.
+ * software_resume - Resume from a saved hibernation image.
*
- * Called as a late_initcall (so all devices are discovered and
- * initialized), we call swsusp to see if we have a saved image or not.
- * If so, we quiesce devices, the restore the saved image. We will
- * return above (in hibernate() ) if everything goes well.
- * Otherwise, we fail gracefully and return to the normally
- * scheduled program.
+ * This routine is called as a late initcall, when all devices have been
+ * discovered and initialized already.
*
+ * The image reading code is called to see if there is a hibernation image
+ * available for reading. If that is the case, devices are quiesced and the
+ * contents of memory is restored from the saved image.
+ *
+ * If this is successful, control reappears in the restored target kernel in
+ * hibernation_snaphot() which returns to hibernate(). Otherwise, the routine
+ * attempts to recover gracefully and make the kernel return to the normal mode
+ * of operation.
*/
-
static int software_resume(void)
{
int error;
@@ -809,21 +820,17 @@ static const char * const hibernation_modes[] = {
[HIBERNATION_TESTPROC] = "testproc",
};
-/**
- * disk - Control hibernation mode
- *
- * Suspend-to-disk can be handled in several ways. We have a few options
- * for putting the system to sleep - using the platform driver (e.g. ACPI
- * or other hibernation_ops), powering off the system or rebooting the
- * system (for testing) as well as the two test modes.
+/*
+ * /sys/power/disk - Control hibernation mode.
*
- * The system can support 'platform', and that is known a priori (and
- * encoded by the presence of hibernation_ops). However, the user may
- * choose 'shutdown' or 'reboot' as alternatives, as well as one fo the
- * test modes, 'test' or 'testproc'.
+ * Hibernation can be handled in several ways. There are a few different ways
+ * to put the system into the sleep state: using the platform driver (e.g. ACPI
+ * or other hibernation_ops), powering it off or rebooting it (for testing
+ * mostly), or using one of the two available test modes.
*
- * show() will display what the mode is currently set to.
- * store() will accept one of
+ * The sysfs file /sys/power/disk provides an interface for selecting the
+ * hibernation mode to use. Reading from this file causes the available modes
+ * to be printed. There are 5 modes that can be supported:
*
* 'platform'
* 'shutdown'
@@ -831,8 +838,14 @@ static const char * const hibernation_modes[] = {
* 'test'
* 'testproc'
*
- * It will only change to 'platform' if the system
- * supports it (as determined by having hibernation_ops).
+ * If a platform hibernation driver is in use, 'platform' will be supported
+ * and will be used by default. Otherwise, 'shutdown' will be used by default.
+ * The selected option (i.e. the one corresponding to the current value of
+ * hibernation_mode) is enclosed by a square bracket.
+ *
+ * To select a given hibernation mode it is necessary to write the mode's
+ * string representation (as returned by reading from /sys/power/disk) back
+ * into /sys/power/disk.
*/
static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
@@ -865,7 +878,6 @@ static ssize_t disk_show(struct kobject *kobj, struct kobj_attribute *attr,
return buf-start;
}
-
static ssize_t disk_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t n)
{
@@ -967,10 +979,33 @@ static ssize_t image_size_store(struct kobject *kobj, struct kobj_attribute *att
power_attr(image_size);
+static ssize_t reserved_size_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return sprintf(buf, "%lu\n", reserved_size);
+}
+
+static ssize_t reserved_size_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t n)
+{
+ unsigned long size;
+
+ if (sscanf(buf, "%lu", &size) == 1) {
+ reserved_size = size;
+ return n;
+ }
+
+ return -EINVAL;
+}
+
+power_attr(reserved_size);
+
static struct attribute * g[] = {
&disk_attr.attr,
&resume_attr.attr,
&image_size_attr.attr,
+ &reserved_size_attr.attr,
NULL,
};
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 701853042c28..2981af4ce7cb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -17,9 +17,6 @@
DEFINE_MUTEX(pm_mutex);
-unsigned int pm_flags;
-EXPORT_SYMBOL(pm_flags);
-
#ifdef CONFIG_PM_SLEEP
/* Routines for PM-transition notifications */
@@ -227,7 +224,7 @@ power_attr(state);
* writing to 'state'. It first should read from 'wakeup_count' and store
* the read value. Then, after carrying out its own preparations for the system
* transition to a sleep state, it should write the stored value to
- * 'wakeup_count'. If that fails, at least one wakeup event has occured since
+ * 'wakeup_count'. If that fails, at least one wakeup event has occurred since
* 'wakeup_count' was read and 'state' should not be written to. Otherwise, it
* is allowed to write to 'state', but the transition will be aborted if there
* are any wakeup events detected after 'wakeup_count' was written to.
@@ -340,6 +337,7 @@ static int __init pm_init(void)
if (error)
return error;
hibernate_image_size_init();
+ hibernate_reserved_size_init();
power_kobj = kobject_create_and_add("power", NULL);
if (!power_kobj)
return -ENOMEM;
diff --git a/kernel/power/power.h b/kernel/power/power.h
index 03634be55f62..9a00a0a26280 100644
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -15,6 +15,7 @@ struct swsusp_info {
#ifdef CONFIG_HIBERNATION
/* kernel/power/snapshot.c */
+extern void __init hibernate_reserved_size_init(void);
extern void __init hibernate_image_size_init(void);
#ifdef CONFIG_ARCH_HIBERNATION_HEADER
@@ -55,6 +56,7 @@ extern int hibernation_platform_enter(void);
#else /* !CONFIG_HIBERNATION */
+static inline void hibernate_reserved_size_init(void) {}
static inline void hibernate_image_size_init(void) {}
#endif /* !CONFIG_HIBERNATION */
@@ -72,6 +74,8 @@ static struct kobj_attribute _name##_attr = { \
/* Preferred image size in bytes (default 500 MB) */
extern unsigned long image_size;
+/* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
+extern unsigned long reserved_size;
extern int in_suspend;
extern dev_t swsusp_resume_device;
extern sector_t swsusp_resume_block;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index 64db648ff911..ace55889f702 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -41,6 +41,18 @@ static void swsusp_set_page_forbidden(struct page *);
static void swsusp_unset_page_forbidden(struct page *);
/*
+ * Number of bytes to reserve for memory allocations made by device drivers
+ * from their ->freeze() and ->freeze_noirq() callbacks so that they don't
+ * cause image creation to fail (tunable via /sys/power/reserved_size).
+ */
+unsigned long reserved_size;
+
+void __init hibernate_reserved_size_init(void)
+{
+ reserved_size = SPARE_PAGES * PAGE_SIZE;
+}
+
+/*
* Preferred image size in bytes (tunable via /sys/power/image_size).
* When it is set to N, swsusp will do its best to ensure the image
* size will not exceed N bytes, but if that is impossible, it will
@@ -1263,11 +1275,13 @@ static unsigned long minimum_image_size(unsigned long saveable)
* frame in use. We also need a number of page frames to be free during
* hibernation for allocations made while saving the image and for device
* drivers, in case they need to allocate memory from their hibernation
- * callbacks (these two numbers are given by PAGES_FOR_IO and SPARE_PAGES,
- * respectively, both of which are rough estimates). To make this happen, we
- * compute the total number of available page frames and allocate at least
+ * callbacks (these two numbers are given by PAGES_FOR_IO (which is a rough
+ * estimate) and reserverd_size divided by PAGE_SIZE (which is tunable through
+ * /sys/power/reserved_size, respectively). To make this happen, we compute the
+ * total number of available page frames and allocate at least
*
- * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2 + 2 * SPARE_PAGES
+ * ([page frames total] + PAGES_FOR_IO + [metadata pages]) / 2
+ * + 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE)
*
* of them, which corresponds to the maximum size of a hibernation image.
*
@@ -1322,7 +1336,8 @@ int hibernate_preallocate_memory(void)
count -= totalreserve_pages;
/* Compute the maximum number of saveable pages to leave in memory. */
- max_size = (count - (size + PAGES_FOR_IO)) / 2 - 2 * SPARE_PAGES;
+ max_size = (count - (size + PAGES_FOR_IO)) / 2
+ - 2 * DIV_ROUND_UP(reserved_size, PAGE_SIZE);
/* Compute the desired number of image pages specified by image_size. */
size = DIV_ROUND_UP(image_size, PAGE_SIZE);
if (size > max_size)
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
index de6f86bfa303..1c41ba215419 100644
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -22,6 +22,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/suspend.h>
+#include <linux/syscore_ops.h>
#include <trace/events/power.h>
#include "power.h"
@@ -162,13 +163,13 @@ static int suspend_enter(suspend_state_t state)
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
- error = sysdev_suspend(PMSG_SUSPEND);
+ error = syscore_suspend();
if (!error) {
if (!(suspend_test(TEST_CORE) || pm_wakeup_pending())) {
error = suspend_ops->enter(state);
events_check_enabled = false;
}
- sysdev_resume();
+ syscore_resume();
}
arch_suspend_enable_irqs();
@@ -209,7 +210,6 @@ int suspend_devices_and_enter(suspend_state_t state)
goto Close;
}
suspend_console();
- pm_restrict_gfp_mask();
suspend_test_start();
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
@@ -220,13 +220,12 @@ int suspend_devices_and_enter(suspend_state_t state)
if (suspend_test(TEST_DEVICES))
goto Recover_platform;
- suspend_enter(state);
+ error = suspend_enter(state);
Resume_devices:
suspend_test_start();
dpm_resume_end(PMSG_RESUME);
suspend_test_finish("resume devices");
- pm_restore_gfp_mask();
resume_console();
Close:
if (suspend_ops->end)
@@ -287,7 +286,9 @@ int enter_state(suspend_state_t state)
goto Finish;
pr_debug("PM: Entering %s sleep\n", pm_states[state]);
+ pm_restrict_gfp_mask();
error = suspend_devices_and_enter(state);
+ pm_restore_gfp_mask();
Finish:
pr_debug("PM: Finishing wakeup.\n");
diff --git a/kernel/power/user.c b/kernel/power/user.c
index c36c3b9e8a84..7d02d33be699 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -135,8 +135,10 @@ static int snapshot_release(struct inode *inode, struct file *filp)
free_basic_memory_bitmaps();
data = filp->private_data;
free_all_swap_pages(data->swap);
- if (data->frozen)
+ if (data->frozen) {
+ pm_restore_gfp_mask();
thaw_processes();
+ }
pm_notifier_call_chain(data->mode == O_RDONLY ?
PM_POST_HIBERNATION : PM_POST_RESTORE);
atomic_inc(&snapshot_device_available);
@@ -379,6 +381,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd,
* PM_HIBERNATION_PREPARE
*/
error = suspend_devices_and_enter(PM_SUSPEND_MEM);
+ data->ready = 0;
break;
case SNAPSHOT_PLATFORM_SUPPORT:
diff --git a/kernel/printk.c b/kernel/printk.c
index 36231525e22f..35185392173f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -31,6 +31,7 @@
#include <linux/smp.h>
#include <linux/security.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/syscalls.h>
#include <linux/kexec.h>
#include <linux/kdb.h>
@@ -53,7 +54,7 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
/* printk's without a loglevel use this.. */
-#define DEFAULT_MESSAGE_LOGLEVEL 4 /* KERN_WARNING */
+#define DEFAULT_MESSAGE_LOGLEVEL CONFIG_DEFAULT_MESSAGE_LOGLEVEL
/* We show everything that is MORE important than this.. */
#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
@@ -113,6 +114,11 @@ static unsigned con_start; /* Index into log_buf: next char to be sent to consol
static unsigned log_end; /* Index into log_buf: most-recently-written-char + 1 */
/*
+ * If exclusive_console is non-NULL then only this console is to be printed to.
+ */
+static struct console *exclusive_console;
+
+/*
* Array of consoles built from command line options (console=)
*/
struct console_cmdline
@@ -162,46 +168,74 @@ void log_buf_kexec_setup(void)
}
#endif
+/* requested log_buf_len from kernel cmdline */
+static unsigned long __initdata new_log_buf_len;
+
+/* save requested log_buf_len since it's too early to process it */
static int __init log_buf_len_setup(char *str)
{
unsigned size = memparse(str, &str);
- unsigned long flags;
if (size)
size = roundup_pow_of_two(size);
- if (size > log_buf_len) {
- unsigned start, dest_idx, offset;
- char *new_log_buf;
+ if (size > log_buf_len)
+ new_log_buf_len = size;
- new_log_buf = alloc_bootmem(size);
- if (!new_log_buf) {
- printk(KERN_WARNING "log_buf_len: allocation failed\n");
- goto out;
- }
+ return 0;
+}
+early_param("log_buf_len", log_buf_len_setup);
- spin_lock_irqsave(&logbuf_lock, flags);
- log_buf_len = size;
- log_buf = new_log_buf;
-
- offset = start = min(con_start, log_start);
- dest_idx = 0;
- while (start != log_end) {
- log_buf[dest_idx] = __log_buf[start & (__LOG_BUF_LEN - 1)];
- start++;
- dest_idx++;
- }
- log_start -= offset;
- con_start -= offset;
- log_end -= offset;
- spin_unlock_irqrestore(&logbuf_lock, flags);
+void __init setup_log_buf(int early)
+{
+ unsigned long flags;
+ unsigned start, dest_idx, offset;
+ char *new_log_buf;
+ int free;
+
+ if (!new_log_buf_len)
+ return;
+
+ if (early) {
+ unsigned long mem;
+
+ mem = memblock_alloc(new_log_buf_len, PAGE_SIZE);
+ if (mem == MEMBLOCK_ERROR)
+ return;
+ new_log_buf = __va(mem);
+ } else {
+ new_log_buf = alloc_bootmem_nopanic(new_log_buf_len);
+ }
- printk(KERN_NOTICE "log_buf_len: %d\n", log_buf_len);
+ if (unlikely(!new_log_buf)) {
+ pr_err("log_buf_len: %ld bytes not available\n",
+ new_log_buf_len);
+ return;
}
-out:
- return 1;
-}
-__setup("log_buf_len=", log_buf_len_setup);
+ spin_lock_irqsave(&logbuf_lock, flags);
+ log_buf_len = new_log_buf_len;
+ log_buf = new_log_buf;
+ new_log_buf_len = 0;
+ free = __LOG_BUF_LEN - log_end;
+
+ offset = start = min(con_start, log_start);
+ dest_idx = 0;
+ while (start != log_end) {
+ unsigned log_idx_mask = start & (__LOG_BUF_LEN - 1);
+
+ log_buf[dest_idx] = __log_buf[log_idx_mask];
+ start++;
+ dest_idx++;
+ }
+ log_start -= offset;
+ con_start -= offset;
+ log_end -= offset;
+ spin_unlock_irqrestore(&logbuf_lock, flags);
+
+ pr_info("log_buf_len: %d\n", log_buf_len);
+ pr_info("early log buf free: %d(%d%%)\n",
+ free, (free * 100) / __LOG_BUF_LEN);
+}
#ifdef CONFIG_BOOT_PRINTK_DELAY
@@ -476,6 +510,8 @@ static void __call_console_drivers(unsigned start, unsigned end)
struct console *con;
for_each_console(con) {
+ if (exclusive_console && con != exclusive_console)
+ continue;
if ((con->flags & CON_ENABLED) && con->write &&
(cpu_online(smp_processor_id()) ||
(con->flags & CON_ANYTIME)))
@@ -515,6 +551,71 @@ static void _call_console_drivers(unsigned start,
}
/*
+ * Parse the syslog header <[0-9]*>. The decimal value represents 32bit, the
+ * lower 3 bit are the log level, the rest are the log facility. In case
+ * userspace passes usual userspace syslog messages to /dev/kmsg or
+ * /dev/ttyprintk, the log prefix might contain the facility. Printk needs
+ * to extract the correct log level for in-kernel processing, and not mangle
+ * the original value.
+ *
+ * If a prefix is found, the length of the prefix is returned. If 'level' is
+ * passed, it will be filled in with the log level without a possible facility
+ * value. If 'special' is passed, the special printk prefix chars are accepted
+ * and returned. If no valid header is found, 0 is returned and the passed
+ * variables are not touched.
+ */
+static size_t log_prefix(const char *p, unsigned int *level, char *special)
+{
+ unsigned int lev = 0;
+ char sp = '\0';
+ size_t len;
+
+ if (p[0] != '<' || !p[1])
+ return 0;
+ if (p[2] == '>') {
+ /* usual single digit level number or special char */
+ switch (p[1]) {
+ case '0' ... '7':
+ lev = p[1] - '0';
+ break;
+ case 'c': /* KERN_CONT */
+ case 'd': /* KERN_DEFAULT */
+ sp = p[1];
+ break;
+ default:
+ return 0;
+ }
+ len = 3;
+ } else {
+ /* multi digit including the level and facility number */
+ char *endp = NULL;
+
+ if (p[1] < '0' && p[1] > '9')
+ return 0;
+
+ lev = (simple_strtoul(&p[1], &endp, 10) & 7);
+ if (endp == NULL || endp[0] != '>')
+ return 0;
+ len = (endp + 1) - p;
+ }
+
+ /* do not accept special char if not asked for */
+ if (sp && !special)
+ return 0;
+
+ if (special) {
+ *special = sp;
+ /* return special char, do not touch level */
+ if (sp)
+ return len;
+ }
+
+ if (level)
+ *level = lev;
+ return len;
+}
+
+/*
* Call the console drivers, asking them to write out
* log_buf[start] to log_buf[end - 1].
* The console_lock must be held.
@@ -529,13 +630,9 @@ static void call_console_drivers(unsigned start, unsigned end)
cur_index = start;
start_print = start;
while (cur_index != end) {
- if (msg_level < 0 && ((end - cur_index) > 2) &&
- LOG_BUF(cur_index + 0) == '<' &&
- LOG_BUF(cur_index + 1) >= '0' &&
- LOG_BUF(cur_index + 1) <= '7' &&
- LOG_BUF(cur_index + 2) == '>') {
- msg_level = LOG_BUF(cur_index + 1) - '0';
- cur_index += 3;
+ if (msg_level < 0 && ((end - cur_index) > 2)) {
+ /* strip log prefix */
+ cur_index += log_prefix(&LOG_BUF(cur_index), &msg_level, NULL);
start_print = cur_index;
}
while (cur_index != end) {
@@ -733,6 +830,8 @@ asmlinkage int vprintk(const char *fmt, va_list args)
unsigned long flags;
int this_cpu;
char *p;
+ size_t plen;
+ char special;
boot_delay_msec();
printk_delay();
@@ -773,45 +872,52 @@ asmlinkage int vprintk(const char *fmt, va_list args)
printed_len += vscnprintf(printk_buf + printed_len,
sizeof(printk_buf) - printed_len, fmt, args);
-
p = printk_buf;
- /* Do we have a loglevel in the string? */
- if (p[0] == '<') {
- unsigned char c = p[1];
- if (c && p[2] == '>') {
- switch (c) {
- case '0' ... '7': /* loglevel */
- current_log_level = c - '0';
- /* Fallthrough - make sure we're on a new line */
- case 'd': /* KERN_DEFAULT */
- if (!new_text_line) {
- emit_log_char('\n');
- new_text_line = 1;
- }
- /* Fallthrough - skip the loglevel */
- case 'c': /* KERN_CONT */
- p += 3;
- break;
+ /* Read log level and handle special printk prefix */
+ plen = log_prefix(p, &current_log_level, &special);
+ if (plen) {
+ p += plen;
+
+ switch (special) {
+ case 'c': /* Strip <c> KERN_CONT, continue line */
+ plen = 0;
+ break;
+ case 'd': /* Strip <d> KERN_DEFAULT, start new line */
+ plen = 0;
+ default:
+ if (!new_text_line) {
+ emit_log_char('\n');
+ new_text_line = 1;
}
}
}
/*
- * Copy the output into log_buf. If the caller didn't provide
- * appropriate log level tags, we insert them here
+ * Copy the output into log_buf. If the caller didn't provide
+ * the appropriate log prefix, we insert them here
*/
- for ( ; *p; p++) {
+ for (; *p; p++) {
if (new_text_line) {
- /* Always output the token */
- emit_log_char('<');
- emit_log_char(current_log_level + '0');
- emit_log_char('>');
- printed_len += 3;
new_text_line = 0;
+ if (plen) {
+ /* Copy original log prefix */
+ int i;
+
+ for (i = 0; i < plen; i++)
+ emit_log_char(printk_buf[i]);
+ printed_len += plen;
+ } else {
+ /* Add log prefix */
+ emit_log_char('<');
+ emit_log_char(current_log_level + '0');
+ emit_log_char('>');
+ printed_len += 3;
+ }
+
if (printk_time) {
- /* Follow the token with the time */
+ /* Add the current time stamp */
char tbuf[50], *tp;
unsigned tlen;
unsigned long long t;
@@ -1160,6 +1266,11 @@ void console_unlock(void)
local_irq_restore(flags);
}
console_locked = 0;
+
+ /* Release the exclusive_console once it is used */
+ if (unlikely(exclusive_console))
+ exclusive_console = NULL;
+
up(&console_sem);
spin_unlock_irqrestore(&logbuf_lock, flags);
if (wake_klogd)
@@ -1246,6 +1357,18 @@ void console_start(struct console *console)
}
EXPORT_SYMBOL(console_start);
+static int __read_mostly keep_bootcon;
+
+static int __init keep_bootcon_setup(char *str)
+{
+ keep_bootcon = 1;
+ printk(KERN_INFO "debug: skip boot console de-registration.\n");
+
+ return 0;
+}
+
+early_param("keep_bootcon", keep_bootcon_setup);
+
/*
* The console driver calls this routine during kernel initialization
* to register the console printing procedure with printk() and to
@@ -1382,6 +1505,12 @@ void register_console(struct console *newcon)
spin_lock_irqsave(&logbuf_lock, flags);
con_start = log_start;
spin_unlock_irqrestore(&logbuf_lock, flags);
+ /*
+ * We're about to replay the log buffer. Only do this to the
+ * just-registered console to avoid excessive message spam to
+ * the already-registered consoles.
+ */
+ exclusive_console = newcon;
}
console_unlock();
console_sysfs_notify();
@@ -1393,7 +1522,9 @@ void register_console(struct console *newcon)
* users know there might be something in the kernel's log buffer that
* went to the bootconsole (that they do not see on the real console)
*/
- if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) {
+ if (bcon &&
+ ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) &&
+ !keep_bootcon) {
/* we need to iterate through twice, to make sure we print
* everything out, before we unregister the console(s)
*/
diff --git a/kernel/profile.c b/kernel/profile.c
index 66f841b7fbd3..961b389fe52f 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -126,11 +126,9 @@ int __ref profile_init(void)
if (prof_buffer)
return 0;
- prof_buffer = vmalloc(buffer_bytes);
- if (prof_buffer) {
- memset(prof_buffer, 0, buffer_bytes);
+ prof_buffer = vzalloc(buffer_bytes);
+ if (prof_buffer)
return 0;
- }
free_cpumask_var(prof_cpu_mask);
return -ENOMEM;
@@ -305,14 +303,12 @@ static void profile_discard_flip_buffers(void)
mutex_unlock(&profile_flip_mutex);
}
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
unsigned long primary, secondary, flags, pc = (unsigned long)__pc;
int i, j, cpu;
struct profile_hit *hits;
- if (prof_on != type || !prof_buffer)
- return;
pc = min((pc - (unsigned long)_stext) >> prof_shift, prof_len - 1);
i = primary = (pc & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
secondary = (~(pc << 1) & (NR_PROFILE_GRP - 1)) << PROFILE_GRPSHIFT;
@@ -419,16 +415,20 @@ out_free:
#define profile_discard_flip_buffers() do { } while (0)
#define profile_cpu_callback NULL
-void profile_hits(int type, void *__pc, unsigned int nr_hits)
+static void do_profile_hits(int type, void *__pc, unsigned int nr_hits)
{
unsigned long pc;
-
- if (prof_on != type || !prof_buffer)
- return;
pc = ((unsigned long)__pc - (unsigned long)_stext) >> prof_shift;
atomic_add(nr_hits, &prof_buffer[min(pc, prof_len - 1)]);
}
#endif /* !CONFIG_SMP */
+
+void profile_hits(int type, void *__pc, unsigned int nr_hits)
+{
+ if (prof_on != type || !prof_buffer)
+ return;
+ do_profile_hits(type, __pc, nr_hits);
+}
EXPORT_SYMBOL_GPL(profile_hits);
void profile_tick(int type)
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index e2302e40b360..2df115790cd9 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -22,6 +22,7 @@
#include <linux/syscalls.h>
#include <linux/uaccess.h>
#include <linux/regset.h>
+#include <linux/hw_breakpoint.h>
/*
@@ -37,35 +38,33 @@ void __ptrace_link(struct task_struct *child, struct task_struct *new_parent)
child->parent = new_parent;
}
-/*
- * Turn a tracing stop into a normal stop now, since with no tracer there
- * would be no way to wake it up with SIGCONT or SIGKILL. If there was a
- * signal sent that would resume the child, but didn't because it was in
- * TASK_TRACED, resume it now.
- * Requires that irqs be disabled.
- */
-static void ptrace_untrace(struct task_struct *child)
-{
- spin_lock(&child->sighand->siglock);
- if (task_is_traced(child)) {
- /*
- * If the group stop is completed or in progress,
- * this thread was already counted as stopped.
- */
- if (child->signal->flags & SIGNAL_STOP_STOPPED ||
- child->signal->group_stop_count)
- __set_task_state(child, TASK_STOPPED);
- else
- signal_wake_up(child, 1);
- }
- spin_unlock(&child->sighand->siglock);
-}
-
-/*
- * unptrace a task: move it back to its original parent and
- * remove it from the ptrace list.
+/**
+ * __ptrace_unlink - unlink ptracee and restore its execution state
+ * @child: ptracee to be unlinked
*
- * Must be called with the tasklist lock write-held.
+ * Remove @child from the ptrace list, move it back to the original parent,
+ * and restore the execution state so that it conforms to the group stop
+ * state.
+ *
+ * Unlinking can happen via two paths - explicit PTRACE_DETACH or ptracer
+ * exiting. For PTRACE_DETACH, unless the ptracee has been killed between
+ * ptrace_check_attach() and here, it's guaranteed to be in TASK_TRACED.
+ * If the ptracer is exiting, the ptracee can be in any state.
+ *
+ * After detach, the ptracee should be in a state which conforms to the
+ * group stop. If the group is stopped or in the process of stopping, the
+ * ptracee should be put into TASK_STOPPED; otherwise, it should be woken
+ * up from TASK_TRACED.
+ *
+ * If the ptracee is in TASK_TRACED and needs to be moved to TASK_STOPPED,
+ * it goes through TRACED -> RUNNING -> STOPPED transition which is similar
+ * to but in the opposite direction of what happens while attaching to a
+ * stopped task. However, in this direction, the intermediate RUNNING
+ * state is not hidden even from the current ptracer and if it immediately
+ * re-attaches and performs a WNOHANG wait(2), it may fail.
+ *
+ * CONTEXT:
+ * write_lock_irq(tasklist_lock)
*/
void __ptrace_unlink(struct task_struct *child)
{
@@ -75,8 +74,27 @@ void __ptrace_unlink(struct task_struct *child)
child->parent = child->real_parent;
list_del_init(&child->ptrace_entry);
- if (task_is_traced(child))
- ptrace_untrace(child);
+ spin_lock(&child->sighand->siglock);
+
+ /*
+ * Reinstate GROUP_STOP_PENDING if group stop is in effect and
+ * @child isn't dead.
+ */
+ if (!(child->flags & PF_EXITING) &&
+ (child->signal->flags & SIGNAL_STOP_STOPPED ||
+ child->signal->group_stop_count))
+ child->group_stop |= GROUP_STOP_PENDING;
+
+ /*
+ * If transition to TASK_STOPPED is pending or in TASK_TRACED, kick
+ * @child in the butt. Note that @resume should be used iff @child
+ * is in TASK_TRACED; otherwise, we might unduly disrupt
+ * TASK_KILLABLE sleeps.
+ */
+ if (child->group_stop & GROUP_STOP_PENDING || task_is_traced(child))
+ signal_wake_up(child, task_is_traced(child));
+
+ spin_unlock(&child->sighand->siglock);
}
/*
@@ -95,16 +113,14 @@ int ptrace_check_attach(struct task_struct *child, int kill)
*/
read_lock(&tasklist_lock);
if ((child->ptrace & PT_PTRACED) && child->parent == current) {
- ret = 0;
/*
* child->sighand can't be NULL, release_task()
* does ptrace_unlink() before __exit_signal().
*/
spin_lock_irq(&child->sighand->siglock);
- if (task_is_stopped(child))
- child->state = TASK_TRACED;
- else if (!task_is_traced(child) && !kill)
- ret = -ESRCH;
+ WARN_ON_ONCE(task_is_stopped(child));
+ if (task_is_traced(child) || kill)
+ ret = 0;
spin_unlock_irq(&child->sighand->siglock);
}
read_unlock(&tasklist_lock);
@@ -134,21 +150,24 @@ int __ptrace_may_access(struct task_struct *task, unsigned int mode)
return 0;
rcu_read_lock();
tcred = __task_cred(task);
- if ((cred->uid != tcred->euid ||
- cred->uid != tcred->suid ||
- cred->uid != tcred->uid ||
- cred->gid != tcred->egid ||
- cred->gid != tcred->sgid ||
- cred->gid != tcred->gid) &&
- !capable(CAP_SYS_PTRACE)) {
- rcu_read_unlock();
- return -EPERM;
- }
+ if (cred->user->user_ns == tcred->user->user_ns &&
+ (cred->uid == tcred->euid &&
+ cred->uid == tcred->suid &&
+ cred->uid == tcred->uid &&
+ cred->gid == tcred->egid &&
+ cred->gid == tcred->sgid &&
+ cred->gid == tcred->gid))
+ goto ok;
+ if (ns_capable(tcred->user->user_ns, CAP_SYS_PTRACE))
+ goto ok;
+ rcu_read_unlock();
+ return -EPERM;
+ok:
rcu_read_unlock();
smp_rmb();
if (task->mm)
dumpable = get_dumpable(task->mm);
- if (!dumpable && !capable(CAP_SYS_PTRACE))
+ if (!dumpable && !task_ns_capable(task, CAP_SYS_PTRACE))
return -EPERM;
return security_ptrace_access_check(task, mode);
@@ -165,6 +184,7 @@ bool ptrace_may_access(struct task_struct *task, unsigned int mode)
static int ptrace_attach(struct task_struct *task)
{
+ bool wait_trap = false;
int retval;
audit_ptrace(task);
@@ -198,18 +218,48 @@ static int ptrace_attach(struct task_struct *task)
goto unlock_tasklist;
task->ptrace = PT_PTRACED;
- if (capable(CAP_SYS_PTRACE))
+ if (task_ns_capable(task, CAP_SYS_PTRACE))
task->ptrace |= PT_PTRACE_CAP;
__ptrace_link(task, current);
send_sig_info(SIGSTOP, SEND_SIG_FORCED, task);
+ spin_lock(&task->sighand->siglock);
+
+ /*
+ * If the task is already STOPPED, set GROUP_STOP_PENDING and
+ * TRAPPING, and kick it so that it transits to TRACED. TRAPPING
+ * will be cleared if the child completes the transition or any
+ * event which clears the group stop states happens. We'll wait
+ * for the transition to complete before returning from this
+ * function.
+ *
+ * This hides STOPPED -> RUNNING -> TRACED transition from the
+ * attaching thread but a different thread in the same group can
+ * still observe the transient RUNNING state. IOW, if another
+ * thread's WNOHANG wait(2) on the stopped tracee races against
+ * ATTACH, the wait(2) may fail due to the transient RUNNING.
+ *
+ * The following task_is_stopped() test is safe as both transitions
+ * in and out of STOPPED are protected by siglock.
+ */
+ if (task_is_stopped(task)) {
+ task->group_stop |= GROUP_STOP_PENDING | GROUP_STOP_TRAPPING;
+ signal_wake_up(task, 1);
+ wait_trap = true;
+ }
+
+ spin_unlock(&task->sighand->siglock);
+
retval = 0;
unlock_tasklist:
write_unlock_irq(&tasklist_lock);
unlock_creds:
mutex_unlock(&task->signal->cred_guard_mutex);
out:
+ if (wait_trap)
+ wait_event(current->signal->wait_chldexit,
+ !(task->group_stop & GROUP_STOP_TRAPPING));
return retval;
}
@@ -312,8 +362,6 @@ static int ptrace_detach(struct task_struct *child, unsigned int data)
if (child->ptrace) {
child->exit_code = data;
dead = __ptrace_detach(current, child);
- if (!child->exit_state)
- wake_up_state(child, TASK_TRACED | TASK_STOPPED);
}
write_unlock_irq(&tasklist_lock);
@@ -514,7 +562,7 @@ static int ptrace_resume(struct task_struct *child, long request,
}
child->exit_code = data;
- wake_up_process(child);
+ wake_up_state(child, __TASK_TRACED);
return 0;
}
@@ -876,3 +924,19 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid,
return ret;
}
#endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_HAVE_HW_BREAKPOINT
+int ptrace_get_breakpoints(struct task_struct *tsk)
+{
+ if (atomic_inc_not_zero(&tsk->ptrace_bp_refcnt))
+ return 0;
+
+ return -1;
+}
+
+void ptrace_put_breakpoints(struct task_struct *tsk)
+{
+ if (atomic_dec_and_test(&tsk->ptrace_bp_refcnt))
+ flush_ptrace_hw_breakpoint(tsk);
+}
+#endif /* CONFIG_HAVE_HW_BREAKPOINT */
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index a23a57a976d1..7784bd216b6a 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -142,10 +142,17 @@ static int rcuhead_fixup_init(void *addr, enum debug_obj_state state)
* Ensure that queued callbacks are all executed.
* If we detect that we are nested in a RCU read-side critical
* section, we should simply fail, otherwise we would deadlock.
+ * In !PREEMPT configurations, there is no way to tell if we are
+ * in a RCU read-side critical section or not, so we never
+ * attempt any fixup and just print a warning.
*/
+#ifndef CONFIG_PREEMPT
+ WARN_ON_ONCE(1);
+ return 0;
+#endif
if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
irqs_disabled()) {
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return 0;
}
rcu_barrier();
@@ -184,10 +191,17 @@ static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state)
* Ensure that queued callbacks are all executed.
* If we detect that we are nested in a RCU read-side critical
* section, we should simply fail, otherwise we would deadlock.
+ * In !PREEMPT configurations, there is no way to tell if we are
+ * in a RCU read-side critical section or not, so we never
+ * attempt any fixup and just print a warning.
*/
+#ifndef CONFIG_PREEMPT
+ WARN_ON_ONCE(1);
+ return 0;
+#endif
if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
irqs_disabled()) {
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return 0;
}
rcu_barrier();
@@ -214,14 +228,17 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
* Ensure that queued callbacks are all executed.
* If we detect that we are nested in a RCU read-side critical
* section, we should simply fail, otherwise we would deadlock.
+ * In !PREEMPT configurations, there is no way to tell if we are
+ * in a RCU read-side critical section or not, so we never
+ * attempt any fixup and just print a warning.
*/
#ifndef CONFIG_PREEMPT
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return 0;
-#else
+#endif
if (rcu_preempt_depth() != 0 || preempt_count() != 0 ||
irqs_disabled()) {
- WARN_ON(1);
+ WARN_ON_ONCE(1);
return 0;
}
rcu_barrier();
@@ -229,7 +246,6 @@ static int rcuhead_fixup_free(void *addr, enum debug_obj_state state)
rcu_barrier_bh();
debug_object_free(head, &rcuhead_debug_descr);
return 1;
-#endif
default:
return 0;
}
diff --git a/kernel/rcutiny.c b/kernel/rcutiny.c
index 0c343b9a46d5..7bbac7d0f5ab 100644
--- a/kernel/rcutiny.c
+++ b/kernel/rcutiny.c
@@ -35,15 +35,16 @@
#include <linux/init.h>
#include <linux/time.h>
#include <linux/cpu.h>
+#include <linux/prefetch.h>
/* Controls for rcu_kthread() kthread, replacing RCU_SOFTIRQ used previously. */
static struct task_struct *rcu_kthread_task;
static DECLARE_WAIT_QUEUE_HEAD(rcu_kthread_wq);
static unsigned long have_rcu_kthread_work;
-static void invoke_rcu_kthread(void);
/* Forward declarations for rcutiny_plugin.h. */
struct rcu_ctrlblk;
+static void invoke_rcu_kthread(void);
static void rcu_process_callbacks(struct rcu_ctrlblk *rcp);
static int rcu_kthread(void *arg);
static void __call_rcu(struct rcu_head *head,
@@ -79,36 +80,45 @@ void rcu_exit_nohz(void)
#endif /* #ifdef CONFIG_NO_HZ */
/*
- * Helper function for rcu_qsctr_inc() and rcu_bh_qsctr_inc().
- * Also disable irqs to avoid confusion due to interrupt handlers
+ * Helper function for rcu_sched_qs() and rcu_bh_qs().
+ * Also irqs are disabled to avoid confusion due to interrupt handlers
* invoking call_rcu().
*/
static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
{
- unsigned long flags;
-
- local_irq_save(flags);
if (rcp->rcucblist != NULL &&
rcp->donetail != rcp->curtail) {
rcp->donetail = rcp->curtail;
- local_irq_restore(flags);
return 1;
}
- local_irq_restore(flags);
return 0;
}
/*
+ * Wake up rcu_kthread() to process callbacks now eligible for invocation
+ * or to boost readers.
+ */
+static void invoke_rcu_kthread(void)
+{
+ have_rcu_kthread_work = 1;
+ wake_up(&rcu_kthread_wq);
+}
+
+/*
* Record an rcu quiescent state. And an rcu_bh quiescent state while we
* are at it, given that any rcu quiescent state is also an rcu_bh
* quiescent state. Use "+" instead of "||" to defeat short circuiting.
*/
void rcu_sched_qs(int cpu)
{
+ unsigned long flags;
+
+ local_irq_save(flags);
if (rcu_qsctr_help(&rcu_sched_ctrlblk) +
rcu_qsctr_help(&rcu_bh_ctrlblk))
invoke_rcu_kthread();
+ local_irq_restore(flags);
}
/*
@@ -116,8 +126,12 @@ void rcu_sched_qs(int cpu)
*/
void rcu_bh_qs(int cpu)
{
+ unsigned long flags;
+
+ local_irq_save(flags);
if (rcu_qsctr_help(&rcu_bh_ctrlblk))
invoke_rcu_kthread();
+ local_irq_restore(flags);
}
/*
@@ -167,7 +181,7 @@ static void rcu_process_callbacks(struct rcu_ctrlblk *rcp)
prefetch(next);
debug_rcu_head_unqueue(list);
local_bh_disable();
- list->func(list);
+ __rcu_reclaim(list);
local_bh_enable();
list = next;
RCU_TRACE(cb_count++);
@@ -208,20 +222,6 @@ static int rcu_kthread(void *arg)
}
/*
- * Wake up rcu_kthread() to process callbacks now eligible for invocation
- * or to boost readers.
- */
-static void invoke_rcu_kthread(void)
-{
- unsigned long flags;
-
- local_irq_save(flags);
- have_rcu_kthread_work = 1;
- wake_up(&rcu_kthread_wq);
- local_irq_restore(flags);
-}
-
-/*
* Wait for a grace period to elapse. But it is illegal to invoke
* synchronize_sched() from within an RCU read-side critical section.
* Therefore, any legal call to synchronize_sched() is a quiescent
diff --git a/kernel/rcutiny_plugin.h b/kernel/rcutiny_plugin.h
index 015abaea962a..f259c676195f 100644
--- a/kernel/rcutiny_plugin.h
+++ b/kernel/rcutiny_plugin.h
@@ -100,23 +100,28 @@ struct rcu_preempt_ctrlblk {
u8 completed; /* Last grace period completed. */
/* If all three are equal, RCU is idle. */
#ifdef CONFIG_RCU_BOOST
- s8 boosted_this_gp; /* Has boosting already happened? */
unsigned long boost_time; /* When to start boosting (jiffies) */
#endif /* #ifdef CONFIG_RCU_BOOST */
#ifdef CONFIG_RCU_TRACE
unsigned long n_grace_periods;
#ifdef CONFIG_RCU_BOOST
unsigned long n_tasks_boosted;
+ /* Total number of tasks boosted. */
unsigned long n_exp_boosts;
+ /* Number of tasks boosted for expedited GP. */
unsigned long n_normal_boosts;
- unsigned long n_normal_balk_blkd_tasks;
- unsigned long n_normal_balk_gp_tasks;
- unsigned long n_normal_balk_boost_tasks;
- unsigned long n_normal_balk_boosted;
- unsigned long n_normal_balk_notyet;
- unsigned long n_normal_balk_nos;
- unsigned long n_exp_balk_blkd_tasks;
- unsigned long n_exp_balk_nos;
+ /* Number of tasks boosted for normal GP. */
+ unsigned long n_balk_blkd_tasks;
+ /* Refused to boost: no blocked tasks. */
+ unsigned long n_balk_exp_gp_tasks;
+ /* Refused to boost: nothing blocking GP. */
+ unsigned long n_balk_boost_tasks;
+ /* Refused to boost: already boosting. */
+ unsigned long n_balk_notyet;
+ /* Refused to boost: not yet time. */
+ unsigned long n_balk_nos;
+ /* Refused to boost: not sure why, though. */
+ /* This can happen due to race conditions. */
#endif /* #ifdef CONFIG_RCU_BOOST */
#endif /* #ifdef CONFIG_RCU_TRACE */
};
@@ -201,7 +206,6 @@ static struct list_head *rcu_next_node_entry(struct task_struct *t)
#ifdef CONFIG_RCU_BOOST
static void rcu_initiate_boost_trace(void);
-static void rcu_initiate_exp_boost_trace(void);
#endif /* #ifdef CONFIG_RCU_BOOST */
/*
@@ -219,41 +223,21 @@ static void show_tiny_preempt_stats(struct seq_file *m)
"N."[!rcu_preempt_ctrlblk.gp_tasks],
"E."[!rcu_preempt_ctrlblk.exp_tasks]);
#ifdef CONFIG_RCU_BOOST
- seq_printf(m, " ttb=%c btg=",
- "B."[!rcu_preempt_ctrlblk.boost_tasks]);
- switch (rcu_preempt_ctrlblk.boosted_this_gp) {
- case -1:
- seq_puts(m, "exp");
- break;
- case 0:
- seq_puts(m, "no");
- break;
- case 1:
- seq_puts(m, "begun");
- break;
- case 2:
- seq_puts(m, "done");
- break;
- default:
- seq_printf(m, "?%d?", rcu_preempt_ctrlblk.boosted_this_gp);
- }
- seq_printf(m, " ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+ seq_printf(m, "%sttb=%c ntb=%lu neb=%lu nnb=%lu j=%04x bt=%04x\n",
+ " ",
+ "B."[!rcu_preempt_ctrlblk.boost_tasks],
rcu_preempt_ctrlblk.n_tasks_boosted,
rcu_preempt_ctrlblk.n_exp_boosts,
rcu_preempt_ctrlblk.n_normal_boosts,
(int)(jiffies & 0xffff),
(int)(rcu_preempt_ctrlblk.boost_time & 0xffff));
- seq_printf(m, " %s: nt=%lu gt=%lu bt=%lu b=%lu ny=%lu nos=%lu\n",
- "normal balk",
- rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks,
- rcu_preempt_ctrlblk.n_normal_balk_gp_tasks,
- rcu_preempt_ctrlblk.n_normal_balk_boost_tasks,
- rcu_preempt_ctrlblk.n_normal_balk_boosted,
- rcu_preempt_ctrlblk.n_normal_balk_notyet,
- rcu_preempt_ctrlblk.n_normal_balk_nos);
- seq_printf(m, " exp balk: bt=%lu nos=%lu\n",
- rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks,
- rcu_preempt_ctrlblk.n_exp_balk_nos);
+ seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu ny=%lu nos=%lu\n",
+ " balk",
+ rcu_preempt_ctrlblk.n_balk_blkd_tasks,
+ rcu_preempt_ctrlblk.n_balk_exp_gp_tasks,
+ rcu_preempt_ctrlblk.n_balk_boost_tasks,
+ rcu_preempt_ctrlblk.n_balk_notyet,
+ rcu_preempt_ctrlblk.n_balk_nos);
#endif /* #ifdef CONFIG_RCU_BOOST */
}
@@ -271,25 +255,59 @@ static int rcu_boost(void)
{
unsigned long flags;
struct rt_mutex mtx;
- struct list_head *np;
struct task_struct *t;
+ struct list_head *tb;
- if (rcu_preempt_ctrlblk.boost_tasks == NULL)
+ if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
+ rcu_preempt_ctrlblk.exp_tasks == NULL)
return 0; /* Nothing to boost. */
+
raw_local_irq_save(flags);
- rcu_preempt_ctrlblk.boosted_this_gp++;
- t = container_of(rcu_preempt_ctrlblk.boost_tasks, struct task_struct,
- rcu_node_entry);
- np = rcu_next_node_entry(t);
+
+ /*
+ * Recheck with irqs disabled: all tasks in need of boosting
+ * might exit their RCU read-side critical sections on their own
+ * if we are preempted just before disabling irqs.
+ */
+ if (rcu_preempt_ctrlblk.boost_tasks == NULL &&
+ rcu_preempt_ctrlblk.exp_tasks == NULL) {
+ raw_local_irq_restore(flags);
+ return 0;
+ }
+
+ /*
+ * Preferentially boost tasks blocking expedited grace periods.
+ * This cannot starve the normal grace periods because a second
+ * expedited grace period must boost all blocked tasks, including
+ * those blocking the pre-existing normal grace period.
+ */
+ if (rcu_preempt_ctrlblk.exp_tasks != NULL) {
+ tb = rcu_preempt_ctrlblk.exp_tasks;
+ RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
+ } else {
+ tb = rcu_preempt_ctrlblk.boost_tasks;
+ RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
+ }
+ RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
+
+ /*
+ * We boost task t by manufacturing an rt_mutex that appears to
+ * be held by task t. We leave a pointer to that rt_mutex where
+ * task t can find it, and task t will release the mutex when it
+ * exits its outermost RCU read-side critical section. Then
+ * simply acquiring this artificial rt_mutex will boost task
+ * t's priority. (Thanks to tglx for suggesting this approach!)
+ */
+ t = container_of(tb, struct task_struct, rcu_node_entry);
rt_mutex_init_proxy_locked(&mtx, t);
t->rcu_boost_mutex = &mtx;
t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
raw_local_irq_restore(flags);
rt_mutex_lock(&mtx);
- RCU_TRACE(rcu_preempt_ctrlblk.n_tasks_boosted++);
- rcu_preempt_ctrlblk.boosted_this_gp++;
- rt_mutex_unlock(&mtx);
- return rcu_preempt_ctrlblk.boost_tasks != NULL;
+ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
+
+ return rcu_preempt_ctrlblk.boost_tasks != NULL ||
+ rcu_preempt_ctrlblk.exp_tasks != NULL;
}
/*
@@ -304,42 +322,25 @@ static int rcu_boost(void)
*/
static int rcu_initiate_boost(void)
{
- if (!rcu_preempt_blocked_readers_cgp()) {
- RCU_TRACE(rcu_preempt_ctrlblk.n_normal_balk_blkd_tasks++);
+ if (!rcu_preempt_blocked_readers_cgp() &&
+ rcu_preempt_ctrlblk.exp_tasks == NULL) {
+ RCU_TRACE(rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++);
return 0;
}
- if (rcu_preempt_ctrlblk.gp_tasks != NULL &&
- rcu_preempt_ctrlblk.boost_tasks == NULL &&
- rcu_preempt_ctrlblk.boosted_this_gp == 0 &&
- ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time)) {
- rcu_preempt_ctrlblk.boost_tasks = rcu_preempt_ctrlblk.gp_tasks;
+ if (rcu_preempt_ctrlblk.exp_tasks != NULL ||
+ (rcu_preempt_ctrlblk.gp_tasks != NULL &&
+ rcu_preempt_ctrlblk.boost_tasks == NULL &&
+ ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))) {
+ if (rcu_preempt_ctrlblk.exp_tasks == NULL)
+ rcu_preempt_ctrlblk.boost_tasks =
+ rcu_preempt_ctrlblk.gp_tasks;
invoke_rcu_kthread();
- RCU_TRACE(rcu_preempt_ctrlblk.n_normal_boosts++);
} else
RCU_TRACE(rcu_initiate_boost_trace());
return 1;
}
-/*
- * Initiate boosting for an expedited grace period.
- */
-static void rcu_initiate_expedited_boost(void)
-{
- unsigned long flags;
-
- raw_local_irq_save(flags);
- if (!list_empty(&rcu_preempt_ctrlblk.blkd_tasks)) {
- rcu_preempt_ctrlblk.boost_tasks =
- rcu_preempt_ctrlblk.blkd_tasks.next;
- rcu_preempt_ctrlblk.boosted_this_gp = -1;
- invoke_rcu_kthread();
- RCU_TRACE(rcu_preempt_ctrlblk.n_exp_boosts++);
- } else
- RCU_TRACE(rcu_initiate_exp_boost_trace());
- raw_local_irq_restore(flags);
-}
-
-#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000);
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
/*
* Do priority-boost accounting for the start of a new grace period.
@@ -347,8 +348,6 @@ static void rcu_initiate_expedited_boost(void)
static void rcu_preempt_boost_start_gp(void)
{
rcu_preempt_ctrlblk.boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
- if (rcu_preempt_ctrlblk.boosted_this_gp > 0)
- rcu_preempt_ctrlblk.boosted_this_gp = 0;
}
#else /* #ifdef CONFIG_RCU_BOOST */
@@ -372,13 +371,6 @@ static int rcu_initiate_boost(void)
}
/*
- * If there is no RCU priority boosting, we don't initiate expedited boosting.
- */
-static void rcu_initiate_expedited_boost(void)
-{
-}
-
-/*
* If there is no RCU priority boosting, nothing to do at grace-period start.
*/
static void rcu_preempt_boost_start_gp(void)
@@ -418,7 +410,7 @@ static void rcu_preempt_cpu_qs(void)
if (!rcu_preempt_gp_in_progress())
return;
/*
- * Check up on boosting. If there are no readers blocking the
+ * Check up on boosting. If there are readers blocking the
* current grace period, leave.
*/
if (rcu_initiate_boost())
@@ -578,7 +570,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
empty = !rcu_preempt_blocked_readers_cgp();
empty_exp = rcu_preempt_ctrlblk.exp_tasks == NULL;
np = rcu_next_node_entry(t);
- list_del(&t->rcu_node_entry);
+ list_del_init(&t->rcu_node_entry);
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.gp_tasks)
rcu_preempt_ctrlblk.gp_tasks = np;
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.exp_tasks)
@@ -587,7 +579,6 @@ static void rcu_read_unlock_special(struct task_struct *t)
if (&t->rcu_node_entry == rcu_preempt_ctrlblk.boost_tasks)
rcu_preempt_ctrlblk.boost_tasks = np;
#endif /* #ifdef CONFIG_RCU_BOOST */
- INIT_LIST_HEAD(&t->rcu_node_entry);
/*
* If this was the last task on the current list, and if
@@ -812,13 +803,16 @@ void synchronize_rcu_expedited(void)
rpcp->exp_tasks = rpcp->blkd_tasks.next;
if (rpcp->exp_tasks == &rpcp->blkd_tasks)
rpcp->exp_tasks = NULL;
- local_irq_restore(flags);
/* Wait for tail of ->blkd_tasks list to drain. */
- if (rcu_preempted_readers_exp())
- rcu_initiate_expedited_boost();
+ if (!rcu_preempted_readers_exp())
+ local_irq_restore(flags);
+ else {
+ rcu_initiate_boost();
+ local_irq_restore(flags);
wait_event(sync_rcu_preempt_exp_wq,
!rcu_preempted_readers_exp());
+ }
/* Clean up and exit. */
barrier(); /* ensure expedited GP seen before counter increment. */
@@ -852,7 +846,7 @@ void exit_rcu(void)
if (t->rcu_read_lock_nesting == 0)
return;
t->rcu_read_lock_nesting = 1;
- rcu_read_unlock();
+ __rcu_read_unlock();
}
#else /* #ifdef CONFIG_TINY_PREEMPT_RCU */
@@ -931,24 +925,17 @@ void __init rcu_scheduler_starting(void)
static void rcu_initiate_boost_trace(void)
{
- if (rcu_preempt_ctrlblk.gp_tasks == NULL)
- rcu_preempt_ctrlblk.n_normal_balk_gp_tasks++;
+ if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
+ rcu_preempt_ctrlblk.n_balk_blkd_tasks++;
+ else if (rcu_preempt_ctrlblk.gp_tasks == NULL &&
+ rcu_preempt_ctrlblk.exp_tasks == NULL)
+ rcu_preempt_ctrlblk.n_balk_exp_gp_tasks++;
else if (rcu_preempt_ctrlblk.boost_tasks != NULL)
- rcu_preempt_ctrlblk.n_normal_balk_boost_tasks++;
- else if (rcu_preempt_ctrlblk.boosted_this_gp != 0)
- rcu_preempt_ctrlblk.n_normal_balk_boosted++;
+ rcu_preempt_ctrlblk.n_balk_boost_tasks++;
else if (!ULONG_CMP_GE(jiffies, rcu_preempt_ctrlblk.boost_time))
- rcu_preempt_ctrlblk.n_normal_balk_notyet++;
- else
- rcu_preempt_ctrlblk.n_normal_balk_nos++;
-}
-
-static void rcu_initiate_exp_boost_trace(void)
-{
- if (list_empty(&rcu_preempt_ctrlblk.blkd_tasks))
- rcu_preempt_ctrlblk.n_exp_balk_blkd_tasks++;
+ rcu_preempt_ctrlblk.n_balk_notyet++;
else
- rcu_preempt_ctrlblk.n_exp_balk_nos++;
+ rcu_preempt_ctrlblk.n_balk_nos++;
}
#endif /* #ifdef CONFIG_RCU_BOOST */
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 89613f97ff26..2e138db03382 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -47,7 +47,6 @@
#include <linux/srcu.h>
#include <linux/slab.h>
#include <asm/byteorder.h>
-#include <linux/sched.h>
MODULE_LICENSE("GPL");
MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and "
@@ -132,7 +131,7 @@ struct rcu_torture {
static LIST_HEAD(rcu_torture_freelist);
static struct rcu_torture __rcu *rcu_torture_current;
-static long rcu_torture_current_version;
+static unsigned long rcu_torture_current_version;
static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
static DEFINE_SPINLOCK(rcu_torture_lock);
static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
@@ -147,8 +146,6 @@ static atomic_t n_rcu_torture_mberror;
static atomic_t n_rcu_torture_error;
static long n_rcu_torture_boost_ktrerror;
static long n_rcu_torture_boost_rterror;
-static long n_rcu_torture_boost_allocerror;
-static long n_rcu_torture_boost_afferror;
static long n_rcu_torture_boost_failure;
static long n_rcu_torture_boosts;
static long n_rcu_torture_timers;
@@ -164,11 +161,11 @@ static int stutter_pause_test;
#endif
int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT;
-#ifdef CONFIG_RCU_BOOST
+#if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU)
#define rcu_can_boost() 1
-#else /* #ifdef CONFIG_RCU_BOOST */
+#else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
#define rcu_can_boost() 0
-#endif /* #else #ifdef CONFIG_RCU_BOOST */
+#endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */
static unsigned long boost_starttime; /* jiffies of next boost test start. */
DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */
@@ -752,6 +749,7 @@ static int rcu_torture_boost(void *arg)
n_rcu_torture_boost_rterror++;
}
+ init_rcu_head_on_stack(&rbi.rcu);
/* Each pass through the following loop does one boost-test cycle. */
do {
/* Wait for the next test interval. */
@@ -811,6 +809,7 @@ checkwait: rcu_stutter_wait("rcu_torture_boost");
/* Clean up and exit. */
VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping");
+ destroy_rcu_head_on_stack(&rbi.rcu);
rcutorture_shutdown_absorb("rcu_torture_boost");
while (!kthread_should_stop() || rbi.inflight)
schedule_timeout_uninterruptible(1);
@@ -887,7 +886,7 @@ rcu_torture_writer(void *arg)
old_rp->rtort_pipe_count++;
cur_ops->deferred_free(old_rp);
}
- rcu_torture_current_version++;
+ rcutorture_record_progress(++rcu_torture_current_version);
oldbatch = cur_ops->completed();
rcu_stutter_wait("rcu_torture_writer");
} while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP);
@@ -1067,8 +1066,8 @@ rcu_torture_printk(char *page)
}
cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG);
cnt += sprintf(&page[cnt],
- "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
- "rtmbe: %d rtbke: %ld rtbre: %ld rtbae: %ld rtbafe: %ld "
+ "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d "
+ "rtmbe: %d rtbke: %ld rtbre: %ld "
"rtbf: %ld rtb: %ld nt: %ld",
rcu_torture_current,
rcu_torture_current_version,
@@ -1079,16 +1078,12 @@ rcu_torture_printk(char *page)
atomic_read(&n_rcu_torture_mberror),
n_rcu_torture_boost_ktrerror,
n_rcu_torture_boost_rterror,
- n_rcu_torture_boost_allocerror,
- n_rcu_torture_boost_afferror,
n_rcu_torture_boost_failure,
n_rcu_torture_boosts,
n_rcu_torture_timers);
if (atomic_read(&n_rcu_torture_mberror) != 0 ||
n_rcu_torture_boost_ktrerror != 0 ||
n_rcu_torture_boost_rterror != 0 ||
- n_rcu_torture_boost_allocerror != 0 ||
- n_rcu_torture_boost_afferror != 0 ||
n_rcu_torture_boost_failure != 0)
cnt += sprintf(&page[cnt], " !!!");
cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG);
@@ -1332,6 +1327,7 @@ rcu_torture_cleanup(void)
int i;
mutex_lock(&fullstop_mutex);
+ rcutorture_record_test_transition();
if (fullstop == FULLSTOP_SHUTDOWN) {
printk(KERN_WARNING /* but going down anyway, so... */
"Concurrent 'rmmod rcutorture' and shutdown illegal!\n");
@@ -1487,8 +1483,6 @@ rcu_torture_init(void)
atomic_set(&n_rcu_torture_error, 0);
n_rcu_torture_boost_ktrerror = 0;
n_rcu_torture_boost_rterror = 0;
- n_rcu_torture_boost_allocerror = 0;
- n_rcu_torture_boost_afferror = 0;
n_rcu_torture_boost_failure = 0;
n_rcu_torture_boosts = 0;
for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
@@ -1625,6 +1619,7 @@ rcu_torture_init(void)
}
}
register_reboot_notifier(&rcutorture_shutdown_nb);
+ rcutorture_record_test_transition();
mutex_unlock(&fullstop_mutex);
return 0;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index dd4aea806f8e..77a7671dd147 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -36,7 +36,7 @@
#include <linux/interrupt.h>
#include <linux/sched.h>
#include <linux/nmi.h>
-#include <asm/atomic.h>
+#include <linux/atomic.h>
#include <linux/bitops.h>
#include <linux/module.h>
#include <linux/completion.h>
@@ -47,6 +47,9 @@
#include <linux/mutex.h>
#include <linux/time.h>
#include <linux/kernel_stat.h>
+#include <linux/wait.h>
+#include <linux/kthread.h>
+#include <linux/prefetch.h>
#include "rcutree.h"
@@ -79,10 +82,40 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+static struct rcu_state *rcu_state;
+
int rcu_scheduler_active __read_mostly;
EXPORT_SYMBOL_GPL(rcu_scheduler_active);
/*
+ * Control variables for per-CPU and per-rcu_node kthreads. These
+ * handle all flavors of RCU.
+ */
+static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DEFINE_PER_CPU(int, rcu_cpu_kthread_cpu);
+DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DEFINE_PER_CPU(char, rcu_cpu_has_work);
+static char rcu_kthreads_spawnable;
+
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
+static void invoke_rcu_cpu_kthread(void);
+
+#define RCU_KTHREAD_PRIO 1 /* RT priority for per-CPU kthreads. */
+
+/*
+ * Track the rcutorture test sequence number and the update version
+ * number within a given test. The rcutorture_testseq is incremented
+ * on every rcutorture module load and unload, so has an odd value
+ * when a test is running. The rcutorture_vernum is set to zero
+ * when rcutorture starts and is incremented on each rcutorture update.
+ * These variables enable correlating rcutorture output with the
+ * RCU tracing information.
+ */
+unsigned long rcutorture_testseq;
+unsigned long rcutorture_vernum;
+
+/*
* Return true if an RCU grace period is in progress. The ACCESS_ONCE()s
* permit this function to be invoked without holding the root rcu_node
* structure's ->lock, but of course results can be subject to change.
@@ -124,11 +157,12 @@ void rcu_note_context_switch(int cpu)
rcu_sched_qs(cpu);
rcu_preempt_note_context_switch(cpu);
}
+EXPORT_SYMBOL_GPL(rcu_note_context_switch);
#ifdef CONFIG_NO_HZ
DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
.dynticks_nesting = 1,
- .dynticks = 1,
+ .dynticks = ATOMIC_INIT(1),
};
#endif /* #ifdef CONFIG_NO_HZ */
@@ -140,10 +174,8 @@ module_param(blimit, int, 0);
module_param(qhimark, int, 0);
module_param(qlowmark, int, 0);
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-int rcu_cpu_stall_suppress __read_mostly = RCU_CPU_STALL_SUPPRESS_INIT;
+int rcu_cpu_stall_suppress __read_mostly;
module_param(rcu_cpu_stall_suppress, int, 0644);
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
static int rcu_pending(int cpu);
@@ -176,6 +208,31 @@ void rcu_bh_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
/*
+ * Record the number of times rcutorture tests have been initiated and
+ * terminated. This information allows the debugfs tracing stats to be
+ * correlated to the rcutorture messages, even when the rcutorture module
+ * is being repeatedly loaded and unloaded. In other words, we cannot
+ * store this state in rcutorture itself.
+ */
+void rcutorture_record_test_transition(void)
+{
+ rcutorture_testseq++;
+ rcutorture_vernum = 0;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_test_transition);
+
+/*
+ * Record the number of writer passes through the current rcutorture test.
+ * This is also used to correlate debugfs tracing stats with the rcutorture
+ * messages.
+ */
+void rcutorture_record_progress(unsigned long vernum)
+{
+ rcutorture_vernum++;
+}
+EXPORT_SYMBOL_GPL(rcutorture_record_progress);
+
+/*
* Force a quiescent state for RCU-sched.
*/
void rcu_sched_force_quiescent_state(void)
@@ -234,8 +291,8 @@ static int rcu_implicit_offline_qs(struct rcu_data *rdp)
return 1;
}
- /* If preemptable RCU, no point in sending reschedule IPI. */
- if (rdp->preemptable)
+ /* If preemptible RCU, no point in sending reschedule IPI. */
+ if (rdp->preemptible)
return 0;
/* The CPU is online, so send it a reschedule IPI. */
@@ -264,13 +321,25 @@ void rcu_enter_nohz(void)
unsigned long flags;
struct rcu_dynticks *rdtp;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- rdtp->dynticks++;
- rdtp->dynticks_nesting--;
- WARN_ON_ONCE(rdtp->dynticks & 0x1);
+ if (--rdtp->dynticks_nesting) {
+ local_irq_restore(flags);
+ return;
+ }
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic_inc(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
local_irq_restore(flags);
+
+ /* If the interrupt queued a callback, get out of dyntick mode. */
+ if (in_irq() &&
+ (__get_cpu_var(rcu_sched_data).nxtlist ||
+ __get_cpu_var(rcu_bh_data).nxtlist ||
+ rcu_preempt_needs_cpu(smp_processor_id())))
+ set_need_resched();
}
/*
@@ -286,11 +355,16 @@ void rcu_exit_nohz(void)
local_irq_save(flags);
rdtp = &__get_cpu_var(rcu_dynticks);
- rdtp->dynticks++;
- rdtp->dynticks_nesting++;
- WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
+ if (rdtp->dynticks_nesting++) {
+ local_irq_restore(flags);
+ return;
+ }
+ smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic_inc(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
local_irq_restore(flags);
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
}
/**
@@ -304,11 +378,15 @@ void rcu_nmi_enter(void)
{
struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
- if (rdtp->dynticks & 0x1)
+ if (rdtp->dynticks_nmi_nesting == 0 &&
+ (atomic_read(&rdtp->dynticks) & 0x1))
return;
- rdtp->dynticks_nmi++;
- WARN_ON_ONCE(!(rdtp->dynticks_nmi & 0x1));
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+ rdtp->dynticks_nmi_nesting++;
+ smp_mb__before_atomic_inc(); /* Force delay from prior write. */
+ atomic_inc(&rdtp->dynticks);
+ /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */
+ smp_mb__after_atomic_inc(); /* See above. */
+ WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1));
}
/**
@@ -322,11 +400,14 @@ void rcu_nmi_exit(void)
{
struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
- if (rdtp->dynticks & 0x1)
+ if (rdtp->dynticks_nmi_nesting == 0 ||
+ --rdtp->dynticks_nmi_nesting != 0)
return;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
- rdtp->dynticks_nmi++;
- WARN_ON_ONCE(rdtp->dynticks_nmi & 0x1);
+ /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */
+ smp_mb__before_atomic_inc(); /* See above. */
+ atomic_inc(&rdtp->dynticks);
+ smp_mb__after_atomic_inc(); /* Force delay to next write. */
+ WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1);
}
/**
@@ -337,13 +418,7 @@ void rcu_nmi_exit(void)
*/
void rcu_irq_enter(void)
{
- struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-
- if (rdtp->dynticks_nesting++)
- return;
- rdtp->dynticks++;
- WARN_ON_ONCE(!(rdtp->dynticks & 0x1));
- smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+ rcu_exit_nohz();
}
/**
@@ -355,18 +430,7 @@ void rcu_irq_enter(void)
*/
void rcu_irq_exit(void)
{
- struct rcu_dynticks *rdtp = &__get_cpu_var(rcu_dynticks);
-
- if (--rdtp->dynticks_nesting)
- return;
- smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
- rdtp->dynticks++;
- WARN_ON_ONCE(rdtp->dynticks & 0x1);
-
- /* If the interrupt queued a callback, get out of dyntick mode. */
- if (__this_cpu_read(rcu_sched_data.nxtlist) ||
- __this_cpu_read(rcu_bh_data.nxtlist))
- set_need_resched();
+ rcu_enter_nohz();
}
#ifdef CONFIG_SMP
@@ -378,19 +442,8 @@ void rcu_irq_exit(void)
*/
static int dyntick_save_progress_counter(struct rcu_data *rdp)
{
- int ret;
- int snap;
- int snap_nmi;
-
- snap = rdp->dynticks->dynticks;
- snap_nmi = rdp->dynticks->dynticks_nmi;
- smp_mb(); /* Order sampling of snap with end of grace period. */
- rdp->dynticks_snap = snap;
- rdp->dynticks_nmi_snap = snap_nmi;
- ret = ((snap & 0x1) == 0) && ((snap_nmi & 0x1) == 0);
- if (ret)
- rdp->dynticks_fqs++;
- return ret;
+ rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks);
+ return 0;
}
/*
@@ -401,16 +454,11 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp)
*/
static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
{
- long curr;
- long curr_nmi;
- long snap;
- long snap_nmi;
+ unsigned long curr;
+ unsigned long snap;
- curr = rdp->dynticks->dynticks;
- snap = rdp->dynticks_snap;
- curr_nmi = rdp->dynticks->dynticks_nmi;
- snap_nmi = rdp->dynticks_nmi_snap;
- smp_mb(); /* force ordering with cpu entering/leaving dynticks. */
+ curr = (unsigned long)atomic_add_return(0, &rdp->dynticks->dynticks);
+ snap = (unsigned long)rdp->dynticks_snap;
/*
* If the CPU passed through or entered a dynticks idle phase with
@@ -420,8 +468,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
* read-side critical section that started before the beginning
* of the current RCU grace period.
*/
- if ((curr != snap || (curr & 0x1) == 0) &&
- (curr_nmi != snap_nmi || (curr_nmi & 0x1) == 0)) {
+ if ((curr & 0x1) == 0 || ULONG_CMP_GE(curr, snap + 2)) {
rdp->dynticks_fqs++;
return 1;
}
@@ -450,8 +497,6 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
#endif /* #else #ifdef CONFIG_NO_HZ */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-
int rcu_cpu_stall_suppress __read_mostly;
static void record_gp_stall_check_time(struct rcu_state *rsp)
@@ -537,21 +582,24 @@ static void print_cpu_stall(struct rcu_state *rsp)
static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
{
- long delta;
+ unsigned long j;
+ unsigned long js;
struct rcu_node *rnp;
if (rcu_cpu_stall_suppress)
return;
- delta = jiffies - ACCESS_ONCE(rsp->jiffies_stall);
+ j = ACCESS_ONCE(jiffies);
+ js = ACCESS_ONCE(rsp->jiffies_stall);
rnp = rdp->mynode;
- if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && delta >= 0) {
+ if ((ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) {
/* We haven't checked in, so go dump stack. */
print_cpu_stall(rsp);
- } else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
+ } else if (rcu_gp_in_progress(rsp) &&
+ ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) {
- /* They had two time units to dump stack, so complain. */
+ /* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(rsp);
}
}
@@ -587,26 +635,6 @@ static void __init check_cpu_stall_init(void)
atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block);
}
-#else /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
-static void record_gp_stall_check_time(struct rcu_state *rsp)
-{
-}
-
-static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
-{
-}
-
-void rcu_cpu_stall_reset(void)
-{
-}
-
-static void __init check_cpu_stall_init(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
/*
* Update CPU-local rcu_data state to record the newly noticed grace period.
* This is used both when we started the grace period and when we notice
@@ -809,6 +837,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
rnp->completed = rsp->completed;
rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */
rcu_start_gp_per_cpu(rsp, rnp, rdp);
+ rcu_preempt_boost_start_gp(rnp);
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return;
}
@@ -844,6 +873,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
rnp->completed = rsp->completed;
if (rnp == rdp->mynode)
rcu_start_gp_per_cpu(rsp, rnp, rdp);
+ rcu_preempt_boost_start_gp(rnp);
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
@@ -864,7 +894,18 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
__releases(rcu_get_root(rsp)->lock)
{
+ unsigned long gp_duration;
+
WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
+
+ /*
+ * Ensure that all grace-period and pre-grace-period activity
+ * is seen before the assignment to rsp->completed.
+ */
+ smp_mb(); /* See above block comment. */
+ gp_duration = jiffies - rsp->gp_start;
+ if (gp_duration > rsp->gp_max)
+ rsp->gp_max = gp_duration;
rsp->completed = rsp->gpnum;
rsp->signaled = RCU_GP_IDLE;
rcu_start_gp(rsp, flags); /* releases root node's rnp->lock. */
@@ -894,7 +935,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
return;
}
rnp->qsmask &= ~mask;
- if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
+ if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
/* Other bits still set at this level, so done. */
raw_spin_unlock_irqrestore(&rnp->lock, flags);
@@ -1037,6 +1078,8 @@ static void rcu_send_cbs_to_online(struct rcu_state *rsp)
/*
* Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
* and move all callbacks from the outgoing CPU to the current one.
+ * There can only be one CPU hotplug operation at a time, so no other
+ * CPU can be attempting to update rcu_cpu_kthread_task.
*/
static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
{
@@ -1045,6 +1088,14 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
int need_report = 0;
struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
struct rcu_node *rnp;
+ struct task_struct *t;
+
+ /* Stop the CPU's kthread. */
+ t = per_cpu(rcu_cpu_kthread_task, cpu);
+ if (t != NULL) {
+ per_cpu(rcu_cpu_kthread_task, cpu) = NULL;
+ kthread_stop(t);
+ }
/* Exclude any attempts to start a new grace period. */
raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -1082,6 +1133,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
raw_spin_unlock_irqrestore(&rnp->lock, flags);
if (need_report & RCU_OFL_TASKS_EXP_GP)
rcu_report_exp_rnp(rsp, rnp);
+ rcu_node_kthread_setaffinity(rnp, -1);
}
/*
@@ -1143,7 +1195,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
next = list->next;
prefetch(next);
debug_rcu_head_unqueue(list);
- list->func(list);
+ __rcu_reclaim(list);
list = next;
if (++count >= rdp->blimit)
break;
@@ -1179,7 +1231,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
/* Re-raise the RCU softirq if there are callbacks remaining. */
if (cpu_has_callbacks_ready_to_invoke(rdp))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_cpu_kthread();
}
/*
@@ -1225,7 +1277,7 @@ void rcu_check_callbacks(int cpu, int user)
}
rcu_preempt_check_callbacks(cpu);
if (rcu_pending(cpu))
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_cpu_kthread();
}
#ifdef CONFIG_SMP
@@ -1233,6 +1285,8 @@ void rcu_check_callbacks(int cpu, int user)
/*
* Scan the leaf rcu_node structures, processing dyntick state for any that
* have not yet encountered a quiescent state, using the function specified.
+ * Also initiate boosting for any threads blocked on the root rcu_node.
+ *
* The caller must have suppressed start of new grace periods.
*/
static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
@@ -1251,7 +1305,7 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
return;
}
if (rnp->qsmask == 0) {
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
continue;
}
cpu = rnp->grplo;
@@ -1269,6 +1323,11 @@ static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *))
}
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
+ rnp = rcu_get_root(rsp);
+ if (rnp->qsmask == 0) {
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+ }
}
/*
@@ -1389,31 +1448,347 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
/*
* Do softirq processing for the current CPU.
*/
-static void rcu_process_callbacks(struct softirq_action *unused)
+static void rcu_process_callbacks(void)
{
- /*
- * Memory references from any prior RCU read-side critical sections
- * executed by the interrupted code must be seen before any RCU
- * grace-period manipulations below.
- */
- smp_mb(); /* See above block comment. */
-
__rcu_process_callbacks(&rcu_sched_state,
&__get_cpu_var(rcu_sched_data));
__rcu_process_callbacks(&rcu_bh_state, &__get_cpu_var(rcu_bh_data));
rcu_preempt_process_callbacks();
- /*
- * Memory references from any later RCU read-side critical sections
- * executed by the interrupted code must be seen after any RCU
- * grace-period manipulations above.
- */
- smp_mb(); /* See above block comment. */
-
/* If we are last CPU on way to dyntick-idle mode, accelerate it. */
rcu_needs_cpu_flush();
}
+/*
+ * Wake up the current CPU's kthread. This replaces raise_softirq()
+ * in earlier versions of RCU. Note that because we are running on
+ * the current CPU with interrupts disabled, the rcu_cpu_kthread_task
+ * cannot disappear out from under us.
+ */
+static void invoke_rcu_cpu_kthread(void)
+{
+ unsigned long flags;
+
+ local_irq_save(flags);
+ __this_cpu_write(rcu_cpu_has_work, 1);
+ if (__this_cpu_read(rcu_cpu_kthread_task) == NULL) {
+ local_irq_restore(flags);
+ return;
+ }
+ wake_up_process(__this_cpu_read(rcu_cpu_kthread_task));
+ local_irq_restore(flags);
+}
+
+/*
+ * Wake up the specified per-rcu_node-structure kthread.
+ * Because the per-rcu_node kthreads are immortal, we don't need
+ * to do anything to keep them alive.
+ */
+static void invoke_rcu_node_kthread(struct rcu_node *rnp)
+{
+ struct task_struct *t;
+
+ t = rnp->node_kthread_task;
+ if (t != NULL)
+ wake_up_process(t);
+}
+
+/*
+ * Set the specified CPU's kthread to run RT or not, as specified by
+ * the to_rt argument. The CPU-hotplug locks are held, so the task
+ * is not going away.
+ */
+static void rcu_cpu_kthread_setrt(int cpu, int to_rt)
+{
+ int policy;
+ struct sched_param sp;
+ struct task_struct *t;
+
+ t = per_cpu(rcu_cpu_kthread_task, cpu);
+ if (t == NULL)
+ return;
+ if (to_rt) {
+ policy = SCHED_FIFO;
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ } else {
+ policy = SCHED_NORMAL;
+ sp.sched_priority = 0;
+ }
+ sched_setscheduler_nocheck(t, policy, &sp);
+}
+
+/*
+ * Timer handler to initiate the waking up of per-CPU kthreads that
+ * have yielded the CPU due to excess numbers of RCU callbacks.
+ * We wake up the per-rcu_node kthread, which in turn will wake up
+ * the booster kthread.
+ */
+static void rcu_cpu_kthread_timer(unsigned long arg)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, arg);
+ struct rcu_node *rnp = rdp->mynode;
+
+ atomic_or(rdp->grpmask, &rnp->wakemask);
+ invoke_rcu_node_kthread(rnp);
+}
+
+/*
+ * Drop to non-real-time priority and yield, but only after posting a
+ * timer that will cause us to regain our real-time priority if we
+ * remain preempted. Either way, we restore our real-time priority
+ * before returning.
+ */
+static void rcu_yield(void (*f)(unsigned long), unsigned long arg)
+{
+ struct sched_param sp;
+ struct timer_list yield_timer;
+
+ setup_timer_on_stack(&yield_timer, f, arg);
+ mod_timer(&yield_timer, jiffies + 2);
+ sp.sched_priority = 0;
+ sched_setscheduler_nocheck(current, SCHED_NORMAL, &sp);
+ set_user_nice(current, 19);
+ schedule();
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(current, SCHED_FIFO, &sp);
+ del_timer(&yield_timer);
+}
+
+/*
+ * Handle cases where the rcu_cpu_kthread() ends up on the wrong CPU.
+ * This can happen while the corresponding CPU is either coming online
+ * or going offline. We cannot wait until the CPU is fully online
+ * before starting the kthread, because the various notifier functions
+ * can wait for RCU grace periods. So we park rcu_cpu_kthread() until
+ * the corresponding CPU is online.
+ *
+ * Return 1 if the kthread needs to stop, 0 otherwise.
+ *
+ * Caller must disable bh. This function can momentarily enable it.
+ */
+static int rcu_cpu_kthread_should_stop(int cpu)
+{
+ while (cpu_is_offline(cpu) ||
+ !cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)) ||
+ smp_processor_id() != cpu) {
+ if (kthread_should_stop())
+ return 1;
+ per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU;
+ per_cpu(rcu_cpu_kthread_cpu, cpu) = raw_smp_processor_id();
+ local_bh_enable();
+ schedule_timeout_uninterruptible(1);
+ if (!cpumask_equal(&current->cpus_allowed, cpumask_of(cpu)))
+ set_cpus_allowed_ptr(current, cpumask_of(cpu));
+ local_bh_disable();
+ }
+ per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+ return 0;
+}
+
+/*
+ * Per-CPU kernel thread that invokes RCU callbacks. This replaces the
+ * earlier RCU softirq.
+ */
+static int rcu_cpu_kthread(void *arg)
+{
+ int cpu = (int)(long)arg;
+ unsigned long flags;
+ int spincnt = 0;
+ unsigned int *statusp = &per_cpu(rcu_cpu_kthread_status, cpu);
+ char work;
+ char *workp = &per_cpu(rcu_cpu_has_work, cpu);
+
+ for (;;) {
+ *statusp = RCU_KTHREAD_WAITING;
+ rcu_wait(*workp != 0 || kthread_should_stop());
+ local_bh_disable();
+ if (rcu_cpu_kthread_should_stop(cpu)) {
+ local_bh_enable();
+ break;
+ }
+ *statusp = RCU_KTHREAD_RUNNING;
+ per_cpu(rcu_cpu_kthread_loops, cpu)++;
+ local_irq_save(flags);
+ work = *workp;
+ *workp = 0;
+ local_irq_restore(flags);
+ if (work)
+ rcu_process_callbacks();
+ local_bh_enable();
+ if (*workp != 0)
+ spincnt++;
+ else
+ spincnt = 0;
+ if (spincnt > 10) {
+ *statusp = RCU_KTHREAD_YIELDING;
+ rcu_yield(rcu_cpu_kthread_timer, (unsigned long)cpu);
+ spincnt = 0;
+ }
+ }
+ *statusp = RCU_KTHREAD_STOPPED;
+ return 0;
+}
+
+/*
+ * Spawn a per-CPU kthread, setting up affinity and priority.
+ * Because the CPU hotplug lock is held, no other CPU will be attempting
+ * to manipulate rcu_cpu_kthread_task. There might be another CPU
+ * attempting to access it during boot, but the locking in kthread_bind()
+ * will enforce sufficient ordering.
+ */
+static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu)
+{
+ struct sched_param sp;
+ struct task_struct *t;
+
+ if (!rcu_kthreads_spawnable ||
+ per_cpu(rcu_cpu_kthread_task, cpu) != NULL)
+ return 0;
+ t = kthread_create(rcu_cpu_kthread, (void *)(long)cpu, "rcuc%d", cpu);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ kthread_bind(t, cpu);
+ set_task_state(t, TASK_INTERRUPTIBLE);
+ per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu;
+ WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL);
+ per_cpu(rcu_cpu_kthread_task, cpu) = t;
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ return 0;
+}
+
+/*
+ * Per-rcu_node kthread, which is in charge of waking up the per-CPU
+ * kthreads when needed. We ignore requests to wake up kthreads
+ * for offline CPUs, which is OK because force_quiescent_state()
+ * takes care of this case.
+ */
+static int rcu_node_kthread(void *arg)
+{
+ int cpu;
+ unsigned long flags;
+ unsigned long mask;
+ struct rcu_node *rnp = (struct rcu_node *)arg;
+ struct sched_param sp;
+ struct task_struct *t;
+
+ for (;;) {
+ rnp->node_kthread_status = RCU_KTHREAD_WAITING;
+ rcu_wait(atomic_read(&rnp->wakemask) != 0);
+ rnp->node_kthread_status = RCU_KTHREAD_RUNNING;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ mask = atomic_xchg(&rnp->wakemask, 0);
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) {
+ if ((mask & 0x1) == 0)
+ continue;
+ preempt_disable();
+ t = per_cpu(rcu_cpu_kthread_task, cpu);
+ if (!cpu_online(cpu) || t == NULL) {
+ preempt_enable();
+ continue;
+ }
+ per_cpu(rcu_cpu_has_work, cpu) = 1;
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ preempt_enable();
+ }
+ }
+ /* NOTREACHED */
+ rnp->node_kthread_status = RCU_KTHREAD_STOPPED;
+ return 0;
+}
+
+/*
+ * Set the per-rcu_node kthread's affinity to cover all CPUs that are
+ * served by the rcu_node in question. The CPU hotplug lock is still
+ * held, so the value of rnp->qsmaskinit will be stable.
+ *
+ * We don't include outgoingcpu in the affinity set, use -1 if there is
+ * no outgoing CPU. If there are no CPUs left in the affinity set,
+ * this function allows the kthread to execute on any CPU.
+ */
+static void rcu_node_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
+{
+ cpumask_var_t cm;
+ int cpu;
+ unsigned long mask = rnp->qsmaskinit;
+
+ if (rnp->node_kthread_task == NULL)
+ return;
+ if (!alloc_cpumask_var(&cm, GFP_KERNEL))
+ return;
+ cpumask_clear(cm);
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1)
+ if ((mask & 0x1) && cpu != outgoingcpu)
+ cpumask_set_cpu(cpu, cm);
+ if (cpumask_weight(cm) == 0) {
+ cpumask_setall(cm);
+ for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++)
+ cpumask_clear_cpu(cpu, cm);
+ WARN_ON_ONCE(cpumask_weight(cm) == 0);
+ }
+ set_cpus_allowed_ptr(rnp->node_kthread_task, cm);
+ rcu_boost_kthread_setaffinity(rnp, cm);
+ free_cpumask_var(cm);
+}
+
+/*
+ * Spawn a per-rcu_node kthread, setting priority and affinity.
+ * Called during boot before online/offline can happen, or, if
+ * during runtime, with the main CPU-hotplug locks held. So only
+ * one of these can be executing at a time.
+ */
+static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp)
+{
+ unsigned long flags;
+ int rnp_index = rnp - &rsp->node[0];
+ struct sched_param sp;
+ struct task_struct *t;
+
+ if (!rcu_kthreads_spawnable ||
+ rnp->qsmaskinit == 0)
+ return 0;
+ if (rnp->node_kthread_task == NULL) {
+ t = kthread_create(rcu_node_kthread, (void *)rnp,
+ "rcun%d", rnp_index);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ set_task_state(t, TASK_INTERRUPTIBLE);
+ rnp->node_kthread_task = t;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ sp.sched_priority = 99;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ }
+ return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index);
+}
+
+/*
+ * Spawn all kthreads -- called as soon as the scheduler is running.
+ */
+static int __init rcu_spawn_kthreads(void)
+{
+ int cpu;
+ struct rcu_node *rnp;
+
+ rcu_kthreads_spawnable = 1;
+ for_each_possible_cpu(cpu) {
+ per_cpu(rcu_cpu_has_work, cpu) = 0;
+ if (cpu_online(cpu))
+ (void)rcu_spawn_one_cpu_kthread(cpu);
+ }
+ rnp = rcu_get_root(rcu_state);
+ (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ if (NUM_RCU_NODES > 1) {
+ rcu_for_each_leaf_node(rcu_state, rnp)
+ (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ }
+ return 0;
+}
+early_initcall(rcu_spawn_kthreads);
+
static void
__call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
struct rcu_state *rsp)
@@ -1439,6 +1814,13 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
/* Add the callback to our list. */
*rdp->nxttail[RCU_NEXT_TAIL] = head;
rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
+ rdp->qlen++;
+
+ /* If interrupts were disabled, don't dive into RCU core. */
+ if (irqs_disabled_flags(flags)) {
+ local_irq_restore(flags);
+ return;
+ }
/*
* Force the grace period if too many callbacks or too long waiting.
@@ -1447,7 +1829,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
* invoking force_quiescent_state() if the newly enqueued callback
* is the only one waiting for a grace period to complete.
*/
- if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
+ if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
/* Are we ignoring a completed grace period? */
rcu_process_gp_end(rsp, rdp);
@@ -1583,7 +1965,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
* or RCU-bh, force a local reschedule.
*/
rdp->n_rp_qs_pending++;
- if (!rdp->preemptable &&
+ if (!rdp->preemptible &&
ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs) - 1,
jiffies))
set_need_resched();
@@ -1760,7 +2142,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp)
* that this CPU cannot possibly have any RCU callbacks in flight yet.
*/
static void __cpuinit
-rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
+rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible)
{
unsigned long flags;
unsigned long mask;
@@ -1772,7 +2154,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
rdp->passed_quiesc = 0; /* We could be racing with new GP, */
rdp->qs_pending = 1; /* so set up to respond to current GP. */
rdp->beenonline = 1; /* We have now been online. */
- rdp->preemptable = preemptable;
+ rdp->preemptible = preemptible;
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->blimit = blimit;
@@ -1813,6 +2195,19 @@ static void __cpuinit rcu_online_cpu(int cpu)
rcu_preempt_init_percpu_data(cpu);
}
+static void __cpuinit rcu_online_kthreads(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode;
+
+ /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */
+ if (rcu_kthreads_spawnable) {
+ (void)rcu_spawn_one_cpu_kthread(cpu);
+ if (rnp->node_kthread_task == NULL)
+ (void)rcu_spawn_one_node_kthread(rcu_state, rnp);
+ }
+}
+
/*
* Handle CPU online/offline notification events.
*/
@@ -1820,11 +2215,23 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
long cpu = (long)hcpu;
+ struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu);
+ struct rcu_node *rnp = rdp->mynode;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
rcu_online_cpu(cpu);
+ rcu_online_kthreads(cpu);
+ break;
+ case CPU_ONLINE:
+ case CPU_DOWN_FAILED:
+ rcu_node_kthread_setaffinity(rnp, -1);
+ rcu_cpu_kthread_setrt(cpu, 1);
+ break;
+ case CPU_DOWN_PREPARE:
+ rcu_node_kthread_setaffinity(rnp, cpu);
+ rcu_cpu_kthread_setrt(cpu, 0);
break;
case CPU_DYING:
case CPU_DYING_FROZEN:
@@ -1943,10 +2350,7 @@ static void __init rcu_init_one(struct rcu_state *rsp,
j / rsp->levelspread[i - 1];
}
rnp->level = i;
- INIT_LIST_HEAD(&rnp->blocked_tasks[0]);
- INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
- INIT_LIST_HEAD(&rnp->blocked_tasks[2]);
- INIT_LIST_HEAD(&rnp->blocked_tasks[3]);
+ INIT_LIST_HEAD(&rnp->blkd_tasks);
}
}
@@ -1968,7 +2372,6 @@ void __init rcu_init(void)
rcu_init_one(&rcu_sched_state, &rcu_sched_data);
rcu_init_one(&rcu_bh_state, &rcu_bh_data);
__rcu_init_preempt();
- open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
/*
* We don't need protection against CPU-hotplug here because
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index e8f057e44e3e..7b9a08b4aaea 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -84,13 +84,19 @@
* Dynticks per-CPU state.
*/
struct rcu_dynticks {
- int dynticks_nesting; /* Track nesting level, sort of. */
- int dynticks; /* Even value for dynticks-idle, else odd. */
- int dynticks_nmi; /* Even value for either dynticks-idle or */
- /* not in nmi handler, else odd. So this */
- /* remains even for nmi from irq handler. */
+ int dynticks_nesting; /* Track irq/process nesting level. */
+ int dynticks_nmi_nesting; /* Track NMI nesting level. */
+ atomic_t dynticks; /* Even value for dynticks-idle, else odd. */
};
+/* RCU's kthread states for tracing. */
+#define RCU_KTHREAD_STOPPED 0
+#define RCU_KTHREAD_RUNNING 1
+#define RCU_KTHREAD_WAITING 2
+#define RCU_KTHREAD_OFFCPU 3
+#define RCU_KTHREAD_YIELDING 4
+#define RCU_KTHREAD_MAX 4
+
/*
* Definition for node within the RCU grace-period-detection hierarchy.
*/
@@ -109,10 +115,13 @@ struct rcu_node {
/* an rcu_data structure, otherwise, each */
/* bit corresponds to a child rcu_node */
/* structure. */
- unsigned long expmask; /* Groups that have ->blocked_tasks[] */
+ unsigned long expmask; /* Groups that have ->blkd_tasks */
/* elements that need to drain to allow the */
/* current expedited grace period to */
/* complete (only for TREE_PREEMPT_RCU). */
+ atomic_t wakemask; /* CPUs whose kthread needs to be awakened. */
+ /* Since this has meaning only for leaf */
+ /* rcu_node structures, 32 bits suffices. */
unsigned long qsmaskinit;
/* Per-GP initial value for qsmask & expmask. */
unsigned long grpmask; /* Mask to apply to parent qsmask. */
@@ -122,11 +131,62 @@ struct rcu_node {
u8 grpnum; /* CPU/group number for next level up. */
u8 level; /* root is at level 0. */
struct rcu_node *parent;
- struct list_head blocked_tasks[4];
- /* Tasks blocked in RCU read-side critsect. */
- /* Grace period number (->gpnum) x blocked */
- /* by tasks on the (x & 0x1) element of the */
- /* blocked_tasks[] array. */
+ struct list_head blkd_tasks;
+ /* Tasks blocked in RCU read-side critical */
+ /* section. Tasks are placed at the head */
+ /* of this list and age towards the tail. */
+ struct list_head *gp_tasks;
+ /* Pointer to the first task blocking the */
+ /* current grace period, or NULL if there */
+ /* is no such task. */
+ struct list_head *exp_tasks;
+ /* Pointer to the first task blocking the */
+ /* current expedited grace period, or NULL */
+ /* if there is no such task. If there */
+ /* is no current expedited grace period, */
+ /* then there can cannot be any such task. */
+#ifdef CONFIG_RCU_BOOST
+ struct list_head *boost_tasks;
+ /* Pointer to first task that needs to be */
+ /* priority boosted, or NULL if no priority */
+ /* boosting is needed for this rcu_node */
+ /* structure. If there are no tasks */
+ /* queued on this rcu_node structure that */
+ /* are blocking the current grace period, */
+ /* there can be no such task. */
+ unsigned long boost_time;
+ /* When to start boosting (jiffies). */
+ struct task_struct *boost_kthread_task;
+ /* kthread that takes care of priority */
+ /* boosting for this rcu_node structure. */
+ unsigned int boost_kthread_status;
+ /* State of boost_kthread_task for tracing. */
+ unsigned long n_tasks_boosted;
+ /* Total number of tasks boosted. */
+ unsigned long n_exp_boosts;
+ /* Number of tasks boosted for expedited GP. */
+ unsigned long n_normal_boosts;
+ /* Number of tasks boosted for normal GP. */
+ unsigned long n_balk_blkd_tasks;
+ /* Refused to boost: no blocked tasks. */
+ unsigned long n_balk_exp_gp_tasks;
+ /* Refused to boost: nothing blocking GP. */
+ unsigned long n_balk_boost_tasks;
+ /* Refused to boost: already boosting. */
+ unsigned long n_balk_notblocked;
+ /* Refused to boost: RCU RS CS still running. */
+ unsigned long n_balk_notyet;
+ /* Refused to boost: not yet time. */
+ unsigned long n_balk_nos;
+ /* Refused to boost: not sure why, though. */
+ /* This can happen due to race conditions. */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ struct task_struct *node_kthread_task;
+ /* kthread that takes care of this rcu_node */
+ /* structure, for example, awakening the */
+ /* per-CPU kthreads as needed. */
+ unsigned int node_kthread_status;
+ /* State of node_kthread_task for tracing. */
} ____cacheline_internodealigned_in_smp;
/*
@@ -175,7 +235,7 @@ struct rcu_data {
bool passed_quiesc; /* User-mode/idle loop etc. */
bool qs_pending; /* Core waits for quiesc state. */
bool beenonline; /* CPU online at least once. */
- bool preemptable; /* Preemptable RCU? */
+ bool preemptible; /* Preemptible RCU? */
struct rcu_node *mynode; /* This CPU's leaf of hierarchy */
unsigned long grpmask; /* Mask to apply to leaf qsmask. */
@@ -218,7 +278,6 @@ struct rcu_data {
/* 3) dynticks interface. */
struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */
int dynticks_snap; /* Per-GP tracking for dynticks. */
- int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */
#endif /* #ifdef CONFIG_NO_HZ */
/* 4) reasons this CPU needed to be kicked by force_quiescent_state */
@@ -254,7 +313,6 @@ struct rcu_data {
#endif /* #else #ifdef CONFIG_NO_HZ */
#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
#ifdef CONFIG_PROVE_RCU
#define RCU_STALL_DELAY_DELTA (5 * HZ)
@@ -272,13 +330,16 @@ struct rcu_data {
/* scheduling clock irq */
/* before ratting on them. */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR_RUNNABLE
-#define RCU_CPU_STALL_SUPPRESS_INIT 0
-#else
-#define RCU_CPU_STALL_SUPPRESS_INIT 1
-#endif
-
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+#define rcu_wait(cond) \
+do { \
+ for (;;) { \
+ set_current_state(TASK_INTERRUPTIBLE); \
+ if (cond) \
+ break; \
+ schedule(); \
+ } \
+ __set_current_state(TASK_RUNNING); \
+} while (0)
/*
* RCU global state, including node hierarchy. This hierarchy is
@@ -325,12 +386,12 @@ struct rcu_state {
/* due to lock unavailable. */
unsigned long n_force_qs_ngp; /* Number of calls leaving */
/* due to no GP active. */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
unsigned long gp_start; /* Time at which GP started, */
/* but in jiffies. */
unsigned long jiffies_stall; /* Time at which to check */
/* for CPU stalls. */
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+ unsigned long gp_max; /* Maximum GP duration in */
+ /* jiffies. */
char *name; /* Name of structure. */
};
@@ -361,16 +422,14 @@ DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
static void rcu_bootup_announce(void);
long rcu_batches_completed(void);
static void rcu_preempt_note_context_switch(int cpu);
-static int rcu_preempted_readers(struct rcu_node *rnp);
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp,
unsigned long flags);
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
static void rcu_print_detail_task_stall(struct rcu_state *rsp);
static void rcu_print_task_stall(struct rcu_node *rnp);
static void rcu_preempt_stall_reset(void);
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
#ifdef CONFIG_HOTPLUG_CPU
static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
@@ -390,5 +449,12 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
static void rcu_preempt_send_cbs_to_online(void);
static void __init __rcu_init_preempt(void);
static void rcu_needs_cpu_flush(void);
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags);
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+ cpumask_var_t cm);
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp);
+static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp,
+ int rnp_index);
#endif /* #ifndef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index a3638710dc67..a767b7dac365 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1,7 +1,7 @@
/*
* Read-Copy Update mechanism for mutual exclusion (tree-based version)
* Internal non-public definitions that provide either classic
- * or preemptable semantics.
+ * or preemptible semantics.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
@@ -54,10 +54,6 @@ static void __init rcu_bootup_announce_oddness(void)
#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
printk(KERN_INFO "\tRCU torture testing starts during boot.\n");
#endif
-#ifndef CONFIG_RCU_CPU_STALL_DETECTOR
- printk(KERN_INFO
- "\tRCU-based detection of stalled CPUs is disabled.\n");
-#endif
#if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE)
printk(KERN_INFO "\tVerbose stalled-CPUs detection is disabled.\n");
#endif
@@ -70,6 +66,7 @@ static void __init rcu_bootup_announce_oddness(void)
struct rcu_state rcu_preempt_state = RCU_STATE_INITIALIZER(rcu_preempt_state);
DEFINE_PER_CPU(struct rcu_data, rcu_preempt_data);
+static struct rcu_state *rcu_state = &rcu_preempt_state;
static int rcu_preempted_readers_exp(struct rcu_node *rnp);
@@ -78,7 +75,7 @@ static int rcu_preempted_readers_exp(struct rcu_node *rnp);
*/
static void __init rcu_bootup_announce(void)
{
- printk(KERN_INFO "Preemptable hierarchical RCU implementation.\n");
+ printk(KERN_INFO "Preemptible hierarchical RCU implementation.\n");
rcu_bootup_announce_oddness();
}
@@ -111,7 +108,7 @@ void rcu_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
- * Record a preemptable-RCU quiescent state for the specified CPU. Note
+ * Record a preemptible-RCU quiescent state for the specified CPU. Note
* that this just means that the task currently running on the CPU is
* not in a quiescent state. There might be any number of tasks blocked
* while in an RCU read-side critical section.
@@ -134,12 +131,12 @@ static void rcu_preempt_qs(int cpu)
* We have entered the scheduler, and the current task might soon be
* context-switched away from. If this task is in an RCU read-side
* critical section, we will no longer be able to rely on the CPU to
- * record that fact, so we enqueue the task on the appropriate entry
- * of the blocked_tasks[] array. The task will dequeue itself when
- * it exits the outermost enclosing RCU read-side critical section.
- * Therefore, the current grace period cannot be permitted to complete
- * until the blocked_tasks[] entry indexed by the low-order bit of
- * rnp->gpnum empties.
+ * record that fact, so we enqueue the task on the blkd_tasks list.
+ * The task will dequeue itself when it exits the outermost enclosing
+ * RCU read-side critical section. Therefore, the current grace period
+ * cannot be permitted to complete until the blkd_tasks list entries
+ * predating the current grace period drain, in other words, until
+ * rnp->gp_tasks becomes NULL.
*
* Caller must disable preemption.
*/
@@ -147,7 +144,6 @@ static void rcu_preempt_note_context_switch(int cpu)
{
struct task_struct *t = current;
unsigned long flags;
- int phase;
struct rcu_data *rdp;
struct rcu_node *rnp;
@@ -169,15 +165,30 @@ static void rcu_preempt_note_context_switch(int cpu)
* (i.e., this CPU has not yet passed through a quiescent
* state for the current grace period), then as long
* as that task remains queued, the current grace period
- * cannot end.
+ * cannot end. Note that there is some uncertainty as
+ * to exactly when the current grace period started.
+ * We take a conservative approach, which can result
+ * in unnecessarily waiting on tasks that started very
+ * slightly after the current grace period began. C'est
+ * la vie!!!
*
* But first, note that the current CPU must still be
* on line!
*/
WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
- phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1;
- list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]);
+ if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
+ list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
+ rnp->gp_tasks = &t->rcu_node_entry;
+#ifdef CONFIG_RCU_BOOST
+ if (rnp->boost_tasks != NULL)
+ rnp->boost_tasks = rnp->gp_tasks;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ } else {
+ list_add(&t->rcu_node_entry, &rnp->blkd_tasks);
+ if (rnp->qsmask & rdp->grpmask)
+ rnp->gp_tasks = &t->rcu_node_entry;
+ }
raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
@@ -196,7 +207,7 @@ static void rcu_preempt_note_context_switch(int cpu)
}
/*
- * Tree-preemptable RCU implementation for rcu_read_lock().
+ * Tree-preemptible RCU implementation for rcu_read_lock().
* Just increment ->rcu_read_lock_nesting, shared state will be updated
* if we block.
*/
@@ -212,12 +223,9 @@ EXPORT_SYMBOL_GPL(__rcu_read_lock);
* for the specified rcu_node structure. If the caller needs a reliable
* answer, it must hold the rcu_node's ->lock.
*/
-static int rcu_preempted_readers(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
{
- int phase = rnp->gpnum & 0x1;
-
- return !list_empty(&rnp->blocked_tasks[phase]) ||
- !list_empty(&rnp->blocked_tasks[phase + 2]);
+ return rnp->gp_tasks != NULL;
}
/*
@@ -233,7 +241,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
unsigned long mask;
struct rcu_node *rnp_p;
- if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) {
+ if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
raw_spin_unlock_irqrestore(&rnp->lock, flags);
return; /* Still need more quiescent states! */
}
@@ -257,6 +265,21 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
}
/*
+ * Advance a ->blkd_tasks-list pointer to the next entry, instead
+ * returning NULL if at the end of the list.
+ */
+static struct list_head *rcu_next_node_entry(struct task_struct *t,
+ struct rcu_node *rnp)
+{
+ struct list_head *np;
+
+ np = t->rcu_node_entry.next;
+ if (np == &rnp->blkd_tasks)
+ np = NULL;
+ return np;
+}
+
+/*
* Handle special cases during rcu_read_unlock(), such as needing to
* notify RCU core processing or task having blocked during the RCU
* read-side critical section.
@@ -266,6 +289,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
int empty;
int empty_exp;
unsigned long flags;
+ struct list_head *np;
struct rcu_node *rnp;
int special;
@@ -306,10 +330,19 @@ static void rcu_read_unlock_special(struct task_struct *t)
break;
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
- empty = !rcu_preempted_readers(rnp);
+ empty = !rcu_preempt_blocked_readers_cgp(rnp);
empty_exp = !rcu_preempted_readers_exp(rnp);
smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
+ np = rcu_next_node_entry(t, rnp);
list_del_init(&t->rcu_node_entry);
+ if (&t->rcu_node_entry == rnp->gp_tasks)
+ rnp->gp_tasks = np;
+ if (&t->rcu_node_entry == rnp->exp_tasks)
+ rnp->exp_tasks = np;
+#ifdef CONFIG_RCU_BOOST
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp->boost_tasks = np;
+#endif /* #ifdef CONFIG_RCU_BOOST */
t->rcu_blocked_node = NULL;
/*
@@ -322,6 +355,15 @@ static void rcu_read_unlock_special(struct task_struct *t)
else
rcu_report_unblock_qs_rnp(rnp, flags);
+#ifdef CONFIG_RCU_BOOST
+ /* Unboost if we were boosted. */
+ if (special & RCU_READ_UNLOCK_BOOSTED) {
+ t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BOOSTED;
+ rt_mutex_unlock(t->rcu_boost_mutex);
+ t->rcu_boost_mutex = NULL;
+ }
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
/*
* If this was the last task on the expedited lists,
* then we need to report up the rcu_node hierarchy.
@@ -334,7 +376,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
}
/*
- * Tree-preemptable RCU implementation for rcu_read_unlock().
+ * Tree-preemptible RCU implementation for rcu_read_unlock().
* Decrement ->rcu_read_lock_nesting. If the result is zero (outermost
* rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then
* invoke rcu_read_unlock_special() to clean up after a context switch
@@ -356,8 +398,6 @@ void __rcu_read_unlock(void)
}
EXPORT_SYMBOL_GPL(__rcu_read_unlock);
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-
#ifdef CONFIG_RCU_CPU_STALL_VERBOSE
/*
@@ -367,18 +407,16 @@ EXPORT_SYMBOL_GPL(__rcu_read_unlock);
static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp)
{
unsigned long flags;
- struct list_head *lp;
- int phase;
struct task_struct *t;
- if (rcu_preempted_readers(rnp)) {
- raw_spin_lock_irqsave(&rnp->lock, flags);
- phase = rnp->gpnum & 0x1;
- lp = &rnp->blocked_tasks[phase];
- list_for_each_entry(t, lp, rcu_node_entry)
- sched_show_task(t);
- raw_spin_unlock_irqrestore(&rnp->lock, flags);
- }
+ if (!rcu_preempt_blocked_readers_cgp(rnp))
+ return;
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ t = list_entry(rnp->gp_tasks,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+ sched_show_task(t);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
}
/*
@@ -408,16 +446,14 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
*/
static void rcu_print_task_stall(struct rcu_node *rnp)
{
- struct list_head *lp;
- int phase;
struct task_struct *t;
- if (rcu_preempted_readers(rnp)) {
- phase = rnp->gpnum & 0x1;
- lp = &rnp->blocked_tasks[phase];
- list_for_each_entry(t, lp, rcu_node_entry)
- printk(" P%d", t->pid);
- }
+ if (!rcu_preempt_blocked_readers_cgp(rnp))
+ return;
+ t = list_entry(rnp->gp_tasks,
+ struct task_struct, rcu_node_entry);
+ list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry)
+ printk(" P%d", t->pid);
}
/*
@@ -430,18 +466,21 @@ static void rcu_preempt_stall_reset(void)
rcu_preempt_state.jiffies_stall = jiffies + ULONG_MAX / 2;
}
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
/*
* Check that the list of blocked tasks for the newly completed grace
* period is in fact empty. It is a serious bug to complete a grace
* period that still has RCU readers blocked! This function must be
* invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock
* must be held by the caller.
+ *
+ * Also, if there are blocked tasks on the list, they automatically
+ * block the newly created grace period, so set up ->gp_tasks accordingly.
*/
static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
{
- WARN_ON_ONCE(rcu_preempted_readers(rnp));
+ WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+ if (!list_empty(&rnp->blkd_tasks))
+ rnp->gp_tasks = rnp->blkd_tasks.next;
WARN_ON_ONCE(rnp->qsmask);
}
@@ -465,50 +504,68 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
struct rcu_node *rnp,
struct rcu_data *rdp)
{
- int i;
struct list_head *lp;
struct list_head *lp_root;
int retval = 0;
struct rcu_node *rnp_root = rcu_get_root(rsp);
- struct task_struct *tp;
+ struct task_struct *t;
if (rnp == rnp_root) {
WARN_ONCE(1, "Last CPU thought to be offlined?");
return 0; /* Shouldn't happen: at least one CPU online. */
}
- WARN_ON_ONCE(rnp != rdp->mynode &&
- (!list_empty(&rnp->blocked_tasks[0]) ||
- !list_empty(&rnp->blocked_tasks[1]) ||
- !list_empty(&rnp->blocked_tasks[2]) ||
- !list_empty(&rnp->blocked_tasks[3])));
+
+ /* If we are on an internal node, complain bitterly. */
+ WARN_ON_ONCE(rnp != rdp->mynode);
/*
- * Move tasks up to root rcu_node. Rely on the fact that the
- * root rcu_node can be at most one ahead of the rest of the
- * rcu_nodes in terms of gp_num value. This fact allows us to
- * move the blocked_tasks[] array directly, element by element.
+ * Move tasks up to root rcu_node. Don't try to get fancy for
+ * this corner-case operation -- just put this node's tasks
+ * at the head of the root node's list, and update the root node's
+ * ->gp_tasks and ->exp_tasks pointers to those of this node's,
+ * if non-NULL. This might result in waiting for more tasks than
+ * absolutely necessary, but this is a good performance/complexity
+ * tradeoff.
*/
- if (rcu_preempted_readers(rnp))
+ if (rcu_preempt_blocked_readers_cgp(rnp))
retval |= RCU_OFL_TASKS_NORM_GP;
if (rcu_preempted_readers_exp(rnp))
retval |= RCU_OFL_TASKS_EXP_GP;
- for (i = 0; i < 4; i++) {
- lp = &rnp->blocked_tasks[i];
- lp_root = &rnp_root->blocked_tasks[i];
- while (!list_empty(lp)) {
- tp = list_entry(lp->next, typeof(*tp), rcu_node_entry);
- raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
- list_del(&tp->rcu_node_entry);
- tp->rcu_blocked_node = rnp_root;
- list_add(&tp->rcu_node_entry, lp_root);
- raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */
- }
+ lp = &rnp->blkd_tasks;
+ lp_root = &rnp_root->blkd_tasks;
+ while (!list_empty(lp)) {
+ t = list_entry(lp->next, typeof(*t), rcu_node_entry);
+ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+ list_del(&t->rcu_node_entry);
+ t->rcu_blocked_node = rnp_root;
+ list_add(&t->rcu_node_entry, lp_root);
+ if (&t->rcu_node_entry == rnp->gp_tasks)
+ rnp_root->gp_tasks = rnp->gp_tasks;
+ if (&t->rcu_node_entry == rnp->exp_tasks)
+ rnp_root->exp_tasks = rnp->exp_tasks;
+#ifdef CONFIG_RCU_BOOST
+ if (&t->rcu_node_entry == rnp->boost_tasks)
+ rnp_root->boost_tasks = rnp->boost_tasks;
+#endif /* #ifdef CONFIG_RCU_BOOST */
+ raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
}
+
+#ifdef CONFIG_RCU_BOOST
+ /* In case root is being boosted and leaf is not. */
+ raw_spin_lock(&rnp_root->lock); /* irqs already disabled */
+ if (rnp_root->boost_tasks != NULL &&
+ rnp_root->boost_tasks != rnp_root->gp_tasks)
+ rnp_root->boost_tasks = rnp_root->gp_tasks;
+ raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */
+#endif /* #ifdef CONFIG_RCU_BOOST */
+
+ rnp->gp_tasks = NULL;
+ rnp->exp_tasks = NULL;
return retval;
}
/*
- * Do CPU-offline processing for preemptable RCU.
+ * Do CPU-offline processing for preemptible RCU.
*/
static void rcu_preempt_offline_cpu(int cpu)
{
@@ -537,7 +594,7 @@ static void rcu_preempt_check_callbacks(int cpu)
}
/*
- * Process callbacks for preemptable RCU.
+ * Process callbacks for preemptible RCU.
*/
static void rcu_preempt_process_callbacks(void)
{
@@ -546,7 +603,7 @@ static void rcu_preempt_process_callbacks(void)
}
/*
- * Queue a preemptable-RCU callback for invocation after a grace period.
+ * Queue a preemptible-RCU callback for invocation after a grace period.
*/
void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
{
@@ -594,8 +651,7 @@ static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex);
*/
static int rcu_preempted_readers_exp(struct rcu_node *rnp)
{
- return !list_empty(&rnp->blocked_tasks[2]) ||
- !list_empty(&rnp->blocked_tasks[3]);
+ return rnp->exp_tasks != NULL;
}
/*
@@ -655,13 +711,17 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
static void
sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
{
- int must_wait;
+ unsigned long flags;
+ int must_wait = 0;
- raw_spin_lock(&rnp->lock); /* irqs already disabled */
- list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]);
- list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]);
- must_wait = rcu_preempted_readers_exp(rnp);
- raw_spin_unlock(&rnp->lock); /* irqs remain disabled */
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ if (list_empty(&rnp->blkd_tasks))
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ else {
+ rnp->exp_tasks = rnp->blkd_tasks.next;
+ rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
+ must_wait = 1;
+ }
if (!must_wait)
rcu_report_exp_rnp(rsp, rnp);
}
@@ -669,9 +729,7 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
/*
* Wait for an rcu-preempt grace period, but expedite it. The basic idea
* is to invoke synchronize_sched_expedited() to push all the tasks to
- * the ->blocked_tasks[] lists, move all entries from the first set of
- * ->blocked_tasks[] lists to the second set, and finally wait for this
- * second set to drain.
+ * the ->blkd_tasks lists and wait for this list to drain.
*/
void synchronize_rcu_expedited(void)
{
@@ -703,7 +761,7 @@ void synchronize_rcu_expedited(void)
if ((ACCESS_ONCE(sync_rcu_preempt_exp_count) - snap) > 0)
goto unlock_mb_ret; /* Others did our work for us. */
- /* force all RCU readers onto blocked_tasks[]. */
+ /* force all RCU readers onto ->blkd_tasks lists. */
synchronize_sched_expedited();
raw_spin_lock_irqsave(&rsp->onofflock, flags);
@@ -715,7 +773,7 @@ void synchronize_rcu_expedited(void)
raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
}
- /* Snapshot current state of ->blocked_tasks[] lists. */
+ /* Snapshot current state of ->blkd_tasks lists. */
rcu_for_each_leaf_node(rsp, rnp)
sync_rcu_preempt_exp_init(rsp, rnp);
if (NUM_RCU_NODES > 1)
@@ -723,7 +781,7 @@ void synchronize_rcu_expedited(void)
raw_spin_unlock_irqrestore(&rsp->onofflock, flags);
- /* Wait for snapshotted ->blocked_tasks[] lists to drain. */
+ /* Wait for snapshotted ->blkd_tasks lists to drain. */
rnp = rcu_get_root(rsp);
wait_event(sync_rcu_preempt_exp_wq,
sync_rcu_preempt_exp_done(rnp));
@@ -739,7 +797,7 @@ mb_ret:
EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
/*
- * Check to see if there is any immediate preemptable-RCU-related work
+ * Check to see if there is any immediate preemptible-RCU-related work
* to be done.
*/
static int rcu_preempt_pending(int cpu)
@@ -749,7 +807,7 @@ static int rcu_preempt_pending(int cpu)
}
/*
- * Does preemptable RCU need the CPU to stay out of dynticks mode?
+ * Does preemptible RCU need the CPU to stay out of dynticks mode?
*/
static int rcu_preempt_needs_cpu(int cpu)
{
@@ -766,7 +824,7 @@ void rcu_barrier(void)
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
- * Initialize preemptable RCU's per-CPU data.
+ * Initialize preemptible RCU's per-CPU data.
*/
static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
{
@@ -774,7 +832,7 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
}
/*
- * Move preemptable RCU's callbacks from dying CPU to other online CPU.
+ * Move preemptible RCU's callbacks from dying CPU to other online CPU.
*/
static void rcu_preempt_send_cbs_to_online(void)
{
@@ -782,7 +840,7 @@ static void rcu_preempt_send_cbs_to_online(void)
}
/*
- * Initialize preemptable RCU's state structures.
+ * Initialize preemptible RCU's state structures.
*/
static void __init __rcu_init_preempt(void)
{
@@ -790,7 +848,7 @@ static void __init __rcu_init_preempt(void)
}
/*
- * Check for a task exiting while in a preemptable-RCU read-side
+ * Check for a task exiting while in a preemptible-RCU read-side
* critical section, clean up if so. No need to issue warnings,
* as debug_check_no_locks_held() already does this if lockdep
* is enabled.
@@ -802,11 +860,13 @@ void exit_rcu(void)
if (t->rcu_read_lock_nesting == 0)
return;
t->rcu_read_lock_nesting = 1;
- rcu_read_unlock();
+ __rcu_read_unlock();
}
#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
+static struct rcu_state *rcu_state = &rcu_sched_state;
+
/*
* Tell them what RCU they are running.
*/
@@ -836,7 +896,7 @@ void rcu_force_quiescent_state(void)
EXPORT_SYMBOL_GPL(rcu_force_quiescent_state);
/*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
* CPUs being in quiescent states.
*/
static void rcu_preempt_note_context_switch(int cpu)
@@ -844,10 +904,10 @@ static void rcu_preempt_note_context_switch(int cpu)
}
/*
- * Because preemptable RCU does not exist, there are never any preempted
+ * Because preemptible RCU does not exist, there are never any preempted
* RCU readers.
*/
-static int rcu_preempted_readers(struct rcu_node *rnp)
+static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
{
return 0;
}
@@ -862,10 +922,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-
/*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
*/
static void rcu_print_detail_task_stall(struct rcu_state *rsp)
@@ -873,7 +931,7 @@ static void rcu_print_detail_task_stall(struct rcu_state *rsp)
}
/*
- * Because preemptable RCU does not exist, we never have to check for
+ * Because preemptible RCU does not exist, we never have to check for
* tasks blocked within RCU read-side critical sections.
*/
static void rcu_print_task_stall(struct rcu_node *rnp)
@@ -888,10 +946,8 @@ static void rcu_preempt_stall_reset(void)
{
}
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-
/*
- * Because there is no preemptable RCU, there can be no readers blocked,
+ * Because there is no preemptible RCU, there can be no readers blocked,
* so there is no need to check for blocked tasks. So check only for
* bogus qsmask values.
*/
@@ -903,7 +959,7 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Because preemptable RCU does not exist, it never needs to migrate
+ * Because preemptible RCU does not exist, it never needs to migrate
* tasks that were blocked within RCU read-side critical sections, and
* such non-existent tasks cannot possibly have been blocking the current
* grace period.
@@ -916,7 +972,7 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
}
/*
- * Because preemptable RCU does not exist, it never needs CPU-offline
+ * Because preemptible RCU does not exist, it never needs CPU-offline
* processing.
*/
static void rcu_preempt_offline_cpu(int cpu)
@@ -926,7 +982,7 @@ static void rcu_preempt_offline_cpu(int cpu)
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
- * Because preemptable RCU does not exist, it never has any callbacks
+ * Because preemptible RCU does not exist, it never has any callbacks
* to check.
*/
static void rcu_preempt_check_callbacks(int cpu)
@@ -934,7 +990,7 @@ static void rcu_preempt_check_callbacks(int cpu)
}
/*
- * Because preemptable RCU does not exist, it never has any callbacks
+ * Because preemptible RCU does not exist, it never has any callbacks
* to process.
*/
static void rcu_preempt_process_callbacks(void)
@@ -943,7 +999,7 @@ static void rcu_preempt_process_callbacks(void)
/*
* Wait for an rcu-preempt grace period, but make it happen quickly.
- * But because preemptable RCU does not exist, map to rcu-sched.
+ * But because preemptible RCU does not exist, map to rcu-sched.
*/
void synchronize_rcu_expedited(void)
{
@@ -954,7 +1010,7 @@ EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
#ifdef CONFIG_HOTPLUG_CPU
/*
- * Because preemptable RCU does not exist, there is never any need to
+ * Because preemptible RCU does not exist, there is never any need to
* report on tasks preempted in RCU read-side critical sections during
* expedited RCU grace periods.
*/
@@ -966,7 +1022,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp)
#endif /* #ifdef CONFIG_HOTPLUG_CPU */
/*
- * Because preemptable RCU does not exist, it never has any work to do.
+ * Because preemptible RCU does not exist, it never has any work to do.
*/
static int rcu_preempt_pending(int cpu)
{
@@ -974,7 +1030,7 @@ static int rcu_preempt_pending(int cpu)
}
/*
- * Because preemptable RCU does not exist, it never needs any CPU.
+ * Because preemptible RCU does not exist, it never needs any CPU.
*/
static int rcu_preempt_needs_cpu(int cpu)
{
@@ -982,7 +1038,7 @@ static int rcu_preempt_needs_cpu(int cpu)
}
/*
- * Because preemptable RCU does not exist, rcu_barrier() is just
+ * Because preemptible RCU does not exist, rcu_barrier() is just
* another name for rcu_barrier_sched().
*/
void rcu_barrier(void)
@@ -992,7 +1048,7 @@ void rcu_barrier(void)
EXPORT_SYMBOL_GPL(rcu_barrier);
/*
- * Because preemptable RCU does not exist, there is no per-CPU
+ * Because preemptible RCU does not exist, there is no per-CPU
* data to initialize.
*/
static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
@@ -1000,14 +1056,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
}
/*
- * Because there is no preemptable RCU, there are no callbacks to move.
+ * Because there is no preemptible RCU, there are no callbacks to move.
*/
static void rcu_preempt_send_cbs_to_online(void)
{
}
/*
- * Because preemptable RCU does not exist, it need not be initialized.
+ * Because preemptible RCU does not exist, it need not be initialized.
*/
static void __init __rcu_init_preempt(void)
{
@@ -1015,6 +1071,263 @@ static void __init __rcu_init_preempt(void)
#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
+#ifdef CONFIG_RCU_BOOST
+
+#include "rtmutex_common.h"
+
+#ifdef CONFIG_RCU_TRACE
+
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+ if (list_empty(&rnp->blkd_tasks))
+ rnp->n_balk_blkd_tasks++;
+ else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL)
+ rnp->n_balk_exp_gp_tasks++;
+ else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL)
+ rnp->n_balk_boost_tasks++;
+ else if (rnp->gp_tasks != NULL && rnp->qsmask != 0)
+ rnp->n_balk_notblocked++;
+ else if (rnp->gp_tasks != NULL &&
+ ULONG_CMP_LT(jiffies, rnp->boost_time))
+ rnp->n_balk_notyet++;
+ else
+ rnp->n_balk_nos++;
+}
+
+#else /* #ifdef CONFIG_RCU_TRACE */
+
+static void rcu_initiate_boost_trace(struct rcu_node *rnp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_TRACE */
+
+/*
+ * Carry out RCU priority boosting on the task indicated by ->exp_tasks
+ * or ->boost_tasks, advancing the pointer to the next task in the
+ * ->blkd_tasks list.
+ *
+ * Note that irqs must be enabled: boosting the task can block.
+ * Returns 1 if there are more tasks needing to be boosted.
+ */
+static int rcu_boost(struct rcu_node *rnp)
+{
+ unsigned long flags;
+ struct rt_mutex mtx;
+ struct task_struct *t;
+ struct list_head *tb;
+
+ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL)
+ return 0; /* Nothing left to boost. */
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+
+ /*
+ * Recheck under the lock: all tasks in need of boosting
+ * might exit their RCU read-side critical sections on their own.
+ */
+ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) {
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return 0;
+ }
+
+ /*
+ * Preferentially boost tasks blocking expedited grace periods.
+ * This cannot starve the normal grace periods because a second
+ * expedited grace period must boost all blocked tasks, including
+ * those blocking the pre-existing normal grace period.
+ */
+ if (rnp->exp_tasks != NULL) {
+ tb = rnp->exp_tasks;
+ rnp->n_exp_boosts++;
+ } else {
+ tb = rnp->boost_tasks;
+ rnp->n_normal_boosts++;
+ }
+ rnp->n_tasks_boosted++;
+
+ /*
+ * We boost task t by manufacturing an rt_mutex that appears to
+ * be held by task t. We leave a pointer to that rt_mutex where
+ * task t can find it, and task t will release the mutex when it
+ * exits its outermost RCU read-side critical section. Then
+ * simply acquiring this artificial rt_mutex will boost task
+ * t's priority. (Thanks to tglx for suggesting this approach!)
+ *
+ * Note that task t must acquire rnp->lock to remove itself from
+ * the ->blkd_tasks list, which it will do from exit() if from
+ * nowhere else. We therefore are guaranteed that task t will
+ * stay around at least until we drop rnp->lock. Note that
+ * rnp->lock also resolves races between our priority boosting
+ * and task t's exiting its outermost RCU read-side critical
+ * section.
+ */
+ t = container_of(tb, struct task_struct, rcu_node_entry);
+ rt_mutex_init_proxy_locked(&mtx, t);
+ t->rcu_boost_mutex = &mtx;
+ t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BOOSTED;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */
+ rt_mutex_unlock(&mtx); /* Keep lockdep happy. */
+
+ return rnp->exp_tasks != NULL || rnp->boost_tasks != NULL;
+}
+
+/*
+ * Timer handler to initiate waking up of boost kthreads that
+ * have yielded the CPU due to excessive numbers of tasks to
+ * boost. We wake up the per-rcu_node kthread, which in turn
+ * will wake up the booster kthread.
+ */
+static void rcu_boost_kthread_timer(unsigned long arg)
+{
+ invoke_rcu_node_kthread((struct rcu_node *)arg);
+}
+
+/*
+ * Priority-boosting kthread. One per leaf rcu_node and one for the
+ * root rcu_node.
+ */
+static int rcu_boost_kthread(void *arg)
+{
+ struct rcu_node *rnp = (struct rcu_node *)arg;
+ int spincnt = 0;
+ int more2boost;
+
+ for (;;) {
+ rnp->boost_kthread_status = RCU_KTHREAD_WAITING;
+ rcu_wait(rnp->boost_tasks || rnp->exp_tasks);
+ rnp->boost_kthread_status = RCU_KTHREAD_RUNNING;
+ more2boost = rcu_boost(rnp);
+ if (more2boost)
+ spincnt++;
+ else
+ spincnt = 0;
+ if (spincnt > 10) {
+ rcu_yield(rcu_boost_kthread_timer, (unsigned long)rnp);
+ spincnt = 0;
+ }
+ }
+ /* NOTREACHED */
+ return 0;
+}
+
+/*
+ * Check to see if it is time to start boosting RCU readers that are
+ * blocking the current grace period, and, if so, tell the per-rcu_node
+ * kthread to start boosting them. If there is an expedited grace
+ * period in progress, it is always time to boost.
+ *
+ * The caller must hold rnp->lock, which this function releases,
+ * but irqs remain disabled. The ->boost_kthread_task is immortal,
+ * so we don't need to worry about it going away.
+ */
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+{
+ struct task_struct *t;
+
+ if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) {
+ rnp->n_balk_exp_gp_tasks++;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ return;
+ }
+ if (rnp->exp_tasks != NULL ||
+ (rnp->gp_tasks != NULL &&
+ rnp->boost_tasks == NULL &&
+ rnp->qsmask == 0 &&
+ ULONG_CMP_GE(jiffies, rnp->boost_time))) {
+ if (rnp->exp_tasks == NULL)
+ rnp->boost_tasks = rnp->gp_tasks;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ t = rnp->boost_kthread_task;
+ if (t != NULL)
+ wake_up_process(t);
+ } else {
+ rcu_initiate_boost_trace(rnp);
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ }
+}
+
+/*
+ * Set the affinity of the boost kthread. The CPU-hotplug locks are
+ * held, so no one should be messing with the existence of the boost
+ * kthread.
+ */
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+ cpumask_var_t cm)
+{
+ struct task_struct *t;
+
+ t = rnp->boost_kthread_task;
+ if (t != NULL)
+ set_cpus_allowed_ptr(rnp->boost_kthread_task, cm);
+}
+
+#define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000)
+
+/*
+ * Do priority-boost accounting for the start of a new grace period.
+ */
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+ rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES;
+}
+
+/*
+ * Create an RCU-boost kthread for the specified node if one does not
+ * already exist. We only create this kthread for preemptible RCU.
+ * Returns zero if all is well, a negated errno otherwise.
+ */
+static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp,
+ int rnp_index)
+{
+ unsigned long flags;
+ struct sched_param sp;
+ struct task_struct *t;
+
+ if (&rcu_preempt_state != rsp)
+ return 0;
+ if (rnp->boost_kthread_task != NULL)
+ return 0;
+ t = kthread_create(rcu_boost_kthread, (void *)rnp,
+ "rcub%d", rnp_index);
+ if (IS_ERR(t))
+ return PTR_ERR(t);
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ set_task_state(t, TASK_INTERRUPTIBLE);
+ rnp->boost_kthread_task = t;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ sp.sched_priority = RCU_KTHREAD_PRIO;
+ sched_setscheduler_nocheck(t, SCHED_FIFO, &sp);
+ return 0;
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags)
+{
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp,
+ cpumask_var_t cm)
+{
+}
+
+static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
+{
+}
+
+static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
+ struct rcu_node *rnp,
+ int rnp_index)
+{
+ return 0;
+}
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
#ifndef CONFIG_SMP
void synchronize_sched_expedited(void)
@@ -1187,14 +1500,13 @@ static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff);
*
* Because it is not legal to invoke rcu_process_callbacks() with irqs
* disabled, we do one pass of force_quiescent_state(), then do a
- * raise_softirq() to cause rcu_process_callbacks() to be invoked later.
- * The per-cpu rcu_dyntick_drain variable controls the sequencing.
+ * invoke_rcu_cpu_kthread() to cause rcu_process_callbacks() to be invoked
+ * later. The per-cpu rcu_dyntick_drain variable controls the sequencing.
*/
int rcu_needs_cpu(int cpu)
{
int c = 0;
int snap;
- int snap_nmi;
int thatcpu;
/* Check for being in the holdoff period. */
@@ -1205,10 +1517,10 @@ int rcu_needs_cpu(int cpu)
for_each_online_cpu(thatcpu) {
if (thatcpu == cpu)
continue;
- snap = per_cpu(rcu_dynticks, thatcpu).dynticks;
- snap_nmi = per_cpu(rcu_dynticks, thatcpu).dynticks_nmi;
+ snap = atomic_add_return(0, &per_cpu(rcu_dynticks,
+ thatcpu).dynticks);
smp_mb(); /* Order sampling of snap with end of grace period. */
- if (((snap & 0x1) != 0) || ((snap_nmi & 0x1) != 0)) {
+ if ((snap & 0x1) != 0) {
per_cpu(rcu_dyntick_drain, cpu) = 0;
per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1;
return rcu_needs_cpu_quick_check(cpu);
@@ -1239,7 +1551,7 @@ int rcu_needs_cpu(int cpu)
/* If RCU callbacks are still pending, RCU still needs this CPU. */
if (c)
- raise_softirq(RCU_SOFTIRQ);
+ invoke_rcu_cpu_kthread();
return c;
}
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c8e97853b970..9678cc3650f5 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -46,6 +46,18 @@
#define RCU_TREE_NONCORE
#include "rcutree.h"
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_cpu);
+DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops);
+DECLARE_PER_CPU(char, rcu_cpu_has_work);
+
+static char convert_kthread_status(unsigned int kthread_status)
+{
+ if (kthread_status > RCU_KTHREAD_MAX)
+ return '?';
+ return "SRWOY"[kthread_status];
+}
+
static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
{
if (!rdp->beenonline)
@@ -57,14 +69,28 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
rdp->passed_quiesc, rdp->passed_quiesc_completed,
rdp->qs_pending);
#ifdef CONFIG_NO_HZ
- seq_printf(m, " dt=%d/%d dn=%d df=%lu",
- rdp->dynticks->dynticks,
+ seq_printf(m, " dt=%d/%d/%d df=%lu",
+ atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi,
+ rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, " of=%lu ri=%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, " ql=%ld b=%ld", rdp->qlen, rdp->blimit);
+ seq_printf(m, " ql=%ld qs=%c%c%c%c kt=%d/%c/%d ktl=%x b=%ld",
+ rdp->qlen,
+ ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
+ rdp->nxttail[RCU_NEXT_TAIL]],
+ ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
+ rdp->nxttail[RCU_NEXT_READY_TAIL]],
+ ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
+ rdp->nxttail[RCU_WAIT_TAIL]],
+ ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
+ per_cpu(rcu_cpu_has_work, rdp->cpu),
+ convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
+ rdp->cpu)),
+ per_cpu(rcu_cpu_kthread_cpu, rdp->cpu),
+ per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff,
+ rdp->blimit);
seq_printf(m, " ci=%lu co=%lu ca=%lu\n",
rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
@@ -115,13 +141,24 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp)
rdp->qs_pending);
#ifdef CONFIG_NO_HZ
seq_printf(m, ",%d,%d,%d,%lu",
- rdp->dynticks->dynticks,
+ atomic_read(&rdp->dynticks->dynticks),
rdp->dynticks->dynticks_nesting,
- rdp->dynticks->dynticks_nmi,
+ rdp->dynticks->dynticks_nmi_nesting,
rdp->dynticks_fqs);
#endif /* #ifdef CONFIG_NO_HZ */
seq_printf(m, ",%lu,%lu", rdp->offline_fqs, rdp->resched_ipi);
- seq_printf(m, ",%ld,%ld", rdp->qlen, rdp->blimit);
+ seq_printf(m, ",%ld,\"%c%c%c%c\",%d,\"%c\",%ld", rdp->qlen,
+ ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] !=
+ rdp->nxttail[RCU_NEXT_TAIL]],
+ ".R"[rdp->nxttail[RCU_WAIT_TAIL] !=
+ rdp->nxttail[RCU_NEXT_READY_TAIL]],
+ ".W"[rdp->nxttail[RCU_DONE_TAIL] !=
+ rdp->nxttail[RCU_WAIT_TAIL]],
+ ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]],
+ per_cpu(rcu_cpu_has_work, rdp->cpu),
+ convert_kthread_status(per_cpu(rcu_cpu_kthread_status,
+ rdp->cpu)),
+ rdp->blimit);
seq_printf(m, ",%lu,%lu,%lu\n",
rdp->n_cbs_invoked, rdp->n_cbs_orphaned, rdp->n_cbs_adopted);
}
@@ -130,7 +167,7 @@ static int show_rcudata_csv(struct seq_file *m, void *unused)
{
seq_puts(m, "\"CPU\",\"Online?\",\"c\",\"g\",\"pq\",\"pqc\",\"pq\",");
#ifdef CONFIG_NO_HZ
- seq_puts(m, "\"dt\",\"dt nesting\",\"dn\",\"df\",");
+ seq_puts(m, "\"dt\",\"dt nesting\",\"dt NMI nesting\",\"df\",");
#endif /* #ifdef CONFIG_NO_HZ */
seq_puts(m, "\"of\",\"ri\",\"ql\",\"b\",\"ci\",\"co\",\"ca\"\n");
#ifdef CONFIG_TREE_PREEMPT_RCU
@@ -157,11 +194,76 @@ static const struct file_operations rcudata_csv_fops = {
.release = single_release,
};
+#ifdef CONFIG_RCU_BOOST
+
+static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp)
+{
+ seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu "
+ "j=%04x bt=%04x\n",
+ rnp->grplo, rnp->grphi,
+ "T."[list_empty(&rnp->blkd_tasks)],
+ "N."[!rnp->gp_tasks],
+ "E."[!rnp->exp_tasks],
+ "B."[!rnp->boost_tasks],
+ convert_kthread_status(rnp->boost_kthread_status),
+ rnp->n_tasks_boosted, rnp->n_exp_boosts,
+ rnp->n_normal_boosts,
+ (int)(jiffies & 0xffff),
+ (int)(rnp->boost_time & 0xffff));
+ seq_printf(m, "%s: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n",
+ " balk",
+ rnp->n_balk_blkd_tasks,
+ rnp->n_balk_exp_gp_tasks,
+ rnp->n_balk_boost_tasks,
+ rnp->n_balk_notblocked,
+ rnp->n_balk_notyet,
+ rnp->n_balk_nos);
+}
+
+static int show_rcu_node_boost(struct seq_file *m, void *unused)
+{
+ struct rcu_node *rnp;
+
+ rcu_for_each_leaf_node(&rcu_preempt_state, rnp)
+ print_one_rcu_node_boost(m, rnp);
+ return 0;
+}
+
+static int rcu_node_boost_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcu_node_boost, NULL);
+}
+
+static const struct file_operations rcu_node_boost_fops = {
+ .owner = THIS_MODULE,
+ .open = rcu_node_boost_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+/*
+ * Create the rcuboost debugfs entry. Standard error return.
+ */
+static int rcu_boost_trace_create_file(struct dentry *rcudir)
+{
+ return !debugfs_create_file("rcuboost", 0444, rcudir, NULL,
+ &rcu_node_boost_fops);
+}
+
+#else /* #ifdef CONFIG_RCU_BOOST */
+
+static int rcu_boost_trace_create_file(struct dentry *rcudir)
+{
+ return 0; /* There cannot be an error if we didn't create it! */
+}
+
+#endif /* #else #ifdef CONFIG_RCU_BOOST */
+
static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
{
unsigned long gpnum;
int level = 0;
- int phase;
struct rcu_node *rnp;
gpnum = rsp->gpnum;
@@ -178,13 +280,11 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
seq_puts(m, "\n");
level = rnp->level;
}
- phase = gpnum & 0x1;
- seq_printf(m, "%lx/%lx %c%c>%c%c %d:%d ^%d ",
+ seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ",
rnp->qsmask, rnp->qsmaskinit,
- "T."[list_empty(&rnp->blocked_tasks[phase])],
- "E."[list_empty(&rnp->blocked_tasks[phase + 2])],
- "T."[list_empty(&rnp->blocked_tasks[!phase])],
- "E."[list_empty(&rnp->blocked_tasks[!phase + 2])],
+ ".G"[rnp->gp_tasks != NULL],
+ ".E"[rnp->exp_tasks != NULL],
+ ".T"[!list_empty(&rnp->blkd_tasks)],
rnp->grplo, rnp->grphi, rnp->grpnum);
}
seq_puts(m, "\n");
@@ -216,16 +316,35 @@ static const struct file_operations rcuhier_fops = {
.release = single_release,
};
+static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp)
+{
+ unsigned long flags;
+ unsigned long completed;
+ unsigned long gpnum;
+ unsigned long gpage;
+ unsigned long gpmax;
+ struct rcu_node *rnp = &rsp->node[0];
+
+ raw_spin_lock_irqsave(&rnp->lock, flags);
+ completed = rsp->completed;
+ gpnum = rsp->gpnum;
+ if (rsp->completed == rsp->gpnum)
+ gpage = 0;
+ else
+ gpage = jiffies - rsp->gp_start;
+ gpmax = rsp->gp_max;
+ raw_spin_unlock_irqrestore(&rnp->lock, flags);
+ seq_printf(m, "%s: completed=%ld gpnum=%lu age=%ld max=%ld\n",
+ rsp->name, completed, gpnum, gpage, gpmax);
+}
+
static int show_rcugp(struct seq_file *m, void *unused)
{
#ifdef CONFIG_TREE_PREEMPT_RCU
- seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n",
- rcu_preempt_state.completed, rcu_preempt_state.gpnum);
+ show_one_rcugp(m, &rcu_preempt_state);
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
- seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n",
- rcu_sched_state.completed, rcu_sched_state.gpnum);
- seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n",
- rcu_bh_state.completed, rcu_bh_state.gpnum);
+ show_one_rcugp(m, &rcu_sched_state);
+ show_one_rcugp(m, &rcu_bh_state);
return 0;
}
@@ -298,6 +417,29 @@ static const struct file_operations rcu_pending_fops = {
.release = single_release,
};
+static int show_rcutorture(struct seq_file *m, void *unused)
+{
+ seq_printf(m, "rcutorture test sequence: %lu %s\n",
+ rcutorture_testseq >> 1,
+ (rcutorture_testseq & 0x1) ? "(test in progress)" : "");
+ seq_printf(m, "rcutorture update version number: %lu\n",
+ rcutorture_vernum);
+ return 0;
+}
+
+static int rcutorture_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, show_rcutorture, NULL);
+}
+
+static const struct file_operations rcutorture_fops = {
+ .owner = THIS_MODULE,
+ .open = rcutorture_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
static struct dentry *rcudir;
static int __init rcutree_trace_init(void)
@@ -318,6 +460,9 @@ static int __init rcutree_trace_init(void)
if (!retval)
goto free_out;
+ if (rcu_boost_trace_create_file(rcudir))
+ goto free_out;
+
retval = debugfs_create_file("rcugp", 0444, rcudir, NULL, &rcugp_fops);
if (!retval)
goto free_out;
@@ -331,6 +476,11 @@ static int __init rcutree_trace_init(void)
NULL, &rcu_pending_fops);
if (!retval)
goto free_out;
+
+ retval = debugfs_create_file("rcutorture", 0444, rcudir,
+ NULL, &rcutorture_fops);
+ if (!retval)
+ goto free_out;
return 0;
free_out:
debugfs_remove_recursive(rcudir);
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index c7eaa37a768b..34683efa2cce 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -126,10 +126,24 @@ ssize_t res_counter_read(struct res_counter *counter, int member,
pos, buf, s - buf);
}
+#if BITS_PER_LONG == 32
+u64 res_counter_read_u64(struct res_counter *counter, int member)
+{
+ unsigned long flags;
+ u64 ret;
+
+ spin_lock_irqsave(&counter->lock, flags);
+ ret = *res_counter_member(counter, member);
+ spin_unlock_irqrestore(&counter->lock, flags);
+
+ return ret;
+}
+#else
u64 res_counter_read_u64(struct res_counter *counter, int member)
{
return *res_counter_member(counter, member);
}
+#endif
int res_counter_memparse_write_strategy(const char *buf,
unsigned long long *res)
diff --git a/kernel/rtmutex-debug.c b/kernel/rtmutex-debug.c
index ddabb54bb5c8..3c7cbc2c33be 100644
--- a/kernel/rtmutex-debug.c
+++ b/kernel/rtmutex-debug.c
@@ -215,7 +215,6 @@ void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter)
put_pid(waiter->deadlock_task_pid);
TRACE_WARN_ON(!plist_node_empty(&waiter->list_entry));
TRACE_WARN_ON(!plist_node_empty(&waiter->pi_list_entry));
- TRACE_WARN_ON(waiter->task);
memset(waiter, 0x22, sizeof(*waiter));
}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 66cb89bc5ef1..5c9ccd380966 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -9,7 +9,6 @@
#include <linux/kthread.h>
#include <linux/module.h>
#include <linux/sched.h>
-#include <linux/smp_lock.h>
#include <linux/spinlock.h>
#include <linux/sysdev.h>
#include <linux/timer.h>
@@ -27,7 +26,6 @@ struct test_thread_data {
int opcode;
int opdata;
int mutexes[MAX_RT_TEST_MUTEXES];
- int bkl;
int event;
struct sys_device sysdev;
};
@@ -46,9 +44,8 @@ enum test_opcodes {
RTTEST_LOCKINTNOWAIT, /* 6 Lock interruptible no wait in wakeup, data = lockindex */
RTTEST_LOCKCONT, /* 7 Continue locking after the wakeup delay */
RTTEST_UNLOCK, /* 8 Unlock, data = lockindex */
- RTTEST_LOCKBKL, /* 9 Lock BKL */
- RTTEST_UNLOCKBKL, /* 10 Unlock BKL */
- RTTEST_SIGNAL, /* 11 Signal other test thread, data = thread id */
+ /* 9, 10 - reserved for BKL commemoration */
+ RTTEST_SIGNAL = 11, /* 11 Signal other test thread, data = thread id */
RTTEST_RESETEVENT = 98, /* 98 Reset event counter */
RTTEST_RESET = 99, /* 99 Reset all pending operations */
};
@@ -74,13 +71,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
td->mutexes[i] = 0;
}
}
-
- if (!lockwakeup && td->bkl == 4) {
-#ifdef CONFIG_LOCK_KERNEL
- unlock_kernel();
-#endif
- td->bkl = 0;
- }
return 0;
case RTTEST_RESETEVENT:
@@ -131,25 +121,6 @@ static int handle_op(struct test_thread_data *td, int lockwakeup)
td->mutexes[id] = 0;
return 0;
- case RTTEST_LOCKBKL:
- if (td->bkl)
- return 0;
- td->bkl = 1;
-#ifdef CONFIG_LOCK_KERNEL
- lock_kernel();
-#endif
- td->bkl = 4;
- return 0;
-
- case RTTEST_UNLOCKBKL:
- if (td->bkl != 4)
- break;
-#ifdef CONFIG_LOCK_KERNEL
- unlock_kernel();
-#endif
- td->bkl = 0;
- return 0;
-
default:
break;
}
@@ -196,7 +167,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
td->event = atomic_add_return(1, &rttest_event);
break;
- case RTTEST_LOCKBKL:
default:
break;
}
@@ -229,8 +199,6 @@ void schedule_rt_mutex_test(struct rt_mutex *mutex)
td->event = atomic_add_return(1, &rttest_event);
return;
- case RTTEST_LOCKBKL:
- return;
default:
return;
}
@@ -380,11 +348,11 @@ static ssize_t sysfs_test_status(struct sys_device *dev, struct sysdev_attribute
spin_lock(&rttest_lock);
curr += sprintf(curr,
- "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, K: %d, M:",
+ "O: %4d, E:%8d, S: 0x%08lx, P: %4d, N: %4d, B: %p, M:",
td->opcode, td->event, tsk->state,
(MAX_RT_PRIO - 1) - tsk->prio,
(MAX_RT_PRIO - 1) - tsk->normal_prio,
- tsk->pi_blocked_on, td->bkl);
+ tsk->pi_blocked_on);
for (i = MAX_RT_TEST_MUTEXES - 1; i >=0 ; i--)
curr += sprintf(curr, "%d", td->mutexes[i]);
diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c
index a9604815786a..ab449117aaf2 100644
--- a/kernel/rtmutex.c
+++ b/kernel/rtmutex.c
@@ -20,41 +20,34 @@
/*
* lock->owner state tracking:
*
- * lock->owner holds the task_struct pointer of the owner. Bit 0 and 1
- * are used to keep track of the "owner is pending" and "lock has
- * waiters" state.
+ * lock->owner holds the task_struct pointer of the owner. Bit 0
+ * is used to keep track of the "lock has waiters" state.
*
- * owner bit1 bit0
- * NULL 0 0 lock is free (fast acquire possible)
- * NULL 0 1 invalid state
- * NULL 1 0 Transitional State*
- * NULL 1 1 invalid state
- * taskpointer 0 0 lock is held (fast release possible)
- * taskpointer 0 1 task is pending owner
- * taskpointer 1 0 lock is held and has waiters
- * taskpointer 1 1 task is pending owner and lock has more waiters
- *
- * Pending ownership is assigned to the top (highest priority)
- * waiter of the lock, when the lock is released. The thread is woken
- * up and can now take the lock. Until the lock is taken (bit 0
- * cleared) a competing higher priority thread can steal the lock
- * which puts the woken up thread back on the waiters list.
+ * owner bit0
+ * NULL 0 lock is free (fast acquire possible)
+ * NULL 1 lock is free and has waiters and the top waiter
+ * is going to take the lock*
+ * taskpointer 0 lock is held (fast release possible)
+ * taskpointer 1 lock is held and has waiters**
*
* The fast atomic compare exchange based acquire and release is only
- * possible when bit 0 and 1 of lock->owner are 0.
+ * possible when bit 0 of lock->owner is 0.
+ *
+ * (*) It also can be a transitional state when grabbing the lock
+ * with ->wait_lock is held. To prevent any fast path cmpxchg to the lock,
+ * we need to set the bit0 before looking at the lock, and the owner may be
+ * NULL in this small time, hence this can be a transitional state.
*
- * (*) There's a small time where the owner can be NULL and the
- * "lock has waiters" bit is set. This can happen when grabbing the lock.
- * To prevent a cmpxchg of the owner releasing the lock, we need to set this
- * bit before looking at the lock, hence the reason this is a transitional
- * state.
+ * (**) There is a small time when bit 0 is set but there are no
+ * waiters. This can happen when grabbing the lock in the slow path.
+ * To prevent a cmpxchg of the owner releasing the lock, we need to
+ * set this bit before looking at the lock.
*/
static void
-rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner,
- unsigned long mask)
+rt_mutex_set_owner(struct rt_mutex *lock, struct task_struct *owner)
{
- unsigned long val = (unsigned long)owner | mask;
+ unsigned long val = (unsigned long)owner;
if (rt_mutex_has_waiters(lock))
val |= RT_MUTEX_HAS_WAITERS;
@@ -203,15 +196,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* reached or the state of the chain has changed while we
* dropped the locks.
*/
- if (!waiter || !waiter->task)
+ if (!waiter)
goto out_unlock_pi;
/*
* Check the orig_waiter state. After we dropped the locks,
- * the previous owner of the lock might have released the lock
- * and made us the pending owner:
+ * the previous owner of the lock might have released the lock.
*/
- if (orig_waiter && !orig_waiter->task)
+ if (orig_waiter && !rt_mutex_owner(orig_lock))
goto out_unlock_pi;
/*
@@ -254,6 +246,17 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/* Release the task */
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ if (!rt_mutex_owner(lock)) {
+ /*
+ * If the requeue above changed the top waiter, then we need
+ * to wake the new top waiter up to try to get the lock.
+ */
+
+ if (top_waiter != rt_mutex_top_waiter(lock))
+ wake_up_process(rt_mutex_top_waiter(lock)->task);
+ raw_spin_unlock(&lock->wait_lock);
+ goto out_put_task;
+ }
put_task_struct(task);
/* Grab the next task */
@@ -296,78 +299,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
}
/*
- * Optimization: check if we can steal the lock from the
- * assigned pending owner [which might not have taken the
- * lock yet]:
- */
-static inline int try_to_steal_lock(struct rt_mutex *lock,
- struct task_struct *task)
-{
- struct task_struct *pendowner = rt_mutex_owner(lock);
- struct rt_mutex_waiter *next;
- unsigned long flags;
-
- if (!rt_mutex_owner_pending(lock))
- return 0;
-
- if (pendowner == task)
- return 1;
-
- raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
- if (task->prio >= pendowner->prio) {
- raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
- return 0;
- }
-
- /*
- * Check if a waiter is enqueued on the pending owners
- * pi_waiters list. Remove it and readjust pending owners
- * priority.
- */
- if (likely(!rt_mutex_has_waiters(lock))) {
- raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
- return 1;
- }
-
- /* No chain handling, pending owner is not blocked on anything: */
- next = rt_mutex_top_waiter(lock);
- plist_del(&next->pi_list_entry, &pendowner->pi_waiters);
- __rt_mutex_adjust_prio(pendowner);
- raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-
- /*
- * We are going to steal the lock and a waiter was
- * enqueued on the pending owners pi_waiters queue. So
- * we have to enqueue this waiter into
- * task->pi_waiters list. This covers the case,
- * where task is boosted because it holds another
- * lock and gets unboosted because the booster is
- * interrupted, so we would delay a waiter with higher
- * priority as task->normal_prio.
- *
- * Note: in the rare case of a SCHED_OTHER task changing
- * its priority and thus stealing the lock, next->task
- * might be task:
- */
- if (likely(next->task != task)) {
- raw_spin_lock_irqsave(&task->pi_lock, flags);
- plist_add(&next->pi_list_entry, &task->pi_waiters);
- __rt_mutex_adjust_prio(task);
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- }
- return 1;
-}
-
-/*
* Try to take an rt-mutex
*
- * This fails
- * - when the lock has a real owner
- * - when a different pending owner exists and has higher priority than current
- *
* Must be called with lock->wait_lock held.
+ *
+ * @lock: the lock to be acquired.
+ * @task: the task which wants to acquire the lock
+ * @waiter: the waiter that is queued to the lock's wait list. (could be NULL)
*/
-static int try_to_take_rt_mutex(struct rt_mutex *lock)
+static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
+ struct rt_mutex_waiter *waiter)
{
/*
* We have to be careful here if the atomic speedups are
@@ -390,15 +331,52 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
*/
mark_rt_mutex_waiters(lock);
- if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
+ if (rt_mutex_owner(lock))
return 0;
+ /*
+ * It will get the lock because of one of these conditions:
+ * 1) there is no waiter
+ * 2) higher priority than waiters
+ * 3) it is top waiter
+ */
+ if (rt_mutex_has_waiters(lock)) {
+ if (task->prio >= rt_mutex_top_waiter(lock)->list_entry.prio) {
+ if (!waiter || waiter != rt_mutex_top_waiter(lock))
+ return 0;
+ }
+ }
+
+ if (waiter || rt_mutex_has_waiters(lock)) {
+ unsigned long flags;
+ struct rt_mutex_waiter *top;
+
+ raw_spin_lock_irqsave(&task->pi_lock, flags);
+
+ /* remove the queued waiter. */
+ if (waiter) {
+ plist_del(&waiter->list_entry, &lock->wait_list);
+ task->pi_blocked_on = NULL;
+ }
+
+ /*
+ * We have to enqueue the top waiter(if it exists) into
+ * task->pi_waiters list.
+ */
+ if (rt_mutex_has_waiters(lock)) {
+ top = rt_mutex_top_waiter(lock);
+ top->pi_list_entry.prio = top->list_entry.prio;
+ plist_add(&top->pi_list_entry, &task->pi_waiters);
+ }
+ raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ }
+
/* We got the lock. */
debug_rt_mutex_lock(lock);
- rt_mutex_set_owner(lock, current, 0);
+ rt_mutex_set_owner(lock, task);
- rt_mutex_deadlock_account_lock(lock, current);
+ rt_mutex_deadlock_account_lock(lock, task);
return 1;
}
@@ -436,6 +414,9 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ if (!owner)
+ return 0;
+
if (waiter == rt_mutex_top_waiter(lock)) {
raw_spin_lock_irqsave(&owner->pi_lock, flags);
plist_del(&top_waiter->pi_list_entry, &owner->pi_waiters);
@@ -472,21 +453,18 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/*
* Wake up the next waiter on the lock.
*
- * Remove the top waiter from the current tasks waiter list and from
- * the lock waiter list. Set it as pending owner. Then wake it up.
+ * Remove the top waiter from the current tasks waiter list and wake it up.
*
* Called with lock->wait_lock held.
*/
static void wakeup_next_waiter(struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
- struct task_struct *pendowner;
unsigned long flags;
raw_spin_lock_irqsave(&current->pi_lock, flags);
waiter = rt_mutex_top_waiter(lock);
- plist_del(&waiter->list_entry, &lock->wait_list);
/*
* Remove it from current->pi_waiters. We do not adjust a
@@ -495,43 +473,19 @@ static void wakeup_next_waiter(struct rt_mutex *lock)
* lock->wait_lock.
*/
plist_del(&waiter->pi_list_entry, &current->pi_waiters);
- pendowner = waiter->task;
- waiter->task = NULL;
- rt_mutex_set_owner(lock, pendowner, RT_MUTEX_OWNER_PENDING);
+ rt_mutex_set_owner(lock, NULL);
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
- /*
- * Clear the pi_blocked_on variable and enqueue a possible
- * waiter into the pi_waiters list of the pending owner. This
- * prevents that in case the pending owner gets unboosted a
- * waiter with higher priority than pending-owner->normal_prio
- * is blocked on the unboosted (pending) owner.
- */
- raw_spin_lock_irqsave(&pendowner->pi_lock, flags);
-
- WARN_ON(!pendowner->pi_blocked_on);
- WARN_ON(pendowner->pi_blocked_on != waiter);
- WARN_ON(pendowner->pi_blocked_on->lock != lock);
-
- pendowner->pi_blocked_on = NULL;
-
- if (rt_mutex_has_waiters(lock)) {
- struct rt_mutex_waiter *next;
-
- next = rt_mutex_top_waiter(lock);
- plist_add(&next->pi_list_entry, &pendowner->pi_waiters);
- }
- raw_spin_unlock_irqrestore(&pendowner->pi_lock, flags);
-
- wake_up_process(pendowner);
+ wake_up_process(waiter->task);
}
/*
- * Remove a waiter from a lock
+ * Remove a waiter from a lock and give up
*
- * Must be called with lock->wait_lock held
+ * Must be called with lock->wait_lock held and
+ * have just failed to try_to_take_rt_mutex().
*/
static void remove_waiter(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter)
@@ -543,11 +497,13 @@ static void remove_waiter(struct rt_mutex *lock,
raw_spin_lock_irqsave(&current->pi_lock, flags);
plist_del(&waiter->list_entry, &lock->wait_list);
- waiter->task = NULL;
current->pi_blocked_on = NULL;
raw_spin_unlock_irqrestore(&current->pi_lock, flags);
- if (first && owner != current) {
+ if (!owner)
+ return;
+
+ if (first) {
raw_spin_lock_irqsave(&owner->pi_lock, flags);
@@ -614,21 +570,19 @@ void rt_mutex_adjust_pi(struct task_struct *task)
* or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
- * @detect_deadlock: passed to task_blocks_on_rt_mutex
*
* lock->wait_lock must be held by the caller.
*/
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
- struct rt_mutex_waiter *waiter,
- int detect_deadlock)
+ struct rt_mutex_waiter *waiter)
{
int ret = 0;
for (;;) {
/* Try to acquire the lock: */
- if (try_to_take_rt_mutex(lock))
+ if (try_to_take_rt_mutex(lock, current, waiter))
break;
/*
@@ -645,39 +599,11 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
break;
}
- /*
- * waiter->task is NULL the first time we come here and
- * when we have been woken up by the previous owner
- * but the lock got stolen by a higher prio task.
- */
- if (!waiter->task) {
- ret = task_blocks_on_rt_mutex(lock, waiter, current,
- detect_deadlock);
- /*
- * If we got woken up by the owner then start loop
- * all over without going into schedule to try
- * to get the lock now:
- */
- if (unlikely(!waiter->task)) {
- /*
- * Reset the return value. We might
- * have returned with -EDEADLK and the
- * owner released the lock while we
- * were walking the pi chain.
- */
- ret = 0;
- continue;
- }
- if (unlikely(ret))
- break;
- }
-
raw_spin_unlock(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
- if (waiter->task)
- schedule_rt_mutex(lock);
+ schedule_rt_mutex(lock);
raw_spin_lock(&lock->wait_lock);
set_current_state(state);
@@ -698,12 +624,11 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
int ret = 0;
debug_rt_mutex_init_waiter(&waiter);
- waiter.task = NULL;
raw_spin_lock(&lock->wait_lock);
/* Try to acquire the lock again: */
- if (try_to_take_rt_mutex(lock)) {
+ if (try_to_take_rt_mutex(lock, current, NULL)) {
raw_spin_unlock(&lock->wait_lock);
return 0;
}
@@ -717,12 +642,14 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
timeout->task = NULL;
}
- ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
- detect_deadlock);
+ ret = task_blocks_on_rt_mutex(lock, &waiter, current, detect_deadlock);
+
+ if (likely(!ret))
+ ret = __rt_mutex_slowlock(lock, state, timeout, &waiter);
set_current_state(TASK_RUNNING);
- if (unlikely(waiter.task))
+ if (unlikely(ret))
remove_waiter(lock, &waiter);
/*
@@ -737,14 +664,6 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
if (unlikely(timeout))
hrtimer_cancel(&timeout->timer);
- /*
- * Readjust priority, when we did not get the lock. We might
- * have been the pending owner and boosted. Since we did not
- * take the lock, the PI boost has to go.
- */
- if (unlikely(ret))
- rt_mutex_adjust_prio(current);
-
debug_rt_mutex_free_waiter(&waiter);
return ret;
@@ -762,7 +681,7 @@ rt_mutex_slowtrylock(struct rt_mutex *lock)
if (likely(rt_mutex_owner(lock) != current)) {
- ret = try_to_take_rt_mutex(lock);
+ ret = try_to_take_rt_mutex(lock, current, NULL);
/*
* try_to_take_rt_mutex() sets the lock waiters
* bit unconditionally. Clean this up.
@@ -992,7 +911,7 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
{
__rt_mutex_init(lock, NULL);
debug_rt_mutex_proxy_lock(lock, proxy_owner);
- rt_mutex_set_owner(lock, proxy_owner, 0);
+ rt_mutex_set_owner(lock, proxy_owner);
rt_mutex_deadlock_account_lock(lock, proxy_owner);
}
@@ -1008,7 +927,7 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner)
{
debug_rt_mutex_proxy_unlock(lock);
- rt_mutex_set_owner(lock, NULL, 0);
+ rt_mutex_set_owner(lock, NULL);
rt_mutex_deadlock_account_unlock(proxy_owner);
}
@@ -1034,20 +953,14 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
raw_spin_lock(&lock->wait_lock);
- mark_rt_mutex_waiters(lock);
-
- if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
- /* We got the lock for task. */
- debug_rt_mutex_lock(lock);
- rt_mutex_set_owner(lock, task, 0);
+ if (try_to_take_rt_mutex(lock, task, NULL)) {
raw_spin_unlock(&lock->wait_lock);
- rt_mutex_deadlock_account_lock(lock, task);
return 1;
}
ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
- if (ret && !waiter->task) {
+ if (ret && !rt_mutex_owner(lock)) {
/*
* Reset the return value. We might have
* returned with -EDEADLK and the owner
@@ -1056,6 +969,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
*/
ret = 0;
}
+
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+
raw_spin_unlock(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
@@ -1110,12 +1027,11 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
set_current_state(TASK_INTERRUPTIBLE);
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
- detect_deadlock);
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
set_current_state(TASK_RUNNING);
- if (unlikely(waiter->task))
+ if (unlikely(ret))
remove_waiter(lock, waiter);
/*
@@ -1126,13 +1042,5 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
raw_spin_unlock(&lock->wait_lock);
- /*
- * Readjust priority, when we did not get the lock. We might have been
- * the pending owner and boosted. Since we did not take the lock, the
- * PI boost has to go.
- */
- if (unlikely(ret))
- rt_mutex_adjust_prio(current);
-
return ret;
}
diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h
index 97a2f81866af..53a66c85261b 100644
--- a/kernel/rtmutex_common.h
+++ b/kernel/rtmutex_common.h
@@ -91,9 +91,8 @@ task_top_pi_waiter(struct task_struct *p)
/*
* lock->owner state tracking:
*/
-#define RT_MUTEX_OWNER_PENDING 1UL
-#define RT_MUTEX_HAS_WAITERS 2UL
-#define RT_MUTEX_OWNER_MASKALL 3UL
+#define RT_MUTEX_HAS_WAITERS 1UL
+#define RT_MUTEX_OWNER_MASKALL 1UL
static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
{
@@ -101,17 +100,6 @@ static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock)
((unsigned long)lock->owner & ~RT_MUTEX_OWNER_MASKALL);
}
-static inline struct task_struct *rt_mutex_real_owner(struct rt_mutex *lock)
-{
- return (struct task_struct *)
- ((unsigned long)lock->owner & ~RT_MUTEX_HAS_WAITERS);
-}
-
-static inline unsigned long rt_mutex_owner_pending(struct rt_mutex *lock)
-{
- return (unsigned long)lock->owner & RT_MUTEX_OWNER_PENDING;
-}
-
/*
* PI-futex support (proxy locking functions, etc.):
*/
diff --git a/kernel/sched.c b/kernel/sched.c
index 42eab5a8437d..cbb3a0eee58e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -32,7 +32,6 @@
#include <linux/init.h>
#include <linux/uaccess.h>
#include <linux/highmem.h>
-#include <linux/smp_lock.h>
#include <asm/mmu_context.h>
#include <linux/interrupt.h>
#include <linux/capability.h>
@@ -232,7 +231,7 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
#endif
/*
- * sched_domains_mutex serializes calls to arch_init_sched_domains,
+ * sched_domains_mutex serializes calls to init_sched_domains,
* detach_destroy_domains and partition_sched_domains.
*/
static DEFINE_MUTEX(sched_domains_mutex);
@@ -294,7 +293,7 @@ static DEFINE_SPINLOCK(task_group_lock);
* limitation from this.)
*/
#define MIN_SHARES 2
-#define MAX_SHARES (1UL << 18)
+#define MAX_SHARES (1UL << (18 + SCHED_LOAD_RESOLUTION))
static int root_task_group_load = ROOT_TASK_GROUP_LOAD;
#endif
@@ -313,6 +312,9 @@ struct cfs_rq {
u64 exec_clock;
u64 min_vruntime;
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+#endif
struct rb_root tasks_timeline;
struct rb_node *rb_leftmost;
@@ -324,9 +326,11 @@ struct cfs_rq {
* 'curr' points to currently running entity on this cfs_rq.
* It is set to NULL otherwise (i.e when none are currently running).
*/
- struct sched_entity *curr, *next, *last;
+ struct sched_entity *curr, *next, *last, *skip;
+#ifdef CONFIG_SCHED_DEBUG
unsigned int nr_spread_over;
+#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */
@@ -418,6 +422,7 @@ struct rt_rq {
*/
struct root_domain {
atomic_t refcount;
+ struct rcu_head rcu;
cpumask_var_t span;
cpumask_var_t online;
@@ -461,7 +466,7 @@ struct rq {
u64 nohz_stamp;
unsigned char nohz_balance_kick;
#endif
- unsigned int skip_clock_update;
+ int skip_clock_update;
/* capture load from *all* tasks on this cpu: */
struct load_weight load;
@@ -554,6 +559,10 @@ struct rq {
unsigned int ttwu_count;
unsigned int ttwu_local;
#endif
+
+#ifdef CONFIG_SMP
+ struct task_struct *wake_list;
+#endif
};
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -572,7 +581,7 @@ static inline int cpu_of(struct rq *rq)
#define rcu_dereference_check_sched_domain(p) \
rcu_dereference_check((p), \
- rcu_read_lock_sched_held() || \
+ rcu_read_lock_held() || \
lockdep_is_held(&sched_domains_mutex))
/*
@@ -597,7 +606,7 @@ static inline int cpu_of(struct rq *rq)
* Return the group to which this tasks belongs.
*
* We use task_subsys_state_check() and extend the RCU verification
- * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach()
+ * with lockdep_is_held(&p->pi_lock) because cpu_cgroup_attach()
* holds that lock for each task it moves into the cgroup. Therefore
* by holding that lock, we pin the task to the current cgroup.
*/
@@ -606,11 +615,8 @@ static inline struct task_group *task_group(struct task_struct *p)
struct task_group *tg;
struct cgroup_subsys_state *css;
- if (p->flags & PF_EXITING)
- return &root_task_group;
-
css = task_subsys_state_check(p, cpu_cgroup_subsys_id,
- lockdep_is_held(&task_rq(p)->lock));
+ lockdep_is_held(&p->pi_lock));
tg = container_of(css, struct task_group, css);
return autogroup_task_group(p, tg);
@@ -646,7 +652,7 @@ static void update_rq_clock(struct rq *rq)
{
s64 delta;
- if (rq->skip_clock_update)
+ if (rq->skip_clock_update > 0)
return;
delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
@@ -664,10 +670,9 @@ static void update_rq_clock(struct rq *rq)
#endif
/**
- * runqueue_is_locked
+ * runqueue_is_locked - Returns true if the current cpu runqueue is locked
* @cpu: the processor in question.
*
- * Returns true if the current cpu runqueue is locked.
* This interface allows printk to be called with the runqueue lock
* held and know whether or not it is OK to wake up the klogd.
*/
@@ -843,18 +848,39 @@ static inline int task_current(struct rq *rq, struct task_struct *p)
return rq->curr == p;
}
-#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline int task_running(struct rq *rq, struct task_struct *p)
{
+#ifdef CONFIG_SMP
+ return p->on_cpu;
+#else
return task_current(rq, p);
+#endif
}
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
+#ifdef CONFIG_SMP
+ /*
+ * We can optimise this out completely for !SMP, because the
+ * SMP rebalancing from interrupt is the only thing that cares
+ * here.
+ */
+ next->on_cpu = 1;
+#endif
}
static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
+#ifdef CONFIG_SMP
+ /*
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
+ * We must ensure this doesn't happen until the switch is completely
+ * finished.
+ */
+ smp_wmb();
+ prev->on_cpu = 0;
+#endif
#ifdef CONFIG_DEBUG_SPINLOCK
/* this is a valid case when another task releases the spinlock */
rq->lock.owner = current;
@@ -870,15 +896,6 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
}
#else /* __ARCH_WANT_UNLOCKED_CTXSW */
-static inline int task_running(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
- return p->oncpu;
-#else
- return task_current(rq, p);
-#endif
-}
-
static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
{
#ifdef CONFIG_SMP
@@ -887,7 +904,7 @@ static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next)
* SMP rebalancing from interrupt is the only thing that cares
* here.
*/
- next->oncpu = 1;
+ next->on_cpu = 1;
#endif
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
raw_spin_unlock_irq(&rq->lock);
@@ -900,12 +917,12 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
{
#ifdef CONFIG_SMP
/*
- * After ->oncpu is cleared, the task can be moved to a different CPU.
+ * After ->on_cpu is cleared, the task can be moved to a different CPU.
* We must ensure this doesn't happen until the switch is completely
* finished.
*/
smp_wmb();
- prev->oncpu = 0;
+ prev->on_cpu = 0;
#endif
#ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_enable();
@@ -914,23 +931,15 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
#endif /* __ARCH_WANT_UNLOCKED_CTXSW */
/*
- * Check whether the task is waking, we use this to synchronize ->cpus_allowed
- * against ttwu().
- */
-static inline int task_is_waking(struct task_struct *p)
-{
- return unlikely(p->state == TASK_WAKING);
-}
-
-/*
- * __task_rq_lock - lock the runqueue a given task resides on.
- * Must be called interrupts disabled.
+ * __task_rq_lock - lock the rq @p resides on.
*/
static inline struct rq *__task_rq_lock(struct task_struct *p)
__acquires(rq->lock)
{
struct rq *rq;
+ lockdep_assert_held(&p->pi_lock);
+
for (;;) {
rq = task_rq(p);
raw_spin_lock(&rq->lock);
@@ -941,22 +950,22 @@ static inline struct rq *__task_rq_lock(struct task_struct *p)
}
/*
- * task_rq_lock - lock the runqueue a given task resides on and disable
- * interrupts. Note the ordering: we can safely lookup the task_rq without
- * explicitly disabling preemption.
+ * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
*/
static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
+ __acquires(p->pi_lock)
__acquires(rq->lock)
{
struct rq *rq;
for (;;) {
- local_irq_save(*flags);
+ raw_spin_lock_irqsave(&p->pi_lock, *flags);
rq = task_rq(p);
raw_spin_lock(&rq->lock);
if (likely(rq == task_rq(p)))
return rq;
- raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
}
@@ -966,10 +975,13 @@ static void __task_rq_unlock(struct rq *rq)
raw_spin_unlock(&rq->lock);
}
-static inline void task_rq_unlock(struct rq *rq, unsigned long *flags)
+static inline void
+task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
__releases(rq->lock)
+ __releases(p->pi_lock)
{
- raw_spin_unlock_irqrestore(&rq->lock, *flags);
+ raw_spin_unlock(&rq->lock);
+ raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
}
/*
@@ -1198,11 +1210,17 @@ int get_nohz_timer_target(void)
int i;
struct sched_domain *sd;
+ rcu_read_lock();
for_each_domain(cpu, sd) {
- for_each_cpu(i, sched_domain_span(sd))
- if (!idle_cpu(i))
- return i;
+ for_each_cpu(i, sched_domain_span(sd)) {
+ if (!idle_cpu(i)) {
+ cpu = i;
+ goto unlock;
+ }
+ }
}
+unlock:
+ rcu_read_unlock();
return cpu;
}
/*
@@ -1312,15 +1330,27 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight,
{
u64 tmp;
+ /*
+ * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
+ * entities since MIN_SHARES = 2. Treat weight as 1 if less than
+ * 2^SCHED_LOAD_RESOLUTION.
+ */
+ if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
+ tmp = (u64)delta_exec * scale_load_down(weight);
+ else
+ tmp = (u64)delta_exec;
+
if (!lw->inv_weight) {
- if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST))
+ unsigned long w = scale_load_down(lw->weight);
+
+ if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
lw->inv_weight = 1;
+ else if (unlikely(!w))
+ lw->inv_weight = WMULT_CONST;
else
- lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2)
- / (lw->weight+1);
+ lw->inv_weight = WMULT_CONST / w;
}
- tmp = (u64)delta_exec * weight;
/*
* Check whether we'd overflow the 64-bit multiplication:
*/
@@ -1686,6 +1716,39 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
__release(rq2->lock);
}
+#else /* CONFIG_SMP */
+
+/*
+ * double_rq_lock - safely lock two runqueues
+ *
+ * Note this does not disable interrupts like task_rq_lock,
+ * you need to do so manually before calling.
+ */
+static void double_rq_lock(struct rq *rq1, struct rq *rq2)
+ __acquires(rq1->lock)
+ __acquires(rq2->lock)
+{
+ BUG_ON(!irqs_disabled());
+ BUG_ON(rq1 != rq2);
+ raw_spin_lock(&rq1->lock);
+ __acquire(rq2->lock); /* Fake it out ;) */
+}
+
+/*
+ * double_rq_unlock - safely unlock two runqueues
+ *
+ * Note this does not restore interrupts like task_rq_unlock,
+ * you need to do so manually after calling.
+ */
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
+ __releases(rq1->lock)
+ __releases(rq2->lock)
+{
+ BUG_ON(rq1 != rq2);
+ raw_spin_unlock(&rq1->lock);
+ __release(rq2->lock);
+}
+
#endif
static void calc_load_account_idle(struct rq *this_rq);
@@ -1727,17 +1790,20 @@ static void dec_nr_running(struct rq *rq)
static void set_load_weight(struct task_struct *p)
{
+ int prio = p->static_prio - MAX_RT_PRIO;
+ struct load_weight *load = &p->se.load;
+
/*
* SCHED_IDLE tasks get minimal weight:
*/
if (p->policy == SCHED_IDLE) {
- p->se.load.weight = WEIGHT_IDLEPRIO;
- p->se.load.inv_weight = WMULT_IDLEPRIO;
+ load->weight = scale_load(WEIGHT_IDLEPRIO);
+ load->inv_weight = WMULT_IDLEPRIO;
return;
}
- p->se.load.weight = prio_to_weight[p->static_prio - MAX_RT_PRIO];
- p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO];
+ load->weight = scale_load(prio_to_weight[prio]);
+ load->inv_weight = prio_to_wmult[prio];
}
static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1745,7 +1811,6 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq);
sched_info_queued(p);
p->sched_class->enqueue_task(rq, p, flags);
- p->se.on_rq = 1;
}
static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
@@ -1753,7 +1818,6 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
update_rq_clock(rq);
sched_info_dequeued(p);
p->sched_class->dequeue_task(rq, p, flags);
- p->se.on_rq = 0;
}
/*
@@ -1880,7 +1944,7 @@ void account_system_vtime(struct task_struct *curr)
*/
if (hardirq_count())
__this_cpu_add(cpu_hardirq_time, delta);
- else if (in_serving_softirq() && !(curr->flags & PF_KSOFTIRQD))
+ else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
__this_cpu_add(cpu_softirq_time, delta);
irq_time_write_end();
@@ -1920,8 +1984,40 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
sched_rt_avg_update(rq, irq_delta);
}
+static int irqtime_account_hi_update(void)
+{
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_hardirq_time);
+ if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->irq))
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
+static int irqtime_account_si_update(void)
+{
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+ unsigned long flags;
+ u64 latest_ns;
+ int ret = 0;
+
+ local_irq_save(flags);
+ latest_ns = this_cpu_read(cpu_softirq_time);
+ if (cputime64_gt(nsecs_to_cputime64(latest_ns), cpustat->softirq))
+ ret = 1;
+ local_irq_restore(flags);
+ return ret;
+}
+
#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+#define sched_clock_irqtime (0)
+
static void update_rq_clock_task(struct rq *rq, s64 delta)
{
rq->clock_task += delta;
@@ -2025,14 +2121,14 @@ inline int task_curr(const struct task_struct *p)
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
- int oldprio, int running)
+ int oldprio)
{
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
- prev_class->switched_from(rq, p, running);
- p->sched_class->switched_to(rq, p, running);
- } else
- p->sched_class->prio_changed(rq, p, oldprio, running);
+ prev_class->switched_from(rq, p);
+ p->sched_class->switched_to(rq, p);
+ } else if (oldprio != p->prio)
+ p->sched_class->prio_changed(rq, p, oldprio);
}
static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
@@ -2056,7 +2152,7 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
- if (rq->curr->se.on_rq && test_tsk_need_resched(rq->curr))
+ if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1;
}
@@ -2102,6 +2198,11 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
!(task_thread_info(p)->preempt_count & PREEMPT_ACTIVE));
+
+#ifdef CONFIG_LOCKDEP
+ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
+ lockdep_is_held(&task_rq(p)->lock)));
+#endif
#endif
trace_sched_migrate_task(p, new_cpu);
@@ -2122,19 +2223,6 @@ struct migration_arg {
static int migration_cpu_stop(void *data);
/*
- * The task's runqueue lock must be held.
- * Returns true if you have to wait for migration thread.
- */
-static bool migrate_task(struct task_struct *p, struct rq *rq)
-{
- /*
- * If the task is not on a runqueue (and not running), then
- * the next wake-up will properly place the task.
- */
- return p->se.on_rq || task_running(rq, p);
-}
-
-/*
* wait_task_inactive - wait for a thread to unschedule.
*
* If @match_state is nonzero, it's the @p->state value just checked and
@@ -2191,11 +2279,11 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
rq = task_rq_lock(p, &flags);
trace_sched_wait_task(p);
running = task_running(rq, p);
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
ncsw = 0;
if (!match_state || p->state == match_state)
ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
/*
* If it changed from the expected state, bail out now.
@@ -2224,7 +2312,10 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* yield - it could be a while.
*/
if (unlikely(on_rq)) {
- schedule_timeout_uninterruptible(1);
+ ktime_t to = ktime_set(0, NSEC_PER_SEC/HZ);
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_hrtimeout(&to, HRTIMER_MODE_REL);
continue;
}
@@ -2246,7 +2337,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
* Cause a process which is running on another CPU to enter
* kernel-mode, without any delay. (to get signals handled.)
*
- * NOTE: this function doesnt have to take the runqueue lock,
+ * NOTE: this function doesn't have to take the runqueue lock,
* because all it wants to ensure is that the remote task enters
* the kernel. If the IPI races and the task has been migrated
* to another CPU then no harm is done and the purpose has been
@@ -2265,30 +2356,9 @@ void kick_process(struct task_struct *p)
EXPORT_SYMBOL_GPL(kick_process);
#endif /* CONFIG_SMP */
-/**
- * task_oncpu_function_call - call a function on the cpu on which a task runs
- * @p: the task to evaluate
- * @func: the function to be called
- * @info: the function call argument
- *
- * Calls the function @func when the task is currently running. This might
- * be on the current CPU, which just calls the function directly
- */
-void task_oncpu_function_call(struct task_struct *p,
- void (*func) (void *info), void *info)
-{
- int cpu;
-
- preempt_disable();
- cpu = task_cpu(p);
- if (task_curr(p))
- smp_call_function_single(cpu, func, info, 1);
- preempt_enable();
-}
-
#ifdef CONFIG_SMP
/*
- * ->cpus_allowed is protected by either TASK_WAKING or rq->lock held.
+ * ->cpus_allowed is protected by both rq->lock and p->pi_lock
*/
static int select_fallback_rq(int cpu, struct task_struct *p)
{
@@ -2321,12 +2391,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
}
/*
- * The caller (fork, wakeup) owns TASK_WAKING, ->cpus_allowed is stable.
+ * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
*/
static inline
-int select_task_rq(struct rq *rq, struct task_struct *p, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
{
- int cpu = p->sched_class->select_task_rq(rq, p, sd_flags, wake_flags);
+ int cpu = p->sched_class->select_task_rq(p, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -2352,27 +2422,62 @@ static void update_avg(u64 *avg, u64 sample)
}
#endif
-static inline void ttwu_activate(struct task_struct *p, struct rq *rq,
- bool is_sync, bool is_migrate, bool is_local,
- unsigned long en_flags)
+static void
+ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
{
+#ifdef CONFIG_SCHEDSTATS
+ struct rq *rq = this_rq();
+
+#ifdef CONFIG_SMP
+ int this_cpu = smp_processor_id();
+
+ if (cpu == this_cpu) {
+ schedstat_inc(rq, ttwu_local);
+ schedstat_inc(p, se.statistics.nr_wakeups_local);
+ } else {
+ struct sched_domain *sd;
+
+ schedstat_inc(p, se.statistics.nr_wakeups_remote);
+ rcu_read_lock();
+ for_each_domain(this_cpu, sd) {
+ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ break;
+ }
+ }
+ rcu_read_unlock();
+ }
+#endif /* CONFIG_SMP */
+
+ schedstat_inc(rq, ttwu_count);
schedstat_inc(p, se.statistics.nr_wakeups);
- if (is_sync)
+
+ if (wake_flags & WF_SYNC)
schedstat_inc(p, se.statistics.nr_wakeups_sync);
- if (is_migrate)
+
+ if (cpu != task_cpu(p))
schedstat_inc(p, se.statistics.nr_wakeups_migrate);
- if (is_local)
- schedstat_inc(p, se.statistics.nr_wakeups_local);
- else
- schedstat_inc(p, se.statistics.nr_wakeups_remote);
+#endif /* CONFIG_SCHEDSTATS */
+}
+
+static void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
+{
activate_task(rq, p, en_flags);
+ p->on_rq = 1;
+
+ /* if a worker is waking up, notify workqueue */
+ if (p->flags & PF_WQ_WORKER)
+ wq_worker_waking_up(p, cpu_of(rq));
}
-static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
- int wake_flags, bool success)
+/*
+ * Mark the task runnable and perform wakeup-preemption.
+ */
+static void
+ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
{
- trace_sched_wakeup(p, success);
+ trace_sched_wakeup(p, true);
check_preempt_curr(rq, p, wake_flags);
p->state = TASK_RUNNING;
@@ -2391,9 +2496,118 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
rq->idle_stamp = 0;
}
#endif
- /* if a worker is waking up, notify workqueue */
- if ((p->flags & PF_WQ_WORKER) && success)
- wq_worker_waking_up(p, cpu_of(rq));
+}
+
+static void
+ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
+{
+#ifdef CONFIG_SMP
+ if (p->sched_contributes_to_load)
+ rq->nr_uninterruptible--;
+#endif
+
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
+ ttwu_do_wakeup(rq, p, wake_flags);
+}
+
+/*
+ * Called in case the task @p isn't fully descheduled from its runqueue,
+ * in this case we must do a remote wakeup. Its a 'light' wakeup though,
+ * since all we need to do is flip p->state to TASK_RUNNING, since
+ * the task is still ->on_rq.
+ */
+static int ttwu_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (p->on_rq) {
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+}
+
+#ifdef CONFIG_SMP
+static void sched_ttwu_pending(void)
+{
+ struct rq *rq = this_rq();
+ struct task_struct *list = xchg(&rq->wake_list, NULL);
+
+ if (!list)
+ return;
+
+ raw_spin_lock(&rq->lock);
+
+ while (list) {
+ struct task_struct *p = list;
+ list = list->wake_entry;
+ ttwu_do_activate(rq, p, 0);
+ }
+
+ raw_spin_unlock(&rq->lock);
+}
+
+void scheduler_ipi(void)
+{
+ sched_ttwu_pending();
+}
+
+static void ttwu_queue_remote(struct task_struct *p, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *next = rq->wake_list;
+
+ for (;;) {
+ struct task_struct *old = next;
+
+ p->wake_entry = next;
+ next = cmpxchg(&rq->wake_list, old, p);
+ if (next == old)
+ break;
+ }
+
+ if (!next)
+ smp_send_reschedule(cpu);
+}
+
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+static int ttwu_activate_remote(struct task_struct *p, int wake_flags)
+{
+ struct rq *rq;
+ int ret = 0;
+
+ rq = __task_rq_lock(p);
+ if (p->on_cpu) {
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+ ttwu_do_wakeup(rq, p, wake_flags);
+ ret = 1;
+ }
+ __task_rq_unlock(rq);
+
+ return ret;
+
+}
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+#endif /* CONFIG_SMP */
+
+static void ttwu_queue(struct task_struct *p, int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+#if defined(CONFIG_SMP)
+ if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ ttwu_queue_remote(p, cpu);
+ return;
+ }
+#endif
+
+ raw_spin_lock(&rq->lock);
+ ttwu_do_activate(rq, p, 0);
+ raw_spin_unlock(&rq->lock);
}
/**
@@ -2411,92 +2625,64 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
* Returns %true if @p was woken up, %false if it was already running
* or @state didn't match @p's state.
*/
-static int try_to_wake_up(struct task_struct *p, unsigned int state,
- int wake_flags)
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
{
- int cpu, orig_cpu, this_cpu, success = 0;
unsigned long flags;
- unsigned long en_flags = ENQUEUE_WAKEUP;
- struct rq *rq;
-
- this_cpu = get_cpu();
+ int cpu, success = 0;
smp_wmb();
- rq = task_rq_lock(p, &flags);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
goto out;
- if (p->se.on_rq)
- goto out_running;
-
+ success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
- orig_cpu = cpu;
-#ifdef CONFIG_SMP
- if (unlikely(task_running(rq, p)))
- goto out_activate;
+ if (p->on_rq && ttwu_remote(p, wake_flags))
+ goto stat;
+#ifdef CONFIG_SMP
/*
- * In order to handle concurrent wakeups and release the rq->lock
- * we put the task in TASK_WAKING state.
- *
- * First fix up the nr_uninterruptible count:
+ * If the owning (remote) cpu is still in the middle of schedule() with
+ * this task as prev, wait until its done referencing the task.
*/
- if (task_contributes_to_load(p)) {
- if (likely(cpu_online(orig_cpu)))
- rq->nr_uninterruptible--;
- else
- this_rq()->nr_uninterruptible--;
- }
- p->state = TASK_WAKING;
-
- if (p->sched_class->task_waking) {
- p->sched_class->task_waking(rq, p);
- en_flags |= ENQUEUE_WAKING;
+ while (p->on_cpu) {
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ /*
+ * In case the architecture enables interrupts in
+ * context_switch(), we cannot busy wait, since that
+ * would lead to deadlocks when an interrupt hits and
+ * tries to wake up @prev. So bail and do a complete
+ * remote wakeup.
+ */
+ if (ttwu_activate_remote(p, wake_flags))
+ goto stat;
+#else
+ cpu_relax();
+#endif
}
-
- cpu = select_task_rq(rq, p, SD_BALANCE_WAKE, wake_flags);
- if (cpu != orig_cpu)
- set_task_cpu(p, cpu);
- __task_rq_unlock(rq);
-
- rq = cpu_rq(cpu);
- raw_spin_lock(&rq->lock);
-
/*
- * We migrated the task without holding either rq->lock, however
- * since the task is not on the task list itself, nobody else
- * will try and migrate the task, hence the rq should match the
- * cpu we just moved it to.
+ * Pairs with the smp_wmb() in finish_lock_switch().
*/
- WARN_ON(task_cpu(p) != cpu);
- WARN_ON(p->state != TASK_WAKING);
+ smp_rmb();
-#ifdef CONFIG_SCHEDSTATS
- schedstat_inc(rq, ttwu_count);
- if (cpu == this_cpu)
- schedstat_inc(rq, ttwu_local);
- else {
- struct sched_domain *sd;
- for_each_domain(this_cpu, sd) {
- if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
- schedstat_inc(sd, ttwu_wake_remote);
- break;
- }
- }
- }
-#endif /* CONFIG_SCHEDSTATS */
+ p->sched_contributes_to_load = !!task_contributes_to_load(p);
+ p->state = TASK_WAKING;
+
+ if (p->sched_class->task_waking)
+ p->sched_class->task_waking(p);
-out_activate:
+ cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
+ if (task_cpu(p) != cpu)
+ set_task_cpu(p, cpu);
#endif /* CONFIG_SMP */
- ttwu_activate(p, rq, wake_flags & WF_SYNC, orig_cpu != cpu,
- cpu == this_cpu, en_flags);
- success = 1;
-out_running:
- ttwu_post_activation(p, rq, wake_flags, success);
+
+ ttwu_queue(p, cpu);
+stat:
+ ttwu_stat(p, cpu, wake_flags);
out:
- task_rq_unlock(rq, &flags);
- put_cpu();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return success;
}
@@ -2505,31 +2691,34 @@ out:
* try_to_wake_up_local - try to wake up a local task with rq lock held
* @p: the thread to be awakened
*
- * Put @p on the run-queue if it's not already there. The caller must
+ * Put @p on the run-queue if it's not already there. The caller must
* ensure that this_rq() is locked, @p is bound to this_rq() and not
- * the current task. this_rq() stays locked over invocation.
+ * the current task.
*/
static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- bool success = false;
BUG_ON(rq != this_rq());
BUG_ON(p == current);
lockdep_assert_held(&rq->lock);
+ if (!raw_spin_trylock(&p->pi_lock)) {
+ raw_spin_unlock(&rq->lock);
+ raw_spin_lock(&p->pi_lock);
+ raw_spin_lock(&rq->lock);
+ }
+
if (!(p->state & TASK_NORMAL))
- return;
+ goto out;
- if (!p->se.on_rq) {
- if (likely(!task_running(rq, p))) {
- schedstat_inc(rq, ttwu_count);
- schedstat_inc(rq, ttwu_local);
- }
- ttwu_activate(p, rq, false, false, true, ENQUEUE_WAKEUP);
- success = true;
- }
- ttwu_post_activation(p, rq, 0, success);
+ if (!p->on_rq)
+ ttwu_activate(rq, p, ENQUEUE_WAKEUP);
+
+ ttwu_do_wakeup(rq, p, 0);
+ ttwu_stat(p, smp_processor_id(), 0);
+out:
+ raw_spin_unlock(&p->pi_lock);
}
/**
@@ -2562,18 +2751,21 @@ int wake_up_state(struct task_struct *p, unsigned int state)
*/
static void __sched_fork(struct task_struct *p)
{
+ p->on_rq = 0;
+
+ p->se.on_rq = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
p->se.prev_sum_exec_runtime = 0;
p->se.nr_migrations = 0;
+ p->se.vruntime = 0;
+ INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
#endif
INIT_LIST_HEAD(&p->rt.run_list);
- p->se.on_rq = 0;
- INIT_LIST_HEAD(&p->se.group_node);
#ifdef CONFIG_PREEMPT_NOTIFIERS
INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -2583,8 +2775,9 @@ static void __sched_fork(struct task_struct *p)
/*
* fork()/clone()-time setup:
*/
-void sched_fork(struct task_struct *p, int clone_flags)
+void sched_fork(struct task_struct *p)
{
+ unsigned long flags;
int cpu = get_cpu();
__sched_fork(p);
@@ -2635,16 +2828,16 @@ void sched_fork(struct task_struct *p, int clone_flags)
*
* Silence PROVE_RCU.
*/
- rcu_read_lock();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
set_task_cpu(p, cpu);
- rcu_read_unlock();
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (likely(sched_info_on()))
memset(&p->sched_info, 0, sizeof(p->sched_info));
#endif
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
- p->oncpu = 0;
+#if defined(CONFIG_SMP)
+ p->on_cpu = 0;
#endif
#ifdef CONFIG_PREEMPT
/* Want to start with kernel preemption disabled. */
@@ -2664,41 +2857,31 @@ void sched_fork(struct task_struct *p, int clone_flags)
* that must be done for every newly created context, then puts the task
* on the runqueue and wakes it.
*/
-void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
+void wake_up_new_task(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
- int cpu __maybe_unused = get_cpu();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
#ifdef CONFIG_SMP
- rq = task_rq_lock(p, &flags);
- p->state = TASK_WAKING;
-
/*
* Fork balancing, do it here and not earlier because:
* - cpus_allowed can change in the fork path
* - any previously selected cpu might disappear through hotplug
- *
- * We set TASK_WAKING so that select_task_rq() can drop rq->lock
- * without people poking at ->cpus_allowed.
*/
- cpu = select_task_rq(rq, p, SD_BALANCE_FORK, 0);
- set_task_cpu(p, cpu);
-
- p->state = TASK_RUNNING;
- task_rq_unlock(rq, &flags);
+ set_task_cpu(p, select_task_rq(p, SD_BALANCE_FORK, 0));
#endif
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);
activate_task(rq, p, 0);
- trace_sched_wakeup_new(p, 1);
+ p->on_rq = 1;
+ trace_sched_wakeup_new(p, true);
check_preempt_curr(rq, p, WF_FORK);
#ifdef CONFIG_SMP
if (p->sched_class->task_woken)
p->sched_class->task_woken(rq, p);
#endif
- task_rq_unlock(rq, &flags);
- put_cpu();
+ task_rq_unlock(rq, p, &flags);
}
#ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2776,9 +2959,12 @@ static inline void
prepare_task_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
+ sched_info_switch(prev, next);
+ perf_event_task_sched_out(prev, next);
fire_sched_out_preempt_notifiers(prev, next);
prepare_lock_switch(rq, next);
prepare_arch_switch(next);
+ trace_sched_switch(prev, next);
}
/**
@@ -2911,7 +3097,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
struct mm_struct *mm, *oldmm;
prepare_task_switch(rq, prev, next);
- trace_sched_switch(prev, next);
+
mm = next->mm;
oldmm = prev->active_mm;
/*
@@ -3404,27 +3590,22 @@ void sched_exec(void)
{
struct task_struct *p = current;
unsigned long flags;
- struct rq *rq;
int dest_cpu;
- rq = task_rq_lock(p, &flags);
- dest_cpu = p->sched_class->select_task_rq(rq, p, SD_BALANCE_EXEC, 0);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ dest_cpu = p->sched_class->select_task_rq(p, SD_BALANCE_EXEC, 0);
if (dest_cpu == smp_processor_id())
goto unlock;
- /*
- * select_task_rq() can race against ->cpus_allowed
- */
- if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed) &&
- likely(cpu_active(dest_cpu)) && migrate_task(p, rq)) {
+ if (likely(cpu_active(dest_cpu))) {
struct migration_arg arg = { p, dest_cpu };
- task_rq_unlock(rq, &flags);
- stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
return;
}
unlock:
- task_rq_unlock(rq, &flags);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
}
#endif
@@ -3461,7 +3642,7 @@ unsigned long long task_delta_exec(struct task_struct *p)
rq = task_rq_lock(p, &flags);
ns = do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ns;
}
@@ -3479,7 +3660,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
rq = task_rq_lock(p, &flags);
ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ns;
}
@@ -3503,7 +3684,7 @@ unsigned long long thread_group_sched_runtime(struct task_struct *p)
rq = task_rq_lock(p, &flags);
thread_group_cputime(p, &totals);
ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ns;
}
@@ -3568,6 +3749,32 @@ static void account_guest_time(struct task_struct *p, cputime_t cputime,
}
/*
+ * Account system cpu time to a process and desired cpustat field
+ * @p: the process that the cpu time gets accounted to
+ * @cputime: the cpu time spent in kernel space since the last update
+ * @cputime_scaled: cputime scaled by cpu frequency
+ * @target_cputime64: pointer to cpustat field that has to be updated
+ */
+static inline
+void __account_system_time(struct task_struct *p, cputime_t cputime,
+ cputime_t cputime_scaled, cputime64_t *target_cputime64)
+{
+ cputime64_t tmp = cputime_to_cputime64(cputime);
+
+ /* Add system time to process. */
+ p->stime = cputime_add(p->stime, cputime);
+ p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
+ account_group_system_time(p, cputime);
+
+ /* Add system time to cpustat. */
+ *target_cputime64 = cputime64_add(*target_cputime64, tmp);
+ cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+
+ /* Account for system time used */
+ acct_update_integrals(p);
+}
+
+/*
* Account system cpu time to a process.
* @p: the process that the cpu time gets accounted to
* @hardirq_offset: the offset to subtract from hardirq_count()
@@ -3578,36 +3785,26 @@ void account_system_time(struct task_struct *p, int hardirq_offset,
cputime_t cputime, cputime_t cputime_scaled)
{
struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
- cputime64_t tmp;
+ cputime64_t *target_cputime64;
if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) {
account_guest_time(p, cputime, cputime_scaled);
return;
}
- /* Add system time to process. */
- p->stime = cputime_add(p->stime, cputime);
- p->stimescaled = cputime_add(p->stimescaled, cputime_scaled);
- account_group_system_time(p, cputime);
-
- /* Add system time to cpustat. */
- tmp = cputime_to_cputime64(cputime);
if (hardirq_count() - hardirq_offset)
- cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ target_cputime64 = &cpustat->irq;
else if (in_serving_softirq())
- cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ target_cputime64 = &cpustat->softirq;
else
- cpustat->system = cputime64_add(cpustat->system, tmp);
+ target_cputime64 = &cpustat->system;
- cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
-
- /* Account for system time used */
- acct_update_integrals(p);
+ __account_system_time(p, cputime, cputime_scaled, target_cputime64);
}
/*
* Account for involuntary wait time.
- * @steal: the cpu time spent in involuntary wait
+ * @cputime: the cpu time spent in involuntary wait
*/
void account_steal_time(cputime_t cputime)
{
@@ -3635,6 +3832,73 @@ void account_idle_time(cputime_t cputime)
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+/*
+ * Account a tick to a process and cpustat
+ * @p: the process that the cpu time gets accounted to
+ * @user_tick: is the tick from userspace
+ * @rq: the pointer to rq
+ *
+ * Tick demultiplexing follows the order
+ * - pending hardirq update
+ * - pending softirq update
+ * - user_time
+ * - idle_time
+ * - system time
+ * - check for guest_time
+ * - else account as system_time
+ *
+ * Check for hardirq is done both for system and user time as there is
+ * no timer going off while we are on hardirq and hence we may never get an
+ * opportunity to update it solely in system time.
+ * p->stime and friends are only updated on system time and not on irq
+ * softirq as those do not count in task exec_runtime any more.
+ */
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq)
+{
+ cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
+ cputime64_t tmp = cputime_to_cputime64(cputime_one_jiffy);
+ struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat;
+
+ if (irqtime_account_hi_update()) {
+ cpustat->irq = cputime64_add(cpustat->irq, tmp);
+ } else if (irqtime_account_si_update()) {
+ cpustat->softirq = cputime64_add(cpustat->softirq, tmp);
+ } else if (this_cpu_ksoftirqd() == p) {
+ /*
+ * ksoftirqd time do not get accounted in cpu_softirq_time.
+ * So, we have to handle it separately here.
+ * Also, p->stime needs to be updated for ksoftirqd.
+ */
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ &cpustat->softirq);
+ } else if (user_tick) {
+ account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else if (p == rq->idle) {
+ account_idle_time(cputime_one_jiffy);
+ } else if (p->flags & PF_VCPU) { /* System time or guest time */
+ account_guest_time(p, cputime_one_jiffy, one_jiffy_scaled);
+ } else {
+ __account_system_time(p, cputime_one_jiffy, one_jiffy_scaled,
+ &cpustat->system);
+ }
+}
+
+static void irqtime_account_idle_ticks(int ticks)
+{
+ int i;
+ struct rq *rq = this_rq();
+
+ for (i = 0; i < ticks; i++)
+ irqtime_account_process_tick(current, 0, rq);
+}
+#else /* CONFIG_IRQ_TIME_ACCOUNTING */
+static void irqtime_account_idle_ticks(int ticks) {}
+static void irqtime_account_process_tick(struct task_struct *p, int user_tick,
+ struct rq *rq) {}
+#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
/*
* Account a single tick of cpu time.
* @p: the process that the cpu time gets accounted to
@@ -3645,6 +3909,11 @@ void account_process_tick(struct task_struct *p, int user_tick)
cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy);
struct rq *rq = this_rq();
+ if (sched_clock_irqtime) {
+ irqtime_account_process_tick(p, user_tick, rq);
+ return;
+ }
+
if (user_tick)
account_user_time(p, cputime_one_jiffy, one_jiffy_scaled);
else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
@@ -3670,6 +3939,12 @@ void account_steal_ticks(unsigned long ticks)
*/
void account_idle_ticks(unsigned long ticks)
{
+
+ if (sched_clock_irqtime) {
+ irqtime_account_idle_ticks(ticks);
+ return;
+ }
+
account_idle_time(jiffies_to_cputime(ticks));
}
@@ -3763,9 +4038,6 @@ void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st)
/*
* This function gets called by the timer code, with HZ frequency.
* We call it with interrupts disabled.
- *
- * It also gets called by the fork code, when changing the parent's
- * timeslices.
*/
void scheduler_tick(void)
{
@@ -3885,17 +4157,11 @@ static inline void schedule_debug(struct task_struct *prev)
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq(), sched_count);
-#ifdef CONFIG_SCHEDSTATS
- if (unlikely(prev->lock_depth >= 0)) {
- schedstat_inc(this_rq(), rq_sched_info.bkl_count);
- schedstat_inc(prev, sched_info.bkl_count);
- }
-#endif
}
static void put_prev_task(struct rq *rq, struct task_struct *prev)
{
- if (prev->se.on_rq)
+ if (prev->on_rq || rq->skip_clock_update < 0)
update_rq_clock(rq);
prev->sched_class->put_prev_task(rq, prev);
}
@@ -3945,9 +4211,6 @@ need_resched:
rcu_note_context_switch(cpu);
prev = rq->curr;
- release_kernel_lock(prev);
-need_resched_nonpreemptible:
-
schedule_debug(prev);
if (sched_feat(HRTICK))
@@ -3960,11 +4223,13 @@ need_resched_nonpreemptible:
if (unlikely(signal_pending_state(prev->state, prev))) {
prev->state = TASK_RUNNING;
} else {
+ deactivate_task(rq, prev, DEQUEUE_SLEEP);
+ prev->on_rq = 0;
+
/*
- * If a worker is going to sleep, notify and
- * ask workqueue whether it wants to wake up a
- * task to maintain concurrency. If so, wake
- * up the task.
+ * If a worker went to sleep, notify and ask workqueue
+ * whether it wants to wake up a task to maintain
+ * concurrency.
*/
if (prev->flags & PF_WQ_WORKER) {
struct task_struct *to_wakeup;
@@ -3973,7 +4238,16 @@ need_resched_nonpreemptible:
if (to_wakeup)
try_to_wake_up_local(to_wakeup);
}
- deactivate_task(rq, prev, DEQUEUE_SLEEP);
+
+ /*
+ * If we are going to sleep and we have plugged IO
+ * queued, make sure to submit it to avoid deadlocks.
+ */
+ if (blk_needs_flush_plug(prev)) {
+ raw_spin_unlock(&rq->lock);
+ blk_schedule_flush_plug(prev);
+ raw_spin_lock(&rq->lock);
+ }
}
switch_count = &prev->nvcsw;
}
@@ -3989,9 +4263,6 @@ need_resched_nonpreemptible:
rq->skip_clock_update = 0;
if (likely(prev != next)) {
- sched_info_switch(prev, next);
- perf_event_task_sched_out(prev, next);
-
rq->nr_switches++;
rq->curr = next;
++*switch_count;
@@ -4010,9 +4281,6 @@ need_resched_nonpreemptible:
post_schedule(rq);
- if (unlikely(reacquire_kernel_lock(prev)))
- goto need_resched_nonpreemptible;
-
preempt_enable_no_resched();
if (need_resched())
goto need_resched;
@@ -4020,70 +4288,53 @@ need_resched_nonpreemptible:
EXPORT_SYMBOL(schedule);
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
-{
- unsigned int cpu;
- struct rq *rq;
- if (!sched_feat(OWNER_SPIN))
- return 0;
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+ bool ret = false;
-#ifdef CONFIG_DEBUG_PAGEALLOC
- /*
- * Need to access the cpu field knowing that
- * DEBUG_PAGEALLOC could have unmapped it if
- * the mutex owner just released it and exited.
- */
- if (probe_kernel_address(&owner->cpu, cpu))
- return 0;
-#else
- cpu = owner->cpu;
-#endif
+ rcu_read_lock();
+ if (lock->owner != owner)
+ goto fail;
/*
- * Even if the access succeeded (likely case),
- * the cpu field may no longer be valid.
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
*/
- if (cpu >= nr_cpumask_bits)
- return 0;
+ barrier();
- /*
- * We need to validate that we can do a
- * get_cpu() and that we have the percpu area.
- */
- if (!cpu_online(cpu))
- return 0;
+ ret = owner->on_cpu;
+fail:
+ rcu_read_unlock();
- rq = cpu_rq(cpu);
+ return ret;
+}
- for (;;) {
- /*
- * Owner changed, break to re-assess state.
- */
- if (lock->owner != owner) {
- /*
- * If the lock has switched to a different owner,
- * we likely have heavy contention. Return 0 to quit
- * optimistic spinning and not contend further:
- */
- if (lock->owner)
- return 0;
- break;
- }
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+ if (!sched_feat(OWNER_SPIN))
+ return 0;
- /*
- * Is that owner really running on that cpu?
- */
- if (task_thread_info(rq->curr) != owner || need_resched())
+ while (owner_running(lock, owner)) {
+ if (need_resched())
return 0;
arch_mutex_cpu_relax();
}
+ /*
+ * If the owner changed to another task there is likely
+ * heavy contention, stop spinning.
+ */
+ if (lock->owner)
+ return 0;
+
return 1;
}
#endif
@@ -4543,19 +4794,18 @@ EXPORT_SYMBOL(sleep_on_timeout);
*/
void rt_mutex_setprio(struct task_struct *p, int prio)
{
- unsigned long flags;
int oldprio, on_rq, running;
struct rq *rq;
const struct sched_class *prev_class;
BUG_ON(prio < 0 || prio > MAX_PRIO);
- rq = task_rq_lock(p, &flags);
+ rq = __task_rq_lock(p);
trace_sched_pi_setprio(p, prio);
oldprio = p->prio;
prev_class = p->sched_class;
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
dequeue_task(rq, p, 0);
@@ -4571,12 +4821,11 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (on_rq)
enqueue_task(rq, p, oldprio < prio ? ENQUEUE_HEAD : 0);
- check_class_changed(rq, p, prev_class, oldprio, running);
- }
- task_rq_unlock(rq, &flags);
+ check_class_changed(rq, p, prev_class, oldprio);
+ __task_rq_unlock(rq);
}
#endif
@@ -4604,7 +4853,7 @@ void set_user_nice(struct task_struct *p, long nice)
p->static_prio = NICE_TO_PRIO(nice);
goto out_unlock;
}
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
if (on_rq)
dequeue_task(rq, p, 0);
@@ -4624,7 +4873,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_task(rq->curr);
}
out_unlock:
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
}
EXPORT_SYMBOL(set_user_nice);
@@ -4738,8 +4987,6 @@ static struct task_struct *find_process_by_pid(pid_t pid)
static void
__setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
{
- BUG_ON(p->se.on_rq);
-
p->policy = policy;
p->rt_priority = prio;
p->normal_prio = normal_prio(p);
@@ -4762,8 +5009,11 @@ static bool check_same_owner(struct task_struct *p)
rcu_read_lock();
pcred = __task_cred(p);
- match = (cred->euid == pcred->euid ||
- cred->euid == pcred->uid);
+ if (cred->user->user_ns == pcred->user->user_ns)
+ match = (cred->euid == pcred->euid ||
+ cred->euid == pcred->uid);
+ else
+ match = false;
rcu_read_unlock();
return match;
}
@@ -4823,12 +5073,15 @@ recheck:
param->sched_priority > rlim_rtprio)
return -EPERM;
}
+
/*
- * Like positive nice levels, dont allow tasks to
- * move out of SCHED_IDLE either:
+ * Treat SCHED_IDLE as nice 20. Only allow a switch to
+ * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
*/
- if (p->policy == SCHED_IDLE && policy != SCHED_IDLE)
- return -EPERM;
+ if (p->policy == SCHED_IDLE && policy != SCHED_IDLE) {
+ if (!can_nice(p, TASK_NICE(p)))
+ return -EPERM;
+ }
/* can't change other user's priorities */
if (!check_same_owner(p))
@@ -4848,21 +5101,29 @@ recheck:
/*
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
- */
- raw_spin_lock_irqsave(&p->pi_lock, flags);
- /*
- * To be able to change p->policy safely, the apropriate
+ *
+ * To be able to change p->policy safely, the appropriate
* runqueue lock must be held.
*/
- rq = __task_rq_lock(p);
+ rq = task_rq_lock(p, &flags);
/*
* Changing the policy of the stop threads its a very bad idea
*/
if (p == rq->stop) {
+ task_rq_unlock(rq, p, &flags);
+ return -EINVAL;
+ }
+
+ /*
+ * If not changing anything there's no need to proceed further:
+ */
+ if (unlikely(policy == p->policy && (!rt_policy(policy) ||
+ param->sched_priority == p->rt_priority))) {
+
__task_rq_unlock(rq);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
- return -EINVAL;
+ return 0;
}
#ifdef CONFIG_RT_GROUP_SCHED
@@ -4874,8 +5135,7 @@ recheck:
if (rt_bandwidth_enabled() && rt_policy(policy) &&
task_group(p)->rt_bandwidth.rt_runtime == 0 &&
!task_group_is_autogroup(task_group(p))) {
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
return -EPERM;
}
}
@@ -4884,11 +5144,10 @@ recheck:
/* recheck policy now with rq lock held */
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ task_rq_unlock(rq, p, &flags);
goto recheck;
}
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
running = task_current(rq, p);
if (on_rq)
deactivate_task(rq, p, 0);
@@ -4903,13 +5162,11 @@ recheck:
if (running)
p->sched_class->set_curr_task(rq);
- if (on_rq) {
+ if (on_rq)
activate_task(rq, p, 0);
- check_class_changed(rq, p, prev_class, oldprio, running);
- }
- __task_rq_unlock(rq);
- raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+ check_class_changed(rq, p, prev_class, oldprio);
+ task_rq_unlock(rq, p, &flags);
rt_mutex_adjust_pi(p);
@@ -5089,7 +5346,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
goto out_free_cpus_allowed;
}
retval = -EPERM;
- if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
+ if (!check_same_owner(p) && !task_ns_capable(p, CAP_SYS_NICE))
goto out_unlock;
retval = security_task_setscheduler(p);
@@ -5160,7 +5417,6 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
{
struct task_struct *p;
unsigned long flags;
- struct rq *rq;
int retval;
get_online_cpus();
@@ -5175,9 +5431,9 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
if (retval)
goto out_unlock;
- rq = task_rq_lock(p, &flags);
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
- task_rq_unlock(rq, &flags);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
@@ -5324,6 +5580,67 @@ void __sched yield(void)
}
EXPORT_SYMBOL(yield);
+/**
+ * yield_to - yield the current processor to another thread in
+ * your thread group, or accelerate that thread toward the
+ * processor it's on.
+ * @p: target task
+ * @preempt: whether task preemption is allowed or not
+ *
+ * It's the caller's job to ensure that the target task struct
+ * can't go away on us before we can do any checks.
+ *
+ * Returns true if we indeed boosted the target task.
+ */
+bool __sched yield_to(struct task_struct *p, bool preempt)
+{
+ struct task_struct *curr = current;
+ struct rq *rq, *p_rq;
+ unsigned long flags;
+ bool yielded = 0;
+
+ local_irq_save(flags);
+ rq = this_rq();
+
+again:
+ p_rq = task_rq(p);
+ double_rq_lock(rq, p_rq);
+ while (task_rq(p) != p_rq) {
+ double_rq_unlock(rq, p_rq);
+ goto again;
+ }
+
+ if (!curr->sched_class->yield_to_task)
+ goto out;
+
+ if (curr->sched_class != p->sched_class)
+ goto out;
+
+ if (task_running(p_rq, p) || p->state)
+ goto out;
+
+ yielded = curr->sched_class->yield_to_task(rq, p, preempt);
+ if (yielded) {
+ schedstat_inc(rq, yld_count);
+ /*
+ * Make p's CPU reschedule; pick_next_entity takes care of
+ * fairness.
+ */
+ if (preempt && rq != p_rq)
+ resched_task(p_rq->curr);
+ }
+
+out:
+ double_rq_unlock(rq, p_rq);
+ local_irq_restore(flags);
+
+ if (yielded)
+ schedule();
+
+ return yielded;
+}
+EXPORT_SYMBOL_GPL(yield_to);
+
/*
* This task is about to go to sleep on IO. Increment rq->nr_iowait so
* that process accounting knows that this is a task in IO wait state.
@@ -5334,6 +5651,7 @@ void __sched io_schedule(void)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
schedule();
current->in_iowait = 0;
@@ -5349,6 +5667,7 @@ long __sched io_schedule_timeout(long timeout)
delayacct_blkio_start();
atomic_inc(&rq->nr_iowait);
+ blk_flush_plug(current);
current->in_iowait = 1;
ret = schedule_timeout(timeout);
current->in_iowait = 0;
@@ -5439,7 +5758,7 @@ SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
rq = task_rq_lock(p, &flags);
time_slice = p->sched_class->get_rr_interval(rq, p);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
rcu_read_unlock();
jiffies_to_timespec(time_slice, &t);
@@ -5497,7 +5816,7 @@ void show_state_filter(unsigned long state_filter)
do_each_thread(g, p) {
/*
* reset the NMI-timeout, listing all files on a slow
- * console might take alot of time:
+ * console might take a lot of time:
*/
touch_nmi_watchdog();
if (!state_filter || (p->state & state_filter))
@@ -5541,7 +5860,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
idle->state = TASK_RUNNING;
idle->se.exec_start = sched_clock();
- cpumask_copy(&idle->cpus_allowed, cpumask_of(cpu));
+ do_set_cpus_allowed(idle, cpumask_of(cpu));
/*
* We're having a chicken and egg problem, even though we are
* holding rq->lock, the cpu isn't yet set to this cpu so the
@@ -5557,22 +5876,19 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
rcu_read_unlock();
rq->curr = rq->idle = idle;
-#if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
- idle->oncpu = 1;
+#if defined(CONFIG_SMP)
+ idle->on_cpu = 1;
#endif
raw_spin_unlock_irqrestore(&rq->lock, flags);
/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT)
- task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0);
-#else
task_thread_info(idle)->preempt_count = 0;
-#endif
+
/*
* The idle tasks have their own, simple scheduling class:
*/
idle->sched_class = &idle_sched_class;
- ftrace_graph_init_task(idle);
+ ftrace_graph_init_idle_task(idle, cpu);
}
/*
@@ -5632,6 +5948,16 @@ static inline void sched_init_granularity(void)
}
#ifdef CONFIG_SMP
+void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
+{
+ if (p->sched_class && p->sched_class->set_cpus_allowed)
+ p->sched_class->set_cpus_allowed(p, new_mask);
+ else {
+ cpumask_copy(&p->cpus_allowed, new_mask);
+ p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
+ }
+}
+
/*
* This is how migration works:
*
@@ -5662,52 +5988,38 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
unsigned int dest_cpu;
int ret = 0;
- /*
- * Serialize against TASK_WAKING so that ttwu() and wunt() can
- * drop the rq->lock and still rely on ->cpus_allowed.
- */
-again:
- while (task_is_waking(p))
- cpu_relax();
rq = task_rq_lock(p, &flags);
- if (task_is_waking(p)) {
- task_rq_unlock(rq, &flags);
- goto again;
- }
+
+ if (cpumask_equal(&p->cpus_allowed, new_mask))
+ goto out;
if (!cpumask_intersects(new_mask, cpu_active_mask)) {
ret = -EINVAL;
goto out;
}
- if (unlikely((p->flags & PF_THREAD_BOUND) && p != current &&
- !cpumask_equal(&p->cpus_allowed, new_mask))) {
+ if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
ret = -EINVAL;
goto out;
}
- if (p->sched_class->set_cpus_allowed)
- p->sched_class->set_cpus_allowed(p, new_mask);
- else {
- cpumask_copy(&p->cpus_allowed, new_mask);
- p->rt.nr_cpus_allowed = cpumask_weight(new_mask);
- }
+ do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
if (cpumask_test_cpu(task_cpu(p), new_mask))
goto out;
dest_cpu = cpumask_any_and(cpu_active_mask, new_mask);
- if (migrate_task(p, rq)) {
+ if (p->on_rq) {
struct migration_arg arg = { p, dest_cpu };
/* Need help from migration thread: drop lock and wait. */
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
tlb_migrate_finish(p->mm);
return 0;
}
out:
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, p, &flags);
return ret;
}
@@ -5735,6 +6047,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
rq_src = cpu_rq(src_cpu);
rq_dest = cpu_rq(dest_cpu);
+ raw_spin_lock(&p->pi_lock);
double_rq_lock(rq_src, rq_dest);
/* Already moved. */
if (task_cpu(p) != src_cpu)
@@ -5747,7 +6060,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
* If we're not on a rq, the next wake-up will ensure we're
* placed properly.
*/
- if (p->se.on_rq) {
+ if (p->on_rq) {
deactivate_task(rq_src, p, 0);
set_task_cpu(p, dest_cpu);
activate_task(rq_dest, p, 0);
@@ -5757,6 +6070,7 @@ done:
ret = 1;
fail:
double_rq_unlock(rq_src, rq_dest);
+ raw_spin_unlock(&p->pi_lock);
return ret;
}
@@ -6097,6 +6411,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
#ifdef CONFIG_HOTPLUG_CPU
case CPU_DYING:
+ sched_ttwu_pending();
/* Update our root-domain */
raw_spin_lock_irqsave(&rq->lock, flags);
if (rq->rd) {
@@ -6112,6 +6427,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#endif
}
+
+ update_max_interval();
+
return NOTIFY_OK;
}
@@ -6172,6 +6490,8 @@ early_initcall(migration_init);
#ifdef CONFIG_SMP
+static cpumask_var_t sched_domains_tmpmask; /* sched_domains_mutex */
+
#ifdef CONFIG_SCHED_DEBUG
static __read_mostly int sched_domain_debug_enabled;
@@ -6246,7 +6566,7 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
printk(KERN_CONT " %s", str);
- if (group->cpu_power != SCHED_LOAD_SCALE) {
+ if (group->cpu_power != SCHED_POWER_SCALE) {
printk(KERN_CONT " (cpu_power = %d)",
group->cpu_power);
}
@@ -6267,7 +6587,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
static void sched_domain_debug(struct sched_domain *sd, int cpu)
{
- cpumask_var_t groupmask;
int level = 0;
if (!sched_domain_debug_enabled)
@@ -6280,20 +6599,14 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu)
printk(KERN_DEBUG "CPU%d attaching sched-domain:\n", cpu);
- if (!alloc_cpumask_var(&groupmask, GFP_KERNEL)) {
- printk(KERN_DEBUG "Cannot load-balance (out of memory)\n");
- return;
- }
-
for (;;) {
- if (sched_domain_debug_one(sd, cpu, level, groupmask))
+ if (sched_domain_debug_one(sd, cpu, level, sched_domains_tmpmask))
break;
level++;
sd = sd->parent;
if (!sd)
break;
}
- free_cpumask_var(groupmask);
}
#else /* !CONFIG_SCHED_DEBUG */
# define sched_domain_debug(sd, cpu) do { } while (0)
@@ -6350,12 +6663,11 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
return 1;
}
-static void free_rootdomain(struct root_domain *rd)
+static void free_rootdomain(struct rcu_head *rcu)
{
- synchronize_sched();
+ struct root_domain *rd = container_of(rcu, struct root_domain, rcu);
cpupri_cleanup(&rd->cpupri);
-
free_cpumask_var(rd->rto_mask);
free_cpumask_var(rd->online);
free_cpumask_var(rd->span);
@@ -6396,7 +6708,7 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
raw_spin_unlock_irqrestore(&rq->lock, flags);
if (old_rd)
- free_rootdomain(old_rd);
+ call_rcu_sched(&old_rd->rcu, free_rootdomain);
}
static int init_rootdomain(struct root_domain *rd)
@@ -6447,6 +6759,25 @@ static struct root_domain *alloc_rootdomain(void)
return rd;
}
+static void free_sched_domain(struct rcu_head *rcu)
+{
+ struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu);
+ if (atomic_dec_and_test(&sd->groups->ref))
+ kfree(sd->groups);
+ kfree(sd);
+}
+
+static void destroy_sched_domain(struct sched_domain *sd, int cpu)
+{
+ call_rcu(&sd->rcu, free_sched_domain);
+}
+
+static void destroy_sched_domains(struct sched_domain *sd, int cpu)
+{
+ for (; sd; sd = sd->parent)
+ destroy_sched_domain(sd, cpu);
+}
+
/*
* Attach the domain 'sd' to 'cpu' as its base domain. Callers must
* hold the hotplug lock.
@@ -6457,9 +6788,6 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
struct rq *rq = cpu_rq(cpu);
struct sched_domain *tmp;
- for (tmp = sd; tmp; tmp = tmp->parent)
- tmp->span_weight = cpumask_weight(sched_domain_span(tmp));
-
/* Remove the sched domains which do not contribute to scheduling. */
for (tmp = sd; tmp; ) {
struct sched_domain *parent = tmp->parent;
@@ -6470,12 +6798,15 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
tmp->parent = parent->parent;
if (parent->parent)
parent->parent->child = tmp;
+ destroy_sched_domain(parent, cpu);
} else
tmp = tmp->parent;
}
if (sd && sd_degenerate(sd)) {
+ tmp = sd;
sd = sd->parent;
+ destroy_sched_domain(tmp, cpu);
if (sd)
sd->child = NULL;
}
@@ -6483,7 +6814,9 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu)
sched_domain_debug(sd, cpu);
rq_attach_root(rq, rd);
+ tmp = rq->sd;
rcu_assign_pointer(rq->sd, sd);
+ destroy_sched_domains(tmp, cpu);
}
/* cpus with isolated domains */
@@ -6499,56 +6832,6 @@ static int __init isolated_cpu_setup(char *str)
__setup("isolcpus=", isolated_cpu_setup);
-/*
- * init_sched_build_groups takes the cpumask we wish to span, and a pointer
- * to a function which identifies what group(along with sched group) a CPU
- * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
- * (due to the fact that we keep track of groups covered with a struct cpumask).
- *
- * init_sched_build_groups will build a circular linked list of the groups
- * covered by the given span, and will set each group's ->cpumask correctly,
- * and ->cpu_power to 0.
- */
-static void
-init_sched_build_groups(const struct cpumask *span,
- const struct cpumask *cpu_map,
- int (*group_fn)(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg,
- struct cpumask *tmpmask),
- struct cpumask *covered, struct cpumask *tmpmask)
-{
- struct sched_group *first = NULL, *last = NULL;
- int i;
-
- cpumask_clear(covered);
-
- for_each_cpu(i, span) {
- struct sched_group *sg;
- int group = group_fn(i, cpu_map, &sg, tmpmask);
- int j;
-
- if (cpumask_test_cpu(i, covered))
- continue;
-
- cpumask_clear(sched_group_cpus(sg));
- sg->cpu_power = 0;
-
- for_each_cpu(j, span) {
- if (group_fn(j, cpu_map, NULL, tmpmask) != group)
- continue;
-
- cpumask_set_cpu(j, covered);
- cpumask_set_cpu(j, sched_group_cpus(sg));
- }
- if (!first)
- first = sg;
- if (last)
- last->next = sg;
- last = sg;
- }
- last->next = first;
-}
-
#define SD_NODES_PER_DOMAIN 16
#ifdef CONFIG_NUMA
@@ -6565,7 +6848,7 @@ init_sched_build_groups(const struct cpumask *span,
*/
static int find_next_best_node(int node, nodemask_t *used_nodes)
{
- int i, n, val, min_val, best_node = 0;
+ int i, n, val, min_val, best_node = -1;
min_val = INT_MAX;
@@ -6589,7 +6872,8 @@ static int find_next_best_node(int node, nodemask_t *used_nodes)
}
}
- node_set(best_node, *used_nodes);
+ if (best_node != -1)
+ node_set(best_node, *used_nodes);
return best_node;
}
@@ -6615,315 +6899,130 @@ static void sched_domain_node_span(int node, struct cpumask *span)
for (i = 1; i < SD_NODES_PER_DOMAIN; i++) {
int next_node = find_next_best_node(node, &used_nodes);
-
+ if (next_node < 0)
+ break;
cpumask_or(span, span, cpumask_of_node(next_node));
}
}
+
+static const struct cpumask *cpu_node_mask(int cpu)
+{
+ lockdep_assert_held(&sched_domains_mutex);
+
+ sched_domain_node_span(cpu_to_node(cpu), sched_domains_tmpmask);
+
+ return sched_domains_tmpmask;
+}
+
+static const struct cpumask *cpu_allnodes_mask(int cpu)
+{
+ return cpu_possible_mask;
+}
#endif /* CONFIG_NUMA */
-int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
+static const struct cpumask *cpu_cpu_mask(int cpu)
+{
+ return cpumask_of_node(cpu_to_node(cpu));
+}
-/*
- * The cpus mask in sched_group and sched_domain hangs off the end.
- *
- * ( See the the comments in include/linux/sched.h:struct sched_group
- * and struct sched_domain. )
- */
-struct static_sched_group {
- struct sched_group sg;
- DECLARE_BITMAP(cpus, CONFIG_NR_CPUS);
-};
+int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
-struct static_sched_domain {
- struct sched_domain sd;
- DECLARE_BITMAP(span, CONFIG_NR_CPUS);
+struct sd_data {
+ struct sched_domain **__percpu sd;
+ struct sched_group **__percpu sg;
};
struct s_data {
-#ifdef CONFIG_NUMA
- int sd_allnodes;
- cpumask_var_t domainspan;
- cpumask_var_t covered;
- cpumask_var_t notcovered;
-#endif
- cpumask_var_t nodemask;
- cpumask_var_t this_sibling_map;
- cpumask_var_t this_core_map;
- cpumask_var_t this_book_map;
- cpumask_var_t send_covered;
- cpumask_var_t tmpmask;
- struct sched_group **sched_group_nodes;
+ struct sched_domain ** __percpu sd;
struct root_domain *rd;
};
enum s_alloc {
- sa_sched_groups = 0,
sa_rootdomain,
- sa_tmpmask,
- sa_send_covered,
- sa_this_book_map,
- sa_this_core_map,
- sa_this_sibling_map,
- sa_nodemask,
- sa_sched_group_nodes,
-#ifdef CONFIG_NUMA
- sa_notcovered,
- sa_covered,
- sa_domainspan,
-#endif
+ sa_sd,
+ sa_sd_storage,
sa_none,
};
-/*
- * SMT sched-domains:
- */
-#ifdef CONFIG_SCHED_SMT
-static DEFINE_PER_CPU(struct static_sched_domain, cpu_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_groups);
+struct sched_domain_topology_level;
-static int
-cpu_to_cpu_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *unused)
-{
- if (sg)
- *sg = &per_cpu(sched_groups, cpu).sg;
- return cpu;
-}
-#endif /* CONFIG_SCHED_SMT */
+typedef struct sched_domain *(*sched_domain_init_f)(struct sched_domain_topology_level *tl, int cpu);
+typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
-/*
- * multi-core sched-domains:
- */
-#ifdef CONFIG_SCHED_MC
-static DEFINE_PER_CPU(struct static_sched_domain, core_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_core);
-
-static int
-cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
-{
- int group;
-#ifdef CONFIG_SCHED_SMT
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#else
- group = cpu;
-#endif
- if (sg)
- *sg = &per_cpu(sched_group_core, group).sg;
- return group;
-}
-#endif /* CONFIG_SCHED_MC */
+struct sched_domain_topology_level {
+ sched_domain_init_f init;
+ sched_domain_mask_f mask;
+ struct sd_data data;
+};
/*
- * book sched-domains:
+ * Assumes the sched_domain tree is fully constructed
*/
-#ifdef CONFIG_SCHED_BOOK
-static DEFINE_PER_CPU(struct static_sched_domain, book_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_book);
-
-static int
-cpu_to_book_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
+static int get_group(int cpu, struct sd_data *sdd, struct sched_group **sg)
{
- int group = cpu;
-#ifdef CONFIG_SCHED_MC
- cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#endif
- if (sg)
- *sg = &per_cpu(sched_group_book, group).sg;
- return group;
-}
-#endif /* CONFIG_SCHED_BOOK */
+ struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
+ struct sched_domain *child = sd->child;
-static DEFINE_PER_CPU(struct static_sched_domain, phys_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_phys);
+ if (child)
+ cpu = cpumask_first(sched_domain_span(child));
-static int
-cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg, struct cpumask *mask)
-{
- int group;
-#ifdef CONFIG_SCHED_BOOK
- cpumask_and(mask, cpu_book_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_MC)
- cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
- group = cpumask_first(mask);
-#elif defined(CONFIG_SCHED_SMT)
- cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
- group = cpumask_first(mask);
-#else
- group = cpu;
-#endif
if (sg)
- *sg = &per_cpu(sched_group_phys, group).sg;
- return group;
+ *sg = *per_cpu_ptr(sdd->sg, cpu);
+
+ return cpu;
}
-#ifdef CONFIG_NUMA
/*
- * The init_sched_build_groups can't handle what we want to do with node
- * groups, so roll our own. Now each node has its own list of groups which
- * gets dynamically allocated.
+ * build_sched_groups takes the cpumask we wish to span, and a pointer
+ * to a function which identifies what group(along with sched group) a CPU
+ * belongs to. The return value of group_fn must be a >= 0 and < nr_cpu_ids
+ * (due to the fact that we keep track of groups covered with a struct cpumask).
+ *
+ * build_sched_groups will build a circular linked list of the groups
+ * covered by the given span, and will set each group's ->cpumask correctly,
+ * and ->cpu_power to 0.
*/
-static DEFINE_PER_CPU(struct static_sched_domain, node_domains);
-static struct sched_group ***sched_group_nodes_bycpu;
-
-static DEFINE_PER_CPU(struct static_sched_domain, allnodes_domains);
-static DEFINE_PER_CPU(struct static_sched_group, sched_group_allnodes);
-
-static int cpu_to_allnodes_group(int cpu, const struct cpumask *cpu_map,
- struct sched_group **sg,
- struct cpumask *nodemask)
-{
- int group;
-
- cpumask_and(nodemask, cpumask_of_node(cpu_to_node(cpu)), cpu_map);
- group = cpumask_first(nodemask);
-
- if (sg)
- *sg = &per_cpu(sched_group_allnodes, group).sg;
- return group;
-}
-
-static void init_numa_sched_groups_power(struct sched_group *group_head)
-{
- struct sched_group *sg = group_head;
- int j;
-
- if (!sg)
- return;
- do {
- for_each_cpu(j, sched_group_cpus(sg)) {
- struct sched_domain *sd;
-
- sd = &per_cpu(phys_domains, j).sd;
- if (j != group_first_cpu(sd->groups)) {
- /*
- * Only add "power" once for each
- * physical package.
- */
- continue;
- }
-
- sg->cpu_power += sd->groups->cpu_power;
- }
- sg = sg->next;
- } while (sg != group_head);
-}
-
-static int build_numa_sched_groups(struct s_data *d,
- const struct cpumask *cpu_map, int num)
+static void
+build_sched_groups(struct sched_domain *sd)
{
- struct sched_domain *sd;
- struct sched_group *sg, *prev;
- int n, j;
-
- cpumask_clear(d->covered);
- cpumask_and(d->nodemask, cpumask_of_node(num), cpu_map);
- if (cpumask_empty(d->nodemask)) {
- d->sched_group_nodes[num] = NULL;
- goto out;
- }
-
- sched_domain_node_span(num, d->domainspan);
- cpumask_and(d->domainspan, d->domainspan, cpu_map);
-
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING "Can not alloc domain group for node %d\n",
- num);
- return -ENOMEM;
- }
- d->sched_group_nodes[num] = sg;
-
- for_each_cpu(j, d->nodemask) {
- sd = &per_cpu(node_domains, j).sd;
- sd->groups = sg;
- }
-
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->nodemask);
- sg->next = sg;
- cpumask_or(d->covered, d->covered, d->nodemask);
+ struct sched_group *first = NULL, *last = NULL;
+ struct sd_data *sdd = sd->private;
+ const struct cpumask *span = sched_domain_span(sd);
+ struct cpumask *covered;
+ int i;
- prev = sg;
- for (j = 0; j < nr_node_ids; j++) {
- n = (num + j) % nr_node_ids;
- cpumask_complement(d->notcovered, d->covered);
- cpumask_and(d->tmpmask, d->notcovered, cpu_map);
- cpumask_and(d->tmpmask, d->tmpmask, d->domainspan);
- if (cpumask_empty(d->tmpmask))
- break;
- cpumask_and(d->tmpmask, d->tmpmask, cpumask_of_node(n));
- if (cpumask_empty(d->tmpmask))
- continue;
- sg = kmalloc_node(sizeof(struct sched_group) + cpumask_size(),
- GFP_KERNEL, num);
- if (!sg) {
- printk(KERN_WARNING
- "Can not alloc domain group for node %d\n", j);
- return -ENOMEM;
- }
- sg->cpu_power = 0;
- cpumask_copy(sched_group_cpus(sg), d->tmpmask);
- sg->next = prev->next;
- cpumask_or(d->covered, d->covered, d->tmpmask);
- prev->next = sg;
- prev = sg;
- }
-out:
- return 0;
-}
-#endif /* CONFIG_NUMA */
+ lockdep_assert_held(&sched_domains_mutex);
+ covered = sched_domains_tmpmask;
-#ifdef CONFIG_NUMA
-/* Free memory allocated for various sched_group structures */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
- int cpu, i;
+ cpumask_clear(covered);
- for_each_cpu(cpu, cpu_map) {
- struct sched_group **sched_group_nodes
- = sched_group_nodes_bycpu[cpu];
+ for_each_cpu(i, span) {
+ struct sched_group *sg;
+ int group = get_group(i, sdd, &sg);
+ int j;
- if (!sched_group_nodes)
+ if (cpumask_test_cpu(i, covered))
continue;
- for (i = 0; i < nr_node_ids; i++) {
- struct sched_group *oldsg, *sg = sched_group_nodes[i];
+ cpumask_clear(sched_group_cpus(sg));
+ sg->cpu_power = 0;
- cpumask_and(nodemask, cpumask_of_node(i), cpu_map);
- if (cpumask_empty(nodemask))
+ for_each_cpu(j, span) {
+ if (get_group(j, sdd, NULL) != group)
continue;
- if (sg == NULL)
- continue;
- sg = sg->next;
-next_sg:
- oldsg = sg;
- sg = sg->next;
- kfree(oldsg);
- if (oldsg != sched_group_nodes[i])
- goto next_sg;
+ cpumask_set_cpu(j, covered);
+ cpumask_set_cpu(j, sched_group_cpus(sg));
}
- kfree(sched_group_nodes);
- sched_group_nodes_bycpu[cpu] = NULL;
+
+ if (!first)
+ first = sg;
+ if (last)
+ last->next = sg;
+ last = sg;
}
+ last->next = first;
}
-#else /* !CONFIG_NUMA */
-static void free_sched_groups(const struct cpumask *cpu_map,
- struct cpumask *nodemask)
-{
-}
-#endif /* CONFIG_NUMA */
/*
* Initialize sched groups cpu_power.
@@ -6937,11 +7036,6 @@ static void free_sched_groups(const struct cpumask *cpu_map,
*/
static void init_sched_groups_power(int cpu, struct sched_domain *sd)
{
- struct sched_domain *child;
- struct sched_group *group;
- long power;
- int weight;
-
WARN_ON(!sd || !sd->groups);
if (cpu != group_first_cpu(sd->groups))
@@ -6949,36 +7043,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
sd->groups->group_weight = cpumask_weight(sched_group_cpus(sd->groups));
- child = sd->child;
-
- sd->groups->cpu_power = 0;
-
- if (!child) {
- power = SCHED_LOAD_SCALE;
- weight = cpumask_weight(sched_domain_span(sd));
- /*
- * SMT siblings share the power of a single core.
- * Usually multiple threads get a better yield out of
- * that one core than a single thread would have,
- * reflect that in sd->smt_gain.
- */
- if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
- power *= sd->smt_gain;
- power /= weight;
- power >>= SCHED_LOAD_SHIFT;
- }
- sd->groups->cpu_power += power;
- return;
- }
-
- /*
- * Add cpu_power of each child group to this groups cpu_power.
- */
- group = child->groups;
- do {
- sd->groups->cpu_power += group->cpu_power;
- group = group->next;
- } while (group != child->groups);
+ update_group_power(sd, cpu);
}
/*
@@ -6992,15 +7057,15 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
# define SD_INIT_NAME(sd, type) do { } while (0)
#endif
-#define SD_INIT(sd, type) sd_init_##type(sd)
-
-#define SD_INIT_FUNC(type) \
-static noinline void sd_init_##type(struct sched_domain *sd) \
-{ \
- memset(sd, 0, sizeof(*sd)); \
- *sd = SD_##type##_INIT; \
- sd->level = SD_LV_##type; \
- SD_INIT_NAME(sd, type); \
+#define SD_INIT_FUNC(type) \
+static noinline struct sched_domain * \
+sd_init_##type(struct sched_domain_topology_level *tl, int cpu) \
+{ \
+ struct sched_domain *sd = *per_cpu_ptr(tl->data.sd, cpu); \
+ *sd = SD_##type##_INIT; \
+ SD_INIT_NAME(sd, type); \
+ sd->private = &tl->data; \
+ return sd; \
}
SD_INIT_FUNC(CPU)
@@ -7019,13 +7084,14 @@ SD_INIT_FUNC(CPU)
#endif
static int default_relax_domain_level = -1;
+int sched_domain_level_max;
static int __init setup_relax_domain_level(char *str)
{
unsigned long val;
val = simple_strtoul(str, NULL, 0);
- if (val < SD_LV_MAX)
+ if (val < sched_domain_level_max)
default_relax_domain_level = val;
return 1;
@@ -7053,37 +7119,20 @@ static void set_domain_attribute(struct sched_domain *sd,
}
}
+static void __sdt_free(const struct cpumask *cpu_map);
+static int __sdt_alloc(const struct cpumask *cpu_map);
+
static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
const struct cpumask *cpu_map)
{
switch (what) {
- case sa_sched_groups:
- free_sched_groups(cpu_map, d->tmpmask); /* fall through */
- d->sched_group_nodes = NULL;
case sa_rootdomain:
- free_rootdomain(d->rd); /* fall through */
- case sa_tmpmask:
- free_cpumask_var(d->tmpmask); /* fall through */
- case sa_send_covered:
- free_cpumask_var(d->send_covered); /* fall through */
- case sa_this_book_map:
- free_cpumask_var(d->this_book_map); /* fall through */
- case sa_this_core_map:
- free_cpumask_var(d->this_core_map); /* fall through */
- case sa_this_sibling_map:
- free_cpumask_var(d->this_sibling_map); /* fall through */
- case sa_nodemask:
- free_cpumask_var(d->nodemask); /* fall through */
- case sa_sched_group_nodes:
-#ifdef CONFIG_NUMA
- kfree(d->sched_group_nodes); /* fall through */
- case sa_notcovered:
- free_cpumask_var(d->notcovered); /* fall through */
- case sa_covered:
- free_cpumask_var(d->covered); /* fall through */
- case sa_domainspan:
- free_cpumask_var(d->domainspan); /* fall through */
-#endif
+ if (!atomic_read(&d->rd->refcount))
+ free_rootdomain(&d->rd->rcu); /* fall through */
+ case sa_sd:
+ free_percpu(d->sd); /* fall through */
+ case sa_sd_storage:
+ __sdt_free(cpu_map); /* fall through */
case sa_none:
break;
}
@@ -7092,308 +7141,212 @@ static void __free_domain_allocs(struct s_data *d, enum s_alloc what,
static enum s_alloc __visit_domain_allocation_hell(struct s_data *d,
const struct cpumask *cpu_map)
{
-#ifdef CONFIG_NUMA
- if (!alloc_cpumask_var(&d->domainspan, GFP_KERNEL))
- return sa_none;
- if (!alloc_cpumask_var(&d->covered, GFP_KERNEL))
- return sa_domainspan;
- if (!alloc_cpumask_var(&d->notcovered, GFP_KERNEL))
- return sa_covered;
- /* Allocate the per-node list of sched groups */
- d->sched_group_nodes = kcalloc(nr_node_ids,
- sizeof(struct sched_group *), GFP_KERNEL);
- if (!d->sched_group_nodes) {
- printk(KERN_WARNING "Can not alloc sched group node list\n");
- return sa_notcovered;
- }
- sched_group_nodes_bycpu[cpumask_first(cpu_map)] = d->sched_group_nodes;
-#endif
- if (!alloc_cpumask_var(&d->nodemask, GFP_KERNEL))
- return sa_sched_group_nodes;
- if (!alloc_cpumask_var(&d->this_sibling_map, GFP_KERNEL))
- return sa_nodemask;
- if (!alloc_cpumask_var(&d->this_core_map, GFP_KERNEL))
- return sa_this_sibling_map;
- if (!alloc_cpumask_var(&d->this_book_map, GFP_KERNEL))
- return sa_this_core_map;
- if (!alloc_cpumask_var(&d->send_covered, GFP_KERNEL))
- return sa_this_book_map;
- if (!alloc_cpumask_var(&d->tmpmask, GFP_KERNEL))
- return sa_send_covered;
+ memset(d, 0, sizeof(*d));
+
+ if (__sdt_alloc(cpu_map))
+ return sa_sd_storage;
+ d->sd = alloc_percpu(struct sched_domain *);
+ if (!d->sd)
+ return sa_sd_storage;
d->rd = alloc_rootdomain();
- if (!d->rd) {
- printk(KERN_WARNING "Cannot alloc root domain\n");
- return sa_tmpmask;
- }
+ if (!d->rd)
+ return sa_sd;
return sa_rootdomain;
}
-static struct sched_domain *__build_numa_sched_domains(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr, int i)
+/*
+ * NULL the sd_data elements we've used to build the sched_domain and
+ * sched_group structure so that the subsequent __free_domain_allocs()
+ * will not free the data we're using.
+ */
+static void claim_allocations(int cpu, struct sched_domain *sd)
{
- struct sched_domain *sd = NULL;
-#ifdef CONFIG_NUMA
- struct sched_domain *parent;
-
- d->sd_allnodes = 0;
- if (cpumask_weight(cpu_map) >
- SD_NODES_PER_DOMAIN * cpumask_weight(d->nodemask)) {
- sd = &per_cpu(allnodes_domains, i).sd;
- SD_INIT(sd, ALLNODES);
- set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), cpu_map);
- cpu_to_allnodes_group(i, cpu_map, &sd->groups, d->tmpmask);
- d->sd_allnodes = 1;
- }
- parent = sd;
-
- sd = &per_cpu(node_domains, i).sd;
- SD_INIT(sd, NODE);
- set_domain_attribute(sd, attr);
- sched_domain_node_span(cpu_to_node(i), sched_domain_span(sd));
- sd->parent = parent;
- if (parent)
- parent->child = sd;
- cpumask_and(sched_domain_span(sd), sched_domain_span(sd), cpu_map);
-#endif
- return sd;
-}
+ struct sd_data *sdd = sd->private;
+ struct sched_group *sg = sd->groups;
-static struct sched_domain *__build_cpu_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd;
- sd = &per_cpu(phys_domains, i).sd;
- SD_INIT(sd, CPU);
- set_domain_attribute(sd, attr);
- cpumask_copy(sched_domain_span(sd), d->nodemask);
- sd->parent = parent;
- if (parent)
- parent->child = sd;
- cpu_to_phys_group(i, cpu_map, &sd->groups, d->tmpmask);
- return sd;
-}
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd);
+ *per_cpu_ptr(sdd->sd, cpu) = NULL;
-static struct sched_domain *__build_book_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_BOOK
- sd = &per_cpu(book_domains, i).sd;
- SD_INIT(sd, BOOK);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, cpu_book_mask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_book_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
- return sd;
+ if (cpu == cpumask_first(sched_group_cpus(sg))) {
+ WARN_ON_ONCE(*per_cpu_ptr(sdd->sg, cpu) != sg);
+ *per_cpu_ptr(sdd->sg, cpu) = NULL;
+ }
}
-static struct sched_domain *__build_mc_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
+#ifdef CONFIG_SCHED_SMT
+static const struct cpumask *cpu_smt_mask(int cpu)
{
- struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_MC
- sd = &per_cpu(core_domains, i).sd;
- SD_INIT(sd, MC);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, cpu_coregroup_mask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_core_group(i, cpu_map, &sd->groups, d->tmpmask);
-#endif
- return sd;
+ return topology_thread_cpumask(cpu);
}
-
-static struct sched_domain *__build_smt_sched_domain(struct s_data *d,
- const struct cpumask *cpu_map, struct sched_domain_attr *attr,
- struct sched_domain *parent, int i)
-{
- struct sched_domain *sd = parent;
-#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i).sd;
- SD_INIT(sd, SIBLING);
- set_domain_attribute(sd, attr);
- cpumask_and(sched_domain_span(sd), cpu_map, topology_thread_cpumask(i));
- sd->parent = parent;
- parent->child = sd;
- cpu_to_cpu_group(i, cpu_map, &sd->groups, d->tmpmask);
#endif
- return sd;
-}
-static void build_sched_groups(struct s_data *d, enum sched_domain_level l,
- const struct cpumask *cpu_map, int cpu)
-{
- switch (l) {
+/*
+ * Topology list, bottom-up.
+ */
+static struct sched_domain_topology_level default_topology[] = {
#ifdef CONFIG_SCHED_SMT
- case SD_LV_SIBLING: /* set up CPU (sibling) groups */
- cpumask_and(d->this_sibling_map, cpu_map,
- topology_thread_cpumask(cpu));
- if (cpu == cpumask_first(d->this_sibling_map))
- init_sched_build_groups(d->this_sibling_map, cpu_map,
- &cpu_to_cpu_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_SIBLING, cpu_smt_mask, },
#endif
#ifdef CONFIG_SCHED_MC
- case SD_LV_MC: /* set up multi-core groups */
- cpumask_and(d->this_core_map, cpu_map, cpu_coregroup_mask(cpu));
- if (cpu == cpumask_first(d->this_core_map))
- init_sched_build_groups(d->this_core_map, cpu_map,
- &cpu_to_core_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_MC, cpu_coregroup_mask, },
#endif
#ifdef CONFIG_SCHED_BOOK
- case SD_LV_BOOK: /* set up book groups */
- cpumask_and(d->this_book_map, cpu_map, cpu_book_mask(cpu));
- if (cpu == cpumask_first(d->this_book_map))
- init_sched_build_groups(d->this_book_map, cpu_map,
- &cpu_to_book_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_BOOK, cpu_book_mask, },
#endif
- case SD_LV_CPU: /* set up physical groups */
- cpumask_and(d->nodemask, cpumask_of_node(cpu), cpu_map);
- if (!cpumask_empty(d->nodemask))
- init_sched_build_groups(d->nodemask, cpu_map,
- &cpu_to_phys_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_CPU, cpu_cpu_mask, },
#ifdef CONFIG_NUMA
- case SD_LV_ALLNODES:
- init_sched_build_groups(cpu_map, cpu_map, &cpu_to_allnodes_group,
- d->send_covered, d->tmpmask);
- break;
+ { sd_init_NODE, cpu_node_mask, },
+ { sd_init_ALLNODES, cpu_allnodes_mask, },
#endif
- default:
- break;
+ { NULL, },
+};
+
+static struct sched_domain_topology_level *sched_domain_topology = default_topology;
+
+static int __sdt_alloc(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for (tl = sched_domain_topology; tl->init; tl++) {
+ struct sd_data *sdd = &tl->data;
+
+ sdd->sd = alloc_percpu(struct sched_domain *);
+ if (!sdd->sd)
+ return -ENOMEM;
+
+ sdd->sg = alloc_percpu(struct sched_group *);
+ if (!sdd->sg)
+ return -ENOMEM;
+
+ for_each_cpu(j, cpu_map) {
+ struct sched_domain *sd;
+ struct sched_group *sg;
+
+ sd = kzalloc_node(sizeof(struct sched_domain) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sd)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sd, j) = sd;
+
+ sg = kzalloc_node(sizeof(struct sched_group) + cpumask_size(),
+ GFP_KERNEL, cpu_to_node(j));
+ if (!sg)
+ return -ENOMEM;
+
+ *per_cpu_ptr(sdd->sg, j) = sg;
+ }
+ }
+
+ return 0;
+}
+
+static void __sdt_free(const struct cpumask *cpu_map)
+{
+ struct sched_domain_topology_level *tl;
+ int j;
+
+ for (tl = sched_domain_topology; tl->init; tl++) {
+ struct sd_data *sdd = &tl->data;
+
+ for_each_cpu(j, cpu_map) {
+ kfree(*per_cpu_ptr(sdd->sd, j));
+ kfree(*per_cpu_ptr(sdd->sg, j));
+ }
+ free_percpu(sdd->sd);
+ free_percpu(sdd->sg);
}
}
+struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
+ struct s_data *d, const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr, struct sched_domain *child,
+ int cpu)
+{
+ struct sched_domain *sd = tl->init(tl, cpu);
+ if (!sd)
+ return child;
+
+ set_domain_attribute(sd, attr);
+ cpumask_and(sched_domain_span(sd), cpu_map, tl->mask(cpu));
+ if (child) {
+ sd->level = child->level + 1;
+ sched_domain_level_max = max(sched_domain_level_max, sd->level);
+ child->parent = sd;
+ }
+ sd->child = child;
+
+ return sd;
+}
+
/*
* Build sched domains for a given set of cpus and attach the sched domains
* to the individual cpus
*/
-static int __build_sched_domains(const struct cpumask *cpu_map,
- struct sched_domain_attr *attr)
+static int build_sched_domains(const struct cpumask *cpu_map,
+ struct sched_domain_attr *attr)
{
enum s_alloc alloc_state = sa_none;
- struct s_data d;
struct sched_domain *sd;
- int i;
-#ifdef CONFIG_NUMA
- d.sd_allnodes = 0;
-#endif
+ struct s_data d;
+ int i, ret = -ENOMEM;
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
if (alloc_state != sa_rootdomain)
goto error;
- alloc_state = sa_sched_groups;
-
- /*
- * Set up domains for cpus specified by the cpu_map.
- */
- for_each_cpu(i, cpu_map) {
- cpumask_and(d.nodemask, cpumask_of_node(cpu_to_node(i)),
- cpu_map);
-
- sd = __build_numa_sched_domains(&d, cpu_map, attr, i);
- sd = __build_cpu_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_book_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_mc_sched_domain(&d, cpu_map, attr, sd, i);
- sd = __build_smt_sched_domain(&d, cpu_map, attr, sd, i);
- }
+ /* Set up domains for cpus specified by the cpu_map. */
for_each_cpu(i, cpu_map) {
- build_sched_groups(&d, SD_LV_SIBLING, cpu_map, i);
- build_sched_groups(&d, SD_LV_BOOK, cpu_map, i);
- build_sched_groups(&d, SD_LV_MC, cpu_map, i);
- }
+ struct sched_domain_topology_level *tl;
- /* Set up physical groups */
- for (i = 0; i < nr_node_ids; i++)
- build_sched_groups(&d, SD_LV_CPU, cpu_map, i);
+ sd = NULL;
+ for (tl = sched_domain_topology; tl->init; tl++)
+ sd = build_sched_domain(tl, &d, cpu_map, attr, sd, i);
-#ifdef CONFIG_NUMA
- /* Set up node groups */
- if (d.sd_allnodes)
- build_sched_groups(&d, SD_LV_ALLNODES, cpu_map, 0);
+ while (sd->child)
+ sd = sd->child;
- for (i = 0; i < nr_node_ids; i++)
- if (build_numa_sched_groups(&d, cpu_map, i))
- goto error;
-#endif
-
- /* Calculate CPU power for physical packages and nodes */
-#ifdef CONFIG_SCHED_SMT
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(cpu_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
-#ifdef CONFIG_SCHED_MC
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(core_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
-#endif
-#ifdef CONFIG_SCHED_BOOK
- for_each_cpu(i, cpu_map) {
- sd = &per_cpu(book_domains, i).sd;
- init_sched_groups_power(i, sd);
+ *per_cpu_ptr(d.sd, i) = sd;
}
-#endif
+ /* Build the groups for the domains */
for_each_cpu(i, cpu_map) {
- sd = &per_cpu(phys_domains, i).sd;
- init_sched_groups_power(i, sd);
- }
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ sd->span_weight = cpumask_weight(sched_domain_span(sd));
+ get_group(i, sd->private, &sd->groups);
+ atomic_inc(&sd->groups->ref);
-#ifdef CONFIG_NUMA
- for (i = 0; i < nr_node_ids; i++)
- init_numa_sched_groups_power(d.sched_group_nodes[i]);
+ if (i != cpumask_first(sched_domain_span(sd)))
+ continue;
- if (d.sd_allnodes) {
- struct sched_group *sg;
+ build_sched_groups(sd);
+ }
+ }
+
+ /* Calculate CPU power for physical packages and nodes */
+ for (i = nr_cpumask_bits-1; i >= 0; i--) {
+ if (!cpumask_test_cpu(i, cpu_map))
+ continue;
- cpu_to_allnodes_group(cpumask_first(cpu_map), cpu_map, &sg,
- d.tmpmask);
- init_numa_sched_groups_power(sg);
+ for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+ claim_allocations(i, sd);
+ init_sched_groups_power(i, sd);
+ }
}
-#endif
/* Attach the domains */
+ rcu_read_lock();
for_each_cpu(i, cpu_map) {
-#ifdef CONFIG_SCHED_SMT
- sd = &per_cpu(cpu_domains, i).sd;
-#elif defined(CONFIG_SCHED_MC)
- sd = &per_cpu(core_domains, i).sd;
-#elif defined(CONFIG_SCHED_BOOK)
- sd = &per_cpu(book_domains, i).sd;
-#else
- sd = &per_cpu(phys_domains, i).sd;
-#endif
+ sd = *per_cpu_ptr(d.sd, i);
cpu_attach_domain(sd, d.rd, i);
}
+ rcu_read_unlock();
- d.sched_group_nodes = NULL; /* don't free this we still need it */
- __free_domain_allocs(&d, sa_tmpmask, cpu_map);
- return 0;
-
+ ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);
- return -ENOMEM;
-}
-
-static int build_sched_domains(const struct cpumask *cpu_map)
-{
- return __build_sched_domains(cpu_map, NULL);
+ return ret;
}
static cpumask_var_t *doms_cur; /* current sched domains */
@@ -7448,7 +7401,7 @@ void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms)
* For now this just excludes isolated cpus, but could be used to
* exclude other special cases in the future.
*/
-static int arch_init_sched_domains(const struct cpumask *cpu_map)
+static int init_sched_domains(const struct cpumask *cpu_map)
{
int err;
@@ -7459,32 +7412,24 @@ static int arch_init_sched_domains(const struct cpumask *cpu_map)
doms_cur = &fallback_doms;
cpumask_andnot(doms_cur[0], cpu_map, cpu_isolated_map);
dattr_cur = NULL;
- err = build_sched_domains(doms_cur[0]);
+ err = build_sched_domains(doms_cur[0], NULL);
register_sched_domain_sysctl();
return err;
}
-static void arch_destroy_sched_domains(const struct cpumask *cpu_map,
- struct cpumask *tmpmask)
-{
- free_sched_groups(cpu_map, tmpmask);
-}
-
/*
* Detach sched domains from a group of cpus specified in cpu_map
* These cpus will now be attached to the NULL domain
*/
static void detach_destroy_domains(const struct cpumask *cpu_map)
{
- /* Save because hotplug lock held. */
- static DECLARE_BITMAP(tmpmask, CONFIG_NR_CPUS);
int i;
+ rcu_read_lock();
for_each_cpu(i, cpu_map)
cpu_attach_domain(NULL, &def_root_domain, i);
- synchronize_sched();
- arch_destroy_sched_domains(cpu_map, to_cpumask(tmpmask));
+ rcu_read_unlock();
}
/* handle null as "default" */
@@ -7573,8 +7518,7 @@ match1:
goto match2;
}
/* no match - add a new doms_new */
- __build_sched_domains(doms_new[i],
- dattr_new ? dattr_new + i : NULL);
+ build_sched_domains(doms_new[i], dattr_new ? dattr_new + i : NULL);
match2:
;
}
@@ -7593,7 +7537,7 @@ match2:
}
#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-static void arch_reinit_sched_domains(void)
+static void reinit_sched_domains(void)
{
get_online_cpus();
@@ -7626,7 +7570,7 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
else
sched_mc_power_savings = level;
- arch_reinit_sched_domains();
+ reinit_sched_domains();
return count;
}
@@ -7745,14 +7689,9 @@ void __init sched_init_smp(void)
alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
alloc_cpumask_var(&fallback_doms, GFP_KERNEL);
-#if defined(CONFIG_NUMA)
- sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **),
- GFP_KERNEL);
- BUG_ON(sched_group_nodes_bycpu == NULL);
-#endif
get_online_cpus();
mutex_lock(&sched_domains_mutex);
- arch_init_sched_domains(cpu_active_mask);
+ init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
@@ -7797,6 +7736,10 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
INIT_LIST_HEAD(&cfs_rq->tasks);
#ifdef CONFIG_FAIR_GROUP_SCHED
cfs_rq->rq = rq;
+ /* allow initial update_cfs_load() to truncate */
+#ifdef CONFIG_SMP
+ cfs_rq->load_stamp = 1;
+#endif
#endif
cfs_rq->min_vruntime = (u64)(-(1LL << 20));
}
@@ -7998,7 +7941,7 @@ void __init sched_init(void)
#ifdef CONFIG_SMP
rq->sd = NULL;
rq->rd = NULL;
- rq->cpu_power = SCHED_LOAD_SCALE;
+ rq->cpu_power = SCHED_POWER_SCALE;
rq->post_schedule = 0;
rq->active_balance = 0;
rq->next_balance = jiffies;
@@ -8055,6 +7998,7 @@ void __init sched_init(void)
/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
#ifdef CONFIG_SMP
+ zalloc_cpumask_var(&sched_domains_tmpmask, GFP_NOWAIT);
#ifdef CONFIG_NO_HZ
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
alloc_cpumask_var(&nohz.grp_idle_mask, GFP_NOWAIT);
@@ -8075,7 +8019,7 @@ static inline int preempt_count_equals(int preempt_offset)
{
int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
- return (nested == PREEMPT_INATOMIC_BASE + preempt_offset);
+ return (nested == preempt_offset);
}
void __might_sleep(const char *file, int line, int preempt_offset)
@@ -8110,9 +8054,11 @@ EXPORT_SYMBOL(__might_sleep);
#ifdef CONFIG_MAGIC_SYSRQ
static void normalize_task(struct rq *rq, struct task_struct *p)
{
+ const struct sched_class *prev_class = p->sched_class;
+ int old_prio = p->prio;
int on_rq;
- on_rq = p->se.on_rq;
+ on_rq = p->on_rq;
if (on_rq)
deactivate_task(rq, p, 0);
__setscheduler(rq, p, SCHED_NORMAL, 0);
@@ -8120,6 +8066,8 @@ static void normalize_task(struct rq *rq, struct task_struct *p)
activate_task(rq, p, 0);
resched_task(rq->curr);
}
+
+ check_class_changed(rq, p, prev_class, old_prio);
}
void normalize_rt_tasks(void)
@@ -8235,7 +8183,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se;
- struct rq *rq;
int i;
tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8248,8 +8195,6 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
tg->shares = NICE_0_LOAD;
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
-
cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
GFP_KERNEL, cpu_to_node(i));
if (!cfs_rq)
@@ -8326,7 +8271,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
{
struct rt_rq *rt_rq;
struct sched_rt_entity *rt_se;
- struct rq *rq;
int i;
tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
@@ -8340,8 +8284,6 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
ktime_to_ns(def_rt_bandwidth.rt_period), 0);
for_each_possible_cpu(i) {
- rq = cpu_rq(i);
-
rt_rq = kzalloc_node(sizeof(struct rt_rq),
GFP_KERNEL, cpu_to_node(i));
if (!rt_rq)
@@ -8456,7 +8398,7 @@ void sched_move_task(struct task_struct *tsk)
rq = task_rq_lock(tsk, &flags);
running = task_current(rq, tsk);
- on_rq = tsk->se.on_rq;
+ on_rq = tsk->on_rq;
if (on_rq)
dequeue_task(rq, tsk, 0);
@@ -8475,7 +8417,7 @@ void sched_move_task(struct task_struct *tsk)
if (on_rq)
enqueue_task(rq, tsk, 0);
- task_rq_unlock(rq, &flags);
+ task_rq_unlock(rq, tsk, &flags);
}
#endif /* CONFIG_CGROUP_SCHED */
@@ -8511,7 +8453,7 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
/* Propagate contribution to hierarchy */
raw_spin_lock_irqsave(&rq->lock, flags);
for_each_sched_entity(se)
- update_cfs_shares(group_cfs_rq(se), 0);
+ update_cfs_shares(group_cfs_rq(se));
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -8846,46 +8788,15 @@ cpu_cgroup_can_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
return 0;
}
-static int
-cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct task_struct *tsk, bool threadgroup)
-{
- int retval = cpu_cgroup_can_attach_task(cgrp, tsk);
- if (retval)
- return retval;
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- retval = cpu_cgroup_can_attach_task(cgrp, c);
- if (retval) {
- rcu_read_unlock();
- return retval;
- }
- }
- rcu_read_unlock();
- }
- return 0;
-}
-
static void
-cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
- struct cgroup *old_cont, struct task_struct *tsk,
- bool threadgroup)
+cpu_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
{
sched_move_task(tsk);
- if (threadgroup) {
- struct task_struct *c;
- rcu_read_lock();
- list_for_each_entry_rcu(c, &tsk->thread_group, thread_group) {
- sched_move_task(c);
- }
- rcu_read_unlock();
- }
}
static void
-cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
+cpu_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp,
+ struct cgroup *old_cgrp, struct task_struct *task)
{
/*
* cgroup_exit() is called in the copy_process() failure path.
@@ -8902,14 +8813,14 @@ cpu_cgroup_exit(struct cgroup_subsys *ss, struct task_struct *task)
static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype,
u64 shareval)
{
- return sched_group_set_shares(cgroup_tg(cgrp), shareval);
+ return sched_group_set_shares(cgroup_tg(cgrp), scale_load(shareval));
}
static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft)
{
struct task_group *tg = cgroup_tg(cgrp);
- return (u64) tg->shares;
+ return (u64) scale_load_down(tg->shares);
}
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8968,8 +8879,8 @@ struct cgroup_subsys cpu_cgroup_subsys = {
.name = "cpu",
.create = cpu_cgroup_create,
.destroy = cpu_cgroup_destroy,
- .can_attach = cpu_cgroup_can_attach,
- .attach = cpu_cgroup_attach,
+ .can_attach_task = cpu_cgroup_can_attach_task,
+ .attach_task = cpu_cgroup_attach_task,
.exit = cpu_cgroup_exit,
.populate = cpu_cgroup_populate,
.subsys_id = cpu_cgroup_subsys_id,
diff --git a/kernel/sched_autogroup.c b/kernel/sched_autogroup.c
index 9fb656283157..429242f3c484 100644
--- a/kernel/sched_autogroup.c
+++ b/kernel/sched_autogroup.c
@@ -12,7 +12,6 @@ static atomic_t autogroup_seq_nr;
static void __init autogroup_init(struct task_struct *init_task)
{
autogroup_default.tg = &root_task_group;
- root_task_group.autogroup = &autogroup_default;
kref_init(&autogroup_default.kref);
init_rwsem(&autogroup_default.lock);
init_task->signal->autogroup = &autogroup_default;
@@ -130,7 +129,7 @@ task_wants_autogroup(struct task_struct *p, struct task_group *tg)
static inline bool task_group_is_autogroup(struct task_group *tg)
{
- return tg != &root_task_group && tg->autogroup;
+ return !!tg->autogroup;
}
static inline struct task_group *
@@ -161,11 +160,15 @@ autogroup_move_group(struct task_struct *p, struct autogroup *ag)
p->signal->autogroup = autogroup_kref_get(ag);
+ if (!ACCESS_ONCE(sysctl_sched_autogroup_enabled))
+ goto out;
+
t = p;
do {
sched_move_task(t);
} while_each_thread(p, t);
+out:
unlock_task_sighand(p, &flags);
autogroup_kref_put(prev);
}
@@ -176,7 +179,7 @@ void sched_autogroup_create_attach(struct task_struct *p)
struct autogroup *ag = autogroup_create();
autogroup_move_group(p, ag);
- /* drop extra refrence added by autogroup_create() */
+ /* drop extra reference added by autogroup_create() */
autogroup_kref_put(ag);
}
EXPORT_SYMBOL(sched_autogroup_create_attach);
@@ -247,10 +250,14 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
{
struct autogroup *ag = autogroup_task_get(p);
+ if (!task_group_is_autogroup(ag->tg))
+ goto out;
+
down_read(&ag->lock);
seq_printf(m, "/autogroup-%ld nice %d\n", ag->id, ag->nice);
up_read(&ag->lock);
+out:
autogroup_kref_put(ag);
}
#endif /* CONFIG_PROC_FS */
@@ -258,9 +265,7 @@ void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m)
#ifdef CONFIG_SCHED_DEBUG
static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
{
- int enabled = ACCESS_ONCE(sysctl_sched_autogroup_enabled);
-
- if (!enabled || !tg->autogroup)
+ if (!task_group_is_autogroup(tg))
return 0;
return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
diff --git a/kernel/sched_autogroup.h b/kernel/sched_autogroup.h
index 7b859ffe5dad..05577055cfca 100644
--- a/kernel/sched_autogroup.h
+++ b/kernel/sched_autogroup.h
@@ -1,6 +1,11 @@
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup {
+ /*
+ * reference doesn't mean how many thread attach to this
+ * autogroup now. It just stands for the number of task
+ * could use this autogroup.
+ */
struct kref kref;
struct task_group *tg;
struct rw_semaphore lock;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index eb6cb8edd075..a6710a112b4f 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -152,7 +152,7 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu)
read_lock_irqsave(&tasklist_lock, flags);
do_each_thread(g, p) {
- if (!p->se.on_rq || task_cpu(p) != rq_cpu)
+ if (!p->on_rq || task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
@@ -179,7 +179,7 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
raw_spin_lock_irqsave(&rq->lock, flags);
if (cfs_rq->rb_leftmost)
- MIN_vruntime = (__pick_next_entity(cfs_rq))->vruntime;
+ MIN_vruntime = (__pick_first_entity(cfs_rq))->vruntime;
last = __pick_last_entity(cfs_rq);
if (last)
max_vruntime = last->vruntime;
@@ -296,9 +296,6 @@ static void print_cpu(struct seq_file *m, int cpu)
P(ttwu_count);
P(ttwu_local);
- SEQ_printf(m, " .%-30s: %d\n", "bkl_count",
- rq->rq_sched_info.bkl_count);
-
#undef P
#undef P64
#endif
@@ -441,7 +438,6 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
P(se.statistics.wait_count);
PN(se.statistics.iowait_sum);
P(se.statistics.iowait_count);
- P(sched_info.bkl_count);
P(se.nr_migrations);
P(se.statistics.nr_migrations_cold);
P(se.statistics.nr_failed_migrations_affine);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 0c26e2df450e..433491c2dc8f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -22,6 +22,7 @@
#include <linux/latencytop.h>
#include <linux/sched.h>
+#include <linux/cpumask.h>
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -69,14 +70,6 @@ static unsigned int sched_nr_latency = 8;
unsigned int sysctl_sched_child_runs_first __read_mostly;
/*
- * sys_sched_yield() compat mode
- *
- * This option switches the agressive yield implementation of the
- * old scheduler back on.
- */
-unsigned int __read_mostly sysctl_sched_compat_yield;
-
-/*
* SCHED_OTHER wake-up granularity.
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*
@@ -365,6 +358,10 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq)
}
cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
+#endif
}
/*
@@ -419,7 +416,7 @@ static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
}
-static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
+static struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *left = cfs_rq->rb_leftmost;
@@ -429,6 +426,17 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq)
return rb_entry(left, struct sched_entity, run_node);
}
+static struct sched_entity *__pick_next_entity(struct sched_entity *se)
+{
+ struct rb_node *next = rb_next(&se->run_node);
+
+ if (!next)
+ return NULL;
+
+ return rb_entry(next, struct sched_entity, run_node);
+}
+
+#ifdef CONFIG_SCHED_DEBUG
static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
{
struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
@@ -443,7 +451,6 @@ static struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
* Scheduling class statistics methods:
*/
-#ifdef CONFIG_SCHED_DEBUG
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
@@ -540,7 +547,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
}
static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update);
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta);
+static void update_cfs_shares(struct cfs_rq *cfs_rq);
/*
* Update the current task's runtime statistics. Skip current tasks that
@@ -733,6 +740,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
now - cfs_rq->load_last > 4 * period) {
cfs_rq->load_period = 0;
cfs_rq->load_avg = 0;
+ delta = period - 1;
}
cfs_rq->load_stamp = now;
@@ -763,16 +771,15 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
list_del_leaf_cfs_rq(cfs_rq);
}
-static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
- long weight_delta)
+static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
long load_weight, load, shares;
- load = cfs_rq->load.weight + weight_delta;
+ load = cfs_rq->load.weight;
load_weight = atomic_read(&tg->load_weight);
- load_weight -= cfs_rq->load_contribution;
load_weight += load;
+ load_weight -= cfs_rq->load_contribution;
shares = (tg->shares * load);
if (load_weight)
@@ -790,7 +797,7 @@ static void update_entity_shares_tick(struct cfs_rq *cfs_rq)
{
if (cfs_rq->load_unacc_exec_time > sysctl_sched_shares_window) {
update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
}
}
# else /* CONFIG_SMP */
@@ -798,8 +805,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
{
}
-static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg,
- long weight_delta)
+static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
{
return tg->shares;
}
@@ -824,7 +830,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
account_entity_enqueue(cfs_rq, se);
}
-static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static void update_cfs_shares(struct cfs_rq *cfs_rq)
{
struct task_group *tg;
struct sched_entity *se;
@@ -838,7 +844,7 @@ static void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
if (likely(se->load.weight == tg->shares))
return;
#endif
- shares = calc_cfs_shares(cfs_rq, tg, weight_delta);
+ shares = calc_cfs_shares(cfs_rq, tg);
reweight_entity(cfs_rq_of(se), se, shares);
}
@@ -847,7 +853,7 @@ static void update_cfs_load(struct cfs_rq *cfs_rq, int global_update)
{
}
-static inline void update_cfs_shares(struct cfs_rq *cfs_rq, long weight_delta)
+static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
{
}
@@ -978,8 +984,8 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
update_curr(cfs_rq);
update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq, se->load.weight);
account_entity_enqueue(cfs_rq, se);
+ update_cfs_shares(cfs_rq);
if (flags & ENQUEUE_WAKEUP) {
place_entity(cfs_rq, se, 0);
@@ -996,19 +1002,49 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
list_add_leaf_cfs_rq(cfs_rq);
}
-static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies_last(struct sched_entity *se)
{
- if (!se || cfs_rq->last == se)
- cfs_rq->last = NULL;
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->last == se)
+ cfs_rq->last = NULL;
+ else
+ break;
+ }
+}
- if (!se || cfs_rq->next == se)
- cfs_rq->next = NULL;
+static void __clear_buddies_next(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->next == se)
+ cfs_rq->next = NULL;
+ else
+ break;
+ }
+}
+
+static void __clear_buddies_skip(struct sched_entity *se)
+{
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ if (cfs_rq->skip == se)
+ cfs_rq->skip = NULL;
+ else
+ break;
+ }
}
static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- for_each_sched_entity(se)
- __clear_buddies(cfs_rq_of(se), se);
+ if (cfs_rq->last == se)
+ __clear_buddies_last(se);
+
+ if (cfs_rq->next == se)
+ __clear_buddies_next(se);
+
+ if (cfs_rq->skip == se)
+ __clear_buddies_skip(se);
}
static void
@@ -1040,8 +1076,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
se->on_rq = 0;
update_cfs_load(cfs_rq, 0);
account_entity_dequeue(cfs_rq, se);
- update_min_vruntime(cfs_rq);
- update_cfs_shares(cfs_rq, 0);
/*
* Normalize the entity after updating the min_vruntime because the
@@ -1050,6 +1084,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
*/
if (!(flags & DEQUEUE_SLEEP))
se->vruntime -= cfs_rq->min_vruntime;
+
+ update_min_vruntime(cfs_rq);
+ update_cfs_shares(cfs_rq);
}
/*
@@ -1084,7 +1121,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
return;
if (cfs_rq->nr_running > 1) {
- struct sched_entity *se = __pick_next_entity(cfs_rq);
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
s64 delta = curr->vruntime - se->vruntime;
if (delta < 0)
@@ -1128,13 +1165,27 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int
wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
+/*
+ * Pick the next process, keeping these things in mind, in this order:
+ * 1) keep things fair between processes/task groups
+ * 2) pick the "next" process, since someone really wants that to run
+ * 3) pick the "last" process, for cache locality
+ * 4) do not run the "skip" process, if something else is available
+ */
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
- struct sched_entity *se = __pick_next_entity(cfs_rq);
+ struct sched_entity *se = __pick_first_entity(cfs_rq);
struct sched_entity *left = se;
- if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
- se = cfs_rq->next;
+ /*
+ * Avoid running the skip buddy, if running something else can
+ * be done without getting too unfair.
+ */
+ if (cfs_rq->skip == se) {
+ struct sched_entity *second = __pick_next_entity(se);
+ if (second && wakeup_preempt_entity(second, left) < 1)
+ se = second;
+ }
/*
* Prefer last buddy, try to return the CPU to a preempted task.
@@ -1142,6 +1193,12 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
se = cfs_rq->last;
+ /*
+ * Someone really wants this to run. If it's not unfair, run it.
+ */
+ if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+ se = cfs_rq->next;
+
clear_buddies(cfs_rq, se);
return se;
@@ -1282,12 +1339,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
}
hrtick_update(rq);
}
+static void set_next_buddy(struct sched_entity *se);
+
/*
* The dequeue_task method is called before nr_running is
* decreased. We remove the task from the rbtree and
@@ -1297,14 +1356,22 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
{
struct cfs_rq *cfs_rq;
struct sched_entity *se = &p->se;
+ int task_sleep = flags & DEQUEUE_SLEEP;
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
/* Don't dequeue parent if it has other entities besides us */
- if (cfs_rq->load.weight)
+ if (cfs_rq->load.weight) {
+ /*
+ * Bias pick_next to pick a task from this cfs_rq, as
+ * p is sleeping when it is within its sched_slice.
+ */
+ if (task_sleep && parent_entity(se))
+ set_next_buddy(parent_entity(se));
break;
+ }
flags |= DEQUEUE_SLEEP;
}
@@ -1312,66 +1379,33 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
struct cfs_rq *cfs_rq = cfs_rq_of(se);
update_cfs_load(cfs_rq, 0);
- update_cfs_shares(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
}
hrtick_update(rq);
}
-/*
- * sched_yield() support is very simple - we dequeue and enqueue.
- *
- * If compat_yield is turned on then we requeue to the end of the tree.
- */
-static void yield_task_fair(struct rq *rq)
-{
- struct task_struct *curr = rq->curr;
- struct cfs_rq *cfs_rq = task_cfs_rq(curr);
- struct sched_entity *rightmost, *se = &curr->se;
-
- /*
- * Are we the only task in the tree?
- */
- if (unlikely(cfs_rq->nr_running == 1))
- return;
-
- clear_buddies(cfs_rq, se);
-
- if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) {
- update_rq_clock(rq);
- /*
- * Update run-time statistics of the 'current'.
- */
- update_curr(cfs_rq);
-
- return;
- }
- /*
- * Find the rightmost entry in the rbtree:
- */
- rightmost = __pick_last_entity(cfs_rq);
- /*
- * Already in the rightmost position?
- */
- if (unlikely(!rightmost || entity_before(rightmost, se)))
- return;
-
- /*
- * Minimally necessary key value to be last in the tree:
- * Upon rescheduling, sched_class::put_prev_task() will place
- * 'current' within the tree based on its new key value.
- */
- se->vruntime = rightmost->vruntime + 1;
-}
-
#ifdef CONFIG_SMP
-static void task_waking_fair(struct rq *rq, struct task_struct *p)
+static void task_waking_fair(struct task_struct *p)
{
struct sched_entity *se = &p->se;
struct cfs_rq *cfs_rq = cfs_rq_of(se);
+ u64 min_vruntime;
- se->vruntime -= cfs_rq->min_vruntime;
+#ifndef CONFIG_64BIT
+ u64 min_vruntime_copy;
+
+ do {
+ min_vruntime_copy = cfs_rq->min_vruntime_copy;
+ smp_rmb();
+ min_vruntime = cfs_rq->min_vruntime;
+ } while (min_vruntime != min_vruntime_copy);
+#else
+ min_vruntime = cfs_rq->min_vruntime;
+#endif
+
+ se->vruntime -= min_vruntime;
}
#ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1551,7 +1585,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
}
/* Adjust by relative CPU power of the group */
- avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ avg_load = (avg_load * SCHED_POWER_SCALE) / group->cpu_power;
if (local_group) {
this_load = avg_load;
@@ -1616,6 +1650,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
/*
* Otherwise, iterate the domains and find an elegible idle cpu.
*/
+ rcu_read_lock();
for_each_domain(target, sd) {
if (!(sd->flags & SD_SHARE_PKG_RESOURCES))
break;
@@ -1635,6 +1670,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
break;
}
+ rcu_read_unlock();
return target;
}
@@ -1651,7 +1687,7 @@ static int select_idle_sibling(struct task_struct *p, int target)
* preempt must be disabled.
*/
static int
-select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
{
struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
int cpu = smp_processor_id();
@@ -1667,6 +1703,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
new_cpu = prev_cpu;
}
+ rcu_read_lock();
for_each_domain(cpu, tmp) {
if (!(tmp->flags & SD_LOAD_BALANCE))
continue;
@@ -1686,7 +1723,7 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
nr_running += cpu_rq(i)->cfs.nr_running;
}
- capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+ capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
if (tmp->flags & SD_POWERSAVINGS_BALANCE)
nr_running /= 2;
@@ -1717,9 +1754,10 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
if (affine_sd) {
if (cpu == prev_cpu || wake_affine(affine_sd, p, sync))
- return select_idle_sibling(p, cpu);
- else
- return select_idle_sibling(p, prev_cpu);
+ prev_cpu = cpu;
+
+ new_cpu = select_idle_sibling(p, prev_cpu);
+ goto unlock;
}
while (sd) {
@@ -1760,6 +1798,8 @@ select_task_rq_fair(struct rq *rq, struct task_struct *p, int sd_flag, int wake_
}
/* while loop will break here if sd == NULL */
}
+unlock:
+ rcu_read_unlock();
return new_cpu;
}
@@ -1783,10 +1823,7 @@ wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
* This is especially important for buddies when the leftmost
* task is higher priority than the buddy.
*/
- if (unlikely(se->load.weight != NICE_0_LOAD))
- gran = calc_delta_fair(gran, se);
-
- return gran;
+ return calc_delta_fair(gran, se);
}
/*
@@ -1820,18 +1857,26 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
static void set_last_buddy(struct sched_entity *se)
{
- if (likely(task_of(se)->policy != SCHED_IDLE)) {
- for_each_sched_entity(se)
- cfs_rq_of(se)->last = se;
- }
+ if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ return;
+
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->last = se;
}
static void set_next_buddy(struct sched_entity *se)
{
- if (likely(task_of(se)->policy != SCHED_IDLE)) {
- for_each_sched_entity(se)
- cfs_rq_of(se)->next = se;
- }
+ if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ return;
+
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->next = se;
+}
+
+static void set_skip_buddy(struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ cfs_rq_of(se)->skip = se;
}
/*
@@ -1843,12 +1888,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct sched_entity *se = &curr->se, *pse = &p->se;
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency;
+ int next_buddy_marked = 0;
if (unlikely(se == pse))
return;
- if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
+ if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
set_next_buddy(pse);
+ next_buddy_marked = 1;
+ }
/*
* We can come here with TIF_NEED_RESCHED already set from new task
@@ -1857,16 +1905,18 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
if (test_tsk_need_resched(curr))
return;
+ /* Idle tasks are by definition preempted by non-idle tasks. */
+ if (unlikely(curr->policy == SCHED_IDLE) &&
+ likely(p->policy != SCHED_IDLE))
+ goto preempt;
+
/*
- * Batch and idle tasks do not preempt (their preemption is driven by
- * the tick):
+ * Batch and idle tasks do not preempt non-idle tasks (their preemption
+ * is driven by the tick):
*/
if (unlikely(p->policy != SCHED_NORMAL))
return;
- /* Idle tasks are by definition preempted by everybody. */
- if (unlikely(curr->policy == SCHED_IDLE))
- goto preempt;
if (!sched_feat(WAKEUP_PREEMPT))
return;
@@ -1874,8 +1924,15 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
update_curr(cfs_rq);
find_matching_se(&se, &pse);
BUG_ON(!pse);
- if (wakeup_preempt_entity(se, pse) == 1)
+ if (wakeup_preempt_entity(se, pse) == 1) {
+ /*
+ * Bias pick_next to pick the sched entity that is
+ * triggering this preemption.
+ */
+ if (!next_buddy_marked)
+ set_next_buddy(pse);
goto preempt;
+ }
return;
@@ -1932,6 +1989,51 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
}
}
+/*
+ * sched_yield() is very simple
+ *
+ * The magic of dealing with the ->skip buddy is in pick_next_entity.
+ */
+static void yield_task_fair(struct rq *rq)
+{
+ struct task_struct *curr = rq->curr;
+ struct cfs_rq *cfs_rq = task_cfs_rq(curr);
+ struct sched_entity *se = &curr->se;
+
+ /*
+ * Are we the only task in the tree?
+ */
+ if (unlikely(rq->nr_running == 1))
+ return;
+
+ clear_buddies(cfs_rq, se);
+
+ if (curr->policy != SCHED_BATCH) {
+ update_rq_clock(rq);
+ /*
+ * Update run-time statistics of the 'current'.
+ */
+ update_curr(cfs_rq);
+ }
+
+ set_skip_buddy(se);
+}
+
+static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
+{
+ struct sched_entity *se = &p->se;
+
+ if (!se->on_rq)
+ return false;
+
+ /* Tell the scheduler that we'd really like pse to run next. */
+ set_next_buddy(se);
+
+ yield_task_fair(rq);
+
+ return true;
+}
+
#ifdef CONFIG_SMP
/**************************************************
* Fair scheduling class load-balancing methods:
@@ -2041,23 +2143,22 @@ static unsigned long
balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move, struct sched_domain *sd,
enum cpu_idle_type idle, int *all_pinned,
- int *this_best_prio, struct cfs_rq *busiest_cfs_rq)
+ struct cfs_rq *busiest_cfs_rq)
{
- int loops = 0, pulled = 0, pinned = 0;
+ int loops = 0, pulled = 0;
long rem_load_move = max_load_move;
struct task_struct *p, *n;
if (max_load_move == 0)
goto out;
- pinned = 1;
-
list_for_each_entry_safe(p, n, &busiest_cfs_rq->tasks, se.group_node) {
if (loops++ > sysctl_sched_nr_migrate)
break;
if ((p->se.load.weight >> 1) > rem_load_move ||
- !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned))
+ !can_migrate_task(p, busiest, this_cpu, sd, idle,
+ all_pinned))
continue;
pull_task(busiest, p, this_rq, this_cpu);
@@ -2080,9 +2181,6 @@ balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
*/
if (rem_load_move <= 0)
break;
-
- if (p->prio < *this_best_prio)
- *this_best_prio = p->prio;
}
out:
/*
@@ -2092,9 +2190,6 @@ out:
*/
schedstat_add(sd, lb_gained[idle], pulled);
- if (all_pinned)
- *all_pinned = pinned;
-
return max_load_move - rem_load_move;
}
@@ -2123,7 +2218,7 @@ static int update_shares_cpu(struct task_group *tg, int cpu)
* We need to update shares after updating tg->load_weight in
* order to adjust the weight of groups with long running tasks.
*/
- update_cfs_shares(cfs_rq, 0);
+ update_cfs_shares(cfs_rq);
raw_spin_unlock_irqrestore(&rq->lock, flags);
@@ -2145,7 +2240,7 @@ static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned, int *this_best_prio)
+ int *all_pinned)
{
long rem_load_move = max_load_move;
int busiest_cpu = cpu_of(busiest);
@@ -2170,7 +2265,7 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
rem_load = div_u64(rem_load, busiest_h_load + 1);
moved_load = balance_tasks(this_rq, this_cpu, busiest,
- rem_load, sd, idle, all_pinned, this_best_prio,
+ rem_load, sd, idle, all_pinned,
busiest_cfs_rq);
if (!moved_load)
@@ -2196,11 +2291,11 @@ static unsigned long
load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
unsigned long max_load_move,
struct sched_domain *sd, enum cpu_idle_type idle,
- int *all_pinned, int *this_best_prio)
+ int *all_pinned)
{
return balance_tasks(this_rq, this_cpu, busiest,
max_load_move, sd, idle, all_pinned,
- this_best_prio, &busiest->cfs);
+ &busiest->cfs);
}
#endif
@@ -2217,12 +2312,11 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
int *all_pinned)
{
unsigned long total_load_moved = 0, load_moved;
- int this_best_prio = this_rq->curr->prio;
do {
load_moved = load_balance_fair(this_rq, this_cpu, busiest,
max_load_move - total_load_moved,
- sd, idle, all_pinned, &this_best_prio);
+ sd, idle, all_pinned);
total_load_moved += load_moved;
@@ -2477,7 +2571,7 @@ static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
{
- return SCHED_LOAD_SCALE;
+ return SCHED_POWER_SCALE;
}
unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
@@ -2514,10 +2608,10 @@ unsigned long scale_rt_power(int cpu)
available = total - rq->rt_avg;
}
- if (unlikely((s64)total < SCHED_LOAD_SCALE))
- total = SCHED_LOAD_SCALE;
+ if (unlikely((s64)total < SCHED_POWER_SCALE))
+ total = SCHED_POWER_SCALE;
- total >>= SCHED_LOAD_SHIFT;
+ total >>= SCHED_POWER_SHIFT;
return div_u64(available, total);
}
@@ -2525,7 +2619,7 @@ unsigned long scale_rt_power(int cpu)
static void update_cpu_power(struct sched_domain *sd, int cpu)
{
unsigned long weight = sd->span_weight;
- unsigned long power = SCHED_LOAD_SCALE;
+ unsigned long power = SCHED_POWER_SCALE;
struct sched_group *sdg = sd->groups;
if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
@@ -2534,7 +2628,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
else
power *= default_scale_smt_power(sd, cpu);
- power >>= SCHED_LOAD_SHIFT;
+ power >>= SCHED_POWER_SHIFT;
}
sdg->cpu_power_orig = power;
@@ -2544,10 +2638,10 @@ static void update_cpu_power(struct sched_domain *sd, int cpu)
else
power *= default_scale_freq_power(sd, cpu);
- power >>= SCHED_LOAD_SHIFT;
+ power >>= SCHED_POWER_SHIFT;
power *= scale_rt_power(cpu);
- power >>= SCHED_LOAD_SHIFT;
+ power >>= SCHED_POWER_SHIFT;
if (!power)
power = 1;
@@ -2589,9 +2683,9 @@ static inline int
fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
{
/*
- * Only siblings can have significantly less than SCHED_LOAD_SCALE
+ * Only siblings can have significantly less than SCHED_POWER_SCALE
*/
- if (sd->level != SD_LV_SIBLING)
+ if (!(sd->flags & SD_SHARE_CPUPOWER))
return 0;
/*
@@ -2610,7 +2704,6 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
* @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @sd_idle: Idle status of the sched_domain containing group.
* @local_group: Does group contain this_cpu.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
@@ -2618,7 +2711,7 @@ fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
*/
static inline void update_sg_lb_stats(struct sched_domain *sd,
struct sched_group *group, int this_cpu,
- enum cpu_idle_type idle, int load_idx, int *sd_idle,
+ enum cpu_idle_type idle, int load_idx,
int local_group, const struct cpumask *cpus,
int *balance, struct sg_lb_stats *sgs)
{
@@ -2638,9 +2731,6 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
for_each_cpu_and(i, sched_group_cpus(group), cpus) {
struct rq *rq = cpu_rq(i);
- if (*sd_idle && rq->nr_running)
- *sd_idle = 0;
-
/* Bias balancing toward cpus of our domain */
if (local_group) {
if (idle_cpu(i) && !first_idle_cpu) {
@@ -2681,11 +2771,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
}
/* Adjust by relative CPU power of the group */
- sgs->avg_load = (sgs->group_load * SCHED_LOAD_SCALE) / group->cpu_power;
+ sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->cpu_power;
/*
* Consider the group unbalanced when the imbalance is larger
- * than the average weight of two tasks.
+ * than the average weight of a task.
*
* APZ: with cgroup the avg task weight can vary wildly and
* might not be a suitable number - should we keep a
@@ -2695,10 +2785,11 @@ static inline void update_sg_lb_stats(struct sched_domain *sd,
if (sgs->sum_nr_running)
avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
- if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task && max_nr_running > 1)
+ if ((max_cpu_load - min_cpu_load) >= avg_load_per_task && max_nr_running > 1)
sgs->group_imb = 1;
- sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power, SCHED_LOAD_SCALE);
+ sgs->group_capacity = DIV_ROUND_CLOSEST(group->cpu_power,
+ SCHED_POWER_SCALE);
if (!sgs->group_capacity)
sgs->group_capacity = fix_small_capacity(sd, group);
sgs->group_weight = group->group_weight;
@@ -2755,15 +2846,13 @@ static bool update_sd_pick_busiest(struct sched_domain *sd,
* @sd: sched_domain whose statistics are to be updated.
* @this_cpu: Cpu for which load balance is currently performed.
* @idle: Idle status of this_cpu
- * @sd_idle: Idle status of the sched_domain containing sg.
* @cpus: Set of cpus considered for load balancing.
* @balance: Should we balance.
* @sds: variable to hold the statistics for this sched_domain.
*/
static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
- enum cpu_idle_type idle, int *sd_idle,
- const struct cpumask *cpus, int *balance,
- struct sd_lb_stats *sds)
+ enum cpu_idle_type idle, const struct cpumask *cpus,
+ int *balance, struct sd_lb_stats *sds)
{
struct sched_domain *child = sd->child;
struct sched_group *sg = sd->groups;
@@ -2781,7 +2870,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
local_group = cpumask_test_cpu(this_cpu, sched_group_cpus(sg));
memset(&sgs, 0, sizeof(sgs));
- update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx, sd_idle,
+ update_sg_lb_stats(sd, sg, this_cpu, idle, load_idx,
local_group, cpus, balance, &sgs);
if (local_group && !(*balance))
@@ -2874,7 +2963,7 @@ static int check_asym_packing(struct sched_domain *sd,
return 0;
*imbalance = DIV_ROUND_CLOSEST(sds->max_load * sds->busiest->cpu_power,
- SCHED_LOAD_SCALE);
+ SCHED_POWER_SCALE);
return 1;
}
@@ -2903,7 +2992,7 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
cpu_avg_load_per_task(this_cpu);
scaled_busy_load_per_task = sds->busiest_load_per_task
- * SCHED_LOAD_SCALE;
+ * SCHED_POWER_SCALE;
scaled_busy_load_per_task /= sds->busiest->cpu_power;
if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
@@ -2922,10 +3011,10 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
min(sds->busiest_load_per_task, sds->max_load);
pwr_now += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load);
- pwr_now /= SCHED_LOAD_SCALE;
+ pwr_now /= SCHED_POWER_SCALE;
/* Amount of load we'd subtract */
- tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
sds->busiest->cpu_power;
if (sds->max_load > tmp)
pwr_move += sds->busiest->cpu_power *
@@ -2933,15 +3022,15 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
/* Amount of load we'd add */
if (sds->max_load * sds->busiest->cpu_power <
- sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+ sds->busiest_load_per_task * SCHED_POWER_SCALE)
tmp = (sds->max_load * sds->busiest->cpu_power) /
sds->this->cpu_power;
else
- tmp = (sds->busiest_load_per_task * SCHED_LOAD_SCALE) /
+ tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
sds->this->cpu_power;
pwr_move += sds->this->cpu_power *
min(sds->this_load_per_task, sds->this_load + tmp);
- pwr_move /= SCHED_LOAD_SCALE;
+ pwr_move /= SCHED_POWER_SCALE;
/* Move if we gain throughput */
if (pwr_move > pwr_now)
@@ -2983,7 +3072,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
load_above_capacity = (sds->busiest_nr_running -
sds->busiest_group_capacity);
- load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_LOAD_SCALE);
+ load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
load_above_capacity /= sds->busiest->cpu_power;
}
@@ -3003,11 +3092,11 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
/* How much load to actually move to equalise the imbalance */
*imbalance = min(max_pull * sds->busiest->cpu_power,
(sds->avg_load - sds->this_load) * sds->this->cpu_power)
- / SCHED_LOAD_SCALE;
+ / SCHED_POWER_SCALE;
/*
* if *imbalance is less than the average load per runnable task
- * there is no gaurantee that any tasks will be moved so we'll have
+ * there is no guarantee that any tasks will be moved so we'll have
* a think about bumping its value to force at least one task to be
* moved
*/
@@ -3033,7 +3122,6 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
* @imbalance: Variable which stores amount of weighted load which should
* be moved to restore balance/put a group to idle.
* @idle: The idle status of this_cpu.
- * @sd_idle: The idleness of sd
* @cpus: The set of CPUs under consideration for load-balancing.
* @balance: Pointer to a variable indicating if this_cpu
* is the appropriate cpu to perform load balancing at this_level.
@@ -3046,7 +3134,7 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
static struct sched_group *
find_busiest_group(struct sched_domain *sd, int this_cpu,
unsigned long *imbalance, enum cpu_idle_type idle,
- int *sd_idle, const struct cpumask *cpus, int *balance)
+ const struct cpumask *cpus, int *balance)
{
struct sd_lb_stats sds;
@@ -3056,22 +3144,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
* Compute the various statistics relavent for load balancing at
* this level.
*/
- update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
- balance, &sds);
-
- /* Cases where imbalance does not exist from POV of this_cpu */
- /* 1) this_cpu is not the appropriate cpu to perform load balancing
- * at this level.
- * 2) There is no busy sibling group to pull from.
- * 3) This group is the busiest group.
- * 4) This group is more busy than the avg busieness at this
- * sched_domain.
- * 5) The imbalance is within the specified limit.
- *
- * Note: when doing newidle balance, if the local group has excess
- * capacity (i.e. nr_running < group_capacity) and the busiest group
- * does not have any capacity, we force a load balance to pull tasks
- * to the local group. In this case, we skip past checks 3, 4 and 5.
+ update_sd_lb_stats(sd, this_cpu, idle, cpus, balance, &sds);
+
+ /*
+ * this_cpu is not the appropriate cpu to perform load balancing at
+ * this level.
*/
if (!(*balance))
goto ret;
@@ -3080,41 +3157,56 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
check_asym_packing(sd, &sds, this_cpu, imbalance))
return sds.busiest;
+ /* There is no busy sibling group to pull tasks from */
if (!sds.busiest || sds.busiest_nr_running == 0)
goto out_balanced;
- /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
+ sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
+
+ /*
+ * If the busiest group is imbalanced the below checks don't
+ * work because they assumes all things are equal, which typically
+ * isn't true due to cpus_allowed constraints and the like.
+ */
+ if (sds.group_imb)
+ goto force_balance;
+
+ /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
if (idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
!sds.busiest_has_capacity)
goto force_balance;
+ /*
+ * If the local group is more busy than the selected busiest group
+ * don't try and pull any tasks.
+ */
if (sds.this_load >= sds.max_load)
goto out_balanced;
- sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
-
+ /*
+ * Don't pull any tasks if this group is already above the domain
+ * average load.
+ */
if (sds.this_load >= sds.avg_load)
goto out_balanced;
- /*
- * In the CPU_NEWLY_IDLE, use imbalance_pct to be conservative.
- * And to check for busy balance use !idle_cpu instead of
- * CPU_NOT_IDLE. This is because HT siblings will use CPU_NOT_IDLE
- * even when they are idle.
- */
- if (idle == CPU_NEWLY_IDLE || !idle_cpu(this_cpu)) {
- if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
- goto out_balanced;
- } else {
+ if (idle == CPU_IDLE) {
/*
* This cpu is idle. If the busiest group load doesn't
* have more tasks than the number of available cpu's and
* there is no imbalance between this and busiest group
* wrt to idle cpu's, it is balanced.
*/
- if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
+ if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
sds.busiest_nr_running <= sds.busiest_group_weight)
goto out_balanced;
+ } else {
+ /*
+ * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
+ * imbalance_pct to be conservative.
+ */
+ if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
+ goto out_balanced;
}
force_balance:
@@ -3148,7 +3240,8 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
for_each_cpu(i, sched_group_cpus(group)) {
unsigned long power = power_of(i);
- unsigned long capacity = DIV_ROUND_CLOSEST(power, SCHED_LOAD_SCALE);
+ unsigned long capacity = DIV_ROUND_CLOSEST(power,
+ SCHED_POWER_SCALE);
unsigned long wl;
if (!capacity)
@@ -3173,7 +3266,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
* the load can be moved away from the cpu that is potentially
* running at a lower capacity.
*/
- wl = (wl * SCHED_LOAD_SCALE) / power;
+ wl = (wl * SCHED_POWER_SCALE) / power;
if (wl > max_load) {
max_load = wl;
@@ -3193,7 +3286,7 @@ find_busiest_queue(struct sched_domain *sd, struct sched_group *group,
/* Working cpumask for load_balance and load_balance_newidle. */
static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
-static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
+static int need_active_balance(struct sched_domain *sd, int idle,
int busiest_cpu, int this_cpu)
{
if (idle == CPU_NEWLY_IDLE) {
@@ -3225,10 +3318,6 @@ static int need_active_balance(struct sched_domain *sd, int sd_idle, int idle,
* move_tasks() will succeed. ld_moved will be true and this
* active balance code will not be triggered.
*/
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- return 0;
-
if (sched_mc_power_savings < POWERSAVINGS_BALANCE_WAKEUP)
return 0;
}
@@ -3246,7 +3335,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
struct sched_domain *sd, enum cpu_idle_type idle,
int *balance)
{
- int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
+ int ld_moved, all_pinned = 0, active_balance = 0;
struct sched_group *group;
unsigned long imbalance;
struct rq *busiest;
@@ -3255,20 +3344,10 @@ static int load_balance(int this_cpu, struct rq *this_rq,
cpumask_copy(cpus, cpu_active_mask);
- /*
- * When power savings policy is enabled for the parent domain, idle
- * sibling can pick up load irrespective of busy siblings. In this case,
- * let the state of idle sibling percolate up as CPU_IDLE, instead of
- * portraying it as CPU_NOT_IDLE.
- */
- if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- sd_idle = 1;
-
schedstat_inc(sd, lb_count[idle]);
redo:
- group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle,
+ group = find_busiest_group(sd, this_cpu, &imbalance, idle,
cpus, balance);
if (*balance == 0)
@@ -3297,6 +3376,7 @@ redo:
* still unbalanced. ld_moved simply stays zero, so it is
* correctly treated as an imbalance.
*/
+ all_pinned = 1;
local_irq_save(flags);
double_rq_lock(this_rq, busiest);
ld_moved = move_tasks(this_rq, this_cpu, busiest,
@@ -3330,8 +3410,7 @@ redo:
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
- if (need_active_balance(sd, sd_idle, idle, cpu_of(busiest),
- this_cpu)) {
+ if (need_active_balance(sd, idle, cpu_of(busiest), this_cpu)) {
raw_spin_lock_irqsave(&busiest->lock, flags);
/* don't kick the active_load_balance_cpu_stop,
@@ -3386,10 +3465,6 @@ redo:
sd->balance_interval *= 2;
}
- if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
-
goto out;
out_balanced:
@@ -3403,11 +3478,7 @@ out_one_pinned:
(sd->balance_interval < sd->max_interval))
sd->balance_interval *= 2;
- if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
- !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
- ld_moved = -1;
- else
- ld_moved = 0;
+ ld_moved = 0;
out:
return ld_moved;
}
@@ -3433,6 +3504,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
raw_spin_unlock(&this_rq->lock);
update_shares(this_cpu);
+ rcu_read_lock();
for_each_domain(this_cpu, sd) {
unsigned long interval;
int balance = 1;
@@ -3454,6 +3526,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
break;
}
}
+ rcu_read_unlock();
raw_spin_lock(&this_rq->lock);
@@ -3502,6 +3575,7 @@ static int active_load_balance_cpu_stop(void *data)
double_lock_balance(busiest_rq, target_rq);
/* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
for_each_domain(target_cpu, sd) {
if ((sd->flags & SD_LOAD_BALANCE) &&
cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
@@ -3517,6 +3591,7 @@ static int active_load_balance_cpu_stop(void *data)
else
schedstat_inc(sd, alb_failed);
}
+ rcu_read_unlock();
double_unlock_balance(busiest_rq, target_rq);
out_unlock:
busiest_rq->active_balance = 0;
@@ -3643,6 +3718,7 @@ static int find_new_ilb(int cpu)
{
struct sched_domain *sd;
struct sched_group *ilb_group;
+ int ilb = nr_cpu_ids;
/*
* Have idle load balancer selection from semi-idle packages only
@@ -3658,20 +3734,25 @@ static int find_new_ilb(int cpu)
if (cpumask_weight(nohz.idle_cpus_mask) < 2)
goto out_done;
+ rcu_read_lock();
for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
ilb_group = sd->groups;
do {
- if (is_semi_idle_group(ilb_group))
- return cpumask_first(nohz.grp_idle_mask);
+ if (is_semi_idle_group(ilb_group)) {
+ ilb = cpumask_first(nohz.grp_idle_mask);
+ goto unlock;
+ }
ilb_group = ilb_group->next;
} while (ilb_group != sd->groups);
}
+unlock:
+ rcu_read_unlock();
out_done:
- return nr_cpu_ids;
+ return ilb;
}
#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
static inline int find_new_ilb(int call_cpu)
@@ -3786,6 +3867,17 @@ void select_nohz_load_balancer(int stop_tick)
static DEFINE_SPINLOCK(balancing);
+static unsigned long __read_mostly max_load_balance_interval = HZ/10;
+
+/*
+ * Scale the max load_balance interval with the number of CPUs in the system.
+ * This trades load-balance latency on larger machines for less cross talk.
+ */
+static void update_max_interval(void)
+{
+ max_load_balance_interval = HZ*num_online_cpus()/10;
+}
+
/*
* It checks each scheduling domain to see if it is due to be balanced,
* and initiates a balancing operation if so.
@@ -3805,6 +3897,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
update_shares(cpu);
+ rcu_read_lock();
for_each_domain(cpu, sd) {
if (!(sd->flags & SD_LOAD_BALANCE))
continue;
@@ -3815,10 +3908,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
/* scale ms to jiffies */
interval = msecs_to_jiffies(interval);
- if (unlikely(!interval))
- interval = 1;
- if (interval > HZ*NR_CPUS/10)
- interval = HZ*NR_CPUS/10;
+ interval = clamp(interval, 1UL, max_load_balance_interval);
need_serialize = sd->flags & SD_SERIALIZE;
@@ -3831,8 +3921,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
if (load_balance(cpu, rq, sd, idle, &balance)) {
/*
* We've pulled tasks over so either we're no
- * longer idle, or one of our SMT siblings is
- * not idle.
+ * longer idle.
*/
idle = CPU_NOT_IDLE;
}
@@ -3854,6 +3943,7 @@ out:
if (!balance)
break;
}
+ rcu_read_unlock();
/*
* next_balance will be updated only when there is a need.
@@ -4079,33 +4169,62 @@ static void task_fork_fair(struct task_struct *p)
* Priority of the task has changed. Check to see if we preempt
* the current task.
*/
-static void prio_changed_fair(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
{
+ if (!p->se.on_rq)
+ return;
+
/*
* Reschedule if we are currently running on this runqueue and
* our priority decreased, or if we are not currently running on
* this runqueue and our priority is higher than the current's
*/
- if (running) {
+ if (rq->curr == p) {
if (p->prio > oldprio)
resched_task(rq->curr);
} else
check_preempt_curr(rq, p, 0);
}
+static void switched_from_fair(struct rq *rq, struct task_struct *p)
+{
+ struct sched_entity *se = &p->se;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ /*
+ * Ensure the task's vruntime is normalized, so that when its
+ * switched back to the fair class the enqueue_entity(.flags=0) will
+ * do the right thing.
+ *
+ * If it was on_rq, then the dequeue_entity(.flags=0) will already
+ * have normalized the vruntime, if it was !on_rq, then only when
+ * the task is sleeping will it still have non-normalized vruntime.
+ */
+ if (!se->on_rq && p->state != TASK_RUNNING) {
+ /*
+ * Fix up our vruntime so that the current sleep doesn't
+ * cause 'unlimited' sleep bonus.
+ */
+ place_entity(cfs_rq, se, 0);
+ se->vruntime -= cfs_rq->min_vruntime;
+ }
+}
+
/*
* We switched to the sched_fair class.
*/
-static void switched_to_fair(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ if (!p->se.on_rq)
+ return;
+
/*
* We were most likely switched from sched_rt, so
* kick off the schedule if running, otherwise just see
* if we can still preempt the current task.
*/
- if (running)
+ if (rq->curr == p)
resched_task(rq->curr);
else
check_preempt_curr(rq, p, 0);
@@ -4171,6 +4290,7 @@ static const struct sched_class fair_sched_class = {
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
+ .yield_to_task = yield_to_task_fair,
.check_preempt_curr = check_preempt_wakeup,
@@ -4191,6 +4311,7 @@ static const struct sched_class fair_sched_class = {
.task_fork = task_fork_fair,
.prio_changed = prio_changed_fair,
+ .switched_from = switched_from_fair,
.switched_to = switched_to_fair,
.get_rr_interval = get_rr_interval_fair,
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index 68e69acc29b9..be40f7371ee1 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -64,3 +64,9 @@ SCHED_FEAT(OWNER_SPIN, 1)
* Decrement CPU power based on irq activity
*/
SCHED_FEAT(NONIRQ_POWER, 1)
+
+/*
+ * Queue remote wakeups on the target CPU and process them
+ * using the scheduler IPI. Reduces rq->lock contention/bounces.
+ */
+SCHED_FEAT(TTWU_QUEUE, 1)
diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c
index 9fa0f402c87c..0a51882534ea 100644
--- a/kernel/sched_idletask.c
+++ b/kernel/sched_idletask.c
@@ -7,7 +7,7 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_idle(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* IDLE tasks as never migrated */
}
@@ -52,31 +52,15 @@ static void set_curr_task_idle(struct rq *rq)
{
}
-static void switched_to_idle(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
- /* Can this actually happen?? */
- if (running)
- resched_task(rq->curr);
- else
- check_preempt_curr(rq, p, 0);
+ BUG();
}
-static void prio_changed_idle(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
{
- /* This can happen for hot plug CPUS */
-
- /*
- * Reschedule if we are currently running on this runqueue and
- * our priority decreased, or if we are not currently running on
- * this runqueue and our priority is higher than the current's
- */
- if (running) {
- if (p->prio > oldprio)
- resched_task(rq->curr);
- } else
- check_preempt_curr(rq, p, 0);
+ BUG();
}
static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task)
@@ -110,6 +94,4 @@ static const struct sched_class idle_sched_class = {
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
-
- /* no .task_new for idle tasks */
};
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 01f75a5f17af..88725c939e0b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -183,6 +183,14 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
}
+typedef struct task_group *rt_rq_iter_t;
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+ for (iter = list_entry_rcu(task_groups.next, typeof(*iter), list); \
+ (&iter->list != &task_groups) && \
+ (rt_rq = iter->rt_rq[cpu_of(rq)]); \
+ iter = list_entry_rcu(iter->list.next, typeof(*iter), list))
+
static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
{
list_add_rcu(&rt_rq->leaf_rt_rq_list,
@@ -288,6 +296,11 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
return ktime_to_ns(def_rt_bandwidth.rt_period);
}
+typedef struct rt_rq *rt_rq_iter_t;
+
+#define for_each_rt_rq(rt_rq, iter, rq) \
+ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
+
static inline void list_add_leaf_rt_rq(struct rt_rq *rt_rq)
{
}
@@ -402,12 +415,13 @@ next:
static void __disable_runtime(struct rq *rq)
{
struct root_domain *rd = rq->rd;
+ rt_rq_iter_t iter;
struct rt_rq *rt_rq;
if (unlikely(!scheduler_running))
return;
- for_each_leaf_rt_rq(rt_rq, rq) {
+ for_each_rt_rq(rt_rq, iter, rq) {
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
s64 want;
int i;
@@ -487,6 +501,7 @@ static void disable_runtime(struct rq *rq)
static void __enable_runtime(struct rq *rq)
{
+ rt_rq_iter_t iter;
struct rt_rq *rt_rq;
if (unlikely(!scheduler_running))
@@ -495,7 +510,7 @@ static void __enable_runtime(struct rq *rq)
/*
* Reset each runqueue's bandwidth settings
*/
- for_each_leaf_rt_rq(rt_rq, rq) {
+ for_each_rt_rq(rt_rq, iter, rq) {
struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
raw_spin_lock(&rt_b->rt_runtime_lock);
@@ -562,6 +577,13 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
rt_rq->rt_throttled = 0;
enqueue = 1;
+
+ /*
+ * Force a clock update if the CPU was idle,
+ * lest wakeup -> unthrottle time accumulate.
+ */
+ if (rt_rq->rt_nr_running && rq->curr == rq->idle)
+ rq->skip_clock_update = -1;
}
if (rt_rq->rt_time || rt_rq->rt_nr_running)
idle = 0;
@@ -977,13 +999,23 @@ static void yield_task_rt(struct rq *rq)
static int find_lowest_rq(struct task_struct *task);
static int
-select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
{
+ struct task_struct *curr;
+ struct rq *rq;
+ int cpu;
+
if (sd_flag != SD_BALANCE_WAKE)
return smp_processor_id();
+ cpu = task_cpu(p);
+ rq = cpu_rq(cpu);
+
+ rcu_read_lock();
+ curr = ACCESS_ONCE(rq->curr); /* unlocked access */
+
/*
- * If the current task is an RT task, then
+ * If the current task on @p's runqueue is an RT task, then
* try to see if we can wake this RT task up on another
* runqueue. Otherwise simply start this RT task
* on its current runqueue.
@@ -997,21 +1029,25 @@ select_task_rq_rt(struct rq *rq, struct task_struct *p, int sd_flag, int flags)
* lock?
*
* For equal prio tasks, we just let the scheduler sort it out.
+ *
+ * Otherwise, just let it ride on the affined RQ and the
+ * post-schedule router will push the preempted task away
+ *
+ * This test is optimistic, if we get it wrong the load-balancer
+ * will have to sort it out.
*/
- if (unlikely(rt_task(rq->curr)) &&
- (rq->curr->rt.nr_cpus_allowed < 2 ||
- rq->curr->prio < p->prio) &&
+ if (curr && unlikely(rt_task(curr)) &&
+ (curr->rt.nr_cpus_allowed < 2 ||
+ curr->prio < p->prio) &&
(p->rt.nr_cpus_allowed > 1)) {
- int cpu = find_lowest_rq(p);
+ int target = find_lowest_rq(p);
- return (cpu == -1) ? task_cpu(p) : cpu;
+ if (target != -1)
+ cpu = target;
}
+ rcu_read_unlock();
- /*
- * Otherwise, just let it ride on the affined RQ and the
- * post-schedule router will push the preempted task away
- */
- return task_cpu(p);
+ return cpu;
}
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
@@ -1136,7 +1172,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
* The previous task needs to be made eligible for pushing
* if it is still active
*/
- if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+ if (on_rt_rq(&p->rt) && p->rt.nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
}
@@ -1227,6 +1263,7 @@ static int find_lowest_rq(struct task_struct *task)
if (!cpumask_test_cpu(this_cpu, lowest_mask))
this_cpu = -1; /* Skip this_cpu opt if not among lowest */
+ rcu_read_lock();
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_AFFINE) {
int best_cpu;
@@ -1236,15 +1273,20 @@ static int find_lowest_rq(struct task_struct *task)
* remote processor.
*/
if (this_cpu != -1 &&
- cpumask_test_cpu(this_cpu, sched_domain_span(sd)))
+ cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
return this_cpu;
+ }
best_cpu = cpumask_first_and(lowest_mask,
sched_domain_span(sd));
- if (best_cpu < nr_cpu_ids)
+ if (best_cpu < nr_cpu_ids) {
+ rcu_read_unlock();
return best_cpu;
+ }
}
}
+ rcu_read_unlock();
/*
* And finally, if there were no matches within the domains
@@ -1287,7 +1329,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
!cpumask_test_cpu(lowest_rq->cpu,
&task->cpus_allowed) ||
task_running(rq, task) ||
- !task->se.on_rq)) {
+ !task->on_rq)) {
raw_spin_unlock(&lowest_rq->lock);
lowest_rq = NULL;
@@ -1321,7 +1363,7 @@ static struct task_struct *pick_next_pushable_task(struct rq *rq)
BUG_ON(task_current(rq, p));
BUG_ON(p->rt.nr_cpus_allowed <= 1);
- BUG_ON(!p->se.on_rq);
+ BUG_ON(!p->on_rq);
BUG_ON(!rt_task(p));
return p;
@@ -1378,7 +1420,7 @@ retry:
task = pick_next_pushable_task(rq);
if (task_cpu(next_task) == rq->cpu && task == next_task) {
/*
- * If we get here, the task hasnt moved at all, but
+ * If we get here, the task hasn't moved at all, but
* it has failed to push. We will not try again,
* since the other cpus will pull from us when they
* are ready.
@@ -1467,7 +1509,7 @@ static int pull_rt_task(struct rq *this_rq)
*/
if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
WARN_ON(p == src_rq->curr);
- WARN_ON(!p->se.on_rq);
+ WARN_ON(!p->on_rq);
/*
* There's a chance that p is higher in priority
@@ -1488,7 +1530,7 @@ static int pull_rt_task(struct rq *this_rq)
/*
* We continue with the search, just in
* case there's an even higher prio task
- * in another runqueue. (low likelyhood
+ * in another runqueue. (low likelihood
* but possible)
*/
}
@@ -1538,7 +1580,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
* Update the migration status of the RQ if we have an RT task
* which is running AND changing its weight value.
*/
- if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
+ if (p->on_rq && (weight != p->rt.nr_cpus_allowed)) {
struct rq *rq = task_rq(p);
if (!task_current(rq, p)) {
@@ -1599,8 +1641,7 @@ static void rq_offline_rt(struct rq *rq)
* When switch from the rt queue, we bring ourselves to a position
* that we might want to pull RT tasks from other runqueues.
*/
-static void switched_from_rt(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_from_rt(struct rq *rq, struct task_struct *p)
{
/*
* If there are other RT tasks then we will reschedule
@@ -1609,7 +1650,7 @@ static void switched_from_rt(struct rq *rq, struct task_struct *p,
* we may need to handle the pulling of RT tasks
* now.
*/
- if (!rq->rt.rt_nr_running)
+ if (p->on_rq && !rq->rt.rt_nr_running)
pull_rt_task(rq);
}
@@ -1628,8 +1669,7 @@ static inline void init_sched_rt_class(void)
* with RT tasks. In this case we try to push them off to
* other runqueues.
*/
-static void switched_to_rt(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
int check_resched = 1;
@@ -1640,7 +1680,7 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
* If that current running task is also an RT task
* then see if we can move to another run queue.
*/
- if (!running) {
+ if (p->on_rq && rq->curr != p) {
#ifdef CONFIG_SMP
if (rq->rt.overloaded && push_rt_task(rq) &&
/* Don't resched if we changed runqueues */
@@ -1656,10 +1696,13 @@ static void switched_to_rt(struct rq *rq, struct task_struct *p,
* Priority of the task has changed. This may cause
* us to initiate a push or pull.
*/
-static void prio_changed_rt(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
{
- if (running) {
+ if (!p->on_rq)
+ return;
+
+ if (rq->curr == p) {
#ifdef CONFIG_SMP
/*
* If our priority decreases while running, we
@@ -1795,10 +1838,11 @@ extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
static void print_rt_stats(struct seq_file *m, int cpu)
{
+ rt_rq_iter_t iter;
struct rt_rq *rt_rq;
rcu_read_lock();
- for_each_leaf_rt_rq(rt_rq, cpu_rq(cpu))
+ for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
print_rt_rq(m, cpu, rt_rq);
rcu_read_unlock();
}
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 48ddf431db0e..331e01bcd026 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -37,7 +37,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
#ifdef CONFIG_SMP
/* domain-specific stats */
- preempt_disable();
+ rcu_read_lock();
for_each_domain(cpu, sd) {
enum cpu_idle_type itype;
@@ -64,7 +64,7 @@ static int show_schedstat(struct seq_file *seq, void *v)
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
}
- preempt_enable();
+ rcu_read_unlock();
#endif
}
kfree(mask_str);
diff --git a/kernel/sched_stoptask.c b/kernel/sched_stoptask.c
index 2bf6b47058c1..6f437632afab 100644
--- a/kernel/sched_stoptask.c
+++ b/kernel/sched_stoptask.c
@@ -9,8 +9,7 @@
#ifdef CONFIG_SMP
static int
-select_task_rq_stop(struct rq *rq, struct task_struct *p,
- int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int sd_flag, int flags)
{
return task_cpu(p); /* stop tasks as never migrate */
}
@@ -26,7 +25,7 @@ static struct task_struct *pick_next_task_stop(struct rq *rq)
{
struct task_struct *stop = rq->stop;
- if (stop && stop->se.on_rq)
+ if (stop && stop->on_rq)
return stop;
return NULL;
@@ -59,14 +58,13 @@ static void set_curr_task_stop(struct rq *rq)
{
}
-static void switched_to_stop(struct rq *rq, struct task_struct *p,
- int running)
+static void switched_to_stop(struct rq *rq, struct task_struct *p)
{
BUG(); /* its impossible to change to this class */
}
-static void prio_changed_stop(struct rq *rq, struct task_struct *p,
- int oldprio, int running)
+static void
+prio_changed_stop(struct rq *rq, struct task_struct *p, int oldprio)
{
BUG(); /* how!?, what priority? */
}
@@ -103,6 +101,4 @@ static const struct sched_class stop_sched_class = {
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
-
- /* no .task_new for stop tasks */
};
diff --git a/kernel/signal.c b/kernel/signal.c
index 4e3cff10fdce..86c32b884f8e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -124,7 +124,7 @@ static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
static int recalc_sigpending_tsk(struct task_struct *t)
{
- if (t->signal->group_stop_count > 0 ||
+ if ((t->group_stop & GROUP_STOP_PENDING) ||
PENDING(&t->pending, &t->blocked) ||
PENDING(&t->signal->shared_pending, &t->blocked)) {
set_tsk_thread_flag(t, TIF_SIGPENDING);
@@ -223,10 +223,87 @@ static inline void print_dropped_signal(int sig)
current->comm, current->pid, sig);
}
+/**
+ * task_clear_group_stop_trapping - clear group stop trapping bit
+ * @task: target task
+ *
+ * If GROUP_STOP_TRAPPING is set, a ptracer is waiting for us. Clear it
+ * and wake up the ptracer. Note that we don't need any further locking.
+ * @task->siglock guarantees that @task->parent points to the ptracer.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+static void task_clear_group_stop_trapping(struct task_struct *task)
+{
+ if (unlikely(task->group_stop & GROUP_STOP_TRAPPING)) {
+ task->group_stop &= ~GROUP_STOP_TRAPPING;
+ __wake_up_sync_key(&task->parent->signal->wait_chldexit,
+ TASK_UNINTERRUPTIBLE, 1, task);
+ }
+}
+
+/**
+ * task_clear_group_stop_pending - clear pending group stop
+ * @task: target task
+ *
+ * Clear group stop states for @task.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ */
+void task_clear_group_stop_pending(struct task_struct *task)
+{
+ task->group_stop &= ~(GROUP_STOP_PENDING | GROUP_STOP_CONSUME |
+ GROUP_STOP_DEQUEUED);
+}
+
+/**
+ * task_participate_group_stop - participate in a group stop
+ * @task: task participating in a group stop
+ *
+ * @task has GROUP_STOP_PENDING set and is participating in a group stop.
+ * Group stop states are cleared and the group stop count is consumed if
+ * %GROUP_STOP_CONSUME was set. If the consumption completes the group
+ * stop, the appropriate %SIGNAL_* flags are set.
+ *
+ * CONTEXT:
+ * Must be called with @task->sighand->siglock held.
+ *
+ * RETURNS:
+ * %true if group stop completion should be notified to the parent, %false
+ * otherwise.
+ */
+static bool task_participate_group_stop(struct task_struct *task)
+{
+ struct signal_struct *sig = task->signal;
+ bool consume = task->group_stop & GROUP_STOP_CONSUME;
+
+ WARN_ON_ONCE(!(task->group_stop & GROUP_STOP_PENDING));
+
+ task_clear_group_stop_pending(task);
+
+ if (!consume)
+ return false;
+
+ if (!WARN_ON_ONCE(sig->group_stop_count == 0))
+ sig->group_stop_count--;
+
+ /*
+ * Tell the caller to notify completion iff we are entering into a
+ * fresh group stop. Read comment in do_signal_stop() for details.
+ */
+ if (!sig->group_stop_count && !(sig->flags & SIGNAL_STOP_STOPPED)) {
+ sig->flags = SIGNAL_STOP_STOPPED;
+ return true;
+ }
+ return false;
+}
+
/*
* allocate a new signal queue record
* - this may be called without locks if and only if t == current, otherwise an
- * appopriate lock must be held to stop the target task from exiting
+ * appropriate lock must be held to stop the target task from exiting
*/
static struct sigqueue *
__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit)
@@ -375,15 +452,15 @@ int unhandled_signal(struct task_struct *tsk, int sig)
return !tracehook_consider_fatal_signal(tsk, sig);
}
-
-/* Notify the system that a driver wants to block all signals for this
+/*
+ * Notify the system that a driver wants to block all signals for this
* process, and wants to be notified if any signals at all were to be
* sent/acted upon. If the notifier routine returns non-zero, then the
* signal will be acted upon after all. If the notifier routine returns 0,
* then then signal will be blocked. Only one block per process is
* allowed. priv is a pointer to private data that the notifier routine
- * can use to determine if the signal should be blocked or not. */
-
+ * can use to determine if the signal should be blocked or not.
+ */
void
block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask)
{
@@ -434,9 +511,10 @@ still_pending:
copy_siginfo(info, &first->info);
__sigqueue_free(first);
} else {
- /* Ok, it wasn't in the queue. This must be
- a fast-pathed signal or we must have been
- out of queue space. So zero out the info.
+ /*
+ * Ok, it wasn't in the queue. This must be
+ * a fast-pathed signal or we must have been
+ * out of queue space. So zero out the info.
*/
info->si_signo = sig;
info->si_errno = 0;
@@ -468,7 +546,7 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
}
/*
- * Dequeue a signal and return the element to the caller, which is
+ * Dequeue a signal and return the element to the caller, which is
* expected to free it.
*
* All callers have to hold the siglock.
@@ -490,7 +568,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* itimers are process shared and we restart periodic
* itimers in the signal delivery path to prevent DoS
* attacks in the high resolution timer case. This is
- * compliant with the old way of self restarting
+ * compliant with the old way of self-restarting
* itimers, as the SIGALRM is a legacy signal and only
* queued once. Changing the restart behaviour to
* restart the timer in the signal dequeue path is
@@ -526,7 +604,7 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
* is to alert stop-signal processing code when another
* processor has come along and cleared the flag.
*/
- tsk->signal->flags |= SIGNAL_STOP_DEQUEUED;
+ current->group_stop |= GROUP_STOP_DEQUEUED;
}
if ((info->si_code & __SI_MASK) == __SI_TIMER && info->si_sys_private) {
/*
@@ -591,7 +669,7 @@ static int rm_from_queue_full(sigset_t *mask, struct sigpending *s)
if (sigisemptyset(&m))
return 0;
- signandsets(&s->signal, &s->signal, mask);
+ sigandnsets(&s->signal, &s->signal, mask);
list_for_each_entry_safe(q, n, &s->list, list) {
if (sigismember(mask, q->info.si_signo)) {
list_del_init(&q->list);
@@ -636,13 +714,33 @@ static inline bool si_fromuser(const struct siginfo *info)
}
/*
+ * called with RCU read lock from check_kill_permission()
+ */
+static int kill_ok_by_cred(struct task_struct *t)
+{
+ const struct cred *cred = current_cred();
+ const struct cred *tcred = __task_cred(t);
+
+ if (cred->user->user_ns == tcred->user->user_ns &&
+ (cred->euid == tcred->suid ||
+ cred->euid == tcred->uid ||
+ cred->uid == tcred->suid ||
+ cred->uid == tcred->uid))
+ return 1;
+
+ if (ns_capable(tcred->user->user_ns, CAP_KILL))
+ return 1;
+
+ return 0;
+}
+
+/*
* Bad permissions for sending the signal
* - the caller must hold the RCU read lock
*/
static int check_kill_permission(int sig, struct siginfo *info,
struct task_struct *t)
{
- const struct cred *cred, *tcred;
struct pid *sid;
int error;
@@ -656,14 +754,8 @@ static int check_kill_permission(int sig, struct siginfo *info,
if (error)
return error;
- cred = current_cred();
- tcred = __task_cred(t);
if (!same_thread_group(current, t) &&
- (cred->euid ^ tcred->suid) &&
- (cred->euid ^ tcred->uid) &&
- (cred->uid ^ tcred->suid) &&
- (cred->uid ^ tcred->uid) &&
- !capable(CAP_KILL)) {
+ !kill_ok_by_cred(t)) {
switch (sig) {
case SIGCONT:
sid = task_session(t);
@@ -712,34 +804,14 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
} else if (sig == SIGCONT) {
unsigned int why;
/*
- * Remove all stop signals from all queues,
- * and wake all threads.
+ * Remove all stop signals from all queues, wake all threads.
*/
rm_from_queue(SIG_KERNEL_STOP_MASK, &signal->shared_pending);
t = p;
do {
- unsigned int state;
+ task_clear_group_stop_pending(t);
rm_from_queue(SIG_KERNEL_STOP_MASK, &t->pending);
- /*
- * If there is a handler for SIGCONT, we must make
- * sure that no thread returns to user mode before
- * we post the signal, in case it was the only
- * thread eligible to run the signal handler--then
- * it must not do anything between resuming and
- * running the handler. With the TIF_SIGPENDING
- * flag set, the thread will pause and acquire the
- * siglock that we hold now and until we've queued
- * the pending signal.
- *
- * Wake up the stopped thread _after_ setting
- * TIF_SIGPENDING
- */
- state = __TASK_STOPPED;
- if (sig_user_defined(t, SIGCONT) && !sigismember(&t->blocked, SIGCONT)) {
- set_tsk_thread_flag(t, TIF_SIGPENDING);
- state |= TASK_INTERRUPTIBLE;
- }
- wake_up_state(t, state);
+ wake_up_state(t, __TASK_STOPPED);
} while_each_thread(p, t);
/*
@@ -765,13 +837,6 @@ static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
signal->flags = why | SIGNAL_STOP_CONTINUED;
signal->group_stop_count = 0;
signal->group_exit_code = 0;
- } else {
- /*
- * We are not stopped, but there could be a stop
- * signal in the middle of being processed after
- * being removed from the queue. Clear that too.
- */
- signal->flags &= ~SIGNAL_STOP_DEQUEUED;
}
}
@@ -860,6 +925,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
signal->group_stop_count = 0;
t = p;
do {
+ task_clear_group_stop_pending(t);
sigaddset(&t->pending.signal, SIGKILL);
signal_wake_up(t, 1);
} while_each_thread(p, t);
@@ -909,14 +975,15 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
if (info == SEND_SIG_FORCED)
goto out_set;
- /* Real-time signals must be queued if sent by sigqueue, or
- some other real-time mechanism. It is implementation
- defined whether kill() does so. We attempt to do so, on
- the principle of least surprise, but since kill is not
- allowed to fail with EAGAIN when low on memory we just
- make sure at least one signal gets delivered and don't
- pass on the info struct. */
-
+ /*
+ * Real-time signals must be queued if sent by sigqueue, or
+ * some other real-time mechanism. It is implementation
+ * defined whether kill() does so. We attempt to do so, on
+ * the principle of least surprise, but since kill is not
+ * allowed to fail with EAGAIN when low on memory we just
+ * make sure at least one signal gets delivered and don't
+ * pass on the info struct.
+ */
if (sig < SIGRTMIN)
override_rlimit = (is_si_special(info) || info->si_code >= 0);
else
@@ -1093,6 +1160,7 @@ int zap_other_threads(struct task_struct *p)
p->signal->group_stop_count = 0;
while_each_thread(p, t) {
+ task_clear_group_stop_pending(t);
count++;
/* Don't bother with already dead threads */
@@ -1187,8 +1255,7 @@ retry:
return error;
}
-int
-kill_proc_info(int sig, struct siginfo *info, pid_t pid)
+int kill_proc_info(int sig, struct siginfo *info, pid_t pid)
{
int error;
rcu_read_lock();
@@ -1285,8 +1352,7 @@ static int kill_something_info(int sig, struct siginfo *info, pid_t pid)
* These are for backward compatibility with the rest of the kernel source.
*/
-int
-send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
+int send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
{
/*
* Make sure legacy kernel users don't send in bad values
@@ -1354,7 +1420,7 @@ EXPORT_SYMBOL(kill_pid);
* These functions support sending signals using preallocated sigqueue
* structures. This is needed "because realtime applications cannot
* afford to lose notifications of asynchronous events, like timer
- * expirations or I/O completions". In the case of Posix Timers
+ * expirations or I/O completions". In the case of POSIX Timers
* we allocate the sigqueue structure from the timer_create. If this
* allocation fails we are able to report the failure to the application
* with an EAGAIN error.
@@ -1522,16 +1588,30 @@ int do_notify_parent(struct task_struct *tsk, int sig)
return ret;
}
-static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
+/**
+ * do_notify_parent_cldstop - notify parent of stopped/continued state change
+ * @tsk: task reporting the state change
+ * @for_ptracer: the notification is for ptracer
+ * @why: CLD_{CONTINUED|STOPPED|TRAPPED} to report
+ *
+ * Notify @tsk's parent that the stopped/continued state has changed. If
+ * @for_ptracer is %false, @tsk's group leader notifies to its real parent.
+ * If %true, @tsk reports to @tsk->parent which should be the ptracer.
+ *
+ * CONTEXT:
+ * Must be called with tasklist_lock at least read locked.
+ */
+static void do_notify_parent_cldstop(struct task_struct *tsk,
+ bool for_ptracer, int why)
{
struct siginfo info;
unsigned long flags;
struct task_struct *parent;
struct sighand_struct *sighand;
- if (task_ptrace(tsk))
+ if (for_ptracer) {
parent = tsk->parent;
- else {
+ } else {
tsk = tsk->group_leader;
parent = tsk->real_parent;
}
@@ -1539,7 +1619,7 @@ static void do_notify_parent_cldstop(struct task_struct *tsk, int why)
info.si_signo = SIGCHLD;
info.si_errno = 0;
/*
- * see comment in do_notify_parent() abot the following 3 lines
+ * see comment in do_notify_parent() about the following 4 lines
*/
rcu_read_lock();
info.si_pid = task_pid_nr_ns(tsk, parent->nsproxy->pid_ns);
@@ -1597,7 +1677,7 @@ static inline int may_ptrace_stop(void)
}
/*
- * Return nonzero if there is a SIGKILL that should be waking us up.
+ * Return non-zero if there is a SIGKILL that should be waking us up.
* Called with the siglock held.
*/
static int sigkill_pending(struct task_struct *tsk)
@@ -1607,6 +1687,15 @@ static int sigkill_pending(struct task_struct *tsk)
}
/*
+ * Test whether the target task of the usual cldstop notification - the
+ * real_parent of @child - is in the same group as the ptracer.
+ */
+static bool real_parent_is_ptracer(struct task_struct *child)
+{
+ return same_thread_group(child->parent, child->real_parent);
+}
+
+/*
* This must be called with current->sighand->siglock held.
*
* This should be the path for all ptrace stops.
@@ -1617,10 +1706,12 @@ static int sigkill_pending(struct task_struct *tsk)
* If we actually decide not to stop at all because the tracer
* is gone, we keep current->exit_code unless clear_code.
*/
-static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
+static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info)
__releases(&current->sighand->siglock)
__acquires(&current->sighand->siglock)
{
+ bool gstop_done = false;
+
if (arch_ptrace_stop_needed(exit_code, info)) {
/*
* The arch code has something special to do before a
@@ -1641,21 +1732,49 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
}
/*
- * If there is a group stop in progress,
- * we must participate in the bookkeeping.
+ * If @why is CLD_STOPPED, we're trapping to participate in a group
+ * stop. Do the bookkeeping. Note that if SIGCONT was delievered
+ * while siglock was released for the arch hook, PENDING could be
+ * clear now. We act as if SIGCONT is received after TASK_TRACED
+ * is entered - ignore it.
*/
- if (current->signal->group_stop_count > 0)
- --current->signal->group_stop_count;
+ if (why == CLD_STOPPED && (current->group_stop & GROUP_STOP_PENDING))
+ gstop_done = task_participate_group_stop(current);
current->last_siginfo = info;
current->exit_code = exit_code;
- /* Let the debugger run. */
- __set_current_state(TASK_TRACED);
+ /*
+ * TRACED should be visible before TRAPPING is cleared; otherwise,
+ * the tracer might fail do_wait().
+ */
+ set_current_state(TASK_TRACED);
+
+ /*
+ * We're committing to trapping. Clearing GROUP_STOP_TRAPPING and
+ * transition to TASK_TRACED should be atomic with respect to
+ * siglock. This hsould be done after the arch hook as siglock is
+ * released and regrabbed across it.
+ */
+ task_clear_group_stop_trapping(current);
+
spin_unlock_irq(&current->sighand->siglock);
read_lock(&tasklist_lock);
if (may_ptrace_stop()) {
- do_notify_parent_cldstop(current, CLD_TRAPPED);
+ /*
+ * Notify parents of the stop.
+ *
+ * While ptraced, there are two parents - the ptracer and
+ * the real_parent of the group_leader. The ptracer should
+ * know about every stop while the real parent is only
+ * interested in the completion of group stop. The states
+ * for the two don't interact with each other. Notify
+ * separately unless they're gonna be duplicates.
+ */
+ do_notify_parent_cldstop(current, true, why);
+ if (gstop_done && !real_parent_is_ptracer(current))
+ do_notify_parent_cldstop(current, false, why);
+
/*
* Don't want to allow preemption here, because
* sys_ptrace() needs this task to be inactive.
@@ -1670,7 +1789,16 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
/*
* By the time we got the lock, our tracer went away.
* Don't drop the lock yet, another tracer may come.
+ *
+ * If @gstop_done, the ptracer went away between group stop
+ * completion and here. During detach, it would have set
+ * GROUP_STOP_PENDING on us and we'll re-enter TASK_STOPPED
+ * in do_signal_stop() on return, so notifying the real
+ * parent of the group stop completion is enough.
*/
+ if (gstop_done)
+ do_notify_parent_cldstop(current, false, why);
+
__set_current_state(TASK_RUNNING);
if (clear_code)
current->exit_code = 0;
@@ -1714,79 +1842,128 @@ void ptrace_notify(int exit_code)
/* Let the debugger run. */
spin_lock_irq(&current->sighand->siglock);
- ptrace_stop(exit_code, 1, &info);
+ ptrace_stop(exit_code, CLD_TRAPPED, 1, &info);
spin_unlock_irq(&current->sighand->siglock);
}
/*
* This performs the stopping for SIGSTOP and other stop signals.
* We have to stop all threads in the thread group.
- * Returns nonzero if we've actually stopped and released the siglock.
+ * Returns non-zero if we've actually stopped and released the siglock.
* Returns zero if we didn't stop and still hold the siglock.
*/
static int do_signal_stop(int signr)
{
struct signal_struct *sig = current->signal;
- int notify;
- if (!sig->group_stop_count) {
+ if (!(current->group_stop & GROUP_STOP_PENDING)) {
+ unsigned int gstop = GROUP_STOP_PENDING | GROUP_STOP_CONSUME;
struct task_struct *t;
- if (!likely(sig->flags & SIGNAL_STOP_DEQUEUED) ||
+ /* signr will be recorded in task->group_stop for retries */
+ WARN_ON_ONCE(signr & ~GROUP_STOP_SIGMASK);
+
+ if (!likely(current->group_stop & GROUP_STOP_DEQUEUED) ||
unlikely(signal_group_exit(sig)))
return 0;
/*
- * There is no group stop already in progress.
- * We must initiate one now.
+ * There is no group stop already in progress. We must
+ * initiate one now.
+ *
+ * While ptraced, a task may be resumed while group stop is
+ * still in effect and then receive a stop signal and
+ * initiate another group stop. This deviates from the
+ * usual behavior as two consecutive stop signals can't
+ * cause two group stops when !ptraced. That is why we
+ * also check !task_is_stopped(t) below.
+ *
+ * The condition can be distinguished by testing whether
+ * SIGNAL_STOP_STOPPED is already set. Don't generate
+ * group_exit_code in such case.
+ *
+ * This is not necessary for SIGNAL_STOP_CONTINUED because
+ * an intervening stop signal is required to cause two
+ * continued events regardless of ptrace.
*/
- sig->group_exit_code = signr;
+ if (!(sig->flags & SIGNAL_STOP_STOPPED))
+ sig->group_exit_code = signr;
+ else
+ WARN_ON_ONCE(!task_ptrace(current));
+ current->group_stop &= ~GROUP_STOP_SIGMASK;
+ current->group_stop |= signr | gstop;
sig->group_stop_count = 1;
- for (t = next_thread(current); t != current; t = next_thread(t))
+ for (t = next_thread(current); t != current;
+ t = next_thread(t)) {
+ t->group_stop &= ~GROUP_STOP_SIGMASK;
/*
* Setting state to TASK_STOPPED for a group
* stop is always done with the siglock held,
* so this check has no races.
*/
- if (!(t->flags & PF_EXITING) &&
- !task_is_stopped_or_traced(t)) {
+ if (!(t->flags & PF_EXITING) && !task_is_stopped(t)) {
+ t->group_stop |= signr | gstop;
sig->group_stop_count++;
signal_wake_up(t, 0);
}
+ }
}
- /*
- * If there are no other threads in the group, or if there is
- * a group stop in progress and we are the last to stop, report
- * to the parent. When ptraced, every thread reports itself.
- */
- notify = sig->group_stop_count == 1 ? CLD_STOPPED : 0;
- notify = tracehook_notify_jctl(notify, CLD_STOPPED);
- /*
- * tracehook_notify_jctl() can drop and reacquire siglock, so
- * we keep ->group_stop_count != 0 before the call. If SIGCONT
- * or SIGKILL comes in between ->group_stop_count == 0.
- */
- if (sig->group_stop_count) {
- if (!--sig->group_stop_count)
- sig->flags = SIGNAL_STOP_STOPPED;
- current->exit_code = sig->group_exit_code;
+retry:
+ if (likely(!task_ptrace(current))) {
+ int notify = 0;
+
+ /*
+ * If there are no other threads in the group, or if there
+ * is a group stop in progress and we are the last to stop,
+ * report to the parent.
+ */
+ if (task_participate_group_stop(current))
+ notify = CLD_STOPPED;
+
__set_current_state(TASK_STOPPED);
+ spin_unlock_irq(&current->sighand->siglock);
+
+ /*
+ * Notify the parent of the group stop completion. Because
+ * we're not holding either the siglock or tasklist_lock
+ * here, ptracer may attach inbetween; however, this is for
+ * group stop and should always be delivered to the real
+ * parent of the group leader. The new ptracer will get
+ * its notification when this task transitions into
+ * TASK_TRACED.
+ */
+ if (notify) {
+ read_lock(&tasklist_lock);
+ do_notify_parent_cldstop(current, false, notify);
+ read_unlock(&tasklist_lock);
+ }
+
+ /* Now we don't run again until woken by SIGCONT or SIGKILL */
+ schedule();
+
+ spin_lock_irq(&current->sighand->siglock);
+ } else {
+ ptrace_stop(current->group_stop & GROUP_STOP_SIGMASK,
+ CLD_STOPPED, 0, NULL);
+ current->exit_code = 0;
}
- spin_unlock_irq(&current->sighand->siglock);
- if (notify) {
- read_lock(&tasklist_lock);
- do_notify_parent_cldstop(current, notify);
- read_unlock(&tasklist_lock);
+ /*
+ * GROUP_STOP_PENDING could be set if another group stop has
+ * started since being woken up or ptrace wants us to transit
+ * between TASK_STOPPED and TRACED. Retry group stop.
+ */
+ if (current->group_stop & GROUP_STOP_PENDING) {
+ WARN_ON_ONCE(!(current->group_stop & GROUP_STOP_SIGMASK));
+ goto retry;
}
- /* Now we don't run again until woken by SIGCONT or SIGKILL */
- do {
- schedule();
- } while (try_to_freeze());
+ /* PTRACE_ATTACH might have raced with task killing, clear trapping */
+ task_clear_group_stop_trapping(current);
+
+ spin_unlock_irq(&current->sighand->siglock);
tracehook_finish_jctl();
- current->exit_code = 0;
return 1;
}
@@ -1800,7 +1977,7 @@ static int ptrace_signal(int signr, siginfo_t *info,
ptrace_signal_deliver(regs, cookie);
/* Let the debugger run. */
- ptrace_stop(signr, 0, info);
+ ptrace_stop(signr, CLD_TRAPPED, 0, info);
/* We're back. Did the debugger cancel the sig? */
signr = current->exit_code;
@@ -1809,10 +1986,12 @@ static int ptrace_signal(int signr, siginfo_t *info,
current->exit_code = 0;
- /* Update the siginfo structure if the signal has
- changed. If the debugger wanted something
- specific in the siginfo structure then it should
- have updated *info via PTRACE_SETSIGINFO. */
+ /*
+ * Update the siginfo structure if the signal has
+ * changed. If the debugger wanted something
+ * specific in the siginfo structure then it should
+ * have updated *info via PTRACE_SETSIGINFO.
+ */
if (signr != info->si_signo) {
info->si_signo = signr;
info->si_errno = 0;
@@ -1853,25 +2032,43 @@ relock:
* the CLD_ si_code into SIGNAL_CLD_MASK bits.
*/
if (unlikely(signal->flags & SIGNAL_CLD_MASK)) {
- int why = (signal->flags & SIGNAL_STOP_CONTINUED)
- ? CLD_CONTINUED : CLD_STOPPED;
+ struct task_struct *leader;
+ int why;
+
+ if (signal->flags & SIGNAL_CLD_CONTINUED)
+ why = CLD_CONTINUED;
+ else
+ why = CLD_STOPPED;
+
signal->flags &= ~SIGNAL_CLD_MASK;
- why = tracehook_notify_jctl(why, CLD_CONTINUED);
spin_unlock_irq(&sighand->siglock);
- if (why) {
- read_lock(&tasklist_lock);
- do_notify_parent_cldstop(current->group_leader, why);
- read_unlock(&tasklist_lock);
- }
+ /*
+ * Notify the parent that we're continuing. This event is
+ * always per-process and doesn't make whole lot of sense
+ * for ptracers, who shouldn't consume the state via
+ * wait(2) either, but, for backward compatibility, notify
+ * the ptracer of the group leader too unless it's gonna be
+ * a duplicate.
+ */
+ read_lock(&tasklist_lock);
+
+ do_notify_parent_cldstop(current, false, why);
+
+ leader = current->group_leader;
+ if (task_ptrace(leader) && !real_parent_is_ptracer(leader))
+ do_notify_parent_cldstop(leader, true, why);
+
+ read_unlock(&tasklist_lock);
+
goto relock;
}
for (;;) {
struct k_sigaction *ka;
/*
- * Tracing can induce an artifical signal and choose sigaction.
+ * Tracing can induce an artificial signal and choose sigaction.
* The return value in @signr determines the default action,
* but @info->si_signo is the signal number we will report.
*/
@@ -1881,8 +2078,8 @@ relock:
if (unlikely(signr != 0))
ka = return_ka;
else {
- if (unlikely(signal->group_stop_count > 0) &&
- do_signal_stop(0))
+ if (unlikely(current->group_stop &
+ GROUP_STOP_PENDING) && do_signal_stop(0))
goto relock;
signr = dequeue_signal(current, &current->blocked,
@@ -2001,10 +2198,42 @@ relock:
return signr;
}
+/*
+ * It could be that complete_signal() picked us to notify about the
+ * group-wide signal. Other threads should be notified now to take
+ * the shared signals in @which since we will not.
+ */
+static void retarget_shared_pending(struct task_struct *tsk, sigset_t *which)
+{
+ sigset_t retarget;
+ struct task_struct *t;
+
+ sigandsets(&retarget, &tsk->signal->shared_pending.signal, which);
+ if (sigisemptyset(&retarget))
+ return;
+
+ t = tsk;
+ while_each_thread(tsk, t) {
+ if (t->flags & PF_EXITING)
+ continue;
+
+ if (!has_pending_signals(&retarget, &t->blocked))
+ continue;
+ /* Remove the signals this thread can handle. */
+ sigandsets(&retarget, &retarget, &t->blocked);
+
+ if (!signal_pending(t))
+ signal_wake_up(t, 0);
+
+ if (sigisemptyset(&retarget))
+ break;
+ }
+}
+
void exit_signals(struct task_struct *tsk)
{
int group_stop = 0;
- struct task_struct *t;
+ sigset_t unblocked;
if (thread_group_empty(tsk) || signal_group_exit(tsk->signal)) {
tsk->flags |= PF_EXITING;
@@ -2020,25 +2249,23 @@ void exit_signals(struct task_struct *tsk)
if (!signal_pending(tsk))
goto out;
- /* It could be that __group_complete_signal() choose us to
- * notify about group-wide signal. Another thread should be
- * woken now to take the signal since we will not.
- */
- for (t = tsk; (t = next_thread(t)) != tsk; )
- if (!signal_pending(t) && !(t->flags & PF_EXITING))
- recalc_sigpending_and_wake(t);
+ unblocked = tsk->blocked;
+ signotset(&unblocked);
+ retarget_shared_pending(tsk, &unblocked);
- if (unlikely(tsk->signal->group_stop_count) &&
- !--tsk->signal->group_stop_count) {
- tsk->signal->flags = SIGNAL_STOP_STOPPED;
- group_stop = tracehook_notify_jctl(CLD_STOPPED, CLD_STOPPED);
- }
+ if (unlikely(tsk->group_stop & GROUP_STOP_PENDING) &&
+ task_participate_group_stop(tsk))
+ group_stop = CLD_STOPPED;
out:
spin_unlock_irq(&tsk->sighand->siglock);
+ /*
+ * If group stop has completed, deliver the notification. This
+ * should always go to the real parent of the group leader.
+ */
if (unlikely(group_stop)) {
read_lock(&tasklist_lock);
- do_notify_parent_cldstop(tsk, group_stop);
+ do_notify_parent_cldstop(tsk, false, group_stop);
read_unlock(&tasklist_lock);
}
}
@@ -2058,6 +2285,9 @@ EXPORT_SYMBOL(unblock_all_signals);
* System call entry points.
*/
+/**
+ * sys_restart_syscall - restart a system call
+ */
SYSCALL_DEFINE0(restart_syscall)
{
struct restart_block *restart = &current_thread_info()->restart_block;
@@ -2069,11 +2299,33 @@ long do_no_restart_syscall(struct restart_block *param)
return -EINTR;
}
-/*
- * We don't need to get the kernel lock - this is all local to this
- * particular thread.. (and that's good, because this is _heavily_
- * used by various programs)
+static void __set_task_blocked(struct task_struct *tsk, const sigset_t *newset)
+{
+ if (signal_pending(tsk) && !thread_group_empty(tsk)) {
+ sigset_t newblocked;
+ /* A set of now blocked but previously unblocked signals. */
+ sigandnsets(&newblocked, newset, &current->blocked);
+ retarget_shared_pending(tsk, &newblocked);
+ }
+ tsk->blocked = *newset;
+ recalc_sigpending();
+}
+
+/**
+ * set_current_blocked - change current->blocked mask
+ * @newset: new mask
+ *
+ * It is wrong to change ->blocked directly, this helper should be used
+ * to ensure the process can't miss a shared signal we are going to block.
*/
+void set_current_blocked(const sigset_t *newset)
+{
+ struct task_struct *tsk = current;
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ __set_task_blocked(tsk, newset);
+ spin_unlock_irq(&tsk->sighand->siglock);
+}
/*
* This is also useful for kernel threads that want to temporarily
@@ -2085,66 +2337,66 @@ long do_no_restart_syscall(struct restart_block *param)
*/
int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
{
- int error;
+ struct task_struct *tsk = current;
+ sigset_t newset;
- spin_lock_irq(&current->sighand->siglock);
+ /* Lockless, only current can change ->blocked, never from irq */
if (oldset)
- *oldset = current->blocked;
+ *oldset = tsk->blocked;
- error = 0;
switch (how) {
case SIG_BLOCK:
- sigorsets(&current->blocked, &current->blocked, set);
+ sigorsets(&newset, &tsk->blocked, set);
break;
case SIG_UNBLOCK:
- signandsets(&current->blocked, &current->blocked, set);
+ sigandnsets(&newset, &tsk->blocked, set);
break;
case SIG_SETMASK:
- current->blocked = *set;
+ newset = *set;
break;
default:
- error = -EINVAL;
+ return -EINVAL;
}
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- return error;
+ set_current_blocked(&newset);
+ return 0;
}
-SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
+/**
+ * sys_rt_sigprocmask - change the list of currently blocked signals
+ * @how: whether to add, remove, or set signals
+ * @set: stores pending signals
+ * @oset: previous value of signal mask if non-null
+ * @sigsetsize: size of sigset_t type
+ */
+SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, nset,
sigset_t __user *, oset, size_t, sigsetsize)
{
- int error = -EINVAL;
sigset_t old_set, new_set;
+ int error;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
- goto out;
+ return -EINVAL;
- if (set) {
- error = -EFAULT;
- if (copy_from_user(&new_set, set, sizeof(*set)))
- goto out;
+ old_set = current->blocked;
+
+ if (nset) {
+ if (copy_from_user(&new_set, nset, sizeof(sigset_t)))
+ return -EFAULT;
sigdelsetmask(&new_set, sigmask(SIGKILL)|sigmask(SIGSTOP));
- error = sigprocmask(how, &new_set, &old_set);
+ error = sigprocmask(how, &new_set, NULL);
if (error)
- goto out;
- if (oset)
- goto set_old;
- } else if (oset) {
- spin_lock_irq(&current->sighand->siglock);
- old_set = current->blocked;
- spin_unlock_irq(&current->sighand->siglock);
+ return error;
+ }
- set_old:
- error = -EFAULT;
- if (copy_to_user(oset, &old_set, sizeof(*oset)))
- goto out;
+ if (oset) {
+ if (copy_to_user(oset, &old_set, sizeof(sigset_t)))
+ return -EFAULT;
}
- error = 0;
-out:
- return error;
+
+ return 0;
}
long do_sigpending(void __user *set, unsigned long sigsetsize)
@@ -2169,8 +2421,14 @@ long do_sigpending(void __user *set, unsigned long sigsetsize)
out:
return error;
-}
+}
+/**
+ * sys_rt_sigpending - examine a pending signal that has been raised
+ * while blocked
+ * @set: stores pending signals
+ * @sigsetsize: size of sigset_t type or larger
+ */
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
{
return do_sigpending(set, sigsetsize);
@@ -2219,9 +2477,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
err |= __put_user(from->si_trapno, &to->si_trapno);
#endif
#ifdef BUS_MCEERR_AO
- /*
+ /*
* Other callers might not initialize the si_lsb field,
- * so check explicitely for the right codes here.
+ * so check explicitly for the right codes here.
*/
if (from->si_code == BUS_MCEERR_AR || from->si_code == BUS_MCEERR_AO)
err |= __put_user(from->si_addr_lsb, &to->si_addr_lsb);
@@ -2250,15 +2508,82 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
#endif
+/**
+ * do_sigtimedwait - wait for queued signals specified in @which
+ * @which: queued signals to wait for
+ * @info: if non-null, the signal's siginfo is returned here
+ * @ts: upper bound on process time suspension
+ */
+int do_sigtimedwait(const sigset_t *which, siginfo_t *info,
+ const struct timespec *ts)
+{
+ struct task_struct *tsk = current;
+ long timeout = MAX_SCHEDULE_TIMEOUT;
+ sigset_t mask = *which;
+ int sig;
+
+ if (ts) {
+ if (!timespec_valid(ts))
+ return -EINVAL;
+ timeout = timespec_to_jiffies(ts);
+ /*
+ * We can be close to the next tick, add another one
+ * to ensure we will wait at least the time asked for.
+ */
+ if (ts->tv_sec || ts->tv_nsec)
+ timeout++;
+ }
+
+ /*
+ * Invert the set of allowed signals to get those we want to block.
+ */
+ sigdelsetmask(&mask, sigmask(SIGKILL) | sigmask(SIGSTOP));
+ signotset(&mask);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ sig = dequeue_signal(tsk, &mask, info);
+ if (!sig && timeout) {
+ /*
+ * None ready, temporarily unblock those we're interested
+ * while we are sleeping in so that we'll be awakened when
+ * they arrive. Unblocking is always fine, we can avoid
+ * set_current_blocked().
+ */
+ tsk->real_blocked = tsk->blocked;
+ sigandsets(&tsk->blocked, &tsk->blocked, &mask);
+ recalc_sigpending();
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ timeout = schedule_timeout_interruptible(timeout);
+
+ spin_lock_irq(&tsk->sighand->siglock);
+ __set_task_blocked(tsk, &tsk->real_blocked);
+ siginitset(&tsk->real_blocked, 0);
+ sig = dequeue_signal(tsk, &mask, info);
+ }
+ spin_unlock_irq(&tsk->sighand->siglock);
+
+ if (sig)
+ return sig;
+ return timeout ? -EINTR : -EAGAIN;
+}
+
+/**
+ * sys_rt_sigtimedwait - synchronously wait for queued signals specified
+ * in @uthese
+ * @uthese: queued signals to wait for
+ * @uinfo: if non-null, the signal's siginfo is returned here
+ * @uts: upper bound on process time suspension
+ * @sigsetsize: size of sigset_t type
+ */
SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
siginfo_t __user *, uinfo, const struct timespec __user *, uts,
size_t, sigsetsize)
{
- int ret, sig;
sigset_t these;
struct timespec ts;
siginfo_t info;
- long timeout = 0;
+ int ret;
/* XXX: Don't preclude handling different sized sigset_t's. */
if (sigsetsize != sizeof(sigset_t))
@@ -2266,65 +2591,27 @@ SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
if (copy_from_user(&these, uthese, sizeof(these)))
return -EFAULT;
-
- /*
- * Invert the set of allowed signals to get those we
- * want to block.
- */
- sigdelsetmask(&these, sigmask(SIGKILL)|sigmask(SIGSTOP));
- signotset(&these);
if (uts) {
if (copy_from_user(&ts, uts, sizeof(ts)))
return -EFAULT;
- if (ts.tv_nsec >= 1000000000L || ts.tv_nsec < 0
- || ts.tv_sec < 0)
- return -EINVAL;
}
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &these, &info);
- if (!sig) {
- timeout = MAX_SCHEDULE_TIMEOUT;
- if (uts)
- timeout = (timespec_to_jiffies(&ts)
- + (ts.tv_sec || ts.tv_nsec));
-
- if (timeout) {
- /* None ready -- temporarily unblock those we're
- * interested while we are sleeping in so that we'll
- * be awakened when they arrive. */
- current->real_blocked = current->blocked;
- sigandsets(&current->blocked, &current->blocked, &these);
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
-
- timeout = schedule_timeout_interruptible(timeout);
-
- spin_lock_irq(&current->sighand->siglock);
- sig = dequeue_signal(current, &these, &info);
- current->blocked = current->real_blocked;
- siginitset(&current->real_blocked, 0);
- recalc_sigpending();
- }
- }
- spin_unlock_irq(&current->sighand->siglock);
+ ret = do_sigtimedwait(&these, &info, uts ? &ts : NULL);
- if (sig) {
- ret = sig;
- if (uinfo) {
- if (copy_siginfo_to_user(uinfo, &info))
- ret = -EFAULT;
- }
- } else {
- ret = -EAGAIN;
- if (timeout)
- ret = -EINTR;
+ if (ret > 0 && uinfo) {
+ if (copy_siginfo_to_user(uinfo, &info))
+ ret = -EFAULT;
}
return ret;
}
+/**
+ * sys_kill - send a signal to a process
+ * @pid: the PID of the process
+ * @sig: signal to be sent
+ */
SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{
struct siginfo info;
@@ -2400,7 +2687,11 @@ SYSCALL_DEFINE3(tgkill, pid_t, tgid, pid_t, pid, int, sig)
return do_tkill(tgid, pid, sig);
}
-/*
+/**
+ * sys_tkill - send signal to one specific task
+ * @pid: the PID of the task
+ * @sig: signal to be sent
+ *
* Send a signal to only one task, even if it's a CLONE_THREAD task.
*/
SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
@@ -2412,6 +2703,12 @@ SYSCALL_DEFINE2(tkill, pid_t, pid, int, sig)
return do_tkill(0, pid, sig);
}
+/**
+ * sys_rt_sigqueueinfo - send signal information to a signal
+ * @pid: the PID of the thread
+ * @sig: signal to be sent
+ * @uinfo: signal info to be sent
+ */
SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
siginfo_t __user *, uinfo)
{
@@ -2421,9 +2718,13 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
return -EFAULT;
/* Not even root can pretend to send signals from the kernel.
- Nor can they impersonate a kill(), which adds source info. */
- if (info.si_code >= 0)
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if (info.si_code >= 0 || info.si_code == SI_TKILL) {
+ /* We used to allow any < 0 si_code */
+ WARN_ON_ONCE(info.si_code < 0);
return -EPERM;
+ }
info.si_signo = sig;
/* POSIX.1b doesn't mention process groups. */
@@ -2437,9 +2738,13 @@ long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
return -EINVAL;
/* Not even root can pretend to send signals from the kernel.
- Nor can they impersonate a kill(), which adds source info. */
- if (info->si_code >= 0)
+ * Nor can they impersonate a kill()/tgkill(), which adds source info.
+ */
+ if (info->si_code >= 0 || info->si_code == SI_TKILL) {
+ /* We used to allow any < 0 si_code */
+ WARN_ON_ONCE(info->si_code < 0);
return -EPERM;
+ }
info->si_signo = sig;
return do_send_specific(tgid, pid, sig, info);
@@ -2531,12 +2836,11 @@ do_sigaltstack (const stack_t __user *uss, stack_t __user *uoss, unsigned long s
error = -EINVAL;
/*
- *
- * Note - this code used to test ss_flags incorrectly
+ * Note - this code used to test ss_flags incorrectly:
* old code may have been written using ss_flags==0
* to mean ss_flags==SS_ONSTACK (as this was the only
* way that worked) - this fix preserves that older
- * mechanism
+ * mechanism.
*/
if (ss_flags != SS_DISABLE && ss_flags != SS_ONSTACK && ss_flags != 0)
goto out;
@@ -2570,6 +2874,10 @@ out:
#ifdef __ARCH_WANT_SYS_SIGPENDING
+/**
+ * sys_sigpending - examine pending signals
+ * @set: where mask of pending signal is returned
+ */
SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
{
return do_sigpending(set, sizeof(*set));
@@ -2578,60 +2886,65 @@ SYSCALL_DEFINE1(sigpending, old_sigset_t __user *, set)
#endif
#ifdef __ARCH_WANT_SYS_SIGPROCMASK
-/* Some platforms have their own version with special arguments others
- support only sys_rt_sigprocmask. */
+/**
+ * sys_sigprocmask - examine and change blocked signals
+ * @how: whether to add, remove, or set signals
+ * @nset: signals to add or remove (if non-null)
+ * @oset: previous value of signal mask if non-null
+ *
+ * Some platforms have their own version with special arguments;
+ * others support only sys_rt_sigprocmask.
+ */
-SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, set,
+SYSCALL_DEFINE3(sigprocmask, int, how, old_sigset_t __user *, nset,
old_sigset_t __user *, oset)
{
- int error;
old_sigset_t old_set, new_set;
+ sigset_t new_blocked;
- if (set) {
- error = -EFAULT;
- if (copy_from_user(&new_set, set, sizeof(*set)))
- goto out;
+ old_set = current->blocked.sig[0];
+
+ if (nset) {
+ if (copy_from_user(&new_set, nset, sizeof(*nset)))
+ return -EFAULT;
new_set &= ~(sigmask(SIGKILL) | sigmask(SIGSTOP));
- spin_lock_irq(&current->sighand->siglock);
- old_set = current->blocked.sig[0];
+ new_blocked = current->blocked;
- error = 0;
switch (how) {
- default:
- error = -EINVAL;
- break;
case SIG_BLOCK:
- sigaddsetmask(&current->blocked, new_set);
+ sigaddsetmask(&new_blocked, new_set);
break;
case SIG_UNBLOCK:
- sigdelsetmask(&current->blocked, new_set);
+ sigdelsetmask(&new_blocked, new_set);
break;
case SIG_SETMASK:
- current->blocked.sig[0] = new_set;
+ new_blocked.sig[0] = new_set;
break;
+ default:
+ return -EINVAL;
}
- recalc_sigpending();
- spin_unlock_irq(&current->sighand->siglock);
- if (error)
- goto out;
- if (oset)
- goto set_old;
- } else if (oset) {
- old_set = current->blocked.sig[0];
- set_old:
- error = -EFAULT;
+ set_current_blocked(&new_blocked);
+ }
+
+ if (oset) {
if (copy_to_user(oset, &old_set, sizeof(*oset)))
- goto out;
+ return -EFAULT;
}
- error = 0;
-out:
- return error;
+
+ return 0;
}
#endif /* __ARCH_WANT_SYS_SIGPROCMASK */
#ifdef __ARCH_WANT_SYS_RT_SIGACTION
+/**
+ * sys_rt_sigaction - alter an action taken by a process
+ * @sig: signal to be sent
+ * @act: new sigaction
+ * @oact: used to save the previous sigaction
+ * @sigsetsize: size of sigset_t type
+ */
SYSCALL_DEFINE4(rt_sigaction, int, sig,
const struct sigaction __user *, act,
struct sigaction __user *, oact,
@@ -2710,14 +3023,22 @@ SYSCALL_DEFINE2(signal, int, sig, __sighandler_t, handler)
SYSCALL_DEFINE0(pause)
{
- current->state = TASK_INTERRUPTIBLE;
- schedule();
+ while (!signal_pending(current)) {
+ current->state = TASK_INTERRUPTIBLE;
+ schedule();
+ }
return -ERESTARTNOHAND;
}
#endif
#ifdef __ARCH_WANT_SYS_RT_SIGSUSPEND
+/**
+ * sys_rt_sigsuspend - replace the signal mask for a value with the
+ * @unewset value until a signal is received
+ * @unewset: new signal mask value
+ * @sigsetsize: size of sigset_t type
+ */
SYSCALL_DEFINE2(rt_sigsuspend, sigset_t __user *, unewset, size_t, sigsetsize)
{
sigset_t newset;
diff --git a/kernel/smp.c b/kernel/smp.c
index 9910744f0856..73a195193558 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -194,7 +194,7 @@ void generic_smp_call_function_interrupt(void)
*/
list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
int refs;
- void (*func) (void *info);
+ smp_call_func_t func;
/*
* Since we walk the list without any locks, we might
@@ -214,17 +214,17 @@ void generic_smp_call_function_interrupt(void)
if (atomic_read(&data->refs) == 0)
continue;
- func = data->csd.func; /* for later warn */
- data->csd.func(data->csd.info);
+ func = data->csd.func; /* save for later warn */
+ func(data->csd.info);
/*
- * If the cpu mask is not still set then it enabled interrupts,
- * we took another smp interrupt, and executed the function
- * twice on this cpu. In theory that copy decremented refs.
+ * If the cpu mask is not still set then func enabled
+ * interrupts (BUG), and this cpu took another smp call
+ * function interrupt and executed func(info) twice
+ * on this cpu. That nested execution decremented refs.
*/
if (!cpumask_test_and_clear_cpu(cpu, data->cpumask)) {
- WARN(1, "%pS enabled interrupts and double executed\n",
- func);
+ WARN(1, "%pf enabled interrupts and double executed\n", func);
continue;
}
@@ -450,7 +450,7 @@ void smp_call_function_many(const struct cpumask *mask,
{
struct call_function_data *data;
unsigned long flags;
- int cpu, next_cpu, this_cpu = smp_processor_id();
+ int refs, cpu, next_cpu, this_cpu = smp_processor_id();
/*
* Can deadlock when called with interrupts disabled.
@@ -461,7 +461,7 @@ void smp_call_function_many(const struct cpumask *mask,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
&& !oops_in_progress && !early_boot_irqs_disabled);
- /* So, what's a CPU they want? Ignoring this one. */
+ /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
if (cpu == this_cpu)
cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
@@ -483,22 +483,49 @@ void smp_call_function_many(const struct cpumask *mask,
data = &__get_cpu_var(cfd_data);
csd_lock(&data->csd);
+
+ /* This BUG_ON verifies our reuse assertions and can be removed */
BUG_ON(atomic_read(&data->refs) || !cpumask_empty(data->cpumask));
+ /*
+ * The global call function queue list add and delete are protected
+ * by a lock, but the list is traversed without any lock, relying
+ * on the rcu list add and delete to allow safe concurrent traversal.
+ * We reuse the call function data without waiting for any grace
+ * period after some other cpu removes it from the global queue.
+ * This means a cpu might find our data block as it is being
+ * filled out.
+ *
+ * We hold off the interrupt handler on the other cpu by
+ * ordering our writes to the cpu mask vs our setting of the
+ * refs counter. We assert only the cpu owning the data block
+ * will set a bit in cpumask, and each bit will only be cleared
+ * by the subject cpu. Each cpu must first find its bit is
+ * set and then check that refs is set indicating the element is
+ * ready to be processed, otherwise it must skip the entry.
+ *
+ * On the previous iteration refs was set to 0 by another cpu.
+ * To avoid the use of transitivity, set the counter to 0 here
+ * so the wmb will pair with the rmb in the interrupt handler.
+ */
+ atomic_set(&data->refs, 0); /* convert 3rd to 1st party write */
+
data->csd.func = func;
data->csd.info = info;
- cpumask_and(data->cpumask, mask, cpu_online_mask);
- cpumask_clear_cpu(this_cpu, data->cpumask);
- /*
- * To ensure the interrupt handler gets an complete view
- * we order the cpumask and refs writes and order the read
- * of them in the interrupt handler. In addition we may
- * only clear our own cpu bit from the mask.
- */
+ /* Ensure 0 refs is visible before mask. Also orders func and info */
smp_wmb();
- atomic_set(&data->refs, cpumask_weight(data->cpumask));
+ /* We rely on the "and" being processed before the store */
+ cpumask_and(data->cpumask, mask, cpu_online_mask);
+ cpumask_clear_cpu(this_cpu, data->cpumask);
+ refs = cpumask_weight(data->cpumask);
+
+ /* Some callers race with other cpus changing the passed mask */
+ if (unlikely(!refs)) {
+ csd_unlock(&data->csd);
+ return;
+ }
raw_spin_lock_irqsave(&call_function.lock, flags);
/*
@@ -507,6 +534,12 @@ void smp_call_function_many(const struct cpumask *mask,
* will not miss any other list entries:
*/
list_add_rcu(&data->csd.list, &call_function.queue);
+ /*
+ * We rely on the wmb() in list_add_rcu to complete our writes
+ * to the cpumask before this write to refs, which indicates
+ * data is on the list and is ready to be processed.
+ */
+ atomic_set(&data->refs, refs);
raw_spin_unlock_irqrestore(&call_function.lock, flags);
/*
@@ -571,6 +604,87 @@ void ipi_call_unlock_irq(void)
}
#endif /* USE_GENERIC_SMP_HELPERS */
+/* Setup configured maximum number of CPUs to activate */
+unsigned int setup_max_cpus = NR_CPUS;
+EXPORT_SYMBOL(setup_max_cpus);
+
+
+/*
+ * Setup routine for controlling SMP activation
+ *
+ * Command-line option of "nosmp" or "maxcpus=0" will disable SMP
+ * activation entirely (the MPS table probe still happens, though).
+ *
+ * Command-line option of "maxcpus=<NUM>", where <NUM> is an integer
+ * greater than 0, limits the maximum number of CPUs activated in
+ * SMP mode to <NUM>.
+ */
+
+void __weak arch_disable_smp_support(void) { }
+
+static int __init nosmp(char *str)
+{
+ setup_max_cpus = 0;
+ arch_disable_smp_support();
+
+ return 0;
+}
+
+early_param("nosmp", nosmp);
+
+/* this is hard limit */
+static int __init nrcpus(char *str)
+{
+ int nr_cpus;
+
+ get_option(&str, &nr_cpus);
+ if (nr_cpus > 0 && nr_cpus < nr_cpu_ids)
+ nr_cpu_ids = nr_cpus;
+
+ return 0;
+}
+
+early_param("nr_cpus", nrcpus);
+
+static int __init maxcpus(char *str)
+{
+ get_option(&str, &setup_max_cpus);
+ if (setup_max_cpus == 0)
+ arch_disable_smp_support();
+
+ return 0;
+}
+
+early_param("maxcpus", maxcpus);
+
+/* Setup number of possible processor ids */
+int nr_cpu_ids __read_mostly = NR_CPUS;
+EXPORT_SYMBOL(nr_cpu_ids);
+
+/* An arch may set nr_cpu_ids earlier if needed, so this would be redundant */
+void __init setup_nr_cpu_ids(void)
+{
+ nr_cpu_ids = find_last_bit(cpumask_bits(cpu_possible_mask),NR_CPUS) + 1;
+}
+
+/* Called by boot processor to activate the rest. */
+void __init smp_init(void)
+{
+ unsigned int cpu;
+
+ /* FIXME: This should be done in userspace --RR */
+ for_each_present_cpu(cpu) {
+ if (num_online_cpus() >= setup_max_cpus)
+ break;
+ if (!cpu_online(cpu))
+ cpu_up(cpu);
+ }
+
+ /* Any cleanup work */
+ printk(KERN_INFO "Brought up %ld CPUs\n", (long)num_online_cpus());
+ smp_cpus_done(setup_max_cpus);
+}
+
/*
* Call a function on all processors. May be used during early boot while
* early_boot_irqs_disabled is set. Use local_irq_save/restore() instead
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 68eb5efec388..13960170cad4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -54,11 +54,11 @@ EXPORT_SYMBOL(irq_stat);
static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
char *softirq_to_name[NR_SOFTIRQS] = {
"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK", "BLOCK_IOPOLL",
- "TASKLET", "SCHED", "HRTIMER", "RCU"
+ "TASKLET", "SCHED", "HRTIMER"
};
/*
@@ -311,9 +311,21 @@ void irq_enter(void)
}
#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
-# define invoke_softirq() __do_softirq()
+static inline void invoke_softirq(void)
+{
+ if (!force_irqthreads)
+ __do_softirq();
+ else
+ wakeup_softirqd();
+}
#else
-# define invoke_softirq() do_softirq()
+static inline void invoke_softirq(void)
+{
+ if (!force_irqthreads)
+ do_softirq();
+ else
+ wakeup_softirqd();
+}
#endif
/*
@@ -555,7 +567,7 @@ static void __tasklet_hrtimer_trampoline(unsigned long data)
/**
* tasklet_hrtimer_init - Init a tasklet/hrtimer combo for softirq callbacks
* @ttimer: tasklet_hrtimer which is initialized
- * @function: hrtimer callback funtion which gets called from softirq context
+ * @function: hrtimer callback function which gets called from softirq context
* @which_clock: clock id (CLOCK_MONOTONIC/CLOCK_REALTIME)
* @mode: hrtimer mode (HRTIMER_MODE_ABS/HRTIMER_MODE_REL)
*/
@@ -721,7 +733,6 @@ static int run_ksoftirqd(void * __bind_cpu)
{
set_current_state(TASK_INTERRUPTIBLE);
- current->flags |= PF_KSOFTIRQD;
while (!kthread_should_stop()) {
preempt_disable();
if (!local_softirq_pending()) {
@@ -738,7 +749,10 @@ static int run_ksoftirqd(void * __bind_cpu)
don't process */
if (cpu_is_offline((long)__bind_cpu))
goto wait_to_die;
- do_softirq();
+ local_irq_disable();
+ if (local_softirq_pending())
+ __do_softirq();
+ local_irq_enable();
preempt_enable_no_resched();
cond_resched();
preempt_disable();
@@ -831,7 +845,10 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb,
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
+ p = kthread_create_on_node(run_ksoftirqd,
+ hcpu,
+ cpu_to_node(hotcpu),
+ "ksoftirqd/%d", hotcpu);
if (IS_ERR(p)) {
printk("ksoftirqd for %i failed\n", hotcpu);
return notifier_from_errno(PTR_ERR(p));
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 2df820b03beb..e3516b29076c 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -301,8 +301,10 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb,
case CPU_UP_PREPARE:
BUG_ON(stopper->thread || stopper->enabled ||
!list_empty(&stopper->works));
- p = kthread_create(cpu_stopper_thread, stopper, "migration/%d",
- cpu);
+ p = kthread_create_on_node(cpu_stopper_thread,
+ stopper,
+ cpu_to_node(cpu),
+ "migration/%d", cpu);
if (IS_ERR(p))
return notifier_from_errno(PTR_ERR(p));
get_task_struct(p);
diff --git a/kernel/sys.c b/kernel/sys.c
index 18da702ec813..e4128b278f23 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -37,6 +37,7 @@
#include <linux/ptrace.h>
#include <linux/fs_struct.h>
#include <linux/gfp.h>
+#include <linux/syscore_ops.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -119,16 +120,33 @@ EXPORT_SYMBOL(cad_pid);
void (*pm_power_off_prepare)(void);
/*
+ * Returns true if current's euid is same as p's uid or euid,
+ * or has CAP_SYS_NICE to p's user_ns.
+ *
+ * Called with rcu_read_lock, creds are safe
+ */
+static bool set_one_prio_perm(struct task_struct *p)
+{
+ const struct cred *cred = current_cred(), *pcred = __task_cred(p);
+
+ if (pcred->user->user_ns == cred->user->user_ns &&
+ (pcred->uid == cred->euid ||
+ pcred->euid == cred->euid))
+ return true;
+ if (ns_capable(pcred->user->user_ns, CAP_SYS_NICE))
+ return true;
+ return false;
+}
+
+/*
* set the priority of a task
* - the caller must hold the RCU read lock
*/
static int set_one_prio(struct task_struct *p, int niceval, int error)
{
- const struct cred *cred = current_cred(), *pcred = __task_cred(p);
int no_nice;
- if (pcred->uid != cred->euid &&
- pcred->euid != cred->euid && !capable(CAP_SYS_NICE)) {
+ if (!set_one_prio_perm(p)) {
error = -EPERM;
goto out;
}
@@ -296,8 +314,9 @@ void kernel_restart_prepare(char *cmd)
{
blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
system_state = SYSTEM_RESTART;
+ usermodehelper_disable();
device_shutdown();
- sysdev_shutdown();
+ syscore_shutdown();
}
/**
@@ -325,6 +344,7 @@ static void kernel_shutdown_prepare(enum system_states state)
blocking_notifier_call_chain(&reboot_notifier_list,
(state == SYSTEM_HALT)?SYS_HALT:SYS_POWER_OFF, NULL);
system_state = state;
+ usermodehelper_disable();
device_shutdown();
}
/**
@@ -335,7 +355,7 @@ static void kernel_shutdown_prepare(enum system_states state)
void kernel_halt(void)
{
kernel_shutdown_prepare(SYSTEM_HALT);
- sysdev_shutdown();
+ syscore_shutdown();
printk(KERN_EMERG "System halted.\n");
kmsg_dump(KMSG_DUMP_HALT);
machine_halt();
@@ -354,7 +374,7 @@ void kernel_power_off(void)
if (pm_power_off_prepare)
pm_power_off_prepare();
disable_nonboot_cpus();
- sysdev_shutdown();
+ syscore_shutdown();
printk(KERN_EMERG "Power down.\n");
kmsg_dump(KMSG_DUMP_POWEROFF);
machine_power_off();
@@ -502,7 +522,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
if (rgid != (gid_t) -1) {
if (old->gid == rgid ||
old->egid == rgid ||
- capable(CAP_SETGID))
+ nsown_capable(CAP_SETGID))
new->gid = rgid;
else
goto error;
@@ -511,7 +531,7 @@ SYSCALL_DEFINE2(setregid, gid_t, rgid, gid_t, egid)
if (old->gid == egid ||
old->egid == egid ||
old->sgid == egid ||
- capable(CAP_SETGID))
+ nsown_capable(CAP_SETGID))
new->egid = egid;
else
goto error;
@@ -546,7 +566,7 @@ SYSCALL_DEFINE1(setgid, gid_t, gid)
old = current_cred();
retval = -EPERM;
- if (capable(CAP_SETGID))
+ if (nsown_capable(CAP_SETGID))
new->gid = new->egid = new->sgid = new->fsgid = gid;
else if (gid == old->gid || gid == old->sgid)
new->egid = new->fsgid = gid;
@@ -613,7 +633,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
new->uid = ruid;
if (old->uid != ruid &&
old->euid != ruid &&
- !capable(CAP_SETUID))
+ !nsown_capable(CAP_SETUID))
goto error;
}
@@ -622,7 +642,7 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
if (old->uid != euid &&
old->euid != euid &&
old->suid != euid &&
- !capable(CAP_SETUID))
+ !nsown_capable(CAP_SETUID))
goto error;
}
@@ -670,7 +690,7 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
old = current_cred();
retval = -EPERM;
- if (capable(CAP_SETUID)) {
+ if (nsown_capable(CAP_SETUID)) {
new->suid = new->uid = uid;
if (uid != old->uid) {
retval = set_user(new);
@@ -712,7 +732,7 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
old = current_cred();
retval = -EPERM;
- if (!capable(CAP_SETUID)) {
+ if (!nsown_capable(CAP_SETUID)) {
if (ruid != (uid_t) -1 && ruid != old->uid &&
ruid != old->euid && ruid != old->suid)
goto error;
@@ -776,7 +796,7 @@ SYSCALL_DEFINE3(setresgid, gid_t, rgid, gid_t, egid, gid_t, sgid)
old = current_cred();
retval = -EPERM;
- if (!capable(CAP_SETGID)) {
+ if (!nsown_capable(CAP_SETGID)) {
if (rgid != (gid_t) -1 && rgid != old->gid &&
rgid != old->egid && rgid != old->sgid)
goto error;
@@ -836,7 +856,7 @@ SYSCALL_DEFINE1(setfsuid, uid_t, uid)
if (uid == old->uid || uid == old->euid ||
uid == old->suid || uid == old->fsuid ||
- capable(CAP_SETUID)) {
+ nsown_capable(CAP_SETUID)) {
if (uid != old_fsuid) {
new->fsuid = uid;
if (security_task_fix_setuid(new, old, LSM_SETID_FS) == 0)
@@ -869,7 +889,7 @@ SYSCALL_DEFINE1(setfsgid, gid_t, gid)
if (gid == old->gid || gid == old->egid ||
gid == old->sgid || gid == old->fsgid ||
- capable(CAP_SETGID)) {
+ nsown_capable(CAP_SETGID)) {
if (gid != old_fsgid) {
new->fsgid = gid;
goto change_okay;
@@ -1177,8 +1197,9 @@ SYSCALL_DEFINE2(sethostname, char __user *, name, int, len)
int errno;
char tmp[__NEW_UTS_LEN];
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
+
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
down_write(&uts_sem);
@@ -1226,7 +1247,7 @@ SYSCALL_DEFINE2(setdomainname, char __user *, name, int, len)
int errno;
char tmp[__NEW_UTS_LEN];
- if (!capable(CAP_SYS_ADMIN))
+ if (!ns_capable(current->nsproxy->uts_ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
if (len < 0 || len > __NEW_UTS_LEN)
return -EINVAL;
@@ -1341,6 +1362,8 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
rlim = tsk->signal->rlim + resource;
task_lock(tsk->group_leader);
if (new_rlim) {
+ /* Keep the capable check against init_user_ns until
+ cgroups can contain all limits */
if (new_rlim->rlim_max > rlim->rlim_max &&
!capable(CAP_SYS_RESOURCE))
retval = -EPERM;
@@ -1384,19 +1407,22 @@ static int check_prlimit_permission(struct task_struct *task)
{
const struct cred *cred = current_cred(), *tcred;
- tcred = __task_cred(task);
- if (current != task &&
- (cred->uid != tcred->euid ||
- cred->uid != tcred->suid ||
- cred->uid != tcred->uid ||
- cred->gid != tcred->egid ||
- cred->gid != tcred->sgid ||
- cred->gid != tcred->gid) &&
- !capable(CAP_SYS_RESOURCE)) {
- return -EPERM;
- }
+ if (current == task)
+ return 0;
- return 0;
+ tcred = __task_cred(task);
+ if (cred->user->user_ns == tcred->user->user_ns &&
+ (cred->uid == tcred->euid &&
+ cred->uid == tcred->suid &&
+ cred->uid == tcred->uid &&
+ cred->gid == tcred->egid &&
+ cred->gid == tcred->sgid &&
+ cred->gid == tcred->gid))
+ return 0;
+ if (ns_capable(tcred->user->user_ns, CAP_SYS_RESOURCE))
+ return 0;
+
+ return -EPERM;
}
SYSCALL_DEFINE4(prlimit64, pid_t, pid, unsigned int, resource,
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index c782fe9924c7..62cbc8877fef 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -46,7 +46,9 @@ cond_syscall(sys_getsockopt);
cond_syscall(compat_sys_getsockopt);
cond_syscall(sys_shutdown);
cond_syscall(sys_sendmsg);
+cond_syscall(sys_sendmmsg);
cond_syscall(compat_sys_sendmsg);
+cond_syscall(compat_sys_sendmmsg);
cond_syscall(sys_recvmsg);
cond_syscall(sys_recvmmsg);
cond_syscall(compat_sys_recvmsg);
@@ -69,15 +71,22 @@ cond_syscall(compat_sys_epoll_pwait);
cond_syscall(sys_semget);
cond_syscall(sys_semop);
cond_syscall(sys_semtimedop);
+cond_syscall(compat_sys_semtimedop);
cond_syscall(sys_semctl);
+cond_syscall(compat_sys_semctl);
cond_syscall(sys_msgget);
cond_syscall(sys_msgsnd);
+cond_syscall(compat_sys_msgsnd);
cond_syscall(sys_msgrcv);
+cond_syscall(compat_sys_msgrcv);
cond_syscall(sys_msgctl);
+cond_syscall(compat_sys_msgctl);
cond_syscall(sys_shmget);
cond_syscall(sys_shmat);
+cond_syscall(compat_sys_shmat);
cond_syscall(sys_shmdt);
cond_syscall(sys_shmctl);
+cond_syscall(compat_sys_shmctl);
cond_syscall(sys_mq_open);
cond_syscall(sys_mq_unlink);
cond_syscall(sys_mq_timedsend);
@@ -186,3 +195,8 @@ cond_syscall(sys_perf_event_open);
/* fanotify! */
cond_syscall(sys_fanotify_init);
cond_syscall(sys_fanotify_mark);
+
+/* open by handle */
+cond_syscall(sys_name_to_handle_at);
+cond_syscall(sys_open_by_handle_at);
+cond_syscall(compat_sys_open_by_handle_at);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 4eed0af5d144..4fc92445a29c 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -56,6 +56,7 @@
#include <linux/kprobes.h>
#include <linux/pipe_fs_i.h>
#include <linux/oom.h>
+#include <linux/kmod.h>
#include <asm/uaccess.h>
#include <asm/processor.h>
@@ -117,6 +118,7 @@ static int neg_one = -1;
static int zero;
static int __maybe_unused one = 1;
static int __maybe_unused two = 2;
+static int __maybe_unused three = 3;
static unsigned long one_ul = 1;
static int one_hundred = 100;
#ifdef CONFIG_PRINTK
@@ -169,6 +171,11 @@ static int proc_taint(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos);
#endif
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos);
+#endif
+
#ifdef CONFIG_MAGIC_SYSRQ
/* Note: sysrq code uses it's own private copy */
static int __sysrq_enabled = SYSRQ_DEFAULT_ENABLE;
@@ -361,20 +368,13 @@ static struct ctl_table kern_table[] = {
.mode = 0644,
.proc_handler = sched_rt_handler,
},
- {
- .procname = "sched_compat_yield",
- .data = &sysctl_sched_compat_yield,
- .maxlen = sizeof(unsigned int),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
.data = &sysctl_sched_autogroup_enabled,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
.extra2 = &one,
},
@@ -617,6 +617,11 @@ static struct ctl_table kern_table[] = {
.child = random_table,
},
{
+ .procname = "usermodehelper",
+ .mode = 0555,
+ .child = usermodehelper_table,
+ },
+ {
.procname = "overflowuid",
.data = &overflowuid,
.maxlen = sizeof(int),
@@ -713,7 +718,7 @@ static struct ctl_table kern_table[] = {
.data = &kptr_restrict,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec_minmax,
+ .proc_handler = proc_dmesg_restrict,
.extra1 = &zero,
.extra2 = &two,
},
@@ -731,14 +736,16 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_enabled,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_dowatchdog_enabled,
+ .proc_handler = proc_dowatchdog,
+ .extra1 = &zero,
+ .extra2 = &one,
},
{
.procname = "watchdog_thresh",
- .data = &softlockup_thresh,
+ .data = &watchdog_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dowatchdog_thresh,
+ .proc_handler = proc_dowatchdog,
.extra1 = &neg_one,
.extra2 = &sixty,
},
@@ -756,7 +763,9 @@ static struct ctl_table kern_table[] = {
.data = &watchdog_enabled,
.maxlen = sizeof (int),
.mode = 0644,
- .proc_handler = proc_dowatchdog_enabled,
+ .proc_handler = proc_dowatchdog,
+ .extra1 = &zero,
+ .extra2 = &one,
},
#endif
#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86)
@@ -948,7 +957,7 @@ static struct ctl_table kern_table[] = {
.data = &sysctl_perf_event_sample_rate,
.maxlen = sizeof(sysctl_perf_event_sample_rate),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = perf_proc_update_handler,
},
#endif
#ifdef CONFIG_KMEMCHECK
@@ -978,14 +987,18 @@ static struct ctl_table vm_table[] = {
.data = &sysctl_overcommit_memory,
.maxlen = sizeof(sysctl_overcommit_memory),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
},
{
.procname = "panic_on_oom",
.data = &sysctl_panic_on_oom,
.maxlen = sizeof(sysctl_panic_on_oom),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &two,
},
{
.procname = "oom_kill_allocating_task",
@@ -1013,7 +1026,8 @@ static struct ctl_table vm_table[] = {
.data = &page_cluster,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
{
.procname = "dirty_background_ratio",
@@ -1061,7 +1075,8 @@ static struct ctl_table vm_table[] = {
.data = &dirty_expire_interval,
.maxlen = sizeof(dirty_expire_interval),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
{
.procname = "nr_pdflush_threads",
@@ -1137,6 +1152,8 @@ static struct ctl_table vm_table[] = {
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = drop_caches_sysctl_handler,
+ .extra1 = &one,
+ .extra2 = &three,
},
#ifdef CONFIG_COMPACTION
{
@@ -1489,7 +1506,7 @@ static struct ctl_table fs_table[] = {
static struct ctl_table debug_table[] = {
#if defined(CONFIG_X86) || defined(CONFIG_PPC) || defined(CONFIG_SPARC) || \
- defined(CONFIG_S390)
+ defined(CONFIG_S390) || defined(CONFIG_TILE)
{
.procname = "exception-trace",
.data = &show_unhandled_signals,
@@ -1690,13 +1707,8 @@ static int test_perm(int mode, int op)
int sysctl_perm(struct ctl_table_root *root, struct ctl_table *table, int op)
{
- int error;
int mode;
- error = security_sysctl(table, op & (MAY_READ | MAY_WRITE | MAY_EXEC));
- if (error)
- return error;
-
if (root->permissions)
mode = root->permissions(root, current->nsproxy, table);
else
@@ -2397,6 +2409,17 @@ static int proc_taint(struct ctl_table *table, int write,
return err;
}
+#ifdef CONFIG_PRINTK
+static int proc_dmesg_restrict(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ if (write && !capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+}
+#endif
+
struct do_proc_dointvec_minmax_conv_param {
int *min;
int *max;
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
index b875bedf7c9a..3b8e028b9601 100644
--- a/kernel/sysctl_binary.c
+++ b/kernel/sysctl_binary.c
@@ -1321,13 +1321,11 @@ static ssize_t binary_sysctl(const int *name, int nlen,
void __user *oldval, size_t oldlen, void __user *newval, size_t newlen)
{
const struct bin_table *table = NULL;
- struct nameidata nd;
struct vfsmount *mnt;
struct file *file;
ssize_t result;
char *pathname;
int flags;
- int acc_mode;
pathname = sysctl_getname(name, nlen, &table);
result = PTR_ERR(pathname);
@@ -1337,28 +1335,17 @@ static ssize_t binary_sysctl(const int *name, int nlen,
/* How should the sysctl be accessed? */
if (oldval && oldlen && newval && newlen) {
flags = O_RDWR;
- acc_mode = MAY_READ | MAY_WRITE;
} else if (newval && newlen) {
flags = O_WRONLY;
- acc_mode = MAY_WRITE;
} else if (oldval && oldlen) {
flags = O_RDONLY;
- acc_mode = MAY_READ;
} else {
result = 0;
goto out_putname;
}
mnt = current->nsproxy->pid_ns->proc_mnt;
- result = vfs_path_lookup(mnt->mnt_root, mnt, pathname, 0, &nd);
- if (result)
- goto out_putname;
-
- result = may_open(&nd.path, acc_mode, flags);
- if (result)
- goto out_putpath;
-
- file = dentry_open(nd.path.dentry, nd.path.mnt, flags, current_cred());
+ file = file_open_root(mnt->mnt_root, mnt, pathname, flags);
result = PTR_ERR(file);
if (IS_ERR(file))
goto out_putname;
@@ -1370,10 +1357,6 @@ out_putname:
putname(pathname);
out:
return result;
-
-out_putpath:
- path_put(&nd.path);
- goto out_putname;
}
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index 10b90d8a03c4..4e4932a7b360 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -111,11 +111,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
const char *fail = NULL;
if (table->parent) {
- if (table->procname && !table->parent->procname)
+ if (!table->parent->procname)
set_fail(&fail, table, "Parent without procname");
}
- if (!table->procname)
- set_fail(&fail, table, "No procname");
if (table->child) {
if (table->data)
set_fail(&fail, table, "Directory with data?");
@@ -144,13 +142,9 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
set_fail(&fail, table, "No maxlen");
}
#ifdef CONFIG_PROC_SYSCTL
- if (table->procname && !table->proc_handler)
+ if (!table->proc_handler)
set_fail(&fail, table, "No proc_handler");
#endif
-#if 0
- if (!table->procname && table->proc_handler)
- set_fail(&fail, table, "proc_handler without procname");
-#endif
sysctl_check_leaf(namespaces, table, &fail);
}
if (table->mode > 0777)
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
index 3971c6b9d58d..9ffea360a778 100644
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -685,7 +685,7 @@ static int __init taskstats_init(void)
goto err_cgroup_ops;
family_registered = 1;
- printk("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
+ pr_info("registered taskstats version %d\n", TASKSTATS_GENL_VERSION);
return 0;
err_cgroup_ops:
genl_unregister_ops(&family, &taskstats_ops);
diff --git a/kernel/time.c b/kernel/time.c
index 32174359576f..8e8dc6d705c9 100644
--- a/kernel/time.c
+++ b/kernel/time.c
@@ -150,7 +150,7 @@ static inline void warp_clock(void)
* various programs will get confused when the clock gets warped.
*/
-int do_sys_settimeofday(struct timespec *tv, struct timezone *tz)
+int do_sys_settimeofday(const struct timespec *tv, const struct timezone *tz)
{
static int firsttime = 1;
int error = 0;
@@ -645,7 +645,7 @@ u64 nsec_to_clock_t(u64 x)
}
/**
- * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ * nsecs_to_jiffies64 - Convert nsecs in u64 to jiffies64
*
* @n: nsecs in u64
*
@@ -657,7 +657,7 @@ u64 nsec_to_clock_t(u64 x)
* NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
* ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
*/
-unsigned long nsecs_to_jiffies(u64 n)
+u64 nsecs_to_jiffies64(u64 n)
{
#if (NSEC_PER_SEC % HZ) == 0
/* Common case, HZ = 100, 128, 200, 250, 256, 500, 512, 1000 etc. */
@@ -674,22 +674,23 @@ unsigned long nsecs_to_jiffies(u64 n)
#endif
}
-#if (BITS_PER_LONG < 64)
-u64 get_jiffies_64(void)
+/**
+ * nsecs_to_jiffies - Convert nsecs in u64 to jiffies
+ *
+ * @n: nsecs in u64
+ *
+ * Unlike {m,u}secs_to_jiffies, type of input is not unsigned int but u64.
+ * And this doesn't return MAX_JIFFY_OFFSET since this function is designed
+ * for scheduler, not for use in device drivers to calculate timeout value.
+ *
+ * note:
+ * NSEC_PER_SEC = 10^9 = (5^9 * 2^9) = (1953125 * 512)
+ * ULLONG_MAX ns = 18446744073.709551615 secs = about 584 years
+ */
+unsigned long nsecs_to_jiffies(u64 n)
{
- unsigned long seq;
- u64 ret;
-
- do {
- seq = read_seqbegin(&xtime_lock);
- ret = jiffies_64;
- } while (read_seqretry(&xtime_lock, seq));
- return ret;
+ return (unsigned long)nsecs_to_jiffies64(n);
}
-EXPORT_SYMBOL(get_jiffies_64);
-#endif
-
-EXPORT_SYMBOL(jiffies);
/*
* Add two timespec values and do a safety check for overflow.
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index ee266620b06c..e2fd74b8e8c2 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,5 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o timeconv.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
+obj-y += timeconv.o posix-clock.o alarmtimer.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD) += clockevents.o
obj-$(CONFIG_GENERIC_CLOCKEVENTS) += tick-common.o
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
new file mode 100644
index 000000000000..2d966244ea60
--- /dev/null
+++ b/kernel/time/alarmtimer.c
@@ -0,0 +1,702 @@
+/*
+ * Alarmtimer interface
+ *
+ * This interface provides a timer which is similarto hrtimers,
+ * but triggers a RTC alarm if the box is suspend.
+ *
+ * This interface is influenced by the Android RTC Alarm timer
+ * interface.
+ *
+ * Copyright (C) 2010 IBM Corperation
+ *
+ * Author: John Stultz <john.stultz@linaro.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#include <linux/time.h>
+#include <linux/hrtimer.h>
+#include <linux/timerqueue.h>
+#include <linux/rtc.h>
+#include <linux/alarmtimer.h>
+#include <linux/mutex.h>
+#include <linux/platform_device.h>
+#include <linux/posix-timers.h>
+#include <linux/workqueue.h>
+#include <linux/freezer.h>
+
+/**
+ * struct alarm_base - Alarm timer bases
+ * @lock: Lock for syncrhonized access to the base
+ * @timerqueue: Timerqueue head managing the list of events
+ * @timer: hrtimer used to schedule events while running
+ * @gettime: Function to read the time correlating to the base
+ * @base_clockid: clockid for the base
+ */
+static struct alarm_base {
+ spinlock_t lock;
+ struct timerqueue_head timerqueue;
+ struct hrtimer timer;
+ ktime_t (*gettime)(void);
+ clockid_t base_clockid;
+} alarm_bases[ALARM_NUMTYPE];
+
+#ifdef CONFIG_RTC_CLASS
+/* rtc timer and device for setting alarm wakeups at suspend */
+static struct rtc_timer rtctimer;
+static struct rtc_device *rtcdev;
+#endif
+
+/* freezer delta & lock used to handle clock_nanosleep triggered wakeups */
+static ktime_t freezer_delta;
+static DEFINE_SPINLOCK(freezer_delta_lock);
+
+
+/**
+ * alarmtimer_enqueue - Adds an alarm timer to an alarm_base timerqueue
+ * @base: pointer to the base where the timer is being run
+ * @alarm: pointer to alarm being enqueued.
+ *
+ * Adds alarm to a alarm_base timerqueue and if necessary sets
+ * an hrtimer to run.
+ *
+ * Must hold base->lock when calling.
+ */
+static void alarmtimer_enqueue(struct alarm_base *base, struct alarm *alarm)
+{
+ timerqueue_add(&base->timerqueue, &alarm->node);
+ if (&alarm->node == timerqueue_getnext(&base->timerqueue)) {
+ hrtimer_try_to_cancel(&base->timer);
+ hrtimer_start(&base->timer, alarm->node.expires,
+ HRTIMER_MODE_ABS);
+ }
+}
+
+/**
+ * alarmtimer_remove - Removes an alarm timer from an alarm_base timerqueue
+ * @base: pointer to the base where the timer is running
+ * @alarm: pointer to alarm being removed
+ *
+ * Removes alarm to a alarm_base timerqueue and if necessary sets
+ * a new timer to run.
+ *
+ * Must hold base->lock when calling.
+ */
+static void alarmtimer_remove(struct alarm_base *base, struct alarm *alarm)
+{
+ struct timerqueue_node *next = timerqueue_getnext(&base->timerqueue);
+
+ timerqueue_del(&base->timerqueue, &alarm->node);
+ if (next == &alarm->node) {
+ hrtimer_try_to_cancel(&base->timer);
+ next = timerqueue_getnext(&base->timerqueue);
+ if (!next)
+ return;
+ hrtimer_start(&base->timer, next->expires, HRTIMER_MODE_ABS);
+ }
+}
+
+
+/**
+ * alarmtimer_fired - Handles alarm hrtimer being fired.
+ * @timer: pointer to hrtimer being run
+ *
+ * When a alarm timer fires, this runs through the timerqueue to
+ * see which alarms expired, and runs those. If there are more alarm
+ * timers queued for the future, we set the hrtimer to fire when
+ * when the next future alarm timer expires.
+ */
+static enum hrtimer_restart alarmtimer_fired(struct hrtimer *timer)
+{
+ struct alarm_base *base = container_of(timer, struct alarm_base, timer);
+ struct timerqueue_node *next;
+ unsigned long flags;
+ ktime_t now;
+ int ret = HRTIMER_NORESTART;
+
+ spin_lock_irqsave(&base->lock, flags);
+ now = base->gettime();
+ while ((next = timerqueue_getnext(&base->timerqueue))) {
+ struct alarm *alarm;
+ ktime_t expired = next->expires;
+
+ if (expired.tv64 >= now.tv64)
+ break;
+
+ alarm = container_of(next, struct alarm, node);
+
+ timerqueue_del(&base->timerqueue, &alarm->node);
+ alarm->enabled = 0;
+ /* Re-add periodic timers */
+ if (alarm->period.tv64) {
+ alarm->node.expires = ktime_add(expired, alarm->period);
+ timerqueue_add(&base->timerqueue, &alarm->node);
+ alarm->enabled = 1;
+ }
+ spin_unlock_irqrestore(&base->lock, flags);
+ if (alarm->function)
+ alarm->function(alarm);
+ spin_lock_irqsave(&base->lock, flags);
+ }
+
+ if (next) {
+ hrtimer_set_expires(&base->timer, next->expires);
+ ret = HRTIMER_RESTART;
+ }
+ spin_unlock_irqrestore(&base->lock, flags);
+
+ return ret;
+
+}
+
+#ifdef CONFIG_RTC_CLASS
+/**
+ * alarmtimer_suspend - Suspend time callback
+ * @dev: unused
+ * @state: unused
+ *
+ * When we are going into suspend, we look through the bases
+ * to see which is the soonest timer to expire. We then
+ * set an rtc timer to fire that far into the future, which
+ * will wake us from suspend.
+ */
+static int alarmtimer_suspend(struct device *dev)
+{
+ struct rtc_time tm;
+ ktime_t min, now;
+ unsigned long flags;
+ int i;
+
+ spin_lock_irqsave(&freezer_delta_lock, flags);
+ min = freezer_delta;
+ freezer_delta = ktime_set(0, 0);
+ spin_unlock_irqrestore(&freezer_delta_lock, flags);
+
+ /* If we have no rtcdev, just return */
+ if (!rtcdev)
+ return 0;
+
+ /* Find the soonest timer to expire*/
+ for (i = 0; i < ALARM_NUMTYPE; i++) {
+ struct alarm_base *base = &alarm_bases[i];
+ struct timerqueue_node *next;
+ ktime_t delta;
+
+ spin_lock_irqsave(&base->lock, flags);
+ next = timerqueue_getnext(&base->timerqueue);
+ spin_unlock_irqrestore(&base->lock, flags);
+ if (!next)
+ continue;
+ delta = ktime_sub(next->expires, base->gettime());
+ if (!min.tv64 || (delta.tv64 < min.tv64))
+ min = delta;
+ }
+ if (min.tv64 == 0)
+ return 0;
+
+ /* XXX - Should we enforce a minimum sleep time? */
+ WARN_ON(min.tv64 < NSEC_PER_SEC);
+
+ /* Setup an rtc timer to fire that far in the future */
+ rtc_timer_cancel(rtcdev, &rtctimer);
+ rtc_read_time(rtcdev, &tm);
+ now = rtc_tm_to_ktime(tm);
+ now = ktime_add(now, min);
+
+ rtc_timer_start(rtcdev, &rtctimer, now, ktime_set(0, 0));
+
+ return 0;
+}
+#else
+static int alarmtimer_suspend(struct device *dev)
+{
+ return 0;
+}
+#endif
+
+static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type)
+{
+ ktime_t delta;
+ unsigned long flags;
+ struct alarm_base *base = &alarm_bases[type];
+
+ delta = ktime_sub(absexp, base->gettime());
+
+ spin_lock_irqsave(&freezer_delta_lock, flags);
+ if (!freezer_delta.tv64 || (delta.tv64 < freezer_delta.tv64))
+ freezer_delta = delta;
+ spin_unlock_irqrestore(&freezer_delta_lock, flags);
+}
+
+
+/**
+ * alarm_init - Initialize an alarm structure
+ * @alarm: ptr to alarm to be initialized
+ * @type: the type of the alarm
+ * @function: callback that is run when the alarm fires
+ */
+void alarm_init(struct alarm *alarm, enum alarmtimer_type type,
+ void (*function)(struct alarm *))
+{
+ timerqueue_init(&alarm->node);
+ alarm->period = ktime_set(0, 0);
+ alarm->function = function;
+ alarm->type = type;
+ alarm->enabled = 0;
+}
+
+/**
+ * alarm_start - Sets an alarm to fire
+ * @alarm: ptr to alarm to set
+ * @start: time to run the alarm
+ * @period: period at which the alarm will recur
+ */
+void alarm_start(struct alarm *alarm, ktime_t start, ktime_t period)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ unsigned long flags;
+
+ spin_lock_irqsave(&base->lock, flags);
+ if (alarm->enabled)
+ alarmtimer_remove(base, alarm);
+ alarm->node.expires = start;
+ alarm->period = period;
+ alarmtimer_enqueue(base, alarm);
+ alarm->enabled = 1;
+ spin_unlock_irqrestore(&base->lock, flags);
+}
+
+/**
+ * alarm_cancel - Tries to cancel an alarm timer
+ * @alarm: ptr to alarm to be canceled
+ */
+void alarm_cancel(struct alarm *alarm)
+{
+ struct alarm_base *base = &alarm_bases[alarm->type];
+ unsigned long flags;
+
+ spin_lock_irqsave(&base->lock, flags);
+ if (alarm->enabled)
+ alarmtimer_remove(base, alarm);
+ alarm->enabled = 0;
+ spin_unlock_irqrestore(&base->lock, flags);
+}
+
+
+/**
+ * clock2alarm - helper that converts from clockid to alarmtypes
+ * @clockid: clockid.
+ */
+static enum alarmtimer_type clock2alarm(clockid_t clockid)
+{
+ if (clockid == CLOCK_REALTIME_ALARM)
+ return ALARM_REALTIME;
+ if (clockid == CLOCK_BOOTTIME_ALARM)
+ return ALARM_BOOTTIME;
+ return -1;
+}
+
+/**
+ * alarm_handle_timer - Callback for posix timers
+ * @alarm: alarm that fired
+ *
+ * Posix timer callback for expired alarm timers.
+ */
+static void alarm_handle_timer(struct alarm *alarm)
+{
+ struct k_itimer *ptr = container_of(alarm, struct k_itimer,
+ it.alarmtimer);
+ if (posix_timer_event(ptr, 0) != 0)
+ ptr->it_overrun++;
+}
+
+/**
+ * alarm_clock_getres - posix getres interface
+ * @which_clock: clockid
+ * @tp: timespec to fill
+ *
+ * Returns the granularity of underlying alarm base clock
+ */
+static int alarm_clock_getres(const clockid_t which_clock, struct timespec *tp)
+{
+ clockid_t baseid = alarm_bases[clock2alarm(which_clock)].base_clockid;
+
+ return hrtimer_get_res(baseid, tp);
+}
+
+/**
+ * alarm_clock_get - posix clock_get interface
+ * @which_clock: clockid
+ * @tp: timespec to fill.
+ *
+ * Provides the underlying alarm base time.
+ */
+static int alarm_clock_get(clockid_t which_clock, struct timespec *tp)
+{
+ struct alarm_base *base = &alarm_bases[clock2alarm(which_clock)];
+
+ *tp = ktime_to_timespec(base->gettime());
+ return 0;
+}
+
+/**
+ * alarm_timer_create - posix timer_create interface
+ * @new_timer: k_itimer pointer to manage
+ *
+ * Initializes the k_itimer structure.
+ */
+static int alarm_timer_create(struct k_itimer *new_timer)
+{
+ enum alarmtimer_type type;
+ struct alarm_base *base;
+
+ if (!capable(CAP_WAKE_ALARM))
+ return -EPERM;
+
+ type = clock2alarm(new_timer->it_clock);
+ base = &alarm_bases[type];
+ alarm_init(&new_timer->it.alarmtimer, type, alarm_handle_timer);
+ return 0;
+}
+
+/**
+ * alarm_timer_get - posix timer_get interface
+ * @new_timer: k_itimer pointer
+ * @cur_setting: itimerspec data to fill
+ *
+ * Copies the itimerspec data out from the k_itimer
+ */
+static void alarm_timer_get(struct k_itimer *timr,
+ struct itimerspec *cur_setting)
+{
+ cur_setting->it_interval =
+ ktime_to_timespec(timr->it.alarmtimer.period);
+ cur_setting->it_value =
+ ktime_to_timespec(timr->it.alarmtimer.node.expires);
+ return;
+}
+
+/**
+ * alarm_timer_del - posix timer_del interface
+ * @timr: k_itimer pointer to be deleted
+ *
+ * Cancels any programmed alarms for the given timer.
+ */
+static int alarm_timer_del(struct k_itimer *timr)
+{
+ alarm_cancel(&timr->it.alarmtimer);
+ return 0;
+}
+
+/**
+ * alarm_timer_set - posix timer_set interface
+ * @timr: k_itimer pointer to be deleted
+ * @flags: timer flags
+ * @new_setting: itimerspec to be used
+ * @old_setting: itimerspec being replaced
+ *
+ * Sets the timer to new_setting, and starts the timer.
+ */
+static int alarm_timer_set(struct k_itimer *timr, int flags,
+ struct itimerspec *new_setting,
+ struct itimerspec *old_setting)
+{
+ /* Save old values */
+ old_setting->it_interval =
+ ktime_to_timespec(timr->it.alarmtimer.period);
+ old_setting->it_value =
+ ktime_to_timespec(timr->it.alarmtimer.node.expires);
+
+ /* If the timer was already set, cancel it */
+ alarm_cancel(&timr->it.alarmtimer);
+
+ /* start the timer */
+ alarm_start(&timr->it.alarmtimer,
+ timespec_to_ktime(new_setting->it_value),
+ timespec_to_ktime(new_setting->it_interval));
+ return 0;
+}
+
+/**
+ * alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
+ * @alarm: ptr to alarm that fired
+ *
+ * Wakes up the task that set the alarmtimer
+ */
+static void alarmtimer_nsleep_wakeup(struct alarm *alarm)
+{
+ struct task_struct *task = (struct task_struct *)alarm->data;
+
+ alarm->data = NULL;
+ if (task)
+ wake_up_process(task);
+}
+
+/**
+ * alarmtimer_do_nsleep - Internal alarmtimer nsleep implementation
+ * @alarm: ptr to alarmtimer
+ * @absexp: absolute expiration time
+ *
+ * Sets the alarm timer and sleeps until it is fired or interrupted.
+ */
+static int alarmtimer_do_nsleep(struct alarm *alarm, ktime_t absexp)
+{
+ alarm->data = (void *)current;
+ do {
+ set_current_state(TASK_INTERRUPTIBLE);
+ alarm_start(alarm, absexp, ktime_set(0, 0));
+ if (likely(alarm->data))
+ schedule();
+
+ alarm_cancel(alarm);
+ } while (alarm->data && !signal_pending(current));
+
+ __set_current_state(TASK_RUNNING);
+
+ return (alarm->data == NULL);
+}
+
+
+/**
+ * update_rmtp - Update remaining timespec value
+ * @exp: expiration time
+ * @type: timer type
+ * @rmtp: user pointer to remaining timepsec value
+ *
+ * Helper function that fills in rmtp value with time between
+ * now and the exp value
+ */
+static int update_rmtp(ktime_t exp, enum alarmtimer_type type,
+ struct timespec __user *rmtp)
+{
+ struct timespec rmt;
+ ktime_t rem;
+
+ rem = ktime_sub(exp, alarm_bases[type].gettime());
+
+ if (rem.tv64 <= 0)
+ return 0;
+ rmt = ktime_to_timespec(rem);
+
+ if (copy_to_user(rmtp, &rmt, sizeof(*rmtp)))
+ return -EFAULT;
+
+ return 1;
+
+}
+
+/**
+ * alarm_timer_nsleep_restart - restartblock alarmtimer nsleep
+ * @restart: ptr to restart block
+ *
+ * Handles restarted clock_nanosleep calls
+ */
+static long __sched alarm_timer_nsleep_restart(struct restart_block *restart)
+{
+ enum alarmtimer_type type = restart->nanosleep.clockid;
+ ktime_t exp;
+ struct timespec __user *rmtp;
+ struct alarm alarm;
+ int ret = 0;
+
+ exp.tv64 = restart->nanosleep.expires;
+ alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+
+ if (alarmtimer_do_nsleep(&alarm, exp))
+ goto out;
+
+ if (freezing(current))
+ alarmtimer_freezerset(exp, type);
+
+ rmtp = restart->nanosleep.rmtp;
+ if (rmtp) {
+ ret = update_rmtp(exp, type, rmtp);
+ if (ret <= 0)
+ goto out;
+ }
+
+
+ /* The other values in restart are already filled in */
+ ret = -ERESTART_RESTARTBLOCK;
+out:
+ return ret;
+}
+
+/**
+ * alarm_timer_nsleep - alarmtimer nanosleep
+ * @which_clock: clockid
+ * @flags: determins abstime or relative
+ * @tsreq: requested sleep time (abs or rel)
+ * @rmtp: remaining sleep time saved
+ *
+ * Handles clock_nanosleep calls against _ALARM clockids
+ */
+static int alarm_timer_nsleep(const clockid_t which_clock, int flags,
+ struct timespec *tsreq, struct timespec __user *rmtp)
+{
+ enum alarmtimer_type type = clock2alarm(which_clock);
+ struct alarm alarm;
+ ktime_t exp;
+ int ret = 0;
+ struct restart_block *restart;
+
+ if (!capable(CAP_WAKE_ALARM))
+ return -EPERM;
+
+ alarm_init(&alarm, type, alarmtimer_nsleep_wakeup);
+
+ exp = timespec_to_ktime(*tsreq);
+ /* Convert (if necessary) to absolute time */
+ if (flags != TIMER_ABSTIME) {
+ ktime_t now = alarm_bases[type].gettime();
+ exp = ktime_add(now, exp);
+ }
+
+ if (alarmtimer_do_nsleep(&alarm, exp))
+ goto out;
+
+ if (freezing(current))
+ alarmtimer_freezerset(exp, type);
+
+ /* abs timers don't set remaining time or restart */
+ if (flags == TIMER_ABSTIME) {
+ ret = -ERESTARTNOHAND;
+ goto out;
+ }
+
+ if (rmtp) {
+ ret = update_rmtp(exp, type, rmtp);
+ if (ret <= 0)
+ goto out;
+ }
+
+ restart = &current_thread_info()->restart_block;
+ restart->fn = alarm_timer_nsleep_restart;
+ restart->nanosleep.clockid = type;
+ restart->nanosleep.expires = exp.tv64;
+ restart->nanosleep.rmtp = rmtp;
+ ret = -ERESTART_RESTARTBLOCK;
+
+out:
+ return ret;
+}
+
+
+/* Suspend hook structures */
+static const struct dev_pm_ops alarmtimer_pm_ops = {
+ .suspend = alarmtimer_suspend,
+};
+
+static struct platform_driver alarmtimer_driver = {
+ .driver = {
+ .name = "alarmtimer",
+ .pm = &alarmtimer_pm_ops,
+ }
+};
+
+/**
+ * alarmtimer_init - Initialize alarm timer code
+ *
+ * This function initializes the alarm bases and registers
+ * the posix clock ids.
+ */
+static int __init alarmtimer_init(void)
+{
+ int error = 0;
+ int i;
+ struct k_clock alarm_clock = {
+ .clock_getres = alarm_clock_getres,
+ .clock_get = alarm_clock_get,
+ .timer_create = alarm_timer_create,
+ .timer_set = alarm_timer_set,
+ .timer_del = alarm_timer_del,
+ .timer_get = alarm_timer_get,
+ .nsleep = alarm_timer_nsleep,
+ };
+
+ posix_timers_register_clock(CLOCK_REALTIME_ALARM, &alarm_clock);
+ posix_timers_register_clock(CLOCK_BOOTTIME_ALARM, &alarm_clock);
+
+ /* Initialize alarm bases */
+ alarm_bases[ALARM_REALTIME].base_clockid = CLOCK_REALTIME;
+ alarm_bases[ALARM_REALTIME].gettime = &ktime_get_real;
+ alarm_bases[ALARM_BOOTTIME].base_clockid = CLOCK_BOOTTIME;
+ alarm_bases[ALARM_BOOTTIME].gettime = &ktime_get_boottime;
+ for (i = 0; i < ALARM_NUMTYPE; i++) {
+ timerqueue_init_head(&alarm_bases[i].timerqueue);
+ spin_lock_init(&alarm_bases[i].lock);
+ hrtimer_init(&alarm_bases[i].timer,
+ alarm_bases[i].base_clockid,
+ HRTIMER_MODE_ABS);
+ alarm_bases[i].timer.function = alarmtimer_fired;
+ }
+ error = platform_driver_register(&alarmtimer_driver);
+ platform_device_register_simple("alarmtimer", -1, NULL, 0);
+
+ return error;
+}
+device_initcall(alarmtimer_init);
+
+#ifdef CONFIG_RTC_CLASS
+/**
+ * has_wakealarm - check rtc device has wakealarm ability
+ * @dev: current device
+ * @name_ptr: name to be returned
+ *
+ * This helper function checks to see if the rtc device can wake
+ * from suspend.
+ */
+static int __init has_wakealarm(struct device *dev, void *name_ptr)
+{
+ struct rtc_device *candidate = to_rtc_device(dev);
+
+ if (!candidate->ops->set_alarm)
+ return 0;
+ if (!device_may_wakeup(candidate->dev.parent))
+ return 0;
+
+ *(const char **)name_ptr = dev_name(dev);
+ return 1;
+}
+
+/**
+ * alarmtimer_init_late - Late initializing of alarmtimer code
+ *
+ * This function locates a rtc device to use for wakealarms.
+ * Run as late_initcall to make sure rtc devices have been
+ * registered.
+ */
+static int __init alarmtimer_init_late(void)
+{
+ struct device *dev;
+ char *str;
+
+ /* Find an rtc device and init the rtc_timer */
+ dev = class_find_device(rtc_class, NULL, &str, has_wakealarm);
+ /* If we have a device then str is valid. See has_wakealarm() */
+ if (dev) {
+ rtcdev = rtc_class_open(str);
+ /*
+ * Drop the reference we got in class_find_device,
+ * rtc_open takes its own.
+ */
+ put_device(dev);
+ }
+ if (!rtcdev) {
+ printk(KERN_WARNING "No RTC device found, ALARM timers will"
+ " not wake from suspend");
+ }
+ rtc_timer_init(&rtctimer, NULL, NULL);
+
+ return 0;
+}
+#else
+static int __init alarmtimer_init_late(void)
+{
+ printk(KERN_WARNING "Kernel not built with RTC support, ALARM timers"
+ " will not wake from suspend");
+ return 0;
+}
+#endif
+late_initcall(alarmtimer_init_late);
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index d7395fdfb9f3..c027d4f602f1 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -18,7 +18,6 @@
#include <linux/notifier.h>
#include <linux/smp.h>
#include <linux/sysdev.h>
-#include <linux/tick.h>
#include "tick-internal.h"
@@ -195,6 +194,70 @@ void clockevents_register_device(struct clock_event_device *dev)
}
EXPORT_SYMBOL_GPL(clockevents_register_device);
+static void clockevents_config(struct clock_event_device *dev,
+ u32 freq)
+{
+ u64 sec;
+
+ if (!(dev->features & CLOCK_EVT_FEAT_ONESHOT))
+ return;
+
+ /*
+ * Calculate the maximum number of seconds we can sleep. Limit
+ * to 10 minutes for hardware which can program more than
+ * 32bit ticks so we still get reasonable conversion values.
+ */
+ sec = dev->max_delta_ticks;
+ do_div(sec, freq);
+ if (!sec)
+ sec = 1;
+ else if (sec > 600 && dev->max_delta_ticks > UINT_MAX)
+ sec = 600;
+
+ clockevents_calc_mult_shift(dev, freq, sec);
+ dev->min_delta_ns = clockevent_delta2ns(dev->min_delta_ticks, dev);
+ dev->max_delta_ns = clockevent_delta2ns(dev->max_delta_ticks, dev);
+}
+
+/**
+ * clockevents_config_and_register - Configure and register a clock event device
+ * @dev: device to register
+ * @freq: The clock frequency
+ * @min_delta: The minimum clock ticks to program in oneshot mode
+ * @max_delta: The maximum clock ticks to program in oneshot mode
+ *
+ * min/max_delta can be 0 for devices which do not support oneshot mode.
+ */
+void clockevents_config_and_register(struct clock_event_device *dev,
+ u32 freq, unsigned long min_delta,
+ unsigned long max_delta)
+{
+ dev->min_delta_ticks = min_delta;
+ dev->max_delta_ticks = max_delta;
+ clockevents_config(dev, freq);
+ clockevents_register_device(dev);
+}
+
+/**
+ * clockevents_update_freq - Update frequency and reprogram a clock event device.
+ * @dev: device to modify
+ * @freq: new device frequency
+ *
+ * Reconfigure and reprogram a clock event device in oneshot
+ * mode. Must be called on the cpu for which the device delivers per
+ * cpu timer events with interrupts disabled! Returns 0 on success,
+ * -ETIME when the event is in the past.
+ */
+int clockevents_update_freq(struct clock_event_device *dev, u32 freq)
+{
+ clockevents_config(dev, freq);
+
+ if (dev->mode != CLOCK_EVT_MODE_ONESHOT)
+ return 0;
+
+ return clockevents_program_event(dev, dev->next_event, ktime_get());
+}
+
/*
* Noop handler when we shut down an event device
*/
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 6519cf62d9cd..1c95fd677328 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -626,19 +626,6 @@ static void clocksource_enqueue(struct clocksource *cs)
list_add(&cs->list, entry);
}
-
-/*
- * Maximum time we expect to go between ticks. This includes idle
- * tickless time. It provides the trade off between selecting a
- * mult/shift pair that is very precise but can only handle a short
- * period of time, vs. a mult/shift pair that can handle long periods
- * of time but isn't as precise.
- *
- * This is a subsystem constant, and actual hardware limitations
- * may override it (ie: clocksources that wrap every 3 seconds).
- */
-#define MAX_UPDATE_LENGTH 5 /* Seconds */
-
/**
* __clocksource_updatefreq_scale - Used update clocksource with new freq
* @t: clocksource to be registered
@@ -652,15 +639,28 @@ static void clocksource_enqueue(struct clocksource *cs)
*/
void __clocksource_updatefreq_scale(struct clocksource *cs, u32 scale, u32 freq)
{
+ u64 sec;
+
/*
- * Ideally we want to use some of the limits used in
- * clocksource_max_deferment, to provide a more informed
- * MAX_UPDATE_LENGTH. But for now this just gets the
- * register interface working properly.
+ * Calc the maximum number of seconds which we can run before
+ * wrapping around. For clocksources which have a mask > 32bit
+ * we need to limit the max sleep time to have a good
+ * conversion precision. 10 minutes is still a reasonable
+ * amount. That results in a shift value of 24 for a
+ * clocksource with mask >= 40bit and f >= 4GHz. That maps to
+ * ~ 0.06ppm granularity for NTP. We apply the same 12.5%
+ * margin as we do in clocksource_max_deferment()
*/
+ sec = (cs->mask - (cs->mask >> 5));
+ do_div(sec, freq);
+ do_div(sec, scale);
+ if (!sec)
+ sec = 1;
+ else if (sec > 600 && cs->mask > UINT_MAX)
+ sec = 600;
+
clocks_calc_mult_shift(&cs->mult, &cs->shift, freq,
- NSEC_PER_SEC/scale,
- MAX_UPDATE_LENGTH*scale);
+ NSEC_PER_SEC / scale, sec * scale);
cs->max_idle_ns = clocksource_max_deferment(cs);
}
EXPORT_SYMBOL_GPL(__clocksource_updatefreq_scale);
@@ -685,8 +685,8 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq)
/* Add clocksource to the clcoksource list */
mutex_lock(&clocksource_mutex);
clocksource_enqueue(cs);
- clocksource_select();
clocksource_enqueue_watchdog(cs);
+ clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
}
@@ -706,8 +706,8 @@ int clocksource_register(struct clocksource *cs)
mutex_lock(&clocksource_mutex);
clocksource_enqueue(cs);
- clocksource_select();
clocksource_enqueue_watchdog(cs);
+ clocksource_select();
mutex_unlock(&clocksource_mutex);
return 0;
}
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
index 5404a8456909..a470154e0408 100644
--- a/kernel/time/jiffies.c
+++ b/kernel/time/jiffies.c
@@ -22,8 +22,11 @@
************************************************************************/
#include <linux/clocksource.h>
#include <linux/jiffies.h>
+#include <linux/module.h>
#include <linux/init.h>
+#include "tick-internal.h"
+
/* The Jiffies based clocksource is the lowest common
* denominator clock source which should function on
* all systems. It has the same coarse resolution as
@@ -31,7 +34,7 @@
* inaccuracies caused by missed or lost timer
* interrupts and the inability for the timer
* interrupt hardware to accuratly tick at the
- * requested HZ value. It is also not reccomended
+ * requested HZ value. It is also not recommended
* for "tick-less" systems.
*/
#define NSEC_PER_JIFFY ((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
@@ -64,6 +67,23 @@ struct clocksource clocksource_jiffies = {
.shift = JIFFIES_SHIFT,
};
+#if (BITS_PER_LONG < 64)
+u64 get_jiffies_64(void)
+{
+ unsigned long seq;
+ u64 ret;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ ret = jiffies_64;
+ } while (read_seqretry(&xtime_lock, seq));
+ return ret;
+}
+EXPORT_SYMBOL(get_jiffies_64);
+#endif
+
+EXPORT_SYMBOL(jiffies);
+
static int __init init_jiffies_clocksource(void)
{
return clocksource_register(&clocksource_jiffies);
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5c00242fa921..f6117a4c7cb8 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -16,6 +16,8 @@
#include <linux/mm.h>
#include <linux/module.h>
+#include "tick-internal.h"
+
/*
* NTP timekeeping variables:
*/
@@ -646,6 +648,19 @@ int do_adjtimex(struct timex *txc)
hrtimer_cancel(&leap_timer);
}
+ if (txc->modes & ADJ_SETOFFSET) {
+ struct timespec delta;
+ delta.tv_sec = txc->time.tv_sec;
+ delta.tv_nsec = txc->time.tv_usec;
+ if (!capable(CAP_SYS_TIME))
+ return -EPERM;
+ if (!(txc->modes & ADJ_NANO))
+ delta.tv_nsec *= 1000;
+ result = timekeeping_inject_offset(&delta);
+ if (result)
+ return result;
+ }
+
getnstimeofday(&ts);
write_seqlock_irq(&xtime_lock);
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
new file mode 100644
index 000000000000..c340ca658f37
--- /dev/null
+++ b/kernel/time/posix-clock.c
@@ -0,0 +1,445 @@
+/*
+ * posix-clock.c - support for dynamic clock devices
+ *
+ * Copyright (C) 2010 OMICRON electronics GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#include <linux/device.h>
+#include <linux/file.h>
+#include <linux/posix-clock.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/uaccess.h>
+
+static void delete_clock(struct kref *kref);
+
+/*
+ * Returns NULL if the posix_clock instance attached to 'fp' is old and stale.
+ */
+static struct posix_clock *get_posix_clock(struct file *fp)
+{
+ struct posix_clock *clk = fp->private_data;
+
+ down_read(&clk->rwsem);
+
+ if (!clk->zombie)
+ return clk;
+
+ up_read(&clk->rwsem);
+
+ return NULL;
+}
+
+static void put_posix_clock(struct posix_clock *clk)
+{
+ up_read(&clk->rwsem);
+}
+
+static ssize_t posix_clock_read(struct file *fp, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -EINVAL;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.read)
+ err = clk->ops.read(clk, fp->f_flags, buf, count);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static unsigned int posix_clock_poll(struct file *fp, poll_table *wait)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int result = 0;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.poll)
+ result = clk->ops.poll(clk, fp, wait);
+
+ put_posix_clock(clk);
+
+ return result;
+}
+
+static int posix_clock_fasync(int fd, struct file *fp, int on)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = 0;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.fasync)
+ err = clk->ops.fasync(clk, fd, fp, on);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static int posix_clock_mmap(struct file *fp, struct vm_area_struct *vma)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENODEV;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.mmap)
+ err = clk->ops.mmap(clk, vma);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+static long posix_clock_ioctl(struct file *fp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENOTTY;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.ioctl)
+ err = clk->ops.ioctl(clk, cmd, arg);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long posix_clock_compat_ioctl(struct file *fp,
+ unsigned int cmd, unsigned long arg)
+{
+ struct posix_clock *clk = get_posix_clock(fp);
+ int err = -ENOTTY;
+
+ if (!clk)
+ return -ENODEV;
+
+ if (clk->ops.ioctl)
+ err = clk->ops.ioctl(clk, cmd, arg);
+
+ put_posix_clock(clk);
+
+ return err;
+}
+#endif
+
+static int posix_clock_open(struct inode *inode, struct file *fp)
+{
+ int err;
+ struct posix_clock *clk =
+ container_of(inode->i_cdev, struct posix_clock, cdev);
+
+ down_read(&clk->rwsem);
+
+ if (clk->zombie) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (clk->ops.open)
+ err = clk->ops.open(clk, fp->f_mode);
+ else
+ err = 0;
+
+ if (!err) {
+ kref_get(&clk->kref);
+ fp->private_data = clk;
+ }
+out:
+ up_read(&clk->rwsem);
+ return err;
+}
+
+static int posix_clock_release(struct inode *inode, struct file *fp)
+{
+ struct posix_clock *clk = fp->private_data;
+ int err = 0;
+
+ if (clk->ops.release)
+ err = clk->ops.release(clk);
+
+ kref_put(&clk->kref, delete_clock);
+
+ fp->private_data = NULL;
+
+ return err;
+}
+
+static const struct file_operations posix_clock_file_operations = {
+ .owner = THIS_MODULE,
+ .llseek = no_llseek,
+ .read = posix_clock_read,
+ .poll = posix_clock_poll,
+ .unlocked_ioctl = posix_clock_ioctl,
+ .open = posix_clock_open,
+ .release = posix_clock_release,
+ .fasync = posix_clock_fasync,
+ .mmap = posix_clock_mmap,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = posix_clock_compat_ioctl,
+#endif
+};
+
+int posix_clock_register(struct posix_clock *clk, dev_t devid)
+{
+ int err;
+
+ kref_init(&clk->kref);
+ init_rwsem(&clk->rwsem);
+
+ cdev_init(&clk->cdev, &posix_clock_file_operations);
+ clk->cdev.owner = clk->ops.owner;
+ err = cdev_add(&clk->cdev, devid, 1);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(posix_clock_register);
+
+static void delete_clock(struct kref *kref)
+{
+ struct posix_clock *clk = container_of(kref, struct posix_clock, kref);
+
+ if (clk->release)
+ clk->release(clk);
+}
+
+void posix_clock_unregister(struct posix_clock *clk)
+{
+ cdev_del(&clk->cdev);
+
+ down_write(&clk->rwsem);
+ clk->zombie = true;
+ up_write(&clk->rwsem);
+
+ kref_put(&clk->kref, delete_clock);
+}
+EXPORT_SYMBOL_GPL(posix_clock_unregister);
+
+struct posix_clock_desc {
+ struct file *fp;
+ struct posix_clock *clk;
+};
+
+static int get_clock_desc(const clockid_t id, struct posix_clock_desc *cd)
+{
+ struct file *fp = fget(CLOCKID_TO_FD(id));
+ int err = -EINVAL;
+
+ if (!fp)
+ return err;
+
+ if (fp->f_op->open != posix_clock_open || !fp->private_data)
+ goto out;
+
+ cd->fp = fp;
+ cd->clk = get_posix_clock(fp);
+
+ err = cd->clk ? 0 : -ENODEV;
+out:
+ if (err)
+ fput(fp);
+ return err;
+}
+
+static void put_clock_desc(struct posix_clock_desc *cd)
+{
+ put_posix_clock(cd->clk);
+ fput(cd->fp);
+}
+
+static int pc_clock_adjtime(clockid_t id, struct timex *tx)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ err = -EACCES;
+ goto out;
+ }
+
+ if (cd.clk->ops.clock_adjtime)
+ err = cd.clk->ops.clock_adjtime(cd.clk, tx);
+ else
+ err = -EOPNOTSUPP;
+out:
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_gettime(clockid_t id, struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.clock_gettime)
+ err = cd.clk->ops.clock_gettime(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_getres(clockid_t id, struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.clock_getres)
+ err = cd.clk->ops.clock_getres(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_clock_settime(clockid_t id, const struct timespec *ts)
+{
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if ((cd.fp->f_mode & FMODE_WRITE) == 0) {
+ err = -EACCES;
+ goto out;
+ }
+
+ if (cd.clk->ops.clock_settime)
+ err = cd.clk->ops.clock_settime(cd.clk, ts);
+ else
+ err = -EOPNOTSUPP;
+out:
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_timer_create(struct k_itimer *kit)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_create)
+ err = cd.clk->ops.timer_create(cd.clk, kit);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static int pc_timer_delete(struct k_itimer *kit)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_delete)
+ err = cd.clk->ops.timer_delete(cd.clk, kit);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+static void pc_timer_gettime(struct k_itimer *kit, struct itimerspec *ts)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+
+ if (get_clock_desc(id, &cd))
+ return;
+
+ if (cd.clk->ops.timer_gettime)
+ cd.clk->ops.timer_gettime(cd.clk, kit, ts);
+
+ put_clock_desc(&cd);
+}
+
+static int pc_timer_settime(struct k_itimer *kit, int flags,
+ struct itimerspec *ts, struct itimerspec *old)
+{
+ clockid_t id = kit->it_clock;
+ struct posix_clock_desc cd;
+ int err;
+
+ err = get_clock_desc(id, &cd);
+ if (err)
+ return err;
+
+ if (cd.clk->ops.timer_settime)
+ err = cd.clk->ops.timer_settime(cd.clk, kit, flags, ts, old);
+ else
+ err = -EOPNOTSUPP;
+
+ put_clock_desc(&cd);
+
+ return err;
+}
+
+struct k_clock clock_posix_dynamic = {
+ .clock_getres = pc_clock_getres,
+ .clock_set = pc_clock_settime,
+ .clock_get = pc_clock_gettime,
+ .clock_adj = pc_clock_adjtime,
+ .timer_create = pc_timer_create,
+ .timer_set = pc_timer_settime,
+ .timer_del = pc_timer_delete,
+ .timer_get = pc_timer_gettime,
+};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index a3b5aff62606..c7218d132738 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -18,7 +18,6 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
-#include <linux/tick.h>
#include "tick-internal.h"
@@ -457,23 +456,27 @@ void tick_broadcast_oneshot_control(unsigned long reason)
unsigned long flags;
int cpu;
- raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
-
/*
* Periodic mode does not care about the enter/exit of power
* states
*/
if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC)
- goto out;
+ return;
- bc = tick_broadcast_device.evtdev;
+ /*
+ * We are called with preemtion disabled from the depth of the
+ * idle code, so we can't be moved away.
+ */
cpu = smp_processor_id();
td = &per_cpu(tick_cpu_device, cpu);
dev = td->evtdev;
if (!(dev->features & CLOCK_EVT_FEAT_C3STOP))
- goto out;
+ return;
+ bc = tick_broadcast_device.evtdev;
+
+ raw_spin_lock_irqsave(&tick_broadcast_lock, flags);
if (reason == CLOCK_EVT_NOTIFY_BROADCAST_ENTER) {
if (!cpumask_test_cpu(cpu, tick_get_broadcast_oneshot_mask())) {
cpumask_set_cpu(cpu, tick_get_broadcast_oneshot_mask());
@@ -490,8 +493,6 @@ void tick_broadcast_oneshot_control(unsigned long reason)
tick_program_event(dev->next_event, 1);
}
}
-
-out:
raw_spin_unlock_irqrestore(&tick_broadcast_lock, flags);
}
@@ -523,10 +524,11 @@ static void tick_broadcast_init_next_event(struct cpumask *mask,
*/
void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
{
+ int cpu = smp_processor_id();
+
/* Set it up only once ! */
if (bc->event_handler != tick_handle_oneshot_broadcast) {
int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC;
- int cpu = smp_processor_id();
bc->event_handler = tick_handle_oneshot_broadcast;
clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT);
@@ -552,6 +554,15 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
tick_broadcast_set_event(tick_next_period, 1);
} else
bc->next_event.tv64 = KTIME_MAX;
+ } else {
+ /*
+ * The first cpu which switches to oneshot mode sets
+ * the bit for all other cpus which are in the general
+ * (periodic) broadcast mask. So the bit is set and
+ * would prevent the first broadcast enter after this
+ * to program the bc device.
+ */
+ tick_broadcast_clear_oneshot(cpu);
}
}
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index ed228ef6f6b8..119528de8235 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -18,7 +18,6 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
-#include <linux/tick.h>
#include <asm/irq_regs.h>
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index f65d3a723a64..1009b06d6f89 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -1,6 +1,10 @@
/*
* tick internal variable and functions used by low/high res code
*/
+#include <linux/hrtimer.h>
+#include <linux/tick.h>
+
+#ifdef CONFIG_GENERIC_CLOCKEVENTS_BUILD
#define TICK_DO_TIMER_NONE -1
#define TICK_DO_TIMER_BOOT -2
@@ -135,3 +139,8 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
{
return !(dev->features & CLOCK_EVT_FEAT_DUMMY);
}
+
+#endif
+
+extern void do_timer(unsigned long ticks);
+extern seqlock_t xtime_lock;
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
index 5cbc101f908b..2d04411a5f05 100644
--- a/kernel/time/tick-oneshot.c
+++ b/kernel/time/tick-oneshot.c
@@ -18,7 +18,6 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
-#include <linux/tick.h>
#include "tick-internal.h"
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index c55ea2433471..d5097c44b407 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -19,7 +19,6 @@
#include <linux/percpu.h>
#include <linux/profile.h>
#include <linux/sched.h>
-#include <linux/tick.h>
#include <linux/module.h>
#include <asm/irq_regs.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d27c7562902c..342408cf68dd 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -14,7 +14,7 @@
#include <linux/init.h>
#include <linux/mm.h>
#include <linux/sched.h>
-#include <linux/sysdev.h>
+#include <linux/syscore_ops.h>
#include <linux/clocksource.h>
#include <linux/jiffies.h>
#include <linux/time.h>
@@ -353,7 +353,7 @@ EXPORT_SYMBOL(do_gettimeofday);
*
* Sets the time of day to the new time and update NTP and notify hrtimers
*/
-int do_settimeofday(struct timespec *tv)
+int do_settimeofday(const struct timespec *tv)
{
struct timespec ts_delta;
unsigned long flags;
@@ -387,6 +387,42 @@ int do_settimeofday(struct timespec *tv)
EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * timekeeping_inject_offset - Adds or subtracts from the current time.
+ * @tv: pointer to the timespec variable containing the offset
+ *
+ * Adds or subtracts an offset value from the current time.
+ */
+int timekeeping_inject_offset(struct timespec *ts)
+{
+ unsigned long flags;
+
+ if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)
+ return -EINVAL;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+
+ timekeeping_forward_now();
+
+ xtime = timespec_add(xtime, *ts);
+ wall_to_monotonic = timespec_sub(wall_to_monotonic, *ts);
+
+ timekeeper.ntp_error = 0;
+ ntp_clear();
+
+ update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+ timekeeper.mult);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+
+ return 0;
+}
+EXPORT_SYMBOL(timekeeping_inject_offset);
+
/**
* change_clocksource - Swaps clocksources if a new one is available
*
@@ -560,14 +596,65 @@ void __init timekeeping_init(void)
static struct timespec timekeeping_suspend_time;
/**
+ * __timekeeping_inject_sleeptime - Internal function to add sleep interval
+ * @delta: pointer to a timespec delta value
+ *
+ * Takes a timespec offset measuring a suspend interval and properly
+ * adds the sleep offset to the timekeeping variables.
+ */
+static void __timekeeping_inject_sleeptime(struct timespec *delta)
+{
+ xtime = timespec_add(xtime, *delta);
+ wall_to_monotonic = timespec_sub(wall_to_monotonic, *delta);
+ total_sleep_time = timespec_add(total_sleep_time, *delta);
+}
+
+
+/**
+ * timekeeping_inject_sleeptime - Adds suspend interval to timeekeeping values
+ * @delta: pointer to a timespec delta value
+ *
+ * This hook is for architectures that cannot support read_persistent_clock
+ * because their RTC/persistent clock is only accessible when irqs are enabled.
+ *
+ * This function should only be called by rtc_resume(), and allows
+ * a suspend offset to be injected into the timekeeping values.
+ */
+void timekeeping_inject_sleeptime(struct timespec *delta)
+{
+ unsigned long flags;
+ struct timespec ts;
+
+ /* Make sure we don't set the clock twice */
+ read_persistent_clock(&ts);
+ if (!(ts.tv_sec == 0 && ts.tv_nsec == 0))
+ return;
+
+ write_seqlock_irqsave(&xtime_lock, flags);
+ timekeeping_forward_now();
+
+ __timekeeping_inject_sleeptime(delta);
+
+ timekeeper.ntp_error = 0;
+ ntp_clear();
+ update_vsyscall(&xtime, &wall_to_monotonic, timekeeper.clock,
+ timekeeper.mult);
+
+ write_sequnlock_irqrestore(&xtime_lock, flags);
+
+ /* signal hrtimers about time change */
+ clock_was_set();
+}
+
+
+/**
* timekeeping_resume - Resumes the generic timekeeping subsystem.
- * @dev: unused
*
* This is for the generic clocksource timekeeping.
* xtime/wall_to_monotonic/jiffies/etc are
* still managed by arch specific suspend/resume code.
*/
-static int timekeeping_resume(struct sys_device *dev)
+static void timekeeping_resume(void)
{
unsigned long flags;
struct timespec ts;
@@ -580,9 +667,7 @@ static int timekeeping_resume(struct sys_device *dev)
if (timespec_compare(&ts, &timekeeping_suspend_time) > 0) {
ts = timespec_sub(ts, timekeeping_suspend_time);
- xtime = timespec_add(xtime, ts);
- wall_to_monotonic = timespec_sub(wall_to_monotonic, ts);
- total_sleep_time = timespec_add(total_sleep_time, ts);
+ __timekeeping_inject_sleeptime(&ts);
}
/* re-base the last cycle value */
timekeeper.clock->cycle_last = timekeeper.clock->read(timekeeper.clock);
@@ -595,12 +680,10 @@ static int timekeeping_resume(struct sys_device *dev)
clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
/* Resume hrtimers */
- hres_timers_resume();
-
- return 0;
+ hrtimers_resume();
}
-static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+static int timekeeping_suspend(void)
{
unsigned long flags;
@@ -618,26 +701,18 @@ static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
}
/* sysfs resume/suspend bits for timekeeping */
-static struct sysdev_class timekeeping_sysclass = {
- .name = "timekeeping",
+static struct syscore_ops timekeeping_syscore_ops = {
.resume = timekeeping_resume,
.suspend = timekeeping_suspend,
};
-static struct sys_device device_timer = {
- .id = 0,
- .cls = &timekeeping_sysclass,
-};
-
-static int __init timekeeping_init_device(void)
+static int __init timekeeping_init_ops(void)
{
- int error = sysdev_class_register(&timekeeping_sysclass);
- if (!error)
- error = sysdev_register(&device_timer);
- return error;
+ register_syscore_ops(&timekeeping_syscore_ops);
+ return 0;
}
-device_initcall(timekeeping_init_device);
+device_initcall(timekeeping_init_ops);
/*
* If the error is already larger, we look ahead even further
@@ -779,7 +854,7 @@ static cycle_t logarithmic_accumulation(cycle_t offset, int shift)
*
* Called from the timer interrupt, must hold a write on xtime_lock.
*/
-void update_wall_time(void)
+static void update_wall_time(void)
{
struct clocksource *clock;
cycle_t offset;
@@ -871,7 +946,7 @@ void update_wall_time(void)
* getboottime - Return the real time of system boot.
* @ts: pointer to the timespec to be set
*
- * Returns the time of day in a timespec.
+ * Returns the wall-time of boot in a timespec.
*
* This is based on the wall_to_monotonic offset and the total suspend
* time. Calls to settimeofday will affect the value returned (which
@@ -889,6 +964,55 @@ void getboottime(struct timespec *ts)
}
EXPORT_SYMBOL_GPL(getboottime);
+
+/**
+ * get_monotonic_boottime - Returns monotonic time since boot
+ * @ts: pointer to the timespec to be set
+ *
+ * Returns the monotonic time since boot in a timespec.
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get_ts, but also
+ * includes the time spent in suspend.
+ */
+void get_monotonic_boottime(struct timespec *ts)
+{
+ struct timespec tomono, sleep;
+ unsigned int seq;
+ s64 nsecs;
+
+ WARN_ON(timekeeping_suspended);
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ *ts = xtime;
+ tomono = wall_to_monotonic;
+ sleep = total_sleep_time;
+ nsecs = timekeeping_get_ns();
+
+ } while (read_seqretry(&xtime_lock, seq));
+
+ set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec + sleep.tv_sec,
+ ts->tv_nsec + tomono.tv_nsec + sleep.tv_nsec + nsecs);
+}
+EXPORT_SYMBOL_GPL(get_monotonic_boottime);
+
+/**
+ * ktime_get_boottime - Returns monotonic time since boot in a ktime
+ *
+ * Returns the monotonic time since boot in a ktime
+ *
+ * This is similar to CLOCK_MONTONIC/ktime_get, but also
+ * includes the time spent in suspend.
+ */
+ktime_t ktime_get_boottime(void)
+{
+ struct timespec ts;
+
+ get_monotonic_boottime(&ts);
+ return timespec_to_ktime(ts);
+}
+EXPORT_SYMBOL_GPL(ktime_get_boottime);
+
/**
* monotonic_to_bootbased - Convert the monotonic time to boot based.
* @ts: pointer to the timespec to be converted
@@ -910,11 +1034,6 @@ struct timespec __current_kernel_time(void)
return xtime;
}
-struct timespec __get_wall_to_monotonic(void)
-{
- return wall_to_monotonic;
-}
-
struct timespec current_kernel_time(void)
{
struct timespec now;
@@ -946,3 +1065,63 @@ struct timespec get_monotonic_coarse(void)
now.tv_nsec + mono.tv_nsec);
return now;
}
+
+/*
+ * The 64-bit jiffies value is not atomic - you MUST NOT read it
+ * without sampling the sequence number in xtime_lock.
+ * jiffies is defined in the linker script...
+ */
+void do_timer(unsigned long ticks)
+{
+ jiffies_64 += ticks;
+ update_wall_time();
+ calc_global_load(ticks);
+}
+
+/**
+ * get_xtime_and_monotonic_and_sleep_offset() - get xtime, wall_to_monotonic,
+ * and sleep offsets.
+ * @xtim: pointer to timespec to be set with xtime
+ * @wtom: pointer to timespec to be set with wall_to_monotonic
+ * @sleep: pointer to timespec to be set with time in suspend
+ */
+void get_xtime_and_monotonic_and_sleep_offset(struct timespec *xtim,
+ struct timespec *wtom, struct timespec *sleep)
+{
+ unsigned long seq;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ *xtim = xtime;
+ *wtom = wall_to_monotonic;
+ *sleep = total_sleep_time;
+ } while (read_seqretry(&xtime_lock, seq));
+}
+
+/**
+ * ktime_get_monotonic_offset() - get wall_to_monotonic in ktime_t format
+ */
+ktime_t ktime_get_monotonic_offset(void)
+{
+ unsigned long seq;
+ struct timespec wtom;
+
+ do {
+ seq = read_seqbegin(&xtime_lock);
+ wtom = wall_to_monotonic;
+ } while (read_seqretry(&xtime_lock, seq));
+ return timespec_to_ktime(wtom);
+}
+
+/**
+ * xtime_update() - advances the timekeeping infrastructure
+ * @ticks: number of ticks, that have elapsed since the last call.
+ *
+ * Must be called with interrupts disabled.
+ */
+void xtime_update(unsigned long ticks)
+{
+ write_seqlock(&xtime_lock);
+ do_timer(ticks);
+ write_sequnlock(&xtime_lock);
+}
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 2f3b585b8d7d..a5d0a3a85dd8 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -236,7 +236,7 @@ void timer_stats_update_stats(void *timer, pid_t pid, void *startf,
unsigned int timer_flag)
{
/*
- * It doesnt matter which lock we take:
+ * It doesn't matter which lock we take:
*/
raw_spinlock_t *lock;
struct entry *entry, input;
diff --git a/kernel/timer.c b/kernel/timer.c
index d6459923d245..fd6198692b57 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -404,6 +404,11 @@ static void timer_stats_account_timer(struct timer_list *timer) {}
static struct debug_obj_descr timer_debug_descr;
+static void *timer_debug_hint(void *addr)
+{
+ return ((struct timer_list *) addr)->function;
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
@@ -477,6 +482,7 @@ static int timer_fixup_free(void *addr, enum debug_obj_state state)
static struct debug_obj_descr timer_debug_descr = {
.name = "timer_list",
+ .debug_hint = timer_debug_hint,
.fixup_init = timer_fixup_init,
.fixup_activate = timer_fixup_activate,
.fixup_free = timer_fixup_free,
@@ -964,6 +970,25 @@ EXPORT_SYMBOL(try_to_del_timer_sync);
* add_timer_on(). Upon exit the timer is not queued and the handler is
* not running on any CPU.
*
+ * Note: You must not hold locks that are held in interrupt context
+ * while calling this function. Even if the lock has nothing to do
+ * with the timer in question. Here's why:
+ *
+ * CPU0 CPU1
+ * ---- ----
+ * <SOFTIRQ>
+ * call_timer_fn();
+ * base->running_timer = mytimer;
+ * spin_lock_irq(somelock);
+ * <IRQ>
+ * spin_lock(somelock);
+ * del_timer_sync(mytimer);
+ * while (base->running_timer == mytimer);
+ *
+ * Now del_timer_sync() will never return and never release somelock.
+ * The interrupt on the other CPU is waiting to grab somelock but
+ * it has interrupted the softirq that CPU0 is waiting to finish.
+ *
* The function returns whether it has deactivated a pending timer or not.
*/
int del_timer_sync(struct timer_list *timer)
@@ -971,6 +996,10 @@ int del_timer_sync(struct timer_list *timer)
#ifdef CONFIG_LOCKDEP
unsigned long flags;
+ /*
+ * If lockdep gives a backtrace here, please reference
+ * the synchronization rules above.
+ */
local_irq_save(flags);
lock_map_acquire(&timer->lockdep_map);
lock_map_release(&timer->lockdep_map);
@@ -1295,19 +1324,6 @@ void run_local_timers(void)
raise_softirq(TIMER_SOFTIRQ);
}
-/*
- * The 64-bit jiffies value is not atomic - you MUST NOT read it
- * without sampling the sequence number in xtime_lock.
- * jiffies is defined in the linker script...
- */
-
-void do_timer(unsigned long ticks)
-{
- jiffies_64 += ticks;
- update_wall_time();
- calc_global_load(ticks);
-}
-
#ifdef __ARCH_WANT_SYS_ALARM
/*
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 14674dce77a6..2ad39e556cb4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -141,7 +141,7 @@ if FTRACE
config FUNCTION_TRACER
bool "Kernel Function Tracer"
depends on HAVE_FUNCTION_TRACER
- select FRAME_POINTER if !ARM_UNWIND && !S390
+ select FRAME_POINTER if !ARM_UNWIND && !S390 && !MICROBLAZE
select KALLSYMS
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
@@ -275,7 +275,7 @@ config PROFILE_ANNOTATED_BRANCHES
This tracer profiles all the the likely and unlikely macros
in the kernel. It will display the results in:
- /sys/kernel/debug/tracing/profile_annotated_branch
+ /sys/kernel/debug/tracing/trace_stat/branch_annotated
Note: this will add a significant overhead; only turn this
on if you need to profile the system's use of these macros.
@@ -288,7 +288,7 @@ config PROFILE_ALL_BRANCHES
taken in the kernel is recorded whether it hit or miss.
The results will be displayed in:
- /sys/kernel/debug/tracing/profile_branch
+ /sys/kernel/debug/tracing/trace_stat/branch_all
This option also enables the likely/unlikely profiler.
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index cbafed7d4f38..6957aa298dfa 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -703,28 +703,21 @@ void blk_trace_shutdown(struct request_queue *q)
*
**/
static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
- u32 what)
+ u32 what)
{
struct blk_trace *bt = q->blk_trace;
- int rw = rq->cmd_flags & 0x03;
if (likely(!bt))
return;
- if (rq->cmd_flags & REQ_DISCARD)
- rw |= REQ_DISCARD;
-
- if (rq->cmd_flags & REQ_SECURE)
- rw |= REQ_SECURE;
-
if (rq->cmd_type == REQ_TYPE_BLOCK_PC) {
what |= BLK_TC_ACT(BLK_TC_PC);
- __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw,
+ __blk_add_trace(bt, 0, blk_rq_bytes(rq), rq->cmd_flags,
what, rq->errors, rq->cmd_len, rq->cmd);
} else {
what |= BLK_TC_ACT(BLK_TC_FS);
- __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw,
- what, rq->errors, 0, NULL);
+ __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+ rq->cmd_flags, what, rq->errors, 0, NULL);
}
}
@@ -857,29 +850,21 @@ static void blk_add_trace_plug(void *ignore, struct request_queue *q)
__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
}
-static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q)
+static void blk_add_trace_unplug(void *ignore, struct request_queue *q,
+ unsigned int depth, bool explicit)
{
struct blk_trace *bt = q->blk_trace;
if (bt) {
- unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
- __be64 rpdu = cpu_to_be64(pdu);
-
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
- sizeof(rpdu), &rpdu);
- }
-}
+ __be64 rpdu = cpu_to_be64(depth);
+ u32 what;
-static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q)
-{
- struct blk_trace *bt = q->blk_trace;
-
- if (bt) {
- unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
- __be64 rpdu = cpu_to_be64(pdu);
+ if (explicit)
+ what = BLK_TA_UNPLUG_IO;
+ else
+ what = BLK_TA_UNPLUG_TIMER;
- __blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
- sizeof(rpdu), &rpdu);
+ __blk_add_trace(bt, 0, 0, 0, what, 0, sizeof(rpdu), &rpdu);
}
}
@@ -1022,9 +1007,7 @@ static void blk_register_tracepoints(void)
WARN_ON(ret);
ret = register_trace_block_plug(blk_add_trace_plug, NULL);
WARN_ON(ret);
- ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
- WARN_ON(ret);
- ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
+ ret = register_trace_block_unplug(blk_add_trace_unplug, NULL);
WARN_ON(ret);
ret = register_trace_block_split(blk_add_trace_split, NULL);
WARN_ON(ret);
@@ -1039,8 +1022,7 @@ static void blk_unregister_tracepoints(void)
unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL);
unregister_trace_block_bio_remap(blk_add_trace_bio_remap, NULL);
unregister_trace_block_split(blk_add_trace_split, NULL);
- unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL);
- unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL);
+ unregister_trace_block_unplug(blk_add_trace_unplug, NULL);
unregister_trace_block_plug(blk_add_trace_plug, NULL);
unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL);
unregister_trace_block_getrq(blk_add_trace_getrq, NULL);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f3dadae83883..1ee417fcbfa5 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -39,20 +39,26 @@
#include "trace_stat.h"
#define FTRACE_WARN_ON(cond) \
- do { \
- if (WARN_ON(cond)) \
+ ({ \
+ int ___r = cond; \
+ if (WARN_ON(___r)) \
ftrace_kill(); \
- } while (0)
+ ___r; \
+ })
#define FTRACE_WARN_ON_ONCE(cond) \
- do { \
- if (WARN_ON_ONCE(cond)) \
+ ({ \
+ int ___r = cond; \
+ if (WARN_ON_ONCE(___r)) \
ftrace_kill(); \
- } while (0)
+ ___r; \
+ })
/* hash bits for specific function selection */
#define FTRACE_HASH_BITS 7
#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+#define FTRACE_HASH_DEFAULT_BITS 10
+#define FTRACE_HASH_MAX_BITS 12
/* ftrace_enabled is a method to turn ftrace on or off */
int ftrace_enabled __read_mostly;
@@ -81,28 +87,40 @@ static struct ftrace_ops ftrace_list_end __read_mostly =
.func = ftrace_stub,
};
-static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops *ftrace_global_list __read_mostly = &ftrace_list_end;
+static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end;
ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub;
ftrace_func_t __ftrace_trace_function __read_mostly = ftrace_stub;
ftrace_func_t ftrace_pid_function __read_mostly = ftrace_stub;
+static struct ftrace_ops global_ops;
+
+static void
+ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip);
/*
- * Traverse the ftrace_list, invoking all entries. The reason that we
+ * Traverse the ftrace_global_list, invoking all entries. The reason that we
* can use rcu_dereference_raw() is that elements removed from this list
* are simply leaked, so there is no need to interact with a grace-period
* mechanism. The rcu_dereference_raw() calls are needed to handle
- * concurrent insertions into the ftrace_list.
+ * concurrent insertions into the ftrace_global_list.
*
* Silly Alpha and silly pointer-speculation compiler optimizations!
*/
-static void ftrace_list_func(unsigned long ip, unsigned long parent_ip)
+static void ftrace_global_list_func(unsigned long ip,
+ unsigned long parent_ip)
{
- struct ftrace_ops *op = rcu_dereference_raw(ftrace_list); /*see above*/
+ struct ftrace_ops *op;
+ if (unlikely(trace_recursion_test(TRACE_GLOBAL_BIT)))
+ return;
+
+ trace_recursion_set(TRACE_GLOBAL_BIT);
+ op = rcu_dereference_raw(ftrace_global_list); /*see above*/
while (op != &ftrace_list_end) {
op->func(ip, parent_ip);
op = rcu_dereference_raw(op->next); /*see above*/
};
+ trace_recursion_clear(TRACE_GLOBAL_BIT);
}
static void ftrace_pid_func(unsigned long ip, unsigned long parent_ip)
@@ -147,46 +165,69 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
}
#endif
-static int __register_ftrace_function(struct ftrace_ops *ops)
+static void update_global_ops(void)
{
- ops->next = ftrace_list;
+ ftrace_func_t func;
+
/*
- * We are entering ops into the ftrace_list but another
- * CPU might be walking that list. We need to make sure
- * the ops->next pointer is valid before another CPU sees
- * the ops pointer included into the ftrace_list.
+ * If there's only one function registered, then call that
+ * function directly. Otherwise, we need to iterate over the
+ * registered callers.
*/
- rcu_assign_pointer(ftrace_list, ops);
+ if (ftrace_global_list == &ftrace_list_end ||
+ ftrace_global_list->next == &ftrace_list_end)
+ func = ftrace_global_list->func;
+ else
+ func = ftrace_global_list_func;
- if (ftrace_enabled) {
- ftrace_func_t func;
+ /* If we filter on pids, update to use the pid function */
+ if (!list_empty(&ftrace_pids)) {
+ set_ftrace_pid_function(func);
+ func = ftrace_pid_func;
+ }
- if (ops->next == &ftrace_list_end)
- func = ops->func;
- else
- func = ftrace_list_func;
+ global_ops.func = func;
+}
- if (!list_empty(&ftrace_pids)) {
- set_ftrace_pid_function(func);
- func = ftrace_pid_func;
- }
+static void update_ftrace_function(void)
+{
+ ftrace_func_t func;
+
+ update_global_ops();
+
+ /*
+ * If we are at the end of the list and this ops is
+ * not dynamic, then have the mcount trampoline call
+ * the function directly
+ */
+ if (ftrace_ops_list == &ftrace_list_end ||
+ (ftrace_ops_list->next == &ftrace_list_end &&
+ !(ftrace_ops_list->flags & FTRACE_OPS_FL_DYNAMIC)))
+ func = ftrace_ops_list->func;
+ else
+ func = ftrace_ops_list_func;
- /*
- * For one func, simply call it directly.
- * For more than one func, call the chain.
- */
#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
- ftrace_trace_function = func;
+ ftrace_trace_function = func;
#else
- __ftrace_trace_function = func;
- ftrace_trace_function = ftrace_test_stop_func;
+ __ftrace_trace_function = func;
+ ftrace_trace_function = ftrace_test_stop_func;
#endif
- }
+}
- return 0;
+static void add_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
+{
+ ops->next = *list;
+ /*
+ * We are entering ops into the list but another
+ * CPU might be walking that list. We need to make sure
+ * the ops->next pointer is valid before another CPU sees
+ * the ops pointer included into the list.
+ */
+ rcu_assign_pointer(*list, ops);
}
-static int __unregister_ftrace_function(struct ftrace_ops *ops)
+static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops)
{
struct ftrace_ops **p;
@@ -194,13 +235,12 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
* If we are removing the last function, then simply point
* to the ftrace_stub.
*/
- if (ftrace_list == ops && ops->next == &ftrace_list_end) {
- ftrace_trace_function = ftrace_stub;
- ftrace_list = &ftrace_list_end;
+ if (*list == ops && ops->next == &ftrace_list_end) {
+ *list = &ftrace_list_end;
return 0;
}
- for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
+ for (p = list; *p != &ftrace_list_end; p = &(*p)->next)
if (*p == ops)
break;
@@ -208,53 +248,83 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
return -1;
*p = (*p)->next;
+ return 0;
+}
- if (ftrace_enabled) {
- /* If we only have one func left, then call that directly */
- if (ftrace_list->next == &ftrace_list_end) {
- ftrace_func_t func = ftrace_list->func;
+static int __register_ftrace_function(struct ftrace_ops *ops)
+{
+ if (ftrace_disabled)
+ return -ENODEV;
- if (!list_empty(&ftrace_pids)) {
- set_ftrace_pid_function(func);
- func = ftrace_pid_func;
- }
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
- ftrace_trace_function = func;
-#else
- __ftrace_trace_function = func;
-#endif
- }
- }
+ if (FTRACE_WARN_ON(ops == &global_ops))
+ return -EINVAL;
+
+ if (WARN_ON(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return -EBUSY;
+
+ if (!core_kernel_data((unsigned long)ops))
+ ops->flags |= FTRACE_OPS_FL_DYNAMIC;
+
+ if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
+ int first = ftrace_global_list == &ftrace_list_end;
+ add_ftrace_ops(&ftrace_global_list, ops);
+ ops->flags |= FTRACE_OPS_FL_ENABLED;
+ if (first)
+ add_ftrace_ops(&ftrace_ops_list, &global_ops);
+ } else
+ add_ftrace_ops(&ftrace_ops_list, ops);
+
+ if (ftrace_enabled)
+ update_ftrace_function();
return 0;
}
-static void ftrace_update_pid_func(void)
+static int __unregister_ftrace_function(struct ftrace_ops *ops)
{
- ftrace_func_t func;
+ int ret;
- if (ftrace_trace_function == ftrace_stub)
- return;
+ if (ftrace_disabled)
+ return -ENODEV;
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
- func = ftrace_trace_function;
-#else
- func = __ftrace_trace_function;
-#endif
+ if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED)))
+ return -EBUSY;
- if (!list_empty(&ftrace_pids)) {
- set_ftrace_pid_function(func);
- func = ftrace_pid_func;
- } else {
- if (func == ftrace_pid_func)
- func = ftrace_pid_function;
- }
+ if (FTRACE_WARN_ON(ops == &global_ops))
+ return -EINVAL;
-#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
- ftrace_trace_function = func;
-#else
- __ftrace_trace_function = func;
-#endif
+ if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
+ ret = remove_ftrace_ops(&ftrace_global_list, ops);
+ if (!ret && ftrace_global_list == &ftrace_list_end)
+ ret = remove_ftrace_ops(&ftrace_ops_list, &global_ops);
+ if (!ret)
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+ } else
+ ret = remove_ftrace_ops(&ftrace_ops_list, ops);
+
+ if (ret < 0)
+ return ret;
+
+ if (ftrace_enabled)
+ update_ftrace_function();
+
+ /*
+ * Dynamic ops may be freed, we must make sure that all
+ * callers are done before leaving this function.
+ */
+ if (ops->flags & FTRACE_OPS_FL_DYNAMIC)
+ synchronize_sched();
+
+ return 0;
+}
+
+static void ftrace_update_pid_func(void)
+{
+ /* Only do something if we are tracing something */
+ if (ftrace_trace_function == ftrace_stub)
+ return;
+
+ update_ftrace_function();
}
#ifdef CONFIG_FUNCTION_PROFILER
@@ -888,8 +958,35 @@ enum {
FTRACE_START_FUNC_RET = (1 << 3),
FTRACE_STOP_FUNC_RET = (1 << 4),
};
+struct ftrace_func_entry {
+ struct hlist_node hlist;
+ unsigned long ip;
+};
+
+struct ftrace_hash {
+ unsigned long size_bits;
+ struct hlist_head *buckets;
+ unsigned long count;
+ struct rcu_head rcu;
+};
-static int ftrace_filtered;
+/*
+ * We make these constant because no one should touch them,
+ * but they are used as the default "empty hash", to avoid allocating
+ * it all the time. These are in a read only section such that if
+ * anyone does try to modify it, it will cause an exception.
+ */
+static const struct hlist_head empty_buckets[1];
+static const struct ftrace_hash empty_hash = {
+ .buckets = (struct hlist_head *)empty_buckets,
+};
+#define EMPTY_HASH ((struct ftrace_hash *)&empty_hash)
+
+static struct ftrace_ops global_ops = {
+ .func = ftrace_stub,
+ .notrace_hash = EMPTY_HASH,
+ .filter_hash = EMPTY_HASH,
+};
static struct dyn_ftrace *ftrace_new_addrs;
@@ -912,6 +1009,269 @@ static struct ftrace_page *ftrace_pages;
static struct dyn_ftrace *ftrace_free_records;
+static struct ftrace_func_entry *
+ftrace_lookup_ip(struct ftrace_hash *hash, unsigned long ip)
+{
+ unsigned long key;
+ struct ftrace_func_entry *entry;
+ struct hlist_head *hhd;
+ struct hlist_node *n;
+
+ if (!hash->count)
+ return NULL;
+
+ if (hash->size_bits > 0)
+ key = hash_long(ip, hash->size_bits);
+ else
+ key = 0;
+
+ hhd = &hash->buckets[key];
+
+ hlist_for_each_entry_rcu(entry, n, hhd, hlist) {
+ if (entry->ip == ip)
+ return entry;
+ }
+ return NULL;
+}
+
+static void __add_hash_entry(struct ftrace_hash *hash,
+ struct ftrace_func_entry *entry)
+{
+ struct hlist_head *hhd;
+ unsigned long key;
+
+ if (hash->size_bits)
+ key = hash_long(entry->ip, hash->size_bits);
+ else
+ key = 0;
+
+ hhd = &hash->buckets[key];
+ hlist_add_head(&entry->hlist, hhd);
+ hash->count++;
+}
+
+static int add_hash_entry(struct ftrace_hash *hash, unsigned long ip)
+{
+ struct ftrace_func_entry *entry;
+
+ entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+ if (!entry)
+ return -ENOMEM;
+
+ entry->ip = ip;
+ __add_hash_entry(hash, entry);
+
+ return 0;
+}
+
+static void
+free_hash_entry(struct ftrace_hash *hash,
+ struct ftrace_func_entry *entry)
+{
+ hlist_del(&entry->hlist);
+ kfree(entry);
+ hash->count--;
+}
+
+static void
+remove_hash_entry(struct ftrace_hash *hash,
+ struct ftrace_func_entry *entry)
+{
+ hlist_del(&entry->hlist);
+ hash->count--;
+}
+
+static void ftrace_hash_clear(struct ftrace_hash *hash)
+{
+ struct hlist_head *hhd;
+ struct hlist_node *tp, *tn;
+ struct ftrace_func_entry *entry;
+ int size = 1 << hash->size_bits;
+ int i;
+
+ if (!hash->count)
+ return;
+
+ for (i = 0; i < size; i++) {
+ hhd = &hash->buckets[i];
+ hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist)
+ free_hash_entry(hash, entry);
+ }
+ FTRACE_WARN_ON(hash->count);
+}
+
+static void free_ftrace_hash(struct ftrace_hash *hash)
+{
+ if (!hash || hash == EMPTY_HASH)
+ return;
+ ftrace_hash_clear(hash);
+ kfree(hash->buckets);
+ kfree(hash);
+}
+
+static void __free_ftrace_hash_rcu(struct rcu_head *rcu)
+{
+ struct ftrace_hash *hash;
+
+ hash = container_of(rcu, struct ftrace_hash, rcu);
+ free_ftrace_hash(hash);
+}
+
+static void free_ftrace_hash_rcu(struct ftrace_hash *hash)
+{
+ if (!hash || hash == EMPTY_HASH)
+ return;
+ call_rcu_sched(&hash->rcu, __free_ftrace_hash_rcu);
+}
+
+static struct ftrace_hash *alloc_ftrace_hash(int size_bits)
+{
+ struct ftrace_hash *hash;
+ int size;
+
+ hash = kzalloc(sizeof(*hash), GFP_KERNEL);
+ if (!hash)
+ return NULL;
+
+ size = 1 << size_bits;
+ hash->buckets = kzalloc(sizeof(*hash->buckets) * size, GFP_KERNEL);
+
+ if (!hash->buckets) {
+ kfree(hash);
+ return NULL;
+ }
+
+ hash->size_bits = size_bits;
+
+ return hash;
+}
+
+static struct ftrace_hash *
+alloc_and_copy_ftrace_hash(int size_bits, struct ftrace_hash *hash)
+{
+ struct ftrace_func_entry *entry;
+ struct ftrace_hash *new_hash;
+ struct hlist_node *tp;
+ int size;
+ int ret;
+ int i;
+
+ new_hash = alloc_ftrace_hash(size_bits);
+ if (!new_hash)
+ return NULL;
+
+ /* Empty hash? */
+ if (!hash || !hash->count)
+ return new_hash;
+
+ size = 1 << hash->size_bits;
+ for (i = 0; i < size; i++) {
+ hlist_for_each_entry(entry, tp, &hash->buckets[i], hlist) {
+ ret = add_hash_entry(new_hash, entry->ip);
+ if (ret < 0)
+ goto free_hash;
+ }
+ }
+
+ FTRACE_WARN_ON(new_hash->count != hash->count);
+
+ return new_hash;
+
+ free_hash:
+ free_ftrace_hash(new_hash);
+ return NULL;
+}
+
+static int
+ftrace_hash_move(struct ftrace_hash **dst, struct ftrace_hash *src)
+{
+ struct ftrace_func_entry *entry;
+ struct hlist_node *tp, *tn;
+ struct hlist_head *hhd;
+ struct ftrace_hash *old_hash;
+ struct ftrace_hash *new_hash;
+ unsigned long key;
+ int size = src->count;
+ int bits = 0;
+ int i;
+
+ /*
+ * If the new source is empty, just free dst and assign it
+ * the empty_hash.
+ */
+ if (!src->count) {
+ free_ftrace_hash_rcu(*dst);
+ rcu_assign_pointer(*dst, EMPTY_HASH);
+ return 0;
+ }
+
+ /*
+ * Make the hash size about 1/2 the # found
+ */
+ for (size /= 2; size; size >>= 1)
+ bits++;
+
+ /* Don't allocate too much */
+ if (bits > FTRACE_HASH_MAX_BITS)
+ bits = FTRACE_HASH_MAX_BITS;
+
+ new_hash = alloc_ftrace_hash(bits);
+ if (!new_hash)
+ return -ENOMEM;
+
+ size = 1 << src->size_bits;
+ for (i = 0; i < size; i++) {
+ hhd = &src->buckets[i];
+ hlist_for_each_entry_safe(entry, tp, tn, hhd, hlist) {
+ if (bits > 0)
+ key = hash_long(entry->ip, bits);
+ else
+ key = 0;
+ remove_hash_entry(src, entry);
+ __add_hash_entry(new_hash, entry);
+ }
+ }
+
+ old_hash = *dst;
+ rcu_assign_pointer(*dst, new_hash);
+ free_ftrace_hash_rcu(old_hash);
+
+ return 0;
+}
+
+/*
+ * Test the hashes for this ops to see if we want to call
+ * the ops->func or not.
+ *
+ * It's a match if the ip is in the ops->filter_hash or
+ * the filter_hash does not exist or is empty,
+ * AND
+ * the ip is not in the ops->notrace_hash.
+ *
+ * This needs to be called with preemption disabled as
+ * the hashes are freed with call_rcu_sched().
+ */
+static int
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+{
+ struct ftrace_hash *filter_hash;
+ struct ftrace_hash *notrace_hash;
+ int ret;
+
+ filter_hash = rcu_dereference_raw(ops->filter_hash);
+ notrace_hash = rcu_dereference_raw(ops->notrace_hash);
+
+ if ((!filter_hash || !filter_hash->count ||
+ ftrace_lookup_ip(filter_hash, ip)) &&
+ (!notrace_hash || !notrace_hash->count ||
+ !ftrace_lookup_ip(notrace_hash, ip)))
+ ret = 1;
+ else
+ ret = 0;
+
+ return ret;
+}
+
/*
* This is a double for. Do not use 'break' to break out of the loop,
* you must use a goto.
@@ -926,6 +1286,105 @@ static struct dyn_ftrace *ftrace_free_records;
} \
}
+static void __ftrace_hash_rec_update(struct ftrace_ops *ops,
+ int filter_hash,
+ bool inc)
+{
+ struct ftrace_hash *hash;
+ struct ftrace_hash *other_hash;
+ struct ftrace_page *pg;
+ struct dyn_ftrace *rec;
+ int count = 0;
+ int all = 0;
+
+ /* Only update if the ops has been registered */
+ if (!(ops->flags & FTRACE_OPS_FL_ENABLED))
+ return;
+
+ /*
+ * In the filter_hash case:
+ * If the count is zero, we update all records.
+ * Otherwise we just update the items in the hash.
+ *
+ * In the notrace_hash case:
+ * We enable the update in the hash.
+ * As disabling notrace means enabling the tracing,
+ * and enabling notrace means disabling, the inc variable
+ * gets inversed.
+ */
+ if (filter_hash) {
+ hash = ops->filter_hash;
+ other_hash = ops->notrace_hash;
+ if (!hash || !hash->count)
+ all = 1;
+ } else {
+ inc = !inc;
+ hash = ops->notrace_hash;
+ other_hash = ops->filter_hash;
+ /*
+ * If the notrace hash has no items,
+ * then there's nothing to do.
+ */
+ if (hash && !hash->count)
+ return;
+ }
+
+ do_for_each_ftrace_rec(pg, rec) {
+ int in_other_hash = 0;
+ int in_hash = 0;
+ int match = 0;
+
+ if (all) {
+ /*
+ * Only the filter_hash affects all records.
+ * Update if the record is not in the notrace hash.
+ */
+ if (!other_hash || !ftrace_lookup_ip(other_hash, rec->ip))
+ match = 1;
+ } else {
+ in_hash = hash && !!ftrace_lookup_ip(hash, rec->ip);
+ in_other_hash = other_hash && !!ftrace_lookup_ip(other_hash, rec->ip);
+
+ /*
+ *
+ */
+ if (filter_hash && in_hash && !in_other_hash)
+ match = 1;
+ else if (!filter_hash && in_hash &&
+ (in_other_hash || !other_hash->count))
+ match = 1;
+ }
+ if (!match)
+ continue;
+
+ if (inc) {
+ rec->flags++;
+ if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == FTRACE_REF_MAX))
+ return;
+ } else {
+ if (FTRACE_WARN_ON((rec->flags & ~FTRACE_FL_MASK) == 0))
+ return;
+ rec->flags--;
+ }
+ count++;
+ /* Shortcut, if we handled all records, we are done. */
+ if (!all && count == hash->count)
+ return;
+ } while_for_each_ftrace_rec();
+}
+
+static void ftrace_hash_rec_disable(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ __ftrace_hash_rec_update(ops, filter_hash, 0);
+}
+
+static void ftrace_hash_rec_enable(struct ftrace_ops *ops,
+ int filter_hash)
+{
+ __ftrace_hash_rec_update(ops, filter_hash, 1);
+}
+
static void ftrace_free_rec(struct dyn_ftrace *rec)
{
rec->freelist = ftrace_free_records;
@@ -1047,18 +1506,18 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
ftrace_addr = (unsigned long)FTRACE_ADDR;
/*
- * If this record is not to be traced or we want to disable it,
- * then disable it.
+ * If we are enabling tracing:
*
- * If we want to enable it and filtering is off, then enable it.
+ * If the record has a ref count, then we need to enable it
+ * because someone is using it.
*
- * If we want to enable it and filtering is on, enable it only if
- * it's filtered
+ * Otherwise we make sure its disabled.
+ *
+ * If we are disabling tracing, then disable all records that
+ * are enabled.
*/
- if (enable && !(rec->flags & FTRACE_FL_NOTRACE)) {
- if (!ftrace_filtered || (rec->flags & FTRACE_FL_FILTER))
- flag = FTRACE_FL_ENABLED;
- }
+ if (enable && (rec->flags & ~FTRACE_FL_MASK))
+ flag = FTRACE_FL_ENABLED;
/* If the state of this record hasn't changed, then do nothing */
if ((rec->flags & FTRACE_FL_ENABLED) == flag)
@@ -1079,19 +1538,16 @@ static void ftrace_replace_code(int enable)
struct ftrace_page *pg;
int failed;
+ if (unlikely(ftrace_disabled))
+ return;
+
do_for_each_ftrace_rec(pg, rec) {
- /*
- * Skip over free records, records that have
- * failed and not converted.
- */
- if (rec->flags & FTRACE_FL_FREE ||
- rec->flags & FTRACE_FL_FAILED ||
- !(rec->flags & FTRACE_FL_CONVERTED))
+ /* Skip over free records */
+ if (rec->flags & FTRACE_FL_FREE)
continue;
failed = __ftrace_replace_code(rec, enable);
if (failed) {
- rec->flags |= FTRACE_FL_FAILED;
ftrace_bug(failed, rec->ip);
/* Stop processing */
return;
@@ -1107,10 +1563,12 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
ip = rec->ip;
+ if (unlikely(ftrace_disabled))
+ return 0;
+
ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
if (ret) {
ftrace_bug(ret, ip);
- rec->flags |= FTRACE_FL_FAILED;
return 0;
}
return 1;
@@ -1171,6 +1629,7 @@ static void ftrace_run_update_code(int command)
static ftrace_func_t saved_ftrace_func;
static int ftrace_start_up;
+static int global_start_up;
static void ftrace_startup_enable(int command)
{
@@ -1185,19 +1644,38 @@ static void ftrace_startup_enable(int command)
ftrace_run_update_code(command);
}
-static void ftrace_startup(int command)
+static int ftrace_startup(struct ftrace_ops *ops, int command)
{
+ bool hash_enable = true;
+
if (unlikely(ftrace_disabled))
- return;
+ return -ENODEV;
ftrace_start_up++;
command |= FTRACE_ENABLE_CALLS;
+ /* ops marked global share the filter hashes */
+ if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
+ ops = &global_ops;
+ /* Don't update hash if global is already set */
+ if (global_start_up)
+ hash_enable = false;
+ global_start_up++;
+ }
+
+ ops->flags |= FTRACE_OPS_FL_ENABLED;
+ if (hash_enable)
+ ftrace_hash_rec_enable(ops, 1);
+
ftrace_startup_enable(command);
+
+ return 0;
}
-static void ftrace_shutdown(int command)
+static void ftrace_shutdown(struct ftrace_ops *ops, int command)
{
+ bool hash_disable = true;
+
if (unlikely(ftrace_disabled))
return;
@@ -1209,6 +1687,23 @@ static void ftrace_shutdown(int command)
*/
WARN_ON_ONCE(ftrace_start_up < 0);
+ if (ops->flags & FTRACE_OPS_FL_GLOBAL) {
+ ops = &global_ops;
+ global_start_up--;
+ WARN_ON_ONCE(global_start_up < 0);
+ /* Don't update hash if global still has users */
+ if (global_start_up) {
+ WARN_ON_ONCE(!ftrace_start_up);
+ hash_disable = false;
+ }
+ }
+
+ if (hash_disable)
+ ftrace_hash_rec_disable(ops, 1);
+
+ if (ops != &global_ops || !global_start_up)
+ ops->flags &= ~FTRACE_OPS_FL_ENABLED;
+
if (!ftrace_start_up)
command |= FTRACE_DISABLE_CALLS;
@@ -1268,15 +1763,15 @@ static int ftrace_update_code(struct module *mod)
p->flags = 0L;
/*
- * Do the initial record convertion from mcount jump
+ * Do the initial record conversion from mcount jump
* to the NOP instructions.
*/
if (!ftrace_code_disable(mod, p)) {
ftrace_free_rec(p);
- continue;
+ /* Game over */
+ break;
}
- p->flags |= FTRACE_FL_CONVERTED;
ftrace_update_cnt++;
/*
@@ -1351,9 +1846,9 @@ static int __init ftrace_dyn_table_alloc(unsigned long num_to_init)
enum {
FTRACE_ITER_FILTER = (1 << 0),
FTRACE_ITER_NOTRACE = (1 << 1),
- FTRACE_ITER_FAILURES = (1 << 2),
- FTRACE_ITER_PRINTALL = (1 << 3),
- FTRACE_ITER_HASH = (1 << 4),
+ FTRACE_ITER_PRINTALL = (1 << 2),
+ FTRACE_ITER_HASH = (1 << 3),
+ FTRACE_ITER_ENABLED = (1 << 4),
};
#define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -1365,6 +1860,8 @@ struct ftrace_iterator {
struct dyn_ftrace *func;
struct ftrace_func_probe *probe;
struct trace_parser parser;
+ struct ftrace_hash *hash;
+ struct ftrace_ops *ops;
int hidx;
int idx;
unsigned flags;
@@ -1461,13 +1958,17 @@ static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
+ struct ftrace_ops *ops = &global_ops;
struct dyn_ftrace *rec = NULL;
+ if (unlikely(ftrace_disabled))
+ return NULL;
+
if (iter->flags & FTRACE_ITER_HASH)
return t_hash_next(m, pos);
(*pos)++;
- iter->pos = *pos;
+ iter->pos = iter->func_pos = *pos;
if (iter->flags & FTRACE_ITER_PRINTALL)
return t_hash_start(m, pos);
@@ -1483,17 +1984,15 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
rec = &iter->pg->records[iter->idx++];
if ((rec->flags & FTRACE_FL_FREE) ||
- (!(iter->flags & FTRACE_ITER_FAILURES) &&
- (rec->flags & FTRACE_FL_FAILED)) ||
-
- ((iter->flags & FTRACE_ITER_FAILURES) &&
- !(rec->flags & FTRACE_FL_FAILED)) ||
-
((iter->flags & FTRACE_ITER_FILTER) &&
- !(rec->flags & FTRACE_FL_FILTER)) ||
+ !(ftrace_lookup_ip(ops->filter_hash, rec->ip))) ||
((iter->flags & FTRACE_ITER_NOTRACE) &&
- !(rec->flags & FTRACE_FL_NOTRACE))) {
+ !ftrace_lookup_ip(ops->notrace_hash, rec->ip)) ||
+
+ ((iter->flags & FTRACE_ITER_ENABLED) &&
+ !(rec->flags & ~FTRACE_FL_MASK))) {
+
rec = NULL;
goto retry;
}
@@ -1502,7 +2001,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
if (!rec)
return t_hash_start(m, pos);
- iter->func_pos = *pos;
iter->func = rec;
return iter;
@@ -1518,10 +2016,15 @@ static void reset_iter_read(struct ftrace_iterator *iter)
static void *t_start(struct seq_file *m, loff_t *pos)
{
struct ftrace_iterator *iter = m->private;
+ struct ftrace_ops *ops = &global_ops;
void *p = NULL;
loff_t l;
mutex_lock(&ftrace_lock);
+
+ if (unlikely(ftrace_disabled))
+ return NULL;
+
/*
* If an lseek was done, then reset and start from beginning.
*/
@@ -1533,7 +2036,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
* off, we can short cut and just print out that all
* functions are enabled.
*/
- if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
+ if (iter->flags & FTRACE_ITER_FILTER && !ops->filter_hash->count) {
if (*pos > 0)
return t_hash_start(m, pos);
iter->flags |= FTRACE_ITER_PRINTALL;
@@ -1591,7 +2094,11 @@ static int t_show(struct seq_file *m, void *v)
if (!rec)
return 0;
- seq_printf(m, "%ps\n", (void *)rec->ip);
+ seq_printf(m, "%ps", (void *)rec->ip);
+ if (iter->flags & FTRACE_ITER_ENABLED)
+ seq_printf(m, " (%ld)",
+ rec->flags & ~FTRACE_FL_MASK);
+ seq_printf(m, "\n");
return 0;
}
@@ -1631,44 +2138,46 @@ ftrace_avail_open(struct inode *inode, struct file *file)
}
static int
-ftrace_failures_open(struct inode *inode, struct file *file)
+ftrace_enabled_open(struct inode *inode, struct file *file)
{
- int ret;
- struct seq_file *m;
struct ftrace_iterator *iter;
+ int ret;
+
+ if (unlikely(ftrace_disabled))
+ return -ENODEV;
+
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter)
+ return -ENOMEM;
+
+ iter->pg = ftrace_pages_start;
+ iter->flags = FTRACE_ITER_ENABLED;
- ret = ftrace_avail_open(inode, file);
+ ret = seq_open(file, &show_ftrace_seq_ops);
if (!ret) {
- m = file->private_data;
- iter = m->private;
- iter->flags = FTRACE_ITER_FAILURES;
+ struct seq_file *m = file->private_data;
+
+ m->private = iter;
+ } else {
+ kfree(iter);
}
return ret;
}
-
-static void ftrace_filter_reset(int enable)
+static void ftrace_filter_reset(struct ftrace_hash *hash)
{
- struct ftrace_page *pg;
- struct dyn_ftrace *rec;
- unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-
mutex_lock(&ftrace_lock);
- if (enable)
- ftrace_filtered = 0;
- do_for_each_ftrace_rec(pg, rec) {
- if (rec->flags & FTRACE_FL_FAILED)
- continue;
- rec->flags &= ~type;
- } while_for_each_ftrace_rec();
+ ftrace_hash_clear(hash);
mutex_unlock(&ftrace_lock);
}
static int
-ftrace_regex_open(struct inode *inode, struct file *file, int enable)
+ftrace_regex_open(struct ftrace_ops *ops, int flag,
+ struct inode *inode, struct file *file)
{
struct ftrace_iterator *iter;
+ struct ftrace_hash *hash;
int ret = 0;
if (unlikely(ftrace_disabled))
@@ -1683,21 +2192,42 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
return -ENOMEM;
}
+ if (flag & FTRACE_ITER_NOTRACE)
+ hash = ops->notrace_hash;
+ else
+ hash = ops->filter_hash;
+
+ iter->ops = ops;
+ iter->flags = flag;
+
+ if (file->f_mode & FMODE_WRITE) {
+ mutex_lock(&ftrace_lock);
+ iter->hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, hash);
+ mutex_unlock(&ftrace_lock);
+
+ if (!iter->hash) {
+ trace_parser_put(&iter->parser);
+ kfree(iter);
+ return -ENOMEM;
+ }
+ }
+
mutex_lock(&ftrace_regex_lock);
+
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
- ftrace_filter_reset(enable);
+ ftrace_filter_reset(iter->hash);
if (file->f_mode & FMODE_READ) {
iter->pg = ftrace_pages_start;
- iter->flags = enable ? FTRACE_ITER_FILTER :
- FTRACE_ITER_NOTRACE;
ret = seq_open(file, &show_ftrace_seq_ops);
if (!ret) {
struct seq_file *m = file->private_data;
m->private = iter;
} else {
+ /* Failed */
+ free_ftrace_hash(iter->hash);
trace_parser_put(&iter->parser);
kfree(iter);
}
@@ -1711,13 +2241,15 @@ ftrace_regex_open(struct inode *inode, struct file *file, int enable)
static int
ftrace_filter_open(struct inode *inode, struct file *file)
{
- return ftrace_regex_open(inode, file, 1);
+ return ftrace_regex_open(&global_ops, FTRACE_ITER_FILTER,
+ inode, file);
}
static int
ftrace_notrace_open(struct inode *inode, struct file *file)
{
- return ftrace_regex_open(inode, file, 0);
+ return ftrace_regex_open(&global_ops, FTRACE_ITER_NOTRACE,
+ inode, file);
}
static loff_t
@@ -1762,86 +2294,99 @@ static int ftrace_match(char *str, char *regex, int len, int type)
}
static int
-ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+enter_record(struct ftrace_hash *hash, struct dyn_ftrace *rec, int not)
+{
+ struct ftrace_func_entry *entry;
+ int ret = 0;
+
+ entry = ftrace_lookup_ip(hash, rec->ip);
+ if (not) {
+ /* Do nothing if it doesn't exist */
+ if (!entry)
+ return 0;
+
+ free_hash_entry(hash, entry);
+ } else {
+ /* Do nothing if it exists */
+ if (entry)
+ return 0;
+
+ ret = add_hash_entry(hash, rec->ip);
+ }
+ return ret;
+}
+
+static int
+ftrace_match_record(struct dyn_ftrace *rec, char *mod,
+ char *regex, int len, int type)
{
char str[KSYM_SYMBOL_LEN];
+ char *modname;
+
+ kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+
+ if (mod) {
+ /* module lookup requires matching the module */
+ if (!modname || strcmp(modname, mod))
+ return 0;
+
+ /* blank search means to match all funcs in the mod */
+ if (!len)
+ return 1;
+ }
- kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
return ftrace_match(str, regex, len, type);
}
-static int ftrace_match_records(char *buff, int len, int enable)
+static int
+match_records(struct ftrace_hash *hash, char *buff,
+ int len, char *mod, int not)
{
- unsigned int search_len;
+ unsigned search_len = 0;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
- unsigned long flag;
- char *search;
- int type;
- int not;
+ int type = MATCH_FULL;
+ char *search = buff;
int found = 0;
+ int ret;
- flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
- type = filter_parse_regex(buff, len, &search, &not);
-
- search_len = strlen(search);
+ if (len) {
+ type = filter_parse_regex(buff, len, &search, &not);
+ search_len = strlen(search);
+ }
mutex_lock(&ftrace_lock);
- do_for_each_ftrace_rec(pg, rec) {
- if (rec->flags & FTRACE_FL_FAILED)
- continue;
+ if (unlikely(ftrace_disabled))
+ goto out_unlock;
- if (ftrace_match_record(rec, search, search_len, type)) {
- if (not)
- rec->flags &= ~flag;
- else
- rec->flags |= flag;
+ do_for_each_ftrace_rec(pg, rec) {
+
+ if (ftrace_match_record(rec, mod, search, search_len, type)) {
+ ret = enter_record(hash, rec, not);
+ if (ret < 0) {
+ found = ret;
+ goto out_unlock;
+ }
found = 1;
}
- /*
- * Only enable filtering if we have a function that
- * is filtered on.
- */
- if (enable && (rec->flags & FTRACE_FL_FILTER))
- ftrace_filtered = 1;
} while_for_each_ftrace_rec();
+ out_unlock:
mutex_unlock(&ftrace_lock);
return found;
}
static int
-ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
- char *regex, int len, int type)
+ftrace_match_records(struct ftrace_hash *hash, char *buff, int len)
{
- char str[KSYM_SYMBOL_LEN];
- char *modname;
-
- kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
-
- if (!modname || strcmp(modname, mod))
- return 0;
-
- /* blank search means to match all funcs in the mod */
- if (len)
- return ftrace_match(str, regex, len, type);
- else
- return 1;
+ return match_records(hash, buff, len, NULL, 0);
}
-static int ftrace_match_module_records(char *buff, char *mod, int enable)
+static int
+ftrace_match_module_records(struct ftrace_hash *hash, char *buff, char *mod)
{
- unsigned search_len = 0;
- struct ftrace_page *pg;
- struct dyn_ftrace *rec;
- int type = MATCH_FULL;
- char *search = buff;
- unsigned long flag;
int not = 0;
- int found = 0;
-
- flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
/* blank or '*' mean the same */
if (strcmp(buff, "*") == 0)
@@ -1853,32 +2398,7 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
not = 1;
}
- if (strlen(buff)) {
- type = filter_parse_regex(buff, strlen(buff), &search, &not);
- search_len = strlen(search);
- }
-
- mutex_lock(&ftrace_lock);
- do_for_each_ftrace_rec(pg, rec) {
-
- if (rec->flags & FTRACE_FL_FAILED)
- continue;
-
- if (ftrace_match_module_record(rec, mod,
- search, search_len, type)) {
- if (not)
- rec->flags &= ~flag;
- else
- rec->flags |= flag;
- found = 1;
- }
- if (enable && (rec->flags & FTRACE_FL_FILTER))
- ftrace_filtered = 1;
-
- } while_for_each_ftrace_rec();
- mutex_unlock(&ftrace_lock);
-
- return found;
+ return match_records(hash, buff, strlen(buff), mod, not);
}
/*
@@ -1889,7 +2409,10 @@ static int ftrace_match_module_records(char *buff, char *mod, int enable)
static int
ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
{
+ struct ftrace_ops *ops = &global_ops;
+ struct ftrace_hash *hash;
char *mod;
+ int ret = -EINVAL;
/*
* cmd == 'mod' because we only registered this func
@@ -1901,15 +2424,24 @@ ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
/* we must have a module name */
if (!param)
- return -EINVAL;
+ return ret;
mod = strsep(&param, ":");
if (!strlen(mod))
- return -EINVAL;
+ return ret;
- if (ftrace_match_module_records(func, mod, enable))
- return 0;
- return -EINVAL;
+ if (enable)
+ hash = ops->filter_hash;
+ else
+ hash = ops->notrace_hash;
+
+ ret = ftrace_match_module_records(hash, func, mod);
+ if (!ret)
+ ret = -EINVAL;
+ if (ret < 0)
+ return ret;
+
+ return 0;
}
static struct ftrace_func_command ftrace_mod_cmd = {
@@ -1960,6 +2492,7 @@ static int ftrace_probe_registered;
static void __enable_ftrace_function_probe(void)
{
+ int ret;
int i;
if (ftrace_probe_registered)
@@ -1974,13 +2507,16 @@ static void __enable_ftrace_function_probe(void)
if (i == FTRACE_FUNC_HASHSIZE)
return;
- __register_ftrace_function(&trace_probe_ops);
- ftrace_startup(0);
+ ret = __register_ftrace_function(&trace_probe_ops);
+ if (!ret)
+ ret = ftrace_startup(&trace_probe_ops, 0);
+
ftrace_probe_registered = 1;
}
static void __disable_ftrace_function_probe(void)
{
+ int ret;
int i;
if (!ftrace_probe_registered)
@@ -1993,8 +2529,10 @@ static void __disable_ftrace_function_probe(void)
}
/* no more funcs left */
- __unregister_ftrace_function(&trace_probe_ops);
- ftrace_shutdown(0);
+ ret = __unregister_ftrace_function(&trace_probe_ops);
+ if (!ret)
+ ftrace_shutdown(&trace_probe_ops, 0);
+
ftrace_probe_registered = 0;
}
@@ -2030,12 +2568,13 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
return -EINVAL;
mutex_lock(&ftrace_lock);
- do_for_each_ftrace_rec(pg, rec) {
- if (rec->flags & FTRACE_FL_FAILED)
- continue;
+ if (unlikely(ftrace_disabled))
+ goto out_unlock;
+
+ do_for_each_ftrace_rec(pg, rec) {
- if (!ftrace_match_record(rec, search, len, type))
+ if (!ftrace_match_record(rec, NULL, search, len, type))
continue;
entry = kmalloc(sizeof(*entry), GFP_KERNEL);
@@ -2196,18 +2735,22 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
return ret;
}
-static int ftrace_process_regex(char *buff, int len, int enable)
+static int ftrace_process_regex(struct ftrace_hash *hash,
+ char *buff, int len, int enable)
{
char *func, *command, *next = buff;
struct ftrace_func_command *p;
- int ret = -EINVAL;
+ int ret;
func = strsep(&next, ":");
if (!next) {
- if (ftrace_match_records(func, len, enable))
- return 0;
- return ret;
+ ret = ftrace_match_records(hash, func, len);
+ if (!ret)
+ ret = -EINVAL;
+ if (ret < 0)
+ return ret;
+ return 0;
}
/* command found */
@@ -2240,6 +2783,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
mutex_lock(&ftrace_regex_lock);
+ ret = -ENODEV;
+ if (unlikely(ftrace_disabled))
+ goto out_unlock;
+
if (file->f_mode & FMODE_READ) {
struct seq_file *m = file->private_data;
iter = m->private;
@@ -2251,7 +2798,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
if (read >= 0 && trace_parser_loaded(parser) &&
!trace_parser_cont(parser)) {
- ret = ftrace_process_regex(parser->buffer,
+ ret = ftrace_process_regex(iter->hash, parser->buffer,
parser->idx, enable);
trace_parser_clear(parser);
if (ret)
@@ -2279,22 +2826,49 @@ ftrace_notrace_write(struct file *file, const char __user *ubuf,
return ftrace_regex_write(file, ubuf, cnt, ppos, 0);
}
-static void
-ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
+static int
+ftrace_set_regex(struct ftrace_ops *ops, unsigned char *buf, int len,
+ int reset, int enable)
{
+ struct ftrace_hash **orig_hash;
+ struct ftrace_hash *hash;
+ int ret;
+
+ /* All global ops uses the global ops filters */
+ if (ops->flags & FTRACE_OPS_FL_GLOBAL)
+ ops = &global_ops;
+
if (unlikely(ftrace_disabled))
- return;
+ return -ENODEV;
+
+ if (enable)
+ orig_hash = &ops->filter_hash;
+ else
+ orig_hash = &ops->notrace_hash;
+
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+ if (!hash)
+ return -ENOMEM;
mutex_lock(&ftrace_regex_lock);
if (reset)
- ftrace_filter_reset(enable);
+ ftrace_filter_reset(hash);
if (buf)
- ftrace_match_records(buf, len, enable);
+ ftrace_match_records(hash, buf, len);
+
+ mutex_lock(&ftrace_lock);
+ ret = ftrace_hash_move(orig_hash, hash);
+ mutex_unlock(&ftrace_lock);
+
mutex_unlock(&ftrace_regex_lock);
+
+ free_ftrace_hash(hash);
+ return ret;
}
/**
* ftrace_set_filter - set a function to filter on in ftrace
+ * @ops - the ops to set the filter with
* @buf - the string that holds the function filter text.
* @len - the length of the string.
* @reset - non zero to reset all filters before applying this filter.
@@ -2302,13 +2876,16 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
* Filters denote which functions should be enabled when tracing is enabled.
* If @buf is NULL and reset is set, all functions will be enabled for tracing.
*/
-void ftrace_set_filter(unsigned char *buf, int len, int reset)
+void ftrace_set_filter(struct ftrace_ops *ops, unsigned char *buf,
+ int len, int reset)
{
- ftrace_set_regex(buf, len, reset, 1);
+ ftrace_set_regex(ops, buf, len, reset, 1);
}
+EXPORT_SYMBOL_GPL(ftrace_set_filter);
/**
* ftrace_set_notrace - set a function to not trace in ftrace
+ * @ops - the ops to set the notrace filter with
* @buf - the string that holds the function notrace text.
* @len - the length of the string.
* @reset - non zero to reset all filters before applying this filter.
@@ -2317,10 +2894,44 @@ void ftrace_set_filter(unsigned char *buf, int len, int reset)
* is enabled. If @buf is NULL and reset is set, all functions will be enabled
* for tracing.
*/
-void ftrace_set_notrace(unsigned char *buf, int len, int reset)
+void ftrace_set_notrace(struct ftrace_ops *ops, unsigned char *buf,
+ int len, int reset)
{
- ftrace_set_regex(buf, len, reset, 0);
+ ftrace_set_regex(ops, buf, len, reset, 0);
}
+EXPORT_SYMBOL_GPL(ftrace_set_notrace);
+/**
+ * ftrace_set_filter - set a function to filter on in ftrace
+ * @ops - the ops to set the filter with
+ * @buf - the string that holds the function filter text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Filters denote which functions should be enabled when tracing is enabled.
+ * If @buf is NULL and reset is set, all functions will be enabled for tracing.
+ */
+void ftrace_set_global_filter(unsigned char *buf, int len, int reset)
+{
+ ftrace_set_regex(&global_ops, buf, len, reset, 1);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_global_filter);
+
+/**
+ * ftrace_set_notrace - set a function to not trace in ftrace
+ * @ops - the ops to set the notrace filter with
+ * @buf - the string that holds the function notrace text.
+ * @len - the length of the string.
+ * @reset - non zero to reset all filters before applying this filter.
+ *
+ * Notrace Filters denote which functions should not be enabled when tracing
+ * is enabled. If @buf is NULL and reset is set, all functions will be enabled
+ * for tracing.
+ */
+void ftrace_set_global_notrace(unsigned char *buf, int len, int reset)
+{
+ ftrace_set_regex(&global_ops, buf, len, reset, 0);
+}
+EXPORT_SYMBOL_GPL(ftrace_set_global_notrace);
/*
* command line interface to allow users to set filters on boot up.
@@ -2371,22 +2982,23 @@ static void __init set_ftrace_early_graph(char *buf)
}
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-static void __init set_ftrace_early_filter(char *buf, int enable)
+static void __init
+set_ftrace_early_filter(struct ftrace_ops *ops, char *buf, int enable)
{
char *func;
while (buf) {
func = strsep(&buf, ",");
- ftrace_set_regex(func, strlen(func), 0, enable);
+ ftrace_set_regex(ops, func, strlen(func), 0, enable);
}
}
static void __init set_ftrace_early_filters(void)
{
if (ftrace_filter_buf[0])
- set_ftrace_early_filter(ftrace_filter_buf, 1);
+ set_ftrace_early_filter(&global_ops, ftrace_filter_buf, 1);
if (ftrace_notrace_buf[0])
- set_ftrace_early_filter(ftrace_notrace_buf, 0);
+ set_ftrace_early_filter(&global_ops, ftrace_notrace_buf, 0);
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
if (ftrace_graph_buf[0])
set_ftrace_early_graph(ftrace_graph_buf);
@@ -2394,11 +3006,14 @@ static void __init set_ftrace_early_filters(void)
}
static int
-ftrace_regex_release(struct inode *inode, struct file *file, int enable)
+ftrace_regex_release(struct inode *inode, struct file *file)
{
struct seq_file *m = (struct seq_file *)file->private_data;
struct ftrace_iterator *iter;
+ struct ftrace_hash **orig_hash;
struct trace_parser *parser;
+ int filter_hash;
+ int ret;
mutex_lock(&ftrace_regex_lock);
if (file->f_mode & FMODE_READ) {
@@ -2411,33 +3026,41 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
parser = &iter->parser;
if (trace_parser_loaded(parser)) {
parser->buffer[parser->idx] = 0;
- ftrace_match_records(parser->buffer, parser->idx, enable);
+ ftrace_match_records(iter->hash, parser->buffer, parser->idx);
}
- mutex_lock(&ftrace_lock);
- if (ftrace_start_up && ftrace_enabled)
- ftrace_run_update_code(FTRACE_ENABLE_CALLS);
- mutex_unlock(&ftrace_lock);
-
trace_parser_put(parser);
+
+ if (file->f_mode & FMODE_WRITE) {
+ filter_hash = !!(iter->flags & FTRACE_ITER_FILTER);
+
+ if (filter_hash)
+ orig_hash = &iter->ops->filter_hash;
+ else
+ orig_hash = &iter->ops->notrace_hash;
+
+ mutex_lock(&ftrace_lock);
+ /*
+ * Remove the current set, update the hash and add
+ * them back.
+ */
+ ftrace_hash_rec_disable(iter->ops, filter_hash);
+ ret = ftrace_hash_move(orig_hash, iter->hash);
+ if (!ret) {
+ ftrace_hash_rec_enable(iter->ops, filter_hash);
+ if (iter->ops->flags & FTRACE_OPS_FL_ENABLED
+ && ftrace_enabled)
+ ftrace_run_update_code(FTRACE_ENABLE_CALLS);
+ }
+ mutex_unlock(&ftrace_lock);
+ }
+ free_ftrace_hash(iter->hash);
kfree(iter);
mutex_unlock(&ftrace_regex_lock);
return 0;
}
-static int
-ftrace_filter_release(struct inode *inode, struct file *file)
-{
- return ftrace_regex_release(inode, file, 1);
-}
-
-static int
-ftrace_notrace_release(struct inode *inode, struct file *file)
-{
- return ftrace_regex_release(inode, file, 0);
-}
-
static const struct file_operations ftrace_avail_fops = {
.open = ftrace_avail_open,
.read = seq_read,
@@ -2445,8 +3068,8 @@ static const struct file_operations ftrace_avail_fops = {
.release = seq_release_private,
};
-static const struct file_operations ftrace_failures_fops = {
- .open = ftrace_failures_open,
+static const struct file_operations ftrace_enabled_fops = {
+ .open = ftrace_enabled_open,
.read = seq_read,
.llseek = seq_lseek,
.release = seq_release_private,
@@ -2457,7 +3080,7 @@ static const struct file_operations ftrace_filter_fops = {
.read = seq_read,
.write = ftrace_filter_write,
.llseek = ftrace_regex_lseek,
- .release = ftrace_filter_release,
+ .release = ftrace_regex_release,
};
static const struct file_operations ftrace_notrace_fops = {
@@ -2465,7 +3088,7 @@ static const struct file_operations ftrace_notrace_fops = {
.read = seq_read,
.write = ftrace_notrace_write,
.llseek = ftrace_regex_lseek,
- .release = ftrace_notrace_release,
+ .release = ftrace_regex_release,
};
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -2574,9 +3197,6 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
bool exists;
int i;
- if (ftrace_disabled)
- return -ENODEV;
-
/* decode regex */
type = filter_parse_regex(buffer, strlen(buffer), &search, &not);
if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS)
@@ -2585,12 +3205,18 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer)
search_len = strlen(search);
mutex_lock(&ftrace_lock);
+
+ if (unlikely(ftrace_disabled)) {
+ mutex_unlock(&ftrace_lock);
+ return -ENODEV;
+ }
+
do_for_each_ftrace_rec(pg, rec) {
- if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
+ if (rec->flags & FTRACE_FL_FREE)
continue;
- if (ftrace_match_record(rec, search, search_len, type)) {
+ if (ftrace_match_record(rec, NULL, search, search_len, type)) {
/* if it is in the array */
exists = false;
for (i = 0; i < *idx; i++) {
@@ -2680,8 +3306,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
trace_create_file("available_filter_functions", 0444,
d_tracer, NULL, &ftrace_avail_fops);
- trace_create_file("failures", 0444,
- d_tracer, NULL, &ftrace_failures_fops);
+ trace_create_file("enabled_functions", 0444,
+ d_tracer, NULL, &ftrace_enabled_fops);
trace_create_file("set_ftrace_filter", 0644, d_tracer,
NULL, &ftrace_filter_fops);
@@ -2704,7 +3330,6 @@ static int ftrace_process_locs(struct module *mod,
{
unsigned long *p;
unsigned long addr;
- unsigned long flags;
mutex_lock(&ftrace_lock);
p = start;
@@ -2721,10 +3346,7 @@ static int ftrace_process_locs(struct module *mod,
ftrace_record_ip(addr);
}
- /* disable interrupts to prevent kstop machine */
- local_irq_save(flags);
ftrace_update_code(mod);
- local_irq_restore(flags);
mutex_unlock(&ftrace_lock);
return 0;
@@ -2736,10 +3358,11 @@ void ftrace_release_mod(struct module *mod)
struct dyn_ftrace *rec;
struct ftrace_page *pg;
+ mutex_lock(&ftrace_lock);
+
if (ftrace_disabled)
- return;
+ goto out_unlock;
- mutex_lock(&ftrace_lock);
do_for_each_ftrace_rec(pg, rec) {
if (within_module_core(rec->ip, mod)) {
/*
@@ -2750,6 +3373,7 @@ void ftrace_release_mod(struct module *mod)
ftrace_free_rec(rec);
}
} while_for_each_ftrace_rec();
+ out_unlock:
mutex_unlock(&ftrace_lock);
}
@@ -2836,6 +3460,10 @@ void __init ftrace_init(void)
#else
+static struct ftrace_ops global_ops = {
+ .func = ftrace_stub,
+};
+
static int __init ftrace_nodyn_init(void)
{
ftrace_enabled = 1;
@@ -2846,12 +3474,47 @@ device_initcall(ftrace_nodyn_init);
static inline int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { return 0; }
static inline void ftrace_startup_enable(int command) { }
/* Keep as macros so we do not need to define the commands */
-# define ftrace_startup(command) do { } while (0)
-# define ftrace_shutdown(command) do { } while (0)
+# define ftrace_startup(ops, command) \
+ ({ \
+ (ops)->flags |= FTRACE_OPS_FL_ENABLED; \
+ 0; \
+ })
+# define ftrace_shutdown(ops, command) do { } while (0)
# define ftrace_startup_sysctl() do { } while (0)
# define ftrace_shutdown_sysctl() do { } while (0)
+
+static inline int
+ftrace_ops_test(struct ftrace_ops *ops, unsigned long ip)
+{
+ return 1;
+}
+
#endif /* CONFIG_DYNAMIC_FTRACE */
+static void
+ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip)
+{
+ struct ftrace_ops *op;
+
+ if (unlikely(trace_recursion_test(TRACE_INTERNAL_BIT)))
+ return;
+
+ trace_recursion_set(TRACE_INTERNAL_BIT);
+ /*
+ * Some of the ops may be dynamically allocated,
+ * they must be freed after a synchronize_sched().
+ */
+ preempt_disable_notrace();
+ op = rcu_dereference_raw(ftrace_ops_list);
+ while (op != &ftrace_list_end) {
+ if (ftrace_ops_test(op, ip))
+ op->func(ip, parent_ip);
+ op = rcu_dereference_raw(op->next);
+ };
+ preempt_enable_notrace();
+ trace_recursion_clear(TRACE_INTERNAL_BIT);
+}
+
static void clear_ftrace_swapper(void)
{
struct task_struct *p;
@@ -3144,19 +3807,23 @@ void ftrace_kill(void)
*/
int register_ftrace_function(struct ftrace_ops *ops)
{
- int ret;
-
- if (unlikely(ftrace_disabled))
- return -1;
+ int ret = -1;
mutex_lock(&ftrace_lock);
+ if (unlikely(ftrace_disabled))
+ goto out_unlock;
+
ret = __register_ftrace_function(ops);
- ftrace_startup(0);
+ if (!ret)
+ ret = ftrace_startup(ops, 0);
+
+ out_unlock:
mutex_unlock(&ftrace_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(register_ftrace_function);
/**
* unregister_ftrace_function - unregister a function for profiling.
@@ -3170,25 +3837,27 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
mutex_lock(&ftrace_lock);
ret = __unregister_ftrace_function(ops);
- ftrace_shutdown(0);
+ if (!ret)
+ ftrace_shutdown(ops, 0);
mutex_unlock(&ftrace_lock);
return ret;
}
+EXPORT_SYMBOL_GPL(unregister_ftrace_function);
int
ftrace_enable_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos)
{
- int ret;
-
- if (unlikely(ftrace_disabled))
- return -ENODEV;
+ int ret = -ENODEV;
mutex_lock(&ftrace_lock);
- ret = proc_dointvec(table, write, buffer, lenp, ppos);
+ if (unlikely(ftrace_disabled))
+ goto out;
+
+ ret = proc_dointvec(table, write, buffer, lenp, ppos);
if (ret || !write || (last_ftrace_enabled == !!ftrace_enabled))
goto out;
@@ -3200,11 +3869,11 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
ftrace_startup_sysctl();
/* we are starting ftrace again */
- if (ftrace_list != &ftrace_list_end) {
- if (ftrace_list->next == &ftrace_list_end)
- ftrace_trace_function = ftrace_list->func;
+ if (ftrace_ops_list != &ftrace_list_end) {
+ if (ftrace_ops_list->next == &ftrace_list_end)
+ ftrace_trace_function = ftrace_ops_list->func;
else
- ftrace_trace_function = ftrace_list_func;
+ ftrace_trace_function = ftrace_ops_list_func;
}
} else {
@@ -3328,7 +3997,7 @@ static int start_graph_tracing(void)
/* The cpu_boot init_task->ret_stack will never be freed */
for_each_online_cpu(cpu) {
if (!idle_task(cpu)->ret_stack)
- ftrace_graph_init_task(idle_task(cpu));
+ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
}
do {
@@ -3393,7 +4062,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
ftrace_graph_return = retfunc;
ftrace_graph_entry = entryfunc;
- ftrace_startup(FTRACE_START_FUNC_RET);
+ ret = ftrace_startup(&global_ops, FTRACE_START_FUNC_RET);
out:
mutex_unlock(&ftrace_lock);
@@ -3410,7 +4079,7 @@ void unregister_ftrace_graph(void)
ftrace_graph_active--;
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
- ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+ ftrace_shutdown(&global_ops, FTRACE_STOP_FUNC_RET);
unregister_pm_notifier(&ftrace_suspend_notifier);
unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL);
@@ -3418,6 +4087,49 @@ void unregister_ftrace_graph(void)
mutex_unlock(&ftrace_lock);
}
+static DEFINE_PER_CPU(struct ftrace_ret_stack *, idle_ret_stack);
+
+static void
+graph_init_task(struct task_struct *t, struct ftrace_ret_stack *ret_stack)
+{
+ atomic_set(&t->tracing_graph_pause, 0);
+ atomic_set(&t->trace_overrun, 0);
+ t->ftrace_timestamp = 0;
+ /* make curr_ret_stack visible before we add the ret_stack */
+ smp_wmb();
+ t->ret_stack = ret_stack;
+}
+
+/*
+ * Allocate a return stack for the idle task. May be the first
+ * time through, or it may be done by CPU hotplug online.
+ */
+void ftrace_graph_init_idle_task(struct task_struct *t, int cpu)
+{
+ t->curr_ret_stack = -1;
+ /*
+ * The idle task has no parent, it either has its own
+ * stack or no stack at all.
+ */
+ if (t->ret_stack)
+ WARN_ON(t->ret_stack != per_cpu(idle_ret_stack, cpu));
+
+ if (ftrace_graph_active) {
+ struct ftrace_ret_stack *ret_stack;
+
+ ret_stack = per_cpu(idle_ret_stack, cpu);
+ if (!ret_stack) {
+ ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+ * sizeof(struct ftrace_ret_stack),
+ GFP_KERNEL);
+ if (!ret_stack)
+ return;
+ per_cpu(idle_ret_stack, cpu) = ret_stack;
+ }
+ graph_init_task(t, ret_stack);
+ }
+}
+
/* Allocate a return stack for newly created task */
void ftrace_graph_init_task(struct task_struct *t)
{
@@ -3433,12 +4145,7 @@ void ftrace_graph_init_task(struct task_struct *t)
GFP_KERNEL);
if (!ret_stack)
return;
- atomic_set(&t->tracing_graph_pause, 0);
- atomic_set(&t->trace_overrun, 0);
- t->ftrace_timestamp = 0;
- /* make curr_ret_stack visable before we add the ret_stack */
- smp_wmb();
- t->ret_stack = ret_stack;
+ graph_init_task(t, ret_stack);
}
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bd1c35a4fbcc..b0c7aa407943 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -5,7 +5,6 @@
*/
#include <linux/ring_buffer.h>
#include <linux/trace_clock.h>
-#include <linux/ftrace_irq.h>
#include <linux/spinlock.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
@@ -669,7 +668,7 @@ static struct list_head *rb_list_head(struct list_head *list)
* the reader page). But if the next page is a header page,
* its flags will be non zero.
*/
-static int inline
+static inline int
rb_is_head_page(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *page, struct list_head *list)
{
@@ -1429,6 +1428,17 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
}
EXPORT_SYMBOL_GPL(ring_buffer_resize);
+void ring_buffer_change_overwrite(struct ring_buffer *buffer, int val)
+{
+ mutex_lock(&buffer->mutex);
+ if (val)
+ buffer->flags |= RB_FL_OVERWRITE;
+ else
+ buffer->flags &= ~RB_FL_OVERWRITE;
+ mutex_unlock(&buffer->mutex);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_change_overwrite);
+
static inline void *
__rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
{
@@ -1468,7 +1478,7 @@ static inline unsigned long rb_page_entries(struct buffer_page *bpage)
return local_read(&bpage->entries) & RB_WRITE_MASK;
}
-/* Size is determined by what has been commited */
+/* Size is determined by what has been committed */
static inline unsigned rb_page_size(struct buffer_page *bpage)
{
return rb_page_commit(bpage);
@@ -2162,11 +2172,19 @@ rb_reserve_next_event(struct ring_buffer *buffer,
if (likely(ts >= cpu_buffer->write_stamp)) {
delta = diff;
if (unlikely(test_time_stamp(delta))) {
+ int local_clock_stable = 1;
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+ local_clock_stable = sched_clock_stable;
+#endif
WARN_ONCE(delta > (1ULL << 59),
- KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n",
+ KERN_WARNING "Delta way too big! %llu ts=%llu write stamp = %llu\n%s",
(unsigned long long)delta,
(unsigned long long)ts,
- (unsigned long long)cpu_buffer->write_stamp);
+ (unsigned long long)cpu_buffer->write_stamp,
+ local_clock_stable ? "" :
+ "If you just came from a suspend/resume,\n"
+ "please switch to the trace global clock:\n"
+ " echo global > /sys/kernel/debug/tracing/trace_clock\n");
add_timestamp = 1;
}
}
@@ -2198,7 +2216,7 @@ static noinline void trace_recursive_fail(void)
printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
"HC[%lu]:SC[%lu]:NMI[%lu]\n",
- current->trace_recursion,
+ trace_recursion_buffer(),
hardirq_count() >> HARDIRQ_SHIFT,
softirq_count() >> SOFTIRQ_SHIFT,
in_nmi());
@@ -2208,9 +2226,9 @@ static noinline void trace_recursive_fail(void)
static inline int trace_recursive_lock(void)
{
- current->trace_recursion++;
+ trace_recursion_inc();
- if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+ if (likely(trace_recursion_buffer() < TRACE_RECURSIVE_DEPTH))
return 0;
trace_recursive_fail();
@@ -2220,9 +2238,9 @@ static inline int trace_recursive_lock(void)
static inline void trace_recursive_unlock(void)
{
- WARN_ON_ONCE(!current->trace_recursion);
+ WARN_ON_ONCE(!trace_recursion_buffer());
- current->trace_recursion--;
+ trace_recursion_dec();
}
#else
@@ -2914,7 +2932,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
/*
* cpu_buffer->pages just needs to point to the buffer, it
* has no specific buffer page to point to. Lets move it out
- * of our way so we don't accidently swap it.
+ * of our way so we don't accidentally swap it.
*/
cpu_buffer->pages = reader->list.prev;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc53ecb80589..ee9c921d7f21 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,8 +41,6 @@
#include "trace.h"
#include "trace_output.h"
-#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
-
/*
* On boot up, the ring buffer is set to the minimum size, so that
* we do not waste memory on systems that are not using tracing.
@@ -340,7 +338,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
/* trace_flags holds trace_options default values */
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
- TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD;
+ TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE;
static int trace_stop_count;
static DEFINE_SPINLOCK(tracing_start_lock);
@@ -425,6 +423,7 @@ static const char *trace_options[] = {
"sleep-time",
"graph-time",
"record-cmd",
+ "overwrite",
NULL
};
@@ -780,6 +779,11 @@ __acquires(kernel_lock)
tracing_reset_online_cpus(tr);
current_trace = type;
+
+ /* If we expanded the buffers, make sure the max is expanded too */
+ if (ring_buffer_expanded && type->use_max_tr)
+ ring_buffer_resize(max_tr.buffer, trace_buf_size);
+
/* the test is responsible for initializing and enabling */
pr_info("Testing tracer %s: ", type->name);
ret = type->selftest(type, tr);
@@ -792,6 +796,10 @@ __acquires(kernel_lock)
/* Only reset on passing, to avoid touching corrupted buffers */
tracing_reset_online_cpus(tr);
+ /* Shrink the max buffer again */
+ if (ring_buffer_expanded && type->use_max_tr)
+ ring_buffer_resize(max_tr.buffer, 1);
+
printk(KERN_CONT "PASSED\n");
}
#endif
@@ -1102,7 +1110,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
entry->preempt_count = pc & 0xff;
entry->pid = (tsk) ? tsk->pid : 0;
- entry->lock_depth = (tsk) ? tsk->lock_depth : 0;
+ entry->padding = 0;
entry->flags =
#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
@@ -1749,10 +1757,9 @@ static void print_lat_help_header(struct seq_file *m)
seq_puts(m, "# | / _----=> need-resched \n");
seq_puts(m, "# || / _---=> hardirq/softirq \n");
seq_puts(m, "# ||| / _--=> preempt-depth \n");
- seq_puts(m, "# |||| /_--=> lock-depth \n");
- seq_puts(m, "# |||||/ delay \n");
- seq_puts(m, "# cmd pid |||||| time | caller \n");
- seq_puts(m, "# \\ / |||||| \\ | / \n");
+ seq_puts(m, "# |||| / delay \n");
+ seq_puts(m, "# cmd pid ||||| time | caller \n");
+ seq_puts(m, "# \\ / ||||| \\ | / \n");
}
static void print_func_help_header(struct seq_file *m)
@@ -2007,9 +2014,10 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
{
enum print_line_t ret;
- if (iter->lost_events)
- trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
- iter->cpu, iter->lost_events);
+ if (iter->lost_events &&
+ !trace_seq_printf(&iter->seq, "CPU:%d [LOST %lu EVENTS]\n",
+ iter->cpu, iter->lost_events))
+ return TRACE_TYPE_PARTIAL_LINE;
if (iter->trace && iter->trace->print_line) {
ret = iter->trace->print_line(iter);
@@ -2529,6 +2537,9 @@ static void set_tracer_flags(unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_CMD)
trace_event_enable_cmd_record(enabled);
+
+ if (mask == TRACE_ITER_OVERWRITE)
+ ring_buffer_change_overwrite(global_trace.buffer, enabled);
}
static ssize_t
@@ -2710,6 +2721,10 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
mutex_lock(&trace_types_lock);
if (tracer_enabled ^ val) {
+
+ /* Only need to warn if this is used to change the state */
+ WARN_ONCE(1, "tracing_enabled is deprecated. Use tracing_on");
+
if (val) {
tracer_enabled = 1;
if (current_trace->start)
@@ -3216,6 +3231,14 @@ waitagain:
if (iter->seq.len >= cnt)
break;
+
+ /*
+ * Setting the full flag means we reached the trace_seq buffer
+ * size and we should leave by partial output condition above.
+ * One of the trace_seq_* functions is not used properly.
+ */
+ WARN_ONCE(iter->seq.full, "full flag set for trace type %d",
+ iter->ent->type);
}
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
@@ -3226,7 +3249,7 @@ waitagain:
trace_seq_init(&iter->seq);
/*
- * If there was nothing to send to user, inspite of consuming trace
+ * If there was nothing to send to user, in spite of consuming trace
* entries, go back to wait for more entries.
*/
if (sret == -EBUSY)
@@ -4551,9 +4574,11 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
__init static int tracer_alloc_buffers(void)
{
int ring_buf_size;
+ enum ring_buffer_flags rb_flags;
int i;
int ret = -ENOMEM;
+
if (!alloc_cpumask_var(&tracing_buffer_mask, GFP_KERNEL))
goto out;
@@ -4566,12 +4591,13 @@ __init static int tracer_alloc_buffers(void)
else
ring_buf_size = 1;
+ rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+
cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
cpumask_copy(tracing_cpumask, cpu_all_mask);
/* TODO: make the number of buffers hot pluggable with CPUS */
- global_trace.buffer = ring_buffer_alloc(ring_buf_size,
- TRACE_BUFFER_FLAGS);
+ global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
if (!global_trace.buffer) {
printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
WARN_ON(1);
@@ -4581,7 +4607,7 @@ __init static int tracer_alloc_buffers(void)
#ifdef CONFIG_TRACER_MAX_TRACE
- max_tr.buffer = ring_buffer_alloc(1, TRACE_BUFFER_FLAGS);
+ max_tr.buffer = ring_buffer_alloc(1, rb_flags);
if (!max_tr.buffer) {
printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
WARN_ON(1);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9021f8c0c0c3..229f8591f61d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -272,8 +272,8 @@ struct tracer {
/* If you handled the flag setting, return 0 */
int (*set_flag)(u32 old_flags, u32 bit, int set);
struct tracer *next;
- int print_max;
struct tracer_flags *flags;
+ int print_max;
int use_max_tr;
};
@@ -419,6 +419,8 @@ extern void trace_find_cmdline(int pid, char comm[]);
extern unsigned long ftrace_update_tot_cnt;
#define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
extern int DYN_FTRACE_TEST_NAME(void);
+#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
+extern int DYN_FTRACE_TEST_NAME2(void);
#endif
extern int ring_buffer_expanded;
@@ -606,6 +608,7 @@ enum trace_iterator_flags {
TRACE_ITER_SLEEP_TIME = 0x40000,
TRACE_ITER_GRAPH_TIME = 0x80000,
TRACE_ITER_RECORD_CMD = 0x100000,
+ TRACE_ITER_OVERWRITE = 0x200000,
};
/*
@@ -661,8 +664,10 @@ struct ftrace_event_field {
};
struct event_filter {
- int n_preds;
- struct filter_pred **preds;
+ int n_preds; /* Number assigned */
+ int a_preds; /* allocated */
+ struct filter_pred *preds;
+ struct filter_pred *root;
char *filter_string;
};
@@ -674,11 +679,23 @@ struct event_subsystem {
int nr_events;
};
+#define FILTER_PRED_INVALID ((unsigned short)-1)
+#define FILTER_PRED_IS_RIGHT (1 << 15)
+#define FILTER_PRED_FOLD (1 << 15)
+
+/*
+ * The max preds is the size of unsigned short with
+ * two flags at the MSBs. One bit is used for both the IS_RIGHT
+ * and FOLD flags. The other is reserved.
+ *
+ * 2^14 preds is way more than enough.
+ */
+#define MAX_FILTER_PRED 16384
+
struct filter_pred;
struct regex;
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
- int val1, int val2);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
typedef int (*regex_match_func)(char *str, struct regex *r, int len);
@@ -700,11 +717,23 @@ struct filter_pred {
filter_pred_fn_t fn;
u64 val;
struct regex regex;
- char *field_name;
+ /*
+ * Leaf nodes use field_name, ops is used by AND and OR
+ * nodes. The field_name is always freed when freeing a pred.
+ * We can overload field_name for ops and have it freed
+ * as well.
+ */
+ union {
+ char *field_name;
+ unsigned short *ops;
+ };
int offset;
int not;
int op;
- int pop_n;
+ unsigned short index;
+ unsigned short parent;
+ unsigned short left;
+ unsigned short right;
};
extern struct list_head ftrace_common_fields;
@@ -755,4 +784,19 @@ extern const char *__stop___trace_bprintk_fmt[];
FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print))
#include "trace_entries.h"
+/* Only current can touch trace_recursion */
+#define trace_recursion_inc() do { (current)->trace_recursion++; } while (0)
+#define trace_recursion_dec() do { (current)->trace_recursion--; } while (0)
+
+/* Ring buffer has the 10 LSB bits to count */
+#define trace_recursion_buffer() ((current)->trace_recursion & 0x3ff)
+
+/* for function tracing recursion */
+#define TRACE_INTERNAL_BIT (1<<11)
+#define TRACE_GLOBAL_BIT (1<<12)
+
+#define trace_recursion_set(bit) do { (current)->trace_recursion |= (bit); } while (0)
+#define trace_recursion_clear(bit) do { (current)->trace_recursion &= ~(bit); } while (0)
+#define trace_recursion_test(bit) ((current)->trace_recursion & (bit))
+
#endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 685a67d55db0..6302747a1398 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -46,7 +46,7 @@ u64 notrace trace_clock_local(void)
}
/*
- * trace_clock(): 'inbetween' trace clock. Not completely serialized,
+ * trace_clock(): 'between' trace clock. Not completely serialized,
* but not completely incorrect when crossing CPUs either.
*
* This is based on cpu_clock(), which will allow at most ~1 jiffy of
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 6cf223764be8..e32744c84d94 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -27,7 +27,7 @@
* in the structure.
*
* * for structures within structures, the format of the internal
- * structure is layed out. This allows the internal structure
+ * structure is laid out. This allows the internal structure
* to be deciphered for the format file. Although these macros
* may become out of sync with the internal structure, they
* will create a compile error if it happens. Since the
@@ -109,12 +109,12 @@ FTRACE_ENTRY(funcgraph_exit, ftrace_graph_ret_entry,
*/
#define FTRACE_CTX_FIELDS \
__field( unsigned int, prev_pid ) \
+ __field( unsigned int, next_pid ) \
+ __field( unsigned int, next_cpu ) \
__field( unsigned char, prev_prio ) \
__field( unsigned char, prev_state ) \
- __field( unsigned int, next_pid ) \
__field( unsigned char, next_prio ) \
- __field( unsigned char, next_state ) \
- __field( unsigned int, next_cpu )
+ __field( unsigned char, next_state )
FTRACE_ENTRY(context_switch, ctx_switch_entry,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 5f499e0438a4..686ec399f2a8 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -116,7 +116,7 @@ static int trace_define_common_fields(void)
__common_field(unsigned char, flags);
__common_field(unsigned char, preempt_count);
__common_field(int, pid);
- __common_field(int, lock_depth);
+ __common_field(int, padding);
return ret;
}
@@ -326,6 +326,7 @@ int trace_set_clr_event(const char *system, const char *event, int set)
{
return __ftrace_set_clr_event(NULL, system, event, set);
}
+EXPORT_SYMBOL_GPL(trace_set_clr_event);
/* 128 should be much more than enough */
#define EVENT_BUF_SIZE 127
@@ -1656,7 +1657,12 @@ static struct ftrace_ops trace_ops __initdata =
static __init void event_trace_self_test_with_function(void)
{
- register_ftrace_function(&trace_ops);
+ int ret;
+ ret = register_ftrace_function(&trace_ops);
+ if (WARN_ON(ret < 0)) {
+ pr_info("Failed to enable function tracer for event tests\n");
+ return;
+ }
pr_info("Running tests again, along with the function tracer\n");
event_trace_self_tests();
unregister_ftrace_function(&trace_ops);
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 36d40104b17f..8008ddcfbf20 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -123,9 +123,13 @@ struct filter_parse_state {
} operand;
};
+struct pred_stack {
+ struct filter_pred **preds;
+ int index;
+};
+
#define DEFINE_COMPARISON_PRED(type) \
-static int filter_pred_##type(struct filter_pred *pred, void *event, \
- int val1, int val2) \
+static int filter_pred_##type(struct filter_pred *pred, void *event) \
{ \
type *addr = (type *)(event + pred->offset); \
type val = (type)pred->val; \
@@ -152,8 +156,7 @@ static int filter_pred_##type(struct filter_pred *pred, void *event, \
}
#define DEFINE_EQUALITY_PRED(size) \
-static int filter_pred_##size(struct filter_pred *pred, void *event, \
- int val1, int val2) \
+static int filter_pred_##size(struct filter_pred *pred, void *event) \
{ \
u##size *addr = (u##size *)(event + pred->offset); \
u##size val = (u##size)pred->val; \
@@ -178,23 +181,8 @@ DEFINE_EQUALITY_PRED(32);
DEFINE_EQUALITY_PRED(16);
DEFINE_EQUALITY_PRED(8);
-static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
- void *event __attribute((unused)),
- int val1, int val2)
-{
- return val1 && val2;
-}
-
-static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
- void *event __attribute((unused)),
- int val1, int val2)
-{
- return val1 || val2;
-}
-
/* Filter predicate for fixed sized arrays of characters */
-static int filter_pred_string(struct filter_pred *pred, void *event,
- int val1, int val2)
+static int filter_pred_string(struct filter_pred *pred, void *event)
{
char *addr = (char *)(event + pred->offset);
int cmp, match;
@@ -207,8 +195,7 @@ static int filter_pred_string(struct filter_pred *pred, void *event,
}
/* Filter predicate for char * pointers */
-static int filter_pred_pchar(struct filter_pred *pred, void *event,
- int val1, int val2)
+static int filter_pred_pchar(struct filter_pred *pred, void *event)
{
char **addr = (char **)(event + pred->offset);
int cmp, match;
@@ -231,8 +218,7 @@ static int filter_pred_pchar(struct filter_pred *pred, void *event,
* and add it to the address of the entry, and at last we have
* the address of the string.
*/
-static int filter_pred_strloc(struct filter_pred *pred, void *event,
- int val1, int val2)
+static int filter_pred_strloc(struct filter_pred *pred, void *event)
{
u32 str_item = *(u32 *)(event + pred->offset);
int str_loc = str_item & 0xffff;
@@ -247,8 +233,7 @@ static int filter_pred_strloc(struct filter_pred *pred, void *event,
return match;
}
-static int filter_pred_none(struct filter_pred *pred, void *event,
- int val1, int val2)
+static int filter_pred_none(struct filter_pred *pred, void *event)
{
return 0;
}
@@ -377,32 +362,147 @@ static void filter_build_regex(struct filter_pred *pred)
pred->not ^= not;
}
+enum move_type {
+ MOVE_DOWN,
+ MOVE_UP_FROM_LEFT,
+ MOVE_UP_FROM_RIGHT
+};
+
+static struct filter_pred *
+get_pred_parent(struct filter_pred *pred, struct filter_pred *preds,
+ int index, enum move_type *move)
+{
+ if (pred->parent & FILTER_PRED_IS_RIGHT)
+ *move = MOVE_UP_FROM_RIGHT;
+ else
+ *move = MOVE_UP_FROM_LEFT;
+ pred = &preds[pred->parent & ~FILTER_PRED_IS_RIGHT];
+
+ return pred;
+}
+
+/*
+ * A series of AND or ORs where found together. Instead of
+ * climbing up and down the tree branches, an array of the
+ * ops were made in order of checks. We can just move across
+ * the array and short circuit if needed.
+ */
+static int process_ops(struct filter_pred *preds,
+ struct filter_pred *op, void *rec)
+{
+ struct filter_pred *pred;
+ int match = 0;
+ int type;
+ int i;
+
+ /*
+ * Micro-optimization: We set type to true if op
+ * is an OR and false otherwise (AND). Then we
+ * just need to test if the match is equal to
+ * the type, and if it is, we can short circuit the
+ * rest of the checks:
+ *
+ * if ((match && op->op == OP_OR) ||
+ * (!match && op->op == OP_AND))
+ * return match;
+ */
+ type = op->op == OP_OR;
+
+ for (i = 0; i < op->val; i++) {
+ pred = &preds[op->ops[i]];
+ match = pred->fn(pred, rec);
+ if (!!match == type)
+ return match;
+ }
+ return match;
+}
+
/* return 1 if event matches, 0 otherwise (discard) */
int filter_match_preds(struct event_filter *filter, void *rec)
{
- int match, top = 0, val1 = 0, val2 = 0;
- int stack[MAX_FILTER_PRED];
+ int match = -1;
+ enum move_type move = MOVE_DOWN;
+ struct filter_pred *preds;
struct filter_pred *pred;
- int i;
+ struct filter_pred *root;
+ int n_preds;
+ int done = 0;
+
+ /* no filter is considered a match */
+ if (!filter)
+ return 1;
+
+ n_preds = filter->n_preds;
+
+ if (!n_preds)
+ return 1;
+
+ /*
+ * n_preds, root and filter->preds are protect with preemption disabled.
+ */
+ preds = rcu_dereference_sched(filter->preds);
+ root = rcu_dereference_sched(filter->root);
+ if (!root)
+ return 1;
+
+ pred = root;
- for (i = 0; i < filter->n_preds; i++) {
- pred = filter->preds[i];
- if (!pred->pop_n) {
- match = pred->fn(pred, rec, val1, val2);
- stack[top++] = match;
+ /* match is currently meaningless */
+ match = -1;
+
+ do {
+ switch (move) {
+ case MOVE_DOWN:
+ /* only AND and OR have children */
+ if (pred->left != FILTER_PRED_INVALID) {
+ /* If ops is set, then it was folded. */
+ if (!pred->ops) {
+ /* keep going to down the left side */
+ pred = &preds[pred->left];
+ continue;
+ }
+ /* We can treat folded ops as a leaf node */
+ match = process_ops(preds, pred, rec);
+ } else
+ match = pred->fn(pred, rec);
+ /* If this pred is the only pred */
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ case MOVE_UP_FROM_LEFT:
+ /*
+ * Check for short circuits.
+ *
+ * Optimization: !!match == (pred->op == OP_OR)
+ * is the same as:
+ * if ((match && pred->op == OP_OR) ||
+ * (!match && pred->op == OP_AND))
+ */
+ if (!!match == (pred->op == OP_OR)) {
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ }
+ /* now go down the right side of the tree. */
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ /* We finished this equation. */
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
continue;
}
- if (pred->pop_n > top) {
- WARN_ON_ONCE(1);
- return 0;
- }
- val1 = stack[--top];
- val2 = stack[--top];
- match = pred->fn(pred, rec, val1, val2);
- stack[top++] = match;
- }
+ done = 1;
+ } while (!done);
- return stack[--top];
+ return match;
}
EXPORT_SYMBOL_GPL(filter_match_preds);
@@ -414,6 +514,9 @@ static void parse_error(struct filter_parse_state *ps, int err, int pos)
static void remove_filter_string(struct event_filter *filter)
{
+ if (!filter)
+ return;
+
kfree(filter->filter_string);
filter->filter_string = NULL;
}
@@ -473,9 +576,10 @@ static void append_filter_err(struct filter_parse_state *ps,
void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
{
- struct event_filter *filter = call->filter;
+ struct event_filter *filter;
mutex_lock(&event_mutex);
+ filter = call->filter;
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
@@ -486,9 +590,10 @@ void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s)
{
- struct event_filter *filter = system->filter;
+ struct event_filter *filter;
mutex_lock(&event_mutex);
+ filter = system->filter;
if (filter && filter->filter_string)
trace_seq_printf(s, "%s\n", filter->filter_string);
else
@@ -539,10 +644,58 @@ static void filter_clear_pred(struct filter_pred *pred)
pred->regex.len = 0;
}
-static int filter_set_pred(struct filter_pred *dest,
+static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
+{
+ stack->preds = kzalloc(sizeof(*stack->preds)*(n_preds + 1), GFP_KERNEL);
+ if (!stack->preds)
+ return -ENOMEM;
+ stack->index = n_preds;
+ return 0;
+}
+
+static void __free_pred_stack(struct pred_stack *stack)
+{
+ kfree(stack->preds);
+ stack->index = 0;
+}
+
+static int __push_pred_stack(struct pred_stack *stack,
+ struct filter_pred *pred)
+{
+ int index = stack->index;
+
+ if (WARN_ON(index == 0))
+ return -ENOSPC;
+
+ stack->preds[--index] = pred;
+ stack->index = index;
+ return 0;
+}
+
+static struct filter_pred *
+__pop_pred_stack(struct pred_stack *stack)
+{
+ struct filter_pred *pred;
+ int index = stack->index;
+
+ pred = stack->preds[index++];
+ if (!pred)
+ return NULL;
+
+ stack->index = index;
+ return pred;
+}
+
+static int filter_set_pred(struct event_filter *filter,
+ int idx,
+ struct pred_stack *stack,
struct filter_pred *src,
filter_pred_fn_t fn)
{
+ struct filter_pred *dest = &filter->preds[idx];
+ struct filter_pred *left;
+ struct filter_pred *right;
+
*dest = *src;
if (src->field_name) {
dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
@@ -550,116 +703,140 @@ static int filter_set_pred(struct filter_pred *dest,
return -ENOMEM;
}
dest->fn = fn;
+ dest->index = idx;
- return 0;
+ if (dest->op == OP_OR || dest->op == OP_AND) {
+ right = __pop_pred_stack(stack);
+ left = __pop_pred_stack(stack);
+ if (!left || !right)
+ return -EINVAL;
+ /*
+ * If both children can be folded
+ * and they are the same op as this op or a leaf,
+ * then this op can be folded.
+ */
+ if (left->index & FILTER_PRED_FOLD &&
+ (left->op == dest->op ||
+ left->left == FILTER_PRED_INVALID) &&
+ right->index & FILTER_PRED_FOLD &&
+ (right->op == dest->op ||
+ right->left == FILTER_PRED_INVALID))
+ dest->index |= FILTER_PRED_FOLD;
+
+ dest->left = left->index & ~FILTER_PRED_FOLD;
+ dest->right = right->index & ~FILTER_PRED_FOLD;
+ left->parent = dest->index & ~FILTER_PRED_FOLD;
+ right->parent = dest->index | FILTER_PRED_IS_RIGHT;
+ } else {
+ /*
+ * Make dest->left invalid to be used as a quick
+ * way to know this is a leaf node.
+ */
+ dest->left = FILTER_PRED_INVALID;
+
+ /* All leafs allow folding the parent ops. */
+ dest->index |= FILTER_PRED_FOLD;
+ }
+
+ return __push_pred_stack(stack, dest);
}
-static void filter_disable_preds(struct ftrace_event_call *call)
+static void __free_preds(struct event_filter *filter)
{
- struct event_filter *filter = call->filter;
int i;
- call->flags &= ~TRACE_EVENT_FL_FILTERED;
+ if (filter->preds) {
+ for (i = 0; i < filter->a_preds; i++)
+ kfree(filter->preds[i].field_name);
+ kfree(filter->preds);
+ filter->preds = NULL;
+ }
+ filter->a_preds = 0;
filter->n_preds = 0;
-
- for (i = 0; i < MAX_FILTER_PRED; i++)
- filter->preds[i]->fn = filter_pred_none;
}
-static void __free_preds(struct event_filter *filter)
+static void filter_disable(struct ftrace_event_call *call)
{
- int i;
+ call->flags &= ~TRACE_EVENT_FL_FILTERED;
+}
+static void __free_filter(struct event_filter *filter)
+{
if (!filter)
return;
- for (i = 0; i < MAX_FILTER_PRED; i++) {
- if (filter->preds[i])
- filter_free_pred(filter->preds[i]);
- }
- kfree(filter->preds);
+ __free_preds(filter);
kfree(filter->filter_string);
kfree(filter);
}
+/*
+ * Called when destroying the ftrace_event_call.
+ * The call is being freed, so we do not need to worry about
+ * the call being currently used. This is for module code removing
+ * the tracepoints from within it.
+ */
void destroy_preds(struct ftrace_event_call *call)
{
- __free_preds(call->filter);
+ __free_filter(call->filter);
call->filter = NULL;
- call->flags &= ~TRACE_EVENT_FL_FILTERED;
}
-static struct event_filter *__alloc_preds(void)
+static struct event_filter *__alloc_filter(void)
{
struct event_filter *filter;
+
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+ return filter;
+}
+
+static int __alloc_preds(struct event_filter *filter, int n_preds)
+{
struct filter_pred *pred;
int i;
- filter = kzalloc(sizeof(*filter), GFP_KERNEL);
- if (!filter)
- return ERR_PTR(-ENOMEM);
+ if (filter->preds)
+ __free_preds(filter);
- filter->n_preds = 0;
+ filter->preds =
+ kzalloc(sizeof(*filter->preds) * n_preds, GFP_KERNEL);
- filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
if (!filter->preds)
- goto oom;
+ return -ENOMEM;
- for (i = 0; i < MAX_FILTER_PRED; i++) {
- pred = kzalloc(sizeof(*pred), GFP_KERNEL);
- if (!pred)
- goto oom;
+ filter->a_preds = n_preds;
+ filter->n_preds = 0;
+
+ for (i = 0; i < n_preds; i++) {
+ pred = &filter->preds[i];
pred->fn = filter_pred_none;
- filter->preds[i] = pred;
}
- return filter;
-
-oom:
- __free_preds(filter);
- return ERR_PTR(-ENOMEM);
-}
-
-static int init_preds(struct ftrace_event_call *call)
-{
- if (call->filter)
- return 0;
-
- call->flags &= ~TRACE_EVENT_FL_FILTERED;
- call->filter = __alloc_preds();
- if (IS_ERR(call->filter))
- return PTR_ERR(call->filter);
-
return 0;
}
-static int init_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_preds(struct event_subsystem *system)
{
struct ftrace_event_call *call;
- int err;
list_for_each_entry(call, &ftrace_events, list) {
if (strcmp(call->class->system, system->name) != 0)
continue;
- err = init_preds(call);
- if (err)
- return err;
+ filter_disable(call);
+ remove_filter_string(call->filter);
}
-
- return 0;
}
-static void filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_free_subsystem_filters(struct event_subsystem *system)
{
struct ftrace_event_call *call;
list_for_each_entry(call, &ftrace_events, list) {
if (strcmp(call->class->system, system->name) != 0)
continue;
-
- filter_disable_preds(call);
- remove_filter_string(call->filter);
+ __free_filter(call->filter);
+ call->filter = NULL;
}
}
@@ -667,18 +844,19 @@ static int filter_add_pred_fn(struct filter_parse_state *ps,
struct ftrace_event_call *call,
struct event_filter *filter,
struct filter_pred *pred,
+ struct pred_stack *stack,
filter_pred_fn_t fn)
{
int idx, err;
- if (filter->n_preds == MAX_FILTER_PRED) {
+ if (WARN_ON(filter->n_preds == filter->a_preds)) {
parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
return -ENOSPC;
}
idx = filter->n_preds;
- filter_clear_pred(filter->preds[idx]);
- err = filter_set_pred(filter->preds[idx], pred, fn);
+ filter_clear_pred(&filter->preds[idx]);
+ err = filter_set_pred(filter, idx, stack, pred, fn);
if (err)
return err;
@@ -763,6 +941,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
struct ftrace_event_call *call,
struct event_filter *filter,
struct filter_pred *pred,
+ struct pred_stack *stack,
bool dry_run)
{
struct ftrace_event_field *field;
@@ -770,17 +949,12 @@ static int filter_add_pred(struct filter_parse_state *ps,
unsigned long long val;
int ret;
- pred->fn = filter_pred_none;
+ fn = pred->fn = filter_pred_none;
- if (pred->op == OP_AND) {
- pred->pop_n = 2;
- fn = filter_pred_and;
+ if (pred->op == OP_AND)
goto add_pred_fn;
- } else if (pred->op == OP_OR) {
- pred->pop_n = 2;
- fn = filter_pred_or;
+ else if (pred->op == OP_OR)
goto add_pred_fn;
- }
field = find_event_field(call, pred->field_name);
if (!field) {
@@ -829,7 +1003,7 @@ static int filter_add_pred(struct filter_parse_state *ps,
add_pred_fn:
if (!dry_run)
- return filter_add_pred_fn(ps, call, filter, pred, fn);
+ return filter_add_pred_fn(ps, call, filter, pred, stack, fn);
return 0;
}
@@ -1187,6 +1361,234 @@ static int check_preds(struct filter_parse_state *ps)
return 0;
}
+static int count_preds(struct filter_parse_state *ps)
+{
+ struct postfix_elt *elt;
+ int n_preds = 0;
+
+ list_for_each_entry(elt, &ps->postfix, list) {
+ if (elt->op == OP_NONE)
+ continue;
+ n_preds++;
+ }
+
+ return n_preds;
+}
+
+/*
+ * The tree is walked at filtering of an event. If the tree is not correctly
+ * built, it may cause an infinite loop. Check here that the tree does
+ * indeed terminate.
+ */
+static int check_pred_tree(struct event_filter *filter,
+ struct filter_pred *root)
+{
+ struct filter_pred *preds;
+ struct filter_pred *pred;
+ enum move_type move = MOVE_DOWN;
+ int count = 0;
+ int done = 0;
+ int max;
+
+ /*
+ * The max that we can hit a node is three times.
+ * Once going down, once coming up from left, and
+ * once coming up from right. This is more than enough
+ * since leafs are only hit a single time.
+ */
+ max = 3 * filter->n_preds;
+
+ preds = filter->preds;
+ if (!preds)
+ return -EINVAL;
+ pred = root;
+
+ do {
+ if (WARN_ON(count++ > max))
+ return -EINVAL;
+
+ switch (move) {
+ case MOVE_DOWN:
+ if (pred->left != FILTER_PRED_INVALID) {
+ pred = &preds[pred->left];
+ continue;
+ }
+ /* A leaf at the root is just a leaf in the tree */
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ case MOVE_UP_FROM_LEFT:
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ }
+ done = 1;
+ } while (!done);
+
+ /* We are fine. */
+ return 0;
+}
+
+static int count_leafs(struct filter_pred *preds, struct filter_pred *root)
+{
+ struct filter_pred *pred;
+ enum move_type move = MOVE_DOWN;
+ int count = 0;
+ int done = 0;
+
+ pred = root;
+
+ do {
+ switch (move) {
+ case MOVE_DOWN:
+ if (pred->left != FILTER_PRED_INVALID) {
+ pred = &preds[pred->left];
+ continue;
+ }
+ /* A leaf at the root is just a leaf in the tree */
+ if (pred == root)
+ return 1;
+ count++;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ case MOVE_UP_FROM_LEFT:
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ }
+ done = 1;
+ } while (!done);
+
+ return count;
+}
+
+static int fold_pred(struct filter_pred *preds, struct filter_pred *root)
+{
+ struct filter_pred *pred;
+ enum move_type move = MOVE_DOWN;
+ int count = 0;
+ int children;
+ int done = 0;
+
+ /* No need to keep the fold flag */
+ root->index &= ~FILTER_PRED_FOLD;
+
+ /* If the root is a leaf then do nothing */
+ if (root->left == FILTER_PRED_INVALID)
+ return 0;
+
+ /* count the children */
+ children = count_leafs(preds, &preds[root->left]);
+ children += count_leafs(preds, &preds[root->right]);
+
+ root->ops = kzalloc(sizeof(*root->ops) * children, GFP_KERNEL);
+ if (!root->ops)
+ return -ENOMEM;
+
+ root->val = children;
+
+ pred = root;
+ do {
+ switch (move) {
+ case MOVE_DOWN:
+ if (pred->left != FILTER_PRED_INVALID) {
+ pred = &preds[pred->left];
+ continue;
+ }
+ if (WARN_ON(count == children))
+ return -EINVAL;
+ pred->index &= ~FILTER_PRED_FOLD;
+ root->ops[count++] = pred->index;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ case MOVE_UP_FROM_LEFT:
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ }
+ done = 1;
+ } while (!done);
+
+ return 0;
+}
+
+/*
+ * To optimize the processing of the ops, if we have several "ors" or
+ * "ands" together, we can put them in an array and process them all
+ * together speeding up the filter logic.
+ */
+static int fold_pred_tree(struct event_filter *filter,
+ struct filter_pred *root)
+{
+ struct filter_pred *preds;
+ struct filter_pred *pred;
+ enum move_type move = MOVE_DOWN;
+ int done = 0;
+ int err;
+
+ preds = filter->preds;
+ if (!preds)
+ return -EINVAL;
+ pred = root;
+
+ do {
+ switch (move) {
+ case MOVE_DOWN:
+ if (pred->index & FILTER_PRED_FOLD) {
+ err = fold_pred(preds, pred);
+ if (err)
+ return err;
+ /* Folded nodes are like leafs */
+ } else if (pred->left != FILTER_PRED_INVALID) {
+ pred = &preds[pred->left];
+ continue;
+ }
+
+ /* A leaf at the root is just a leaf in the tree */
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ case MOVE_UP_FROM_LEFT:
+ pred = &preds[pred->right];
+ move = MOVE_DOWN;
+ continue;
+ case MOVE_UP_FROM_RIGHT:
+ if (pred == root)
+ break;
+ pred = get_pred_parent(pred, preds,
+ pred->parent, &move);
+ continue;
+ }
+ done = 1;
+ } while (!done);
+
+ return 0;
+}
+
static int replace_preds(struct ftrace_event_call *call,
struct event_filter *filter,
struct filter_parse_state *ps,
@@ -1195,14 +1597,32 @@ static int replace_preds(struct ftrace_event_call *call,
{
char *operand1 = NULL, *operand2 = NULL;
struct filter_pred *pred;
+ struct filter_pred *root;
struct postfix_elt *elt;
+ struct pred_stack stack = { }; /* init to NULL */
int err;
int n_preds = 0;
+ n_preds = count_preds(ps);
+ if (n_preds >= MAX_FILTER_PRED) {
+ parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+ return -ENOSPC;
+ }
+
err = check_preds(ps);
if (err)
return err;
+ if (!dry_run) {
+ err = __alloc_pred_stack(&stack, n_preds);
+ if (err)
+ return err;
+ err = __alloc_preds(filter, n_preds);
+ if (err)
+ goto fail;
+ }
+
+ n_preds = 0;
list_for_each_entry(elt, &ps->postfix, list) {
if (elt->op == OP_NONE) {
if (!operand1)
@@ -1211,14 +1631,16 @@ static int replace_preds(struct ftrace_event_call *call,
operand2 = elt->operand;
else {
parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
- return -EINVAL;
+ err = -EINVAL;
+ goto fail;
}
continue;
}
- if (n_preds++ == MAX_FILTER_PRED) {
+ if (WARN_ON(n_preds++ == MAX_FILTER_PRED)) {
parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
- return -ENOSPC;
+ err = -ENOSPC;
+ goto fail;
}
if (elt->op == OP_AND || elt->op == OP_OR) {
@@ -1228,76 +1650,181 @@ static int replace_preds(struct ftrace_event_call *call,
if (!operand1 || !operand2) {
parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
- return -EINVAL;
+ err = -EINVAL;
+ goto fail;
}
pred = create_pred(elt->op, operand1, operand2);
add_pred:
- if (!pred)
- return -ENOMEM;
- err = filter_add_pred(ps, call, filter, pred, dry_run);
+ if (!pred) {
+ err = -ENOMEM;
+ goto fail;
+ }
+ err = filter_add_pred(ps, call, filter, pred, &stack, dry_run);
filter_free_pred(pred);
if (err)
- return err;
+ goto fail;
operand1 = operand2 = NULL;
}
- return 0;
+ if (!dry_run) {
+ /* We should have one item left on the stack */
+ pred = __pop_pred_stack(&stack);
+ if (!pred)
+ return -EINVAL;
+ /* This item is where we start from in matching */
+ root = pred;
+ /* Make sure the stack is empty */
+ pred = __pop_pred_stack(&stack);
+ if (WARN_ON(pred)) {
+ err = -EINVAL;
+ filter->root = NULL;
+ goto fail;
+ }
+ err = check_pred_tree(filter, root);
+ if (err)
+ goto fail;
+
+ /* Optimize the tree */
+ err = fold_pred_tree(filter, root);
+ if (err)
+ goto fail;
+
+ /* We don't set root until we know it works */
+ barrier();
+ filter->root = root;
+ }
+
+ err = 0;
+fail:
+ __free_pred_stack(&stack);
+ return err;
}
+struct filter_list {
+ struct list_head list;
+ struct event_filter *filter;
+};
+
static int replace_system_preds(struct event_subsystem *system,
struct filter_parse_state *ps,
char *filter_string)
{
struct ftrace_event_call *call;
+ struct filter_list *filter_item;
+ struct filter_list *tmp;
+ LIST_HEAD(filter_list);
bool fail = true;
int err;
list_for_each_entry(call, &ftrace_events, list) {
- struct event_filter *filter = call->filter;
if (strcmp(call->class->system, system->name) != 0)
continue;
- /* try to see if the filter can be applied */
- err = replace_preds(call, filter, ps, filter_string, true);
+ /*
+ * Try to see if the filter can be applied
+ * (filter arg is ignored on dry_run)
+ */
+ err = replace_preds(call, NULL, ps, filter_string, true);
if (err)
+ goto fail;
+ }
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ struct event_filter *filter;
+
+ if (strcmp(call->class->system, system->name) != 0)
continue;
- /* really apply the filter */
- filter_disable_preds(call);
- err = replace_preds(call, filter, ps, filter_string, false);
+ filter_item = kzalloc(sizeof(*filter_item), GFP_KERNEL);
+ if (!filter_item)
+ goto fail_mem;
+
+ list_add_tail(&filter_item->list, &filter_list);
+
+ filter_item->filter = __alloc_filter();
+ if (!filter_item->filter)
+ goto fail_mem;
+ filter = filter_item->filter;
+
+ /* Can only fail on no memory */
+ err = replace_filter_string(filter, filter_string);
if (err)
- filter_disable_preds(call);
- else {
+ goto fail_mem;
+
+ err = replace_preds(call, filter, ps, filter_string, false);
+ if (err) {
+ filter_disable(call);
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ append_filter_err(ps, filter);
+ } else
call->flags |= TRACE_EVENT_FL_FILTERED;
- replace_filter_string(filter, filter_string);
- }
+ /*
+ * Regardless of if this returned an error, we still
+ * replace the filter for the call.
+ */
+ filter = call->filter;
+ call->filter = filter_item->filter;
+ filter_item->filter = filter;
+
fail = false;
}
- if (fail) {
- parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
- return -EINVAL;
+ if (fail)
+ goto fail;
+
+ /*
+ * The calls can still be using the old filters.
+ * Do a synchronize_sched() to ensure all calls are
+ * done with them before we free them.
+ */
+ synchronize_sched();
+ list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+ __free_filter(filter_item->filter);
+ list_del(&filter_item->list);
+ kfree(filter_item);
}
return 0;
+ fail:
+ /* No call succeeded */
+ list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+ list_del(&filter_item->list);
+ kfree(filter_item);
+ }
+ parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+ return -EINVAL;
+ fail_mem:
+ /* If any call succeeded, we still need to sync */
+ if (!fail)
+ synchronize_sched();
+ list_for_each_entry_safe(filter_item, tmp, &filter_list, list) {
+ __free_filter(filter_item->filter);
+ list_del(&filter_item->list);
+ kfree(filter_item);
+ }
+ return -ENOMEM;
}
int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
{
- int err;
struct filter_parse_state *ps;
+ struct event_filter *filter;
+ struct event_filter *tmp;
+ int err = 0;
mutex_lock(&event_mutex);
- err = init_preds(call);
- if (err)
- goto out_unlock;
-
if (!strcmp(strstrip(filter_string), "0")) {
- filter_disable_preds(call);
- remove_filter_string(call->filter);
+ filter_disable(call);
+ filter = call->filter;
+ if (!filter)
+ goto out_unlock;
+ call->filter = NULL;
+ /* Make sure the filter is not being used */
+ synchronize_sched();
+ __free_filter(filter);
goto out_unlock;
}
@@ -1306,22 +1833,41 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
if (!ps)
goto out_unlock;
- filter_disable_preds(call);
- replace_filter_string(call->filter, filter_string);
+ filter = __alloc_filter();
+ if (!filter) {
+ kfree(ps);
+ goto out_unlock;
+ }
+
+ replace_filter_string(filter, filter_string);
parse_init(ps, filter_ops, filter_string);
err = filter_parse(ps);
if (err) {
- append_filter_err(ps, call->filter);
+ append_filter_err(ps, filter);
goto out;
}
- err = replace_preds(call, call->filter, ps, filter_string, false);
- if (err)
- append_filter_err(ps, call->filter);
- else
+ err = replace_preds(call, filter, ps, filter_string, false);
+ if (err) {
+ filter_disable(call);
+ append_filter_err(ps, filter);
+ } else
call->flags |= TRACE_EVENT_FL_FILTERED;
out:
+ /*
+ * Always swap the call filter with the new filter
+ * even if there was an error. If there was an error
+ * in the filter, we disable the filter and show the error
+ * string
+ */
+ tmp = call->filter;
+ call->filter = filter;
+ if (tmp) {
+ /* Make sure the call is done with the filter */
+ synchronize_sched();
+ __free_filter(tmp);
+ }
filter_opstack_clear(ps);
postfix_clear(ps);
kfree(ps);
@@ -1334,18 +1880,21 @@ out_unlock:
int apply_subsystem_event_filter(struct event_subsystem *system,
char *filter_string)
{
- int err;
struct filter_parse_state *ps;
+ struct event_filter *filter;
+ int err = 0;
mutex_lock(&event_mutex);
- err = init_subsystem_preds(system);
- if (err)
- goto out_unlock;
-
if (!strcmp(strstrip(filter_string), "0")) {
filter_free_subsystem_preds(system);
remove_filter_string(system->filter);
+ filter = system->filter;
+ system->filter = NULL;
+ /* Ensure all filters are no longer used */
+ synchronize_sched();
+ filter_free_subsystem_filters(system);
+ __free_filter(filter);
goto out_unlock;
}
@@ -1354,7 +1903,17 @@ int apply_subsystem_event_filter(struct event_subsystem *system,
if (!ps)
goto out_unlock;
- replace_filter_string(system->filter, filter_string);
+ filter = __alloc_filter();
+ if (!filter)
+ goto out;
+
+ replace_filter_string(filter, filter_string);
+ /*
+ * No event actually uses the system filter
+ * we can free it without synchronize_sched().
+ */
+ __free_filter(system->filter);
+ system->filter = filter;
parse_init(ps, filter_ops, filter_string);
err = filter_parse(ps);
@@ -1384,7 +1943,7 @@ void ftrace_profile_free_filter(struct perf_event *event)
struct event_filter *filter = event->filter;
event->filter = NULL;
- __free_preds(filter);
+ __free_filter(filter);
}
int ftrace_profile_set_filter(struct perf_event *event, int event_id,
@@ -1410,8 +1969,8 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
if (event->filter)
goto out_unlock;
- filter = __alloc_preds();
- if (IS_ERR(filter)) {
+ filter = __alloc_filter();
+ if (!filter) {
err = PTR_ERR(filter);
goto out_unlock;
}
@@ -1419,7 +1978,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id,
err = -ENOMEM;
ps = kzalloc(sizeof(*ps), GFP_KERNEL);
if (!ps)
- goto free_preds;
+ goto free_filter;
parse_init(ps, filter_ops, filter_str);
err = filter_parse(ps);
@@ -1435,9 +1994,9 @@ free_ps:
postfix_clear(ps);
kfree(ps);
-free_preds:
+free_filter:
if (err)
- __free_preds(filter);
+ __free_filter(filter);
out_unlock:
mutex_unlock(&event_mutex);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 16aee4d44e8f..8d0e1cc4e974 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -149,11 +149,13 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __read_mostly =
{
.func = function_trace_call,
+ .flags = FTRACE_OPS_FL_GLOBAL,
};
static struct ftrace_ops trace_stack_ops __read_mostly =
{
.func = function_stack_trace_call,
+ .flags = FTRACE_OPS_FL_GLOBAL,
};
/* Our two options */
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 76b05980225c..962cdb24ed81 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -905,7 +905,7 @@ print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
*
* returns 1 if
* - we are inside irq code
- * - we just extered irq code
+ * - we just entered irq code
*
* retunns 0 if
* - funcgraph-interrupts option is set
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 92b6e1e12d98..c77424be284d 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -80,7 +80,7 @@ static struct tracer_flags tracer_flags = {
* skip the latency if the sequence has changed - some other section
* did a maximum and could disturb our measurement with serial console
* printouts, etc. Truly coinciding maximum latencies should be rare
- * and what happens together happens separately as well, so this doesnt
+ * and what happens together happens separately as well, so this doesn't
* decrease the validity of the maximum found:
*/
static __cacheline_aligned_in_smp unsigned long max_sequence;
@@ -153,6 +153,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __read_mostly =
{
.func = irqsoff_tracer_call,
+ .flags = FTRACE_OPS_FL_GLOBAL,
};
#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
index 2dec9bcde8b4..f925c45f0afa 100644
--- a/kernel/trace/trace_kprobe.c
+++ b/kernel/trace/trace_kprobe.c
@@ -53,7 +53,6 @@ const char *reserved_field_names[] = {
"common_preempt_count",
"common_pid",
"common_tgid",
- "common_lock_depth",
FIELD_STRING_IP,
FIELD_STRING_RETIP,
FIELD_STRING_FUNC,
@@ -353,6 +352,43 @@ static __kprobes void free_deref_fetch_param(struct deref_fetch_param *data)
kfree(data);
}
+/* Bitfield fetch function */
+struct bitfield_fetch_param {
+ struct fetch_param orig;
+ unsigned char hi_shift;
+ unsigned char low_shift;
+};
+
+#define DEFINE_FETCH_bitfield(type) \
+static __kprobes void FETCH_FUNC_NAME(bitfield, type)(struct pt_regs *regs,\
+ void *data, void *dest) \
+{ \
+ struct bitfield_fetch_param *bprm = data; \
+ type buf = 0; \
+ call_fetch(&bprm->orig, regs, &buf); \
+ if (buf) { \
+ buf <<= bprm->hi_shift; \
+ buf >>= bprm->low_shift; \
+ } \
+ *(type *)dest = buf; \
+}
+DEFINE_BASIC_FETCH_FUNCS(bitfield)
+#define fetch_bitfield_string NULL
+#define fetch_bitfield_string_size NULL
+
+static __kprobes void
+free_bitfield_fetch_param(struct bitfield_fetch_param *data)
+{
+ /*
+ * Don't check the bitfield itself, because this must be the
+ * last fetch function.
+ */
+ if (CHECK_FETCH_FUNCS(deref, data->orig.fn))
+ free_deref_fetch_param(data->orig.data);
+ else if (CHECK_FETCH_FUNCS(symbol, data->orig.fn))
+ free_symbol_cache(data->orig.data);
+ kfree(data);
+}
/* Default (unsigned long) fetch type */
#define __DEFAULT_FETCH_TYPE(t) u##t
#define _DEFAULT_FETCH_TYPE(t) __DEFAULT_FETCH_TYPE(t)
@@ -367,6 +403,7 @@ enum {
FETCH_MTD_memory,
FETCH_MTD_symbol,
FETCH_MTD_deref,
+ FETCH_MTD_bitfield,
FETCH_MTD_END,
};
@@ -387,6 +424,7 @@ ASSIGN_FETCH_FUNC(retval, ftype), \
ASSIGN_FETCH_FUNC(memory, ftype), \
ASSIGN_FETCH_FUNC(symbol, ftype), \
ASSIGN_FETCH_FUNC(deref, ftype), \
+ASSIGN_FETCH_FUNC(bitfield, ftype), \
} \
}
@@ -430,9 +468,33 @@ static const struct fetch_type *find_fetch_type(const char *type)
if (!type)
type = DEFAULT_FETCH_TYPE_STR;
+ /* Special case: bitfield */
+ if (*type == 'b') {
+ unsigned long bs;
+ type = strchr(type, '/');
+ if (!type)
+ goto fail;
+ type++;
+ if (strict_strtoul(type, 0, &bs))
+ goto fail;
+ switch (bs) {
+ case 8:
+ return find_fetch_type("u8");
+ case 16:
+ return find_fetch_type("u16");
+ case 32:
+ return find_fetch_type("u32");
+ case 64:
+ return find_fetch_type("u64");
+ default:
+ goto fail;
+ }
+ }
+
for (i = 0; i < ARRAY_SIZE(fetch_type_table); i++)
if (strcmp(type, fetch_type_table[i].name) == 0)
return &fetch_type_table[i];
+fail:
return NULL;
}
@@ -586,7 +648,9 @@ error:
static void free_probe_arg(struct probe_arg *arg)
{
- if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
+ if (CHECK_FETCH_FUNCS(bitfield, arg->fetch.fn))
+ free_bitfield_fetch_param(arg->fetch.data);
+ else if (CHECK_FETCH_FUNCS(deref, arg->fetch.fn))
free_deref_fetch_param(arg->fetch.data);
else if (CHECK_FETCH_FUNCS(symbol, arg->fetch.fn))
free_symbol_cache(arg->fetch.data);
@@ -767,16 +831,15 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
}
break;
case '+': /* deref memory */
+ arg++; /* Skip '+', because strict_strtol() rejects it. */
case '-':
tmp = strchr(arg, '(');
if (!tmp)
break;
*tmp = '\0';
- ret = strict_strtol(arg + 1, 0, &offset);
+ ret = strict_strtol(arg, 0, &offset);
if (ret)
break;
- if (arg[0] == '-')
- offset = -offset;
arg = tmp + 1;
tmp = strrchr(arg, ')');
if (tmp) {
@@ -807,6 +870,41 @@ static int __parse_probe_arg(char *arg, const struct fetch_type *t,
return ret;
}
+#define BYTES_TO_BITS(nb) ((BITS_PER_LONG * (nb)) / sizeof(long))
+
+/* Bitfield type needs to be parsed into a fetch function */
+static int __parse_bitfield_probe_arg(const char *bf,
+ const struct fetch_type *t,
+ struct fetch_param *f)
+{
+ struct bitfield_fetch_param *bprm;
+ unsigned long bw, bo;
+ char *tail;
+
+ if (*bf != 'b')
+ return 0;
+
+ bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
+ if (!bprm)
+ return -ENOMEM;
+ bprm->orig = *f;
+ f->fn = t->fetch[FETCH_MTD_bitfield];
+ f->data = (void *)bprm;
+
+ bw = simple_strtoul(bf + 1, &tail, 0); /* Use simple one */
+ if (bw == 0 || *tail != '@')
+ return -EINVAL;
+
+ bf = tail + 1;
+ bo = simple_strtoul(bf, &tail, 0);
+ if (tail == bf || *tail != '/')
+ return -EINVAL;
+
+ bprm->hi_shift = BYTES_TO_BITS(t->size) - (bw + bo);
+ bprm->low_shift = bprm->hi_shift + bo;
+ return (BYTES_TO_BITS(t->size) < (bw + bo)) ? -EINVAL : 0;
+}
+
/* String length checking wrapper */
static int parse_probe_arg(char *arg, struct trace_probe *tp,
struct probe_arg *parg, int is_return)
@@ -836,6 +934,8 @@ static int parse_probe_arg(char *arg, struct trace_probe *tp,
parg->offset = tp->size;
tp->size += parg->type->size;
ret = __parse_probe_arg(arg, parg->type, &parg->fetch, is_return);
+ if (ret >= 0 && t != NULL)
+ ret = __parse_bitfield_probe_arg(t, parg->type, &parg->fetch);
if (ret >= 0) {
parg->fetch_size.fn = get_fetch_size_function(parg->type,
parg->fetch.fn);
@@ -1130,7 +1230,7 @@ static int command_trace_probe(const char *buf)
return ret;
}
-#define WRITE_BUFSIZE 128
+#define WRITE_BUFSIZE 4096
static ssize_t probes_write(struct file *file, const char __user *buffer,
size_t count, loff_t *ppos)
@@ -1738,7 +1838,7 @@ static void unregister_probe_event(struct trace_probe *tp)
kfree(tp->call.print_fmt);
}
-/* Make a debugfs interface for controling probe points */
+/* Make a debugfs interface for controlling probe points */
static __init int init_kprobe_trace(void)
{
struct dentry *d_tracer;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 02272baa2206..e37de492a9e1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -353,6 +353,33 @@ ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
}
EXPORT_SYMBOL(ftrace_print_symbols_seq);
+#if BITS_PER_LONG == 32
+const char *
+ftrace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val,
+ const struct trace_print_flags_u64 *symbol_array)
+{
+ int i;
+ const char *ret = p->buffer + p->len;
+
+ for (i = 0; symbol_array[i].name; i++) {
+
+ if (val != symbol_array[i].mask)
+ continue;
+
+ trace_seq_puts(p, symbol_array[i].name);
+ break;
+ }
+
+ if (!p->len)
+ trace_seq_printf(p, "0x%llx", val);
+
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq_u64);
+#endif
+
const char *
ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
{
@@ -529,24 +556,34 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
* @entry: The trace entry field from the ring buffer
*
* Prints the generic fields of irqs off, in hard or softirq, preempt
- * count and lock depth.
+ * count.
*/
int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
{
- int hardirq, softirq;
+ char hardsoft_irq;
+ char need_resched;
+ char irqs_off;
+ int hardirq;
+ int softirq;
int ret;
hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+ irqs_off =
+ (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+ (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' :
+ '.';
+ need_resched =
+ (entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.';
+ hardsoft_irq =
+ (hardirq && softirq) ? 'H' :
+ hardirq ? 'h' :
+ softirq ? 's' :
+ '.';
+
if (!trace_seq_printf(s, "%c%c%c",
- (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
- (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
- 'X' : '.',
- (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
- 'N' : '.',
- (hardirq && softirq) ? 'H' :
- hardirq ? 'h' : softirq ? 's' : '.'))
+ irqs_off, need_resched, hardsoft_irq))
return 0;
if (entry->preempt_count)
@@ -554,13 +591,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
else
ret = trace_seq_putc(s, '.');
- if (!ret)
- return 0;
-
- if (entry->lock_depth < 0)
- return trace_seq_putc(s, '.');
-
- return trace_seq_printf(s, "%d", entry->lock_depth);
+ return ret;
}
static int
@@ -826,6 +857,9 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event);
enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags,
struct trace_event *event)
{
+ if (!trace_seq_printf(&iter->seq, "type: %d\n", iter->ent->type))
+ return TRACE_TYPE_PARTIAL_LINE;
+
return TRACE_TYPE_HANDLED;
}
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 2547d8813cf0..dff763b7baf1 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -32,7 +32,7 @@ static DEFINE_MUTEX(btrace_mutex);
struct trace_bprintk_fmt {
struct list_head list;
- char fmt[0];
+ const char *fmt;
};
static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
@@ -49,6 +49,7 @@ static
void hold_module_trace_bprintk_format(const char **start, const char **end)
{
const char **iter;
+ char *fmt;
mutex_lock(&btrace_mutex);
for (iter = start; iter < end; iter++) {
@@ -58,14 +59,18 @@ void hold_module_trace_bprintk_format(const char **start, const char **end)
continue;
}
- tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
- + strlen(*iter) + 1, GFP_KERNEL);
- if (tb_fmt) {
+ tb_fmt = kmalloc(sizeof(*tb_fmt), GFP_KERNEL);
+ if (tb_fmt)
+ fmt = kmalloc(strlen(*iter) + 1, GFP_KERNEL);
+ if (tb_fmt && fmt) {
list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
- strcpy(tb_fmt->fmt, *iter);
+ strcpy(fmt, *iter);
+ tb_fmt->fmt = fmt;
*iter = tb_fmt->fmt;
- } else
+ } else {
+ kfree(tb_fmt);
*iter = NULL;
+ }
}
mutex_unlock(&btrace_mutex);
}
@@ -84,6 +89,76 @@ static int module_trace_bprintk_format_notify(struct notifier_block *self,
return 0;
}
+/*
+ * The debugfs/tracing/printk_formats file maps the addresses with
+ * the ASCII formats that are used in the bprintk events in the
+ * buffer. For userspace tools to be able to decode the events from
+ * the buffer, they need to be able to map the address with the format.
+ *
+ * The addresses of the bprintk formats are in their own section
+ * __trace_printk_fmt. But for modules we copy them into a link list.
+ * The code to print the formats and their addresses passes around the
+ * address of the fmt string. If the fmt address passed into the seq
+ * functions is within the kernel core __trace_printk_fmt section, then
+ * it simply uses the next pointer in the list.
+ *
+ * When the fmt pointer is outside the kernel core __trace_printk_fmt
+ * section, then we need to read the link list pointers. The trick is
+ * we pass the address of the string to the seq function just like
+ * we do for the kernel core formats. To get back the structure that
+ * holds the format, we simply use containerof() and then go to the
+ * next format in the list.
+ */
+static const char **
+find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
+{
+ struct trace_bprintk_fmt *mod_fmt;
+
+ if (list_empty(&trace_bprintk_fmt_list))
+ return NULL;
+
+ /*
+ * v will point to the address of the fmt record from t_next
+ * v will be NULL from t_start.
+ * If this is the first pointer or called from start
+ * then we need to walk the list.
+ */
+ if (!v || start_index == *pos) {
+ struct trace_bprintk_fmt *p;
+
+ /* search the module list */
+ list_for_each_entry(p, &trace_bprintk_fmt_list, list) {
+ if (start_index == *pos)
+ return &p->fmt;
+ start_index++;
+ }
+ /* pos > index */
+ return NULL;
+ }
+
+ /*
+ * v points to the address of the fmt field in the mod list
+ * structure that holds the module print format.
+ */
+ mod_fmt = container_of(v, typeof(*mod_fmt), fmt);
+ if (mod_fmt->list.next == &trace_bprintk_fmt_list)
+ return NULL;
+
+ mod_fmt = container_of(mod_fmt->list.next, typeof(*mod_fmt), list);
+
+ return &mod_fmt->fmt;
+}
+
+static void format_mod_start(void)
+{
+ mutex_lock(&btrace_mutex);
+}
+
+static void format_mod_stop(void)
+{
+ mutex_unlock(&btrace_mutex);
+}
+
#else /* !CONFIG_MODULES */
__init static int
module_trace_bprintk_format_notify(struct notifier_block *self,
@@ -91,6 +166,13 @@ module_trace_bprintk_format_notify(struct notifier_block *self,
{
return 0;
}
+static inline const char **
+find_next_mod_format(int start_index, void *v, const char **fmt, loff_t *pos)
+{
+ return NULL;
+}
+static inline void format_mod_start(void) { }
+static inline void format_mod_stop(void) { }
#endif /* CONFIG_MODULES */
@@ -153,20 +235,33 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
}
EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+static const char **find_next(void *v, loff_t *pos)
+{
+ const char **fmt = v;
+ int start_index;
+
+ if (!fmt)
+ fmt = __start___trace_bprintk_fmt + *pos;
+
+ start_index = __stop___trace_bprintk_fmt - __start___trace_bprintk_fmt;
+
+ if (*pos < start_index)
+ return fmt;
+
+ return find_next_mod_format(start_index, v, fmt, pos);
+}
+
static void *
t_start(struct seq_file *m, loff_t *pos)
{
- const char **fmt = __start___trace_bprintk_fmt + *pos;
-
- if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
- return NULL;
- return fmt;
+ format_mod_start();
+ return find_next(NULL, pos);
}
static void *t_next(struct seq_file *m, void * v, loff_t *pos)
{
(*pos)++;
- return t_start(m, pos);
+ return find_next(v, pos);
}
static int t_show(struct seq_file *m, void *v)
@@ -205,6 +300,7 @@ static int t_show(struct seq_file *m, void *v)
static void t_stop(struct seq_file *m, void *p)
{
+ format_mod_stop();
}
static const struct seq_operations show_format_seq_ops = {
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 8f758d070c43..7e62c0a18456 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -247,51 +247,3 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
ctx_trace = tr;
}
-static void stop_sched_trace(struct trace_array *tr)
-{
- tracing_stop_sched_switch_record();
-}
-
-static int sched_switch_trace_init(struct trace_array *tr)
-{
- ctx_trace = tr;
- tracing_reset_online_cpus(tr);
- tracing_start_sched_switch_record();
- return 0;
-}
-
-static void sched_switch_trace_reset(struct trace_array *tr)
-{
- if (sched_ref)
- stop_sched_trace(tr);
-}
-
-static void sched_switch_trace_start(struct trace_array *tr)
-{
- sched_stopped = 0;
-}
-
-static void sched_switch_trace_stop(struct trace_array *tr)
-{
- sched_stopped = 1;
-}
-
-static struct tracer sched_switch_trace __read_mostly =
-{
- .name = "sched_switch",
- .init = sched_switch_trace_init,
- .reset = sched_switch_trace_reset,
- .start = sched_switch_trace_start,
- .stop = sched_switch_trace_stop,
- .wait_pipe = poll_wait_pipe,
-#ifdef CONFIG_FTRACE_SELFTEST
- .selftest = trace_selftest_startup_sched_switch,
-#endif
-};
-
-__init static int init_sched_switch_trace(void)
-{
- return register_tracer(&sched_switch_trace);
-}
-device_initcall(init_sched_switch_trace);
-
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 7319559ed59f..f029dd4fd2ca 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -129,6 +129,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __read_mostly =
{
.func = wakeup_tracer_call,
+ .flags = FTRACE_OPS_FL_GLOBAL,
};
#endif /* CONFIG_FUNCTION_TRACER */
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 659732eba07c..288541f977fb 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -101,6 +101,206 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
#ifdef CONFIG_DYNAMIC_FTRACE
+static int trace_selftest_test_probe1_cnt;
+static void trace_selftest_test_probe1_func(unsigned long ip,
+ unsigned long pip)
+{
+ trace_selftest_test_probe1_cnt++;
+}
+
+static int trace_selftest_test_probe2_cnt;
+static void trace_selftest_test_probe2_func(unsigned long ip,
+ unsigned long pip)
+{
+ trace_selftest_test_probe2_cnt++;
+}
+
+static int trace_selftest_test_probe3_cnt;
+static void trace_selftest_test_probe3_func(unsigned long ip,
+ unsigned long pip)
+{
+ trace_selftest_test_probe3_cnt++;
+}
+
+static int trace_selftest_test_global_cnt;
+static void trace_selftest_test_global_func(unsigned long ip,
+ unsigned long pip)
+{
+ trace_selftest_test_global_cnt++;
+}
+
+static int trace_selftest_test_dyn_cnt;
+static void trace_selftest_test_dyn_func(unsigned long ip,
+ unsigned long pip)
+{
+ trace_selftest_test_dyn_cnt++;
+}
+
+static struct ftrace_ops test_probe1 = {
+ .func = trace_selftest_test_probe1_func,
+};
+
+static struct ftrace_ops test_probe2 = {
+ .func = trace_selftest_test_probe2_func,
+};
+
+static struct ftrace_ops test_probe3 = {
+ .func = trace_selftest_test_probe3_func,
+};
+
+static struct ftrace_ops test_global = {
+ .func = trace_selftest_test_global_func,
+ .flags = FTRACE_OPS_FL_GLOBAL,
+};
+
+static void print_counts(void)
+{
+ printk("(%d %d %d %d %d) ",
+ trace_selftest_test_probe1_cnt,
+ trace_selftest_test_probe2_cnt,
+ trace_selftest_test_probe3_cnt,
+ trace_selftest_test_global_cnt,
+ trace_selftest_test_dyn_cnt);
+}
+
+static void reset_counts(void)
+{
+ trace_selftest_test_probe1_cnt = 0;
+ trace_selftest_test_probe2_cnt = 0;
+ trace_selftest_test_probe3_cnt = 0;
+ trace_selftest_test_global_cnt = 0;
+ trace_selftest_test_dyn_cnt = 0;
+}
+
+static int trace_selftest_ops(int cnt)
+{
+ int save_ftrace_enabled = ftrace_enabled;
+ struct ftrace_ops *dyn_ops;
+ char *func1_name;
+ char *func2_name;
+ int len1;
+ int len2;
+ int ret = -1;
+
+ printk(KERN_CONT "PASSED\n");
+ pr_info("Testing dynamic ftrace ops #%d: ", cnt);
+
+ ftrace_enabled = 1;
+ reset_counts();
+
+ /* Handle PPC64 '.' name */
+ func1_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
+ func2_name = "*" __stringify(DYN_FTRACE_TEST_NAME2);
+ len1 = strlen(func1_name);
+ len2 = strlen(func2_name);
+
+ /*
+ * Probe 1 will trace function 1.
+ * Probe 2 will trace function 2.
+ * Probe 3 will trace functions 1 and 2.
+ */
+ ftrace_set_filter(&test_probe1, func1_name, len1, 1);
+ ftrace_set_filter(&test_probe2, func2_name, len2, 1);
+ ftrace_set_filter(&test_probe3, func1_name, len1, 1);
+ ftrace_set_filter(&test_probe3, func2_name, len2, 0);
+
+ register_ftrace_function(&test_probe1);
+ register_ftrace_function(&test_probe2);
+ register_ftrace_function(&test_probe3);
+ register_ftrace_function(&test_global);
+
+ DYN_FTRACE_TEST_NAME();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 1)
+ goto out;
+ if (trace_selftest_test_probe2_cnt != 0)
+ goto out;
+ if (trace_selftest_test_probe3_cnt != 1)
+ goto out;
+ if (trace_selftest_test_global_cnt == 0)
+ goto out;
+
+ DYN_FTRACE_TEST_NAME2();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 1)
+ goto out;
+ if (trace_selftest_test_probe2_cnt != 1)
+ goto out;
+ if (trace_selftest_test_probe3_cnt != 2)
+ goto out;
+
+ /* Add a dynamic probe */
+ dyn_ops = kzalloc(sizeof(*dyn_ops), GFP_KERNEL);
+ if (!dyn_ops) {
+ printk("MEMORY ERROR ");
+ goto out;
+ }
+
+ dyn_ops->func = trace_selftest_test_dyn_func;
+
+ register_ftrace_function(dyn_ops);
+
+ trace_selftest_test_global_cnt = 0;
+
+ DYN_FTRACE_TEST_NAME();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 2)
+ goto out_free;
+ if (trace_selftest_test_probe2_cnt != 1)
+ goto out_free;
+ if (trace_selftest_test_probe3_cnt != 3)
+ goto out_free;
+ if (trace_selftest_test_global_cnt == 0)
+ goto out;
+ if (trace_selftest_test_dyn_cnt == 0)
+ goto out_free;
+
+ DYN_FTRACE_TEST_NAME2();
+
+ print_counts();
+
+ if (trace_selftest_test_probe1_cnt != 2)
+ goto out_free;
+ if (trace_selftest_test_probe2_cnt != 2)
+ goto out_free;
+ if (trace_selftest_test_probe3_cnt != 4)
+ goto out_free;
+
+ ret = 0;
+ out_free:
+ unregister_ftrace_function(dyn_ops);
+ kfree(dyn_ops);
+
+ out:
+ /* Purposely unregister in the same order */
+ unregister_ftrace_function(&test_probe1);
+ unregister_ftrace_function(&test_probe2);
+ unregister_ftrace_function(&test_probe3);
+ unregister_ftrace_function(&test_global);
+
+ /* Make sure everything is off */
+ reset_counts();
+ DYN_FTRACE_TEST_NAME();
+ DYN_FTRACE_TEST_NAME();
+
+ if (trace_selftest_test_probe1_cnt ||
+ trace_selftest_test_probe2_cnt ||
+ trace_selftest_test_probe3_cnt ||
+ trace_selftest_test_global_cnt ||
+ trace_selftest_test_dyn_cnt)
+ ret = -1;
+
+ ftrace_enabled = save_ftrace_enabled;
+
+ return ret;
+}
+
/* Test dynamic code modification and ftrace filters */
int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
struct trace_array *tr,
@@ -131,7 +331,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
/* filter only on our function */
- ftrace_set_filter(func_name, strlen(func_name), 1);
+ ftrace_set_global_filter(func_name, strlen(func_name), 1);
/* enable tracing */
ret = tracer_init(trace, tr);
@@ -166,22 +366,30 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
/* check the trace buffer */
ret = trace_test_buffer(tr, &count);
- trace->reset(tr);
tracing_start();
/* we should only have one item */
if (!ret && count != 1) {
+ trace->reset(tr);
printk(KERN_CONT ".. filter failed count=%ld ..", count);
ret = -1;
goto out;
}
+ /* Test the ops with global tracing running */
+ ret = trace_selftest_ops(1);
+ trace->reset(tr);
+
out:
ftrace_enabled = save_ftrace_enabled;
tracer_enabled = save_tracer_enabled;
/* Enable tracing on all functions again */
- ftrace_set_filter(NULL, 0, 1);
+ ftrace_set_global_filter(NULL, 0, 1);
+
+ /* Test the ops with global tracing off */
+ if (!ret)
+ ret = trace_selftest_ops(2);
return ret;
}
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
index 54dd77cce5bf..b4c475a0a48b 100644
--- a/kernel/trace/trace_selftest_dynamic.c
+++ b/kernel/trace/trace_selftest_dynamic.c
@@ -5,3 +5,9 @@ int DYN_FTRACE_TEST_NAME(void)
/* used to call mcount */
return 0;
}
+
+int DYN_FTRACE_TEST_NAME2(void)
+{
+ /* used to call mcount */
+ return 0;
+}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4c5dead0c239..b0b53b8e4c25 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -133,6 +133,7 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip)
static struct ftrace_ops trace_ops __read_mostly =
{
.func = stack_trace_call,
+ .flags = FTRACE_OPS_FL_GLOBAL,
};
static ssize_t
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 5c9fe08d2093..ee7b5a0bb9f8 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -60,6 +60,19 @@ extern struct syscall_metadata *__stop_syscalls_metadata[];
static struct syscall_metadata **syscalls_metadata;
+#ifndef ARCH_HAS_SYSCALL_MATCH_SYM_NAME
+static inline bool arch_syscall_match_sym_name(const char *sym, const char *name)
+{
+ /*
+ * Only compare after the "sys" prefix. Archs that use
+ * syscall wrappers may have syscalls symbols aliases prefixed
+ * with "SyS" instead of "sys", leading to an unwanted
+ * mismatch.
+ */
+ return !strcmp(sym + 3, name + 3);
+}
+#endif
+
static __init struct syscall_metadata *
find_syscall_meta(unsigned long syscall)
{
@@ -72,14 +85,11 @@ find_syscall_meta(unsigned long syscall)
stop = __stop_syscalls_metadata;
kallsyms_lookup(syscall, NULL, NULL, NULL, str);
+ if (arch_syscall_match_sym_name(str, "sys_ni_syscall"))
+ return NULL;
+
for ( ; start < stop; start++) {
- /*
- * Only compare after the "sys" prefix. Archs that use
- * syscall wrappers may have syscalls symbols aliases prefixed
- * with "SyS" instead of "sys", leading to an unwanted
- * mismatch.
- */
- if ((*start)->name && !strcmp((*start)->name + 3, str + 3))
+ if ((*start)->name && arch_syscall_match_sym_name(str, (*start)->name))
return *start;
}
return NULL;
@@ -359,7 +369,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_enter)
@@ -377,7 +387,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
sys_refcount_enter--;
@@ -393,7 +403,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
if (!sys_refcount_exit)
@@ -411,7 +421,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
- if (num < 0 || num >= NR_syscalls)
+ if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
sys_refcount_exit--;
@@ -424,6 +434,14 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call)
int init_syscall_trace(struct ftrace_event_call *call)
{
int id;
+ int num;
+
+ num = ((struct syscall_metadata *)call->data)->syscall_nr;
+ if (num < 0 || num >= NR_syscalls) {
+ pr_debug("syscall %s metadata not mapped, disabling ftrace event\n",
+ ((struct syscall_metadata *)call->data)->name);
+ return -ENOSYS;
+ }
if (set_syscall_print_fmt(call) < 0)
return -ENOMEM;
@@ -438,7 +456,7 @@ int init_syscall_trace(struct ftrace_event_call *call)
return id;
}
-unsigned long __init arch_syscall_addr(int nr)
+unsigned long __init __weak arch_syscall_addr(int nr)
{
return (unsigned long)sys_call_table[nr];
}
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 68187af4889e..b219f1449c54 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -251,9 +251,9 @@ static void set_tracepoint(struct tracepoint_entry **entry,
{
WARN_ON(strcmp((*entry)->name, elem->name) != 0);
- if (elem->regfunc && !elem->state && active)
+ if (elem->regfunc && !jump_label_enabled(&elem->key) && active)
elem->regfunc();
- else if (elem->unregfunc && elem->state && !active)
+ else if (elem->unregfunc && jump_label_enabled(&elem->key) && !active)
elem->unregfunc();
/*
@@ -264,13 +264,10 @@ static void set_tracepoint(struct tracepoint_entry **entry,
* is used.
*/
rcu_assign_pointer(elem->funcs, (*entry)->funcs);
- if (!elem->state && active) {
- jump_label_enable(&elem->state);
- elem->state = active;
- } else if (elem->state && !active) {
- jump_label_disable(&elem->state);
- elem->state = active;
- }
+ if (active && !jump_label_enabled(&elem->key))
+ jump_label_inc(&elem->key);
+ else if (!active && jump_label_enabled(&elem->key))
+ jump_label_dec(&elem->key);
}
/*
@@ -281,13 +278,11 @@ static void set_tracepoint(struct tracepoint_entry **entry,
*/
static void disable_tracepoint(struct tracepoint *elem)
{
- if (elem->unregfunc && elem->state)
+ if (elem->unregfunc && jump_label_enabled(&elem->key))
elem->unregfunc();
- if (elem->state) {
- jump_label_disable(&elem->state);
- elem->state = 0;
- }
+ if (jump_label_enabled(&elem->key))
+ jump_label_dec(&elem->key);
rcu_assign_pointer(elem->funcs, NULL);
}
diff --git a/kernel/uid16.c b/kernel/uid16.c
index 419209893d87..51c6e89e8619 100644
--- a/kernel/uid16.c
+++ b/kernel/uid16.c
@@ -189,7 +189,7 @@ SYSCALL_DEFINE2(setgroups16, int, gidsetsize, old_gid_t __user *, grouplist)
struct group_info *group_info;
int retval;
- if (!capable(CAP_SETGID))
+ if (!nsown_capable(CAP_SETGID))
return -EPERM;
if ((unsigned)gidsetsize > NGROUPS_MAX)
return -EINVAL;
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
index eb27fd3430a2..92cb706c7fc8 100644
--- a/kernel/user-return-notifier.c
+++ b/kernel/user-return-notifier.c
@@ -20,7 +20,7 @@ EXPORT_SYMBOL_GPL(user_return_notifier_register);
/*
* Removes a registered user return notifier. Must be called from atomic
- * context, and from the same cpu registration occured in.
+ * context, and from the same cpu registration occurred in.
*/
void user_return_notifier_unregister(struct user_return_notifier *urn)
{
diff --git a/kernel/user.c b/kernel/user.c
index 5c598ca781df..9e03e9c1df8d 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -17,9 +17,13 @@
#include <linux/module.h>
#include <linux/user_namespace.h>
+/*
+ * userns count is 1 for root user, 1 for init_uts_ns,
+ * and 1 for... ?
+ */
struct user_namespace init_user_ns = {
.kref = {
- .refcount = ATOMIC_INIT(2),
+ .refcount = ATOMIC_INIT(3),
},
.creator = &root_user,
};
@@ -47,7 +51,7 @@ static struct kmem_cache *uid_cachep;
*/
static DEFINE_SPINLOCK(uidhash_lock);
-/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->creator */
+/* root_user.__count is 2, 1 for init task cred, 1 for init_user_ns->user_ns */
struct user_struct root_user = {
.__count = ATOMIC_INIT(2),
.processes = ATOMIC_INIT(1),
diff --git a/kernel/utsname.c b/kernel/utsname.c
index 8a82b4b8ea52..bff131b9510a 100644
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,8 @@
#include <linux/utsname.h>
#include <linux/err.h>
#include <linux/slab.h>
+#include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
static struct uts_namespace *create_uts_ns(void)
{
@@ -30,7 +32,8 @@ static struct uts_namespace *create_uts_ns(void)
* @old_ns: namespace to clone
* Return NULL on error (failure to kmalloc), new ns otherwise
*/
-static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
+static struct uts_namespace *clone_uts_ns(struct task_struct *tsk,
+ struct uts_namespace *old_ns)
{
struct uts_namespace *ns;
@@ -40,6 +43,7 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
down_read(&uts_sem);
memcpy(&ns->name, &old_ns->name, sizeof(ns->name));
+ ns->user_ns = get_user_ns(task_cred_xxx(tsk, user)->user_ns);
up_read(&uts_sem);
return ns;
}
@@ -50,8 +54,10 @@ static struct uts_namespace *clone_uts_ns(struct uts_namespace *old_ns)
* utsname of this process won't be seen by parent, and vice
* versa.
*/
-struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *old_ns)
+struct uts_namespace *copy_utsname(unsigned long flags,
+ struct task_struct *tsk)
{
+ struct uts_namespace *old_ns = tsk->nsproxy->uts_ns;
struct uts_namespace *new_ns;
BUG_ON(!old_ns);
@@ -60,7 +66,7 @@ struct uts_namespace *copy_utsname(unsigned long flags, struct uts_namespace *ol
if (!(flags & CLONE_NEWUTS))
return old_ns;
- new_ns = clone_uts_ns(old_ns);
+ new_ns = clone_uts_ns(tsk, old_ns);
put_uts_ns(old_ns);
return new_ns;
@@ -71,5 +77,44 @@ void free_uts_ns(struct kref *kref)
struct uts_namespace *ns;
ns = container_of(kref, struct uts_namespace, kref);
+ put_user_ns(ns->user_ns);
kfree(ns);
}
+
+static void *utsns_get(struct task_struct *task)
+{
+ struct uts_namespace *ns = NULL;
+ struct nsproxy *nsproxy;
+
+ rcu_read_lock();
+ nsproxy = task_nsproxy(task);
+ if (nsproxy) {
+ ns = nsproxy->uts_ns;
+ get_uts_ns(ns);
+ }
+ rcu_read_unlock();
+
+ return ns;
+}
+
+static void utsns_put(void *ns)
+{
+ put_uts_ns(ns);
+}
+
+static int utsns_install(struct nsproxy *nsproxy, void *ns)
+{
+ get_uts_ns(ns);
+ put_uts_ns(nsproxy->uts_ns);
+ nsproxy->uts_ns = ns;
+ return 0;
+}
+
+const struct proc_ns_operations utsns_operations = {
+ .name = "uts",
+ .type = CLONE_NEWUTS,
+ .get = utsns_get,
+ .put = utsns_put,
+ .install = utsns_install,
+};
+
diff --git a/kernel/wait.c b/kernel/wait.c
index b0310eb6cc1e..f45ea8d2a1ce 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -142,7 +142,7 @@ EXPORT_SYMBOL(finish_wait);
* woken up through the queue.
*
* This prevents waiter starvation where an exclusive waiter
- * aborts and is woken up concurrently and noone wakes up
+ * aborts and is woken up concurrently and no one wakes up
* the next waiter.
*/
void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
index 18bb15776c57..3d0c56ad4792 100644
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -28,7 +28,7 @@
#include <linux/perf_event.h>
int watchdog_enabled = 1;
-int __read_mostly softlockup_thresh = 60;
+int __read_mostly watchdog_thresh = 10;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
@@ -48,12 +48,15 @@ static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
* Should we panic when a soft-lockup or hard-lockup occurs:
*/
#ifdef CONFIG_HARDLOCKUP_DETECTOR
-static int hardlockup_panic;
+static int hardlockup_panic =
+ CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
static int __init hardlockup_panic_setup(char *str)
{
if (!strncmp(str, "panic", 5))
hardlockup_panic = 1;
+ else if (!strncmp(str, "nopanic", 7))
+ hardlockup_panic = 0;
else if (!strncmp(str, "0", 1))
watchdog_enabled = 0;
return 1;
@@ -88,6 +91,17 @@ static int __init nosoftlockup_setup(char *str)
__setup("nosoftlockup", nosoftlockup_setup);
/* */
+/*
+ * Hard-lockup warnings should be triggered after just a few seconds. Soft-
+ * lockups can have false positives under extreme conditions. So we generally
+ * want a higher threshold for soft lockups than for hard lockups. So we couple
+ * the thresholds with a factor: we make the soft threshold twice the amount of
+ * time the hard threshold is.
+ */
+static int get_softlockup_thresh(void)
+{
+ return watchdog_thresh * 2;
+}
/*
* Returns seconds, approximately. We don't need nanosecond
@@ -102,12 +116,12 @@ static unsigned long get_timestamp(int this_cpu)
static unsigned long get_sample_period(void)
{
/*
- * convert softlockup_thresh from seconds to ns
+ * convert watchdog_thresh from seconds to ns
* the divide by 5 is to give hrtimer 5 chances to
* increment before the hardlockup detector generates
* a warning
*/
- return softlockup_thresh / 5 * NSEC_PER_SEC;
+ return get_softlockup_thresh() * (NSEC_PER_SEC / 5);
}
/* Commands for resetting the watchdog */
@@ -179,7 +193,7 @@ static int is_softlockup(unsigned long touch_ts)
unsigned long now = get_timestamp(smp_processor_id());
/* Warn about unreasonable delays: */
- if (time_after(now, touch_ts + softlockup_thresh))
+ if (time_after(now, touch_ts + get_softlockup_thresh()))
return now - touch_ts;
return 0;
@@ -356,7 +370,7 @@ static int watchdog_nmi_enable(int cpu)
/* Try to register using hardware perf events */
wd_attr = &wd_hw_attr;
- wd_attr->sample_period = hw_nmi_get_sample_period();
+ wd_attr->sample_period = hw_nmi_get_sample_period(watchdog_thresh);
event = perf_event_create_kernel_counter(wd_attr, cpu, NULL, watchdog_overflow_callback);
if (!IS_ERR(event)) {
printk(KERN_INFO "NMI watchdog enabled, takes one hw-pmu counter.\n");
@@ -401,33 +415,37 @@ static void watchdog_nmi_disable(int cpu) { return; }
#endif /* CONFIG_HARDLOCKUP_DETECTOR */
/* prepare/enable/disable routines */
-static int watchdog_prepare_cpu(int cpu)
+static void watchdog_prepare_cpu(int cpu)
{
struct hrtimer *hrtimer = &per_cpu(watchdog_hrtimer, cpu);
WARN_ON(per_cpu(softlockup_watchdog, cpu));
hrtimer_init(hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
hrtimer->function = watchdog_timer_fn;
-
- return 0;
}
static int watchdog_enable(int cpu)
{
struct task_struct *p = per_cpu(softlockup_watchdog, cpu);
- int err;
+ int err = 0;
/* enable the perf event */
err = watchdog_nmi_enable(cpu);
- if (err)
- return err;
+
+ /* Regardless of err above, fall through and start softlockup */
/* create the watchdog thread */
if (!p) {
p = kthread_create(watchdog, (void *)(unsigned long)cpu, "watchdog/%d", cpu);
if (IS_ERR(p)) {
printk(KERN_ERR "softlockup watchdog for %i failed\n", cpu);
- return PTR_ERR(p);
+ if (!err) {
+ /* if hardlockup hasn't already set this */
+ err = PTR_ERR(p);
+ /* and disable the perf event */
+ watchdog_nmi_disable(cpu);
+ }
+ goto out;
}
kthread_bind(p, cpu);
per_cpu(watchdog_touch_ts, cpu) = 0;
@@ -435,7 +453,8 @@ static int watchdog_enable(int cpu)
wake_up_process(p);
}
- return 0;
+out:
+ return err;
}
static void watchdog_disable(int cpu)
@@ -491,28 +510,25 @@ static void watchdog_disable_all_cpus(void)
/* sysctl functions */
#ifdef CONFIG_SYSCTL
/*
- * proc handler for /proc/sys/kernel/nmi_watchdog
+ * proc handler for /proc/sys/kernel/nmi_watchdog,watchdog_thresh
*/
-int proc_dowatchdog_enabled(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+int proc_dowatchdog(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
{
- proc_dointvec(table, write, buffer, length, ppos);
+ int ret;
- if (write) {
- if (watchdog_enabled)
- watchdog_enable_all_cpus();
- else
- watchdog_disable_all_cpus();
- }
- return 0;
-}
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (ret || !write)
+ goto out;
-int proc_dowatchdog_thresh(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *lenp, loff_t *ppos)
-{
- return proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ if (watchdog_enabled && watchdog_thresh)
+ watchdog_enable_all_cpus();
+ else
+ watchdog_disable_all_cpus();
+
+out:
+ return ret;
}
#endif /* CONFIG_SYSCTL */
@@ -524,17 +540,16 @@ static int __cpuinit
cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
{
int hotcpu = (unsigned long)hcpu;
- int err = 0;
switch (action) {
case CPU_UP_PREPARE:
case CPU_UP_PREPARE_FROZEN:
- err = watchdog_prepare_cpu(hotcpu);
+ watchdog_prepare_cpu(hotcpu);
break;
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
if (watchdog_enabled)
- err = watchdog_enable(hotcpu);
+ watchdog_enable(hotcpu);
break;
#ifdef CONFIG_HOTPLUG_CPU
case CPU_UP_CANCELED:
@@ -547,7 +562,13 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
break;
#endif /* CONFIG_HOTPLUG_CPU */
}
- return notifier_from_errno(err);
+
+ /*
+ * hardlockup and softlockup are not important enough
+ * to block cpu bring up. Just always succeed and
+ * rely on printk output to flag problems.
+ */
+ return NOTIFY_OK;
}
static struct notifier_block __cpuinitdata cpu_nfb = {
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ee6578b578ad..0400553f0d04 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -251,10 +251,12 @@ struct workqueue_struct *system_wq __read_mostly;
struct workqueue_struct *system_long_wq __read_mostly;
struct workqueue_struct *system_nrt_wq __read_mostly;
struct workqueue_struct *system_unbound_wq __read_mostly;
+struct workqueue_struct *system_freezable_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_wq);
EXPORT_SYMBOL_GPL(system_long_wq);
EXPORT_SYMBOL_GPL(system_nrt_wq);
EXPORT_SYMBOL_GPL(system_unbound_wq);
+EXPORT_SYMBOL_GPL(system_freezable_wq);
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
@@ -316,6 +318,11 @@ static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
static struct debug_obj_descr work_debug_descr;
+static void *work_debug_hint(void *addr)
+{
+ return ((struct work_struct *) addr)->func;
+}
+
/*
* fixup_init is called when:
* - an active object is initialized
@@ -387,6 +394,7 @@ static int work_fixup_free(void *addr, enum debug_obj_state state)
static struct debug_obj_descr work_debug_descr = {
.name = "work_struct",
+ .debug_hint = work_debug_hint,
.fixup_init = work_fixup_init,
.fixup_activate = work_fixup_activate,
.fixup_free = work_fixup_free,
@@ -1283,8 +1291,14 @@ __acquires(&gcwq->lock)
return true;
spin_unlock_irq(&gcwq->lock);
- /* CPU has come up inbetween, retry migration */
+ /*
+ * We've raced with CPU hot[un]plug. Give it a breather
+ * and retry migration. cond_resched() is required here;
+ * otherwise, we might deadlock against cpu_stop trying to
+ * bring down the CPU on non-preemptive kernel.
+ */
cpu_relax();
+ cond_resched();
}
}
@@ -1358,8 +1372,10 @@ static struct worker *create_worker(struct global_cwq *gcwq, bool bind)
worker->id = id;
if (!on_unbound_cpu)
- worker->task = kthread_create(worker_thread, worker,
- "kworker/%u:%d", gcwq->cpu, id);
+ worker->task = kthread_create_on_node(worker_thread,
+ worker,
+ cpu_to_node(gcwq->cpu),
+ "kworker/%u:%d", gcwq->cpu, id);
else
worker->task = kthread_create(worker_thread, worker,
"kworker/u:%d", id);
@@ -2850,9 +2866,7 @@ static int alloc_cwqs(struct workqueue_struct *wq)
}
}
- /* just in case, make sure it's actually aligned
- * - this is affected by PERCPU() alignment in vmlinux.lds.S
- */
+ /* just in case, make sure it's actually aligned */
BUG_ON(!IS_ALIGNED(wq->cpu_wq.v, align));
return wq->cpu_wq.v ? 0 : -ENOMEM;
}
@@ -3775,8 +3789,10 @@ static int __init init_workqueues(void)
system_nrt_wq = alloc_workqueue("events_nrt", WQ_NON_REENTRANT, 0);
system_unbound_wq = alloc_workqueue("events_unbound", WQ_UNBOUND,
WQ_UNBOUND_MAX_ACTIVE);
+ system_freezable_wq = alloc_workqueue("events_freezable",
+ WQ_FREEZABLE, 0);
BUG_ON(!system_wq || !system_long_wq || !system_nrt_wq ||
- !system_unbound_wq);
+ !system_unbound_wq || !system_freezable_wq);
return 0;
}
early_initcall(init_workqueues);