summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/.gitignore1
-rw-r--r--kernel/async.c40
-rw-r--r--kernel/audit.c12
-rw-r--r--kernel/audit.h3
-rw-r--r--kernel/audit_tree.c2
-rw-r--r--kernel/auditfilter.c4
-rw-r--r--kernel/auditsc.c14
-rw-r--r--kernel/capability.c24
-rw-r--r--kernel/cgroup.c53
-rw-r--r--kernel/cpuset.c28
-rw-r--r--kernel/events/core.c16
-rw-r--r--kernel/events/internal.h2
-rw-r--r--kernel/events/ring_buffer.c22
-rw-r--r--kernel/exit.c2
-rw-r--r--kernel/fork.c5
-rw-r--r--kernel/futex.c46
-rw-r--r--kernel/hrtimer.c3
-rw-r--r--kernel/kexec.c131
-rw-r--r--kernel/kprobes.c19
-rw-r--r--kernel/kthread.c92
-rw-r--r--kernel/lockdep.c46
-rw-r--r--kernel/mutex.c151
-rw-r--r--kernel/pid_namespace.c3
-rw-r--r--kernel/printk.c80
-rw-r--r--kernel/resource.c198
-rw-r--r--kernel/rtmutex-tester.c5
-rw-r--r--kernel/sched/clock.c26
-rw-r--r--kernel/sched/core.c62
-rw-r--r--kernel/sched/cputime.c2
-rw-r--r--kernel/sched/features.h7
-rw-r--r--kernel/signal.c7
-rw-r--r--kernel/smpboot.c18
-rw-r--r--kernel/softirq.c21
-rw-r--r--kernel/stop_machine.c2
-rw-r--r--kernel/sys.c60
-rw-r--r--kernel/sysctl.c15
-rw-r--r--kernel/time/tick-broadcast.c3
-rw-r--r--kernel/trace/Kconfig73
-rw-r--r--kernel/trace/blktrace.c30
-rw-r--r--kernel/trace/ftrace.c154
-rw-r--r--kernel/trace/ring_buffer.c500
-rw-r--r--kernel/trace/trace.c2261
-rw-r--r--kernel/trace/trace.h146
-rw-r--r--kernel/trace/trace_branch.c8
-rw-r--r--kernel/trace/trace_clock.c10
-rw-r--r--kernel/trace/trace_entries.h23
-rw-r--r--kernel/trace/trace_events.c1397
-rw-r--r--kernel/trace/trace_events_filter.c34
-rw-r--r--kernel/trace/trace_export.c4
-rw-r--r--kernel/trace/trace_functions.c207
-rw-r--r--kernel/trace/trace_functions_graph.c12
-rw-r--r--kernel/trace/trace_irqsoff.c90
-rw-r--r--kernel/trace/trace_kdb.c12
-rw-r--r--kernel/trace/trace_mmiotrace.c12
-rw-r--r--kernel/trace/trace_output.c119
-rw-r--r--kernel/trace/trace_output.h4
-rw-r--r--kernel/trace/trace_sched_switch.c8
-rw-r--r--kernel/trace/trace_sched_wakeup.c93
-rw-r--r--kernel/trace/trace_selftest.c51
-rw-r--r--kernel/trace/trace_stack.c78
-rw-r--r--kernel/trace/trace_stat.c2
-rw-r--r--kernel/trace/trace_syscalls.c90
-rw-r--r--kernel/tracepoint.c21
-rw-r--r--kernel/user.c2
-rw-r--r--kernel/user_namespace.c37
-rw-r--r--kernel/workqueue.c2867
-rw-r--r--kernel/workqueue_internal.h9
67 files changed, 6969 insertions, 2610 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore
index ab4f1090f437..b3097bde4e9c 100644
--- a/kernel/.gitignore
+++ b/kernel/.gitignore
@@ -4,3 +4,4 @@
config_data.h
config_data.gz
timeconst.h
+hz.bc
diff --git a/kernel/async.c b/kernel/async.c
index 8ddee2c3e5b0..61f023ce0228 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -73,7 +73,7 @@ struct async_entry {
struct list_head global_list;
struct work_struct work;
async_cookie_t cookie;
- async_func_ptr *func;
+ async_func_t func;
void *data;
struct async_domain *domain;
};
@@ -84,24 +84,20 @@ static atomic_t entry_count;
static async_cookie_t lowest_in_progress(struct async_domain *domain)
{
- struct async_entry *first = NULL;
+ struct list_head *pending;
async_cookie_t ret = ASYNC_COOKIE_MAX;
unsigned long flags;
spin_lock_irqsave(&async_lock, flags);
- if (domain) {
- if (!list_empty(&domain->pending))
- first = list_first_entry(&domain->pending,
- struct async_entry, domain_list);
- } else {
- if (!list_empty(&async_global_pending))
- first = list_first_entry(&async_global_pending,
- struct async_entry, global_list);
- }
+ if (domain)
+ pending = &domain->pending;
+ else
+ pending = &async_global_pending;
- if (first)
- ret = first->cookie;
+ if (!list_empty(pending))
+ ret = list_first_entry(pending, struct async_entry,
+ domain_list)->cookie;
spin_unlock_irqrestore(&async_lock, flags);
return ret;
@@ -149,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work)
wake_up(&async_done);
}
-static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain)
+static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain)
{
struct async_entry *entry;
unsigned long flags;
@@ -169,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
spin_unlock_irqrestore(&async_lock, flags);
/* low on memory.. run synchronously */
- ptr(data, newcookie);
+ func(data, newcookie);
return newcookie;
}
INIT_LIST_HEAD(&entry->domain_list);
INIT_LIST_HEAD(&entry->global_list);
INIT_WORK(&entry->work, async_run_entry_fn);
- entry->func = ptr;
+ entry->func = func;
entry->data = data;
entry->domain = domain;
@@ -202,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a
/**
* async_schedule - schedule a function for asynchronous execution
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
*
* Returns an async_cookie_t that may be used for checkpointing later.
* Note: This function may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
+async_cookie_t async_schedule(async_func_t func, void *data)
{
- return __async_schedule(ptr, data, &async_dfl_domain);
+ return __async_schedule(func, data, &async_dfl_domain);
}
EXPORT_SYMBOL_GPL(async_schedule);
/**
* async_schedule_domain - schedule a function for asynchronous execution within a certain domain
- * @ptr: function to execute asynchronously
+ * @func: function to execute asynchronously
* @data: data pointer to pass to the function
* @domain: the domain
*
@@ -226,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule);
* synchronization domain is specified via @domain. Note: This function
* may be called from atomic or non-atomic contexts.
*/
-async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+async_cookie_t async_schedule_domain(async_func_t func, void *data,
struct async_domain *domain)
{
- return __async_schedule(ptr, data, domain);
+ return __async_schedule(func, data, domain);
}
EXPORT_SYMBOL_GPL(async_schedule_domain);
diff --git a/kernel/audit.c b/kernel/audit.c
index d596e5355f15..9816a1b96cfc 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
/* As soon as there's any sign of userspace auditd,
* start kauditd to talk to it */
- if (!kauditd_task)
+ if (!kauditd_task) {
kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd");
- if (IS_ERR(kauditd_task)) {
- err = PTR_ERR(kauditd_task);
- kauditd_task = NULL;
- return err;
+ if (IS_ERR(kauditd_task)) {
+ err = PTR_ERR(kauditd_task);
+ kauditd_task = NULL;
+ return err;
+ }
}
-
loginuid = audit_get_loginuid(current);
sessionid = audit_get_sessionid(current);
security_task_getsecid(current, &sid);
diff --git a/kernel/audit.h b/kernel/audit.h
index d51cba868e1b..11468d99dad0 100644
--- a/kernel/audit.h
+++ b/kernel/audit.h
@@ -59,10 +59,7 @@ struct audit_entry {
struct audit_krule rule;
};
-#ifdef CONFIG_AUDIT
-extern int audit_enabled;
extern int audit_ever_enabled;
-#endif
extern int audit_pid;
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 642a89c4f3d6..a291aa23fb3f 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -617,9 +617,9 @@ void audit_trim_trees(void)
}
spin_unlock(&hash_lock);
trim_marked(tree);
- put_tree(tree);
drop_collected_mounts(root_mnt);
skip_it:
+ put_tree(tree);
mutex_lock(&audit_filter_mutex);
}
list_del(&cursor);
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index f9fc54bbe06f..267436826c3b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -594,6 +594,10 @@ exit_nofree:
return entry;
exit_free:
+ if (entry->rule.watch)
+ audit_put_watch(entry->rule.watch); /* matches initial get */
+ if (entry->rule.tree)
+ audit_put_tree(entry->rule.tree); /* that's the temporary one */
audit_free_rule(entry);
return ERR_PTR(err);
}
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index a371f857a0a9..c68229411a7c 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context)
}
}
-static inline void audit_zero_context(struct audit_context *context,
- enum audit_state state)
-{
- memset(context, 0, sizeof(*context));
- context->state = state;
- context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
-}
-
static inline struct audit_context *audit_alloc_context(enum audit_state state)
{
struct audit_context *context;
- if (!(context = kmalloc(sizeof(*context), GFP_KERNEL)))
+ context = kzalloc(sizeof(*context), GFP_KERNEL);
+ if (!context)
return NULL;
- audit_zero_context(context, state);
+ context->state = state;
+ context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0;
INIT_LIST_HEAD(&context->killed_trees);
INIT_LIST_HEAD(&context->names_list);
return context;
diff --git a/kernel/capability.c b/kernel/capability.c
index 493d97259484..f6c2ce5701e1 100644
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap)
EXPORT_SYMBOL(ns_capable);
/**
+ * file_ns_capable - Determine if the file's opener had a capability in effect
+ * @file: The file we want to check
+ * @ns: The usernamespace we want the capability in
+ * @cap: The capability to be tested for
+ *
+ * Return true if task that opened the file had a capability in effect
+ * when the file was opened.
+ *
+ * This does not set PF_SUPERPRIV because the caller may not
+ * actually be privileged.
+ */
+bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap)
+{
+ if (WARN_ON_ONCE(!cap_valid(cap)))
+ return false;
+
+ if (security_capable(file->f_cred, ns, cap) == 0)
+ return true;
+
+ return false;
+}
+EXPORT_SYMBOL(file_ns_capable);
+
+/**
* capable - Determine if the current task has a superior capability in effect
* @cap: The capability to be tested for
*
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 3f14a1310d23..eeb7e49946b2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2137,11 +2137,11 @@ retry_find_task:
tsk = tsk->group_leader;
/*
- * Workqueue threads may acquire PF_THREAD_BOUND and become
+ * Workqueue threads may acquire PF_NO_SETAFFINITY and become
* trapped in a cpuset, or RT worker may be born in a cgroup
* with no rt_runtime allocated. Just say no.
*/
- if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) {
+ if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) {
ret = -EINVAL;
rcu_read_unlock();
goto out_unlock_cgroup;
@@ -5290,55 +5290,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
}
EXPORT_SYMBOL_GPL(css_lookup);
-/**
- * css_get_next - lookup next cgroup under specified hierarchy.
- * @ss: pointer to subsystem
- * @id: current position of iteration.
- * @root: pointer to css. search tree under this.
- * @foundid: position of found object.
- *
- * Search next css under the specified hierarchy of rootid. Calling under
- * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
- */
-struct cgroup_subsys_state *
-css_get_next(struct cgroup_subsys *ss, int id,
- struct cgroup_subsys_state *root, int *foundid)
-{
- struct cgroup_subsys_state *ret = NULL;
- struct css_id *tmp;
- int tmpid;
- int rootid = css_id(root);
- int depth = css_depth(root);
-
- if (!rootid)
- return NULL;
-
- BUG_ON(!ss->use_id);
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- /* fill start point for scan */
- tmpid = id;
- while (1) {
- /*
- * scan next entry from bitmap(tree), tmpid is updated after
- * idr_get_next().
- */
- tmp = idr_get_next(&ss->idr, &tmpid);
- if (!tmp)
- break;
- if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
- ret = rcu_dereference(tmp->css);
- if (ret) {
- *foundid = tmpid;
- break;
- }
- }
- /* continue to scan from next id */
- tmpid = tmpid + 1;
- }
- return ret;
-}
-
/*
* get corresponding css from file open on cgroupfs directory
*/
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 9bd30b9c430c..12331120767c 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1378,16 +1378,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
cgroup_taskset_for_each(task, cgrp, tset) {
/*
- * Kthreads bound to specific cpus cannot be moved to a new
- * cpuset; we cannot change their cpu affinity and
- * isolating such threads by their set of allowed nodes is
- * unnecessary. Thus, cpusets are not applicable for such
- * threads. This prevents checking for success of
- * set_cpus_allowed_ptr() on all attached tasks before
- * cpus_allowed may be changed.
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
*/
ret = -EINVAL;
- if (task->flags & PF_THREAD_BOUND)
+ if (task->flags & PF_NO_SETAFFINITY)
goto out_unlock;
ret = security_task_setscheduler(task);
if (ret)
@@ -2193,7 +2193,6 @@ void cpuset_update_active_cpus(bool cpu_online)
schedule_work(&cpuset_hotplug_work);
}
-#ifdef CONFIG_MEMORY_HOTPLUG
/*
* Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY].
* Call this routine anytime after node_states[N_MEMORY] changes.
@@ -2205,20 +2204,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
schedule_work(&cpuset_hotplug_work);
return NOTIFY_OK;
}
-#endif
+
+static struct notifier_block cpuset_track_online_nodes_nb = {
+ .notifier_call = cpuset_track_online_nodes,
+ .priority = 10, /* ??! */
+};
/**
* cpuset_init_smp - initialize cpus_allowed
*
* Description: Finish top cpuset after cpu, node maps are initialized
- **/
-
+ */
void __init cpuset_init_smp(void)
{
cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask);
top_cpuset.mems_allowed = node_states[N_MEMORY];
- hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+ register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
cpuset_propagate_hotplug_wq =
alloc_ordered_workqueue("cpuset_hotplug", 0);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 310ec19d968a..dce6e13cf9d7 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -4449,12 +4449,15 @@ static void perf_event_task_event(struct perf_task_event *task_event)
if (ctxn < 0)
goto next;
ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
+ if (ctx)
+ perf_event_task_ctx(ctx, task_event);
}
- if (ctx)
- perf_event_task_ctx(ctx, task_event);
next:
put_cpu_ptr(pmu->pmu_cpu_context);
}
+ if (task_event->task_ctx)
+ perf_event_task_ctx(task_event->task_ctx, task_event);
+
rcu_read_unlock();
}
@@ -4608,6 +4611,7 @@ void perf_event_comm(struct task_struct *task)
struct perf_event_context *ctx;
int ctxn;
+ rcu_read_lock();
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (!ctx)
@@ -4615,6 +4619,7 @@ void perf_event_comm(struct task_struct *task)
perf_event_enable_on_exec(ctx);
}
+ rcu_read_unlock();
if (!atomic_read(&nr_comm_events))
return;
@@ -4749,7 +4754,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
} else {
if (arch_vma_name(mmap_event->vma)) {
name = strncpy(tmp, arch_vma_name(mmap_event->vma),
- sizeof(tmp));
+ sizeof(tmp) - 1);
+ tmp[sizeof(tmp) - 1] = '\0';
goto got_name;
}
@@ -5342,7 +5348,7 @@ static void sw_perf_event_destroy(struct perf_event *event)
static int perf_swevent_init(struct perf_event *event)
{
- int event_id = event->attr.config;
+ u64 event_id = event->attr.config;
if (event->attr.type != PERF_TYPE_SOFTWARE)
return -ENOENT;
@@ -5662,6 +5668,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event)
event->attr.sample_period = NSEC_PER_SEC / freq;
hwc->sample_period = event->attr.sample_period;
local64_set(&hwc->period_left, hwc->sample_period);
+ hwc->last_period = hwc->sample_period;
event->attr.freq = 0;
}
}
@@ -5997,6 +6004,7 @@ skip_type:
if (pmu->pmu_cpu_context)
goto got_cpu_context;
+ ret = -ENOMEM;
pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
if (!pmu->pmu_cpu_context)
goto free_dev;
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index d56a64c99a8b..eb675c4d59df 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -16,7 +16,7 @@ struct ring_buffer {
int page_order; /* allocation order */
#endif
int nr_pages; /* nr of data pages */
- int writable; /* are we writable */
+ int overwrite; /* can overwrite itself */
atomic_t poll; /* POLL_ for wakeups */
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 23cb34ff3973..97fddb09762b 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -18,12 +18,24 @@
static bool perf_output_space(struct ring_buffer *rb, unsigned long tail,
unsigned long offset, unsigned long head)
{
- unsigned long mask;
+ unsigned long sz = perf_data_size(rb);
+ unsigned long mask = sz - 1;
- if (!rb->writable)
+ /*
+ * check if user-writable
+ * overwrite : over-write its own tail
+ * !overwrite: buffer possibly drops events.
+ */
+ if (rb->overwrite)
return true;
- mask = perf_data_size(rb) - 1;
+ /*
+ * verify that payload is not bigger than buffer
+ * otherwise masking logic may fail to detect
+ * the "not enough space" condition
+ */
+ if ((head - offset) > sz)
+ return false;
offset = (offset - tail) & mask;
head = (head - tail) & mask;
@@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags)
rb->watermark = max_size / 2;
if (flags & RING_BUFFER_WRITABLE)
- rb->writable = 1;
+ rb->overwrite = 0;
+ else
+ rb->overwrite = 1;
atomic_set(&rb->refcount, 1);
diff --git a/kernel/exit.c b/kernel/exit.c
index 51e485ca9935..60bc027c61c3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -835,7 +835,7 @@ void do_exit(long code)
/*
* Make sure we are holding no locks:
*/
- debug_check_no_locks_held();
+ debug_check_no_locks_held(tsk);
/*
* We can do this unlocked here. The futex code uses this flag
* just to verify whether the pi state cleanup has been done
diff --git a/kernel/fork.c b/kernel/fork.c
index 8d932b1c9056..1766d324d5e3 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
return ERR_PTR(-EINVAL);
+ if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS))
+ return ERR_PTR(-EINVAL);
+
/*
* Thread groups must share signals as well, and detached threads
* can only be started up within the thread group.
@@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
* If unsharing a user namespace must also unshare the thread.
*/
if (unshare_flags & CLONE_NEWUSER)
- unshare_flags |= CLONE_THREAD;
+ unshare_flags |= CLONE_THREAD | CLONE_FS;
/*
* If unsharing a pid namespace must also unshare the thread.
*/
diff --git a/kernel/futex.c b/kernel/futex.c
index f0090a993dab..b26dcfc02c94 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -223,7 +223,8 @@ static void drop_futex_key_refs(union futex_key *key)
* @rw: mapping needs to be read/write (values: VERIFY_READ,
* VERIFY_WRITE)
*
- * Returns a negative error code or 0
+ * Return: a negative error code or 0
+ *
* The key words are stored in *key on success.
*
* For shared mappings, it's (page->index, file_inode(vma->vm_file),
@@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
* be "current" except in the case of requeue pi.
* @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
- * Returns:
- * 0 - ready to wait
- * 1 - acquired the lock
+ * Return:
+ * 0 - ready to wait;
+ * 1 - acquired the lock;
* <0 - error
*
* The hb->lock and futex_key refs shall be held by the caller.
@@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
* then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
* hb1 and hb2 must be held by the caller.
*
- * Returns:
- * 0 - failed to acquire the lock atomicly
- * 1 - acquired the lock
+ * Return:
+ * 0 - failed to acquire the lock atomically;
+ * 1 - acquired the lock;
* <0 - error
*/
static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
* Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
* uaddr2 atomically on behalf of the top waiter.
*
- * Returns:
- * >=0 - on success, the number of tasks requeued or woken
+ * Return:
+ * >=0 - on success, the number of tasks requeued or woken;
* <0 - on error
*/
static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
@@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
* The q->lock_ptr must not be held by the caller. A call to unqueue_me() must
* be paired with exactly one earlier call to queue_me().
*
- * Returns:
- * 1 - if the futex_q was still queued (and we removed unqueued it)
+ * Return:
+ * 1 - if the futex_q was still queued (and we removed unqueued it);
* 0 - if the futex_q was already removed by the waking thread
*/
static int unqueue_me(struct futex_q *q)
@@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart);
* the pi_state owner as well as handle race conditions that may allow us to
* acquire the lock. Must be called with the hb lock held.
*
- * Returns:
- * 1 - success, lock taken
- * 0 - success, lock not taken
+ * Return:
+ * 1 - success, lock taken;
+ * 0 - success, lock not taken;
* <0 - on error (-EFAULT)
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
@@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
* Return with the hb lock held and a q.key reference on success, and unlocked
* with no q.key reference on failure.
*
- * Returns:
- * 0 - uaddr contains val and hb has been locked
+ * Return:
+ * 0 - uaddr contains val and hb has been locked;
* <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked
*/
static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags,
@@ -2203,9 +2204,9 @@ pi_faulted:
* the wakeup and return the appropriate error code to the caller. Must be
* called with the hb lock held.
*
- * Returns
- * 0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ * Return:
+ * 0 = no early wakeup detected;
+ * <0 = -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* @val: the expected value of uaddr
* @abs_time: absolute timeout
* @bitset: 32 bit wakeup bitset set by userspace, defaults to all
- * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
* @uaddr2: the pi futex we will take prior to returning to user-space
*
* The caller will wait on uaddr and will be requeued by futex_requeue() to
@@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
* there was a need to.
*
* We call schedule in futex_wait_queue_me() when we enqueue and return there
- * via the following:
+ * via the following--
* 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
* 2) wakeup on uaddr2 after a requeue
* 3) signal
@@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
*
* If 4 or 7, we cleanup and return with -ETIMEDOUT.
*
- * Returns:
- * 0 - On success
+ * Return:
+ * 0 - On success;
* <0 - On error
*/
static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index cc47812d3feb..14be27feda49 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -63,6 +63,7 @@
DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) =
{
+ .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock),
.clock_base =
{
{
@@ -1642,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu);
int i;
- raw_spin_lock_init(&cpu_base->lock);
-
for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) {
cpu_base->clock_base[i].cpu_base = cpu_base;
timerqueue_init_head(&cpu_base->clock_base[i].active);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index bddd3d7a74b6..b574920cbd4b 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -55,7 +55,7 @@ struct resource crashk_res = {
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
};
struct resource crashk_low_res = {
- .name = "Crash kernel low",
+ .name = "Crash kernel",
.start = 0,
.end = 0,
.flags = IORESOURCE_BUSY | IORESOURCE_MEM
@@ -1118,12 +1118,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin,
{
unsigned long addr;
- for (addr = begin; addr < end; addr += PAGE_SIZE) {
- ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
- init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
- free_page((unsigned long)__va(addr));
- totalram_pages++;
- }
+ for (addr = begin; addr < end; addr += PAGE_SIZE)
+ free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT));
}
int crash_shrink_memory(unsigned long new_size)
@@ -1368,35 +1364,114 @@ static int __init parse_crashkernel_simple(char *cmdline,
return 0;
}
+#define SUFFIX_HIGH 0
+#define SUFFIX_LOW 1
+#define SUFFIX_NULL 2
+static __initdata char *suffix_tbl[] = {
+ [SUFFIX_HIGH] = ",high",
+ [SUFFIX_LOW] = ",low",
+ [SUFFIX_NULL] = NULL,
+};
+
/*
- * That function is the entry point for command line parsing and should be
- * called from the arch-specific code.
+ * That function parses "suffix" crashkernel command lines like
+ *
+ * crashkernel=size,[high|low]
+ *
+ * It returns 0 on success and -EINVAL on failure.
*/
+static int __init parse_crashkernel_suffix(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base,
+ const char *suffix)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warn("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ /* check with suffix */
+ if (strncmp(cur, suffix, strlen(suffix))) {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+ cur += strlen(suffix);
+ if (*cur != ' ' && *cur != '\0') {
+ pr_warn("crashkernel: unrecognized char\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static __init char *get_last_crashkernel(char *cmdline,
+ const char *name,
+ const char *suffix)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, name);
+ while (p) {
+ char *end_p = strchr(p, ' ');
+ char *q;
+
+ if (!end_p)
+ end_p = p + strlen(p);
+
+ if (!suffix) {
+ int i;
+
+ /* skip the one with any known suffix */
+ for (i = 0; suffix_tbl[i]; i++) {
+ q = end_p - strlen(suffix_tbl[i]);
+ if (!strncmp(q, suffix_tbl[i],
+ strlen(suffix_tbl[i])))
+ goto next;
+ }
+ ck_cmdline = p;
+ } else {
+ q = end_p - strlen(suffix);
+ if (!strncmp(q, suffix, strlen(suffix)))
+ ck_cmdline = p;
+ }
+next:
+ p = strstr(p+1, name);
+ }
+
+ if (!ck_cmdline)
+ return NULL;
+
+ return ck_cmdline;
+}
+
static int __init __parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base,
- const char *name)
+ const char *name,
+ const char *suffix)
{
- char *p = cmdline, *ck_cmdline = NULL;
char *first_colon, *first_space;
+ char *ck_cmdline;
BUG_ON(!crash_size || !crash_base);
*crash_size = 0;
*crash_base = 0;
- /* find crashkernel and use the last one if there are more */
- p = strstr(p, name);
- while (p) {
- ck_cmdline = p;
- p = strstr(p+1, name);
- }
+ ck_cmdline = get_last_crashkernel(cmdline, name, suffix);
if (!ck_cmdline)
return -EINVAL;
ck_cmdline += strlen(name);
+ if (suffix)
+ return parse_crashkernel_suffix(ck_cmdline, crash_size,
+ crash_base, suffix);
/*
* if the commandline contains a ':', then that's the extended
* syntax -- if not, it must be the classic syntax
@@ -1413,13 +1488,26 @@ static int __init __parse_crashkernel(char *cmdline,
return 0;
}
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
int __init parse_crashkernel(char *cmdline,
unsigned long long system_ram,
unsigned long long *crash_size,
unsigned long long *crash_base)
{
return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel=");
+ "crashkernel=", NULL);
+}
+
+int __init parse_crashkernel_high(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
+ "crashkernel=", suffix_tbl[SUFFIX_HIGH]);
}
int __init parse_crashkernel_low(char *cmdline,
@@ -1428,7 +1516,7 @@ int __init parse_crashkernel_low(char *cmdline,
unsigned long long *crash_base)
{
return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base,
- "crashkernel_low=");
+ "crashkernel=", suffix_tbl[SUFFIX_LOW]);
}
static void update_vmcoreinfo_note(void)
@@ -1489,7 +1577,7 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_SYMBOL(swapper_pg_dir);
#endif
VMCOREINFO_SYMBOL(_stext);
- VMCOREINFO_SYMBOL(vmlist);
+ VMCOREINFO_SYMBOL(vmap_area_list);
#ifndef CONFIG_NEED_MULTIPLE_NODES
VMCOREINFO_SYMBOL(mem_map);
@@ -1527,7 +1615,8 @@ static int __init crash_save_vmcoreinfo_init(void)
VMCOREINFO_OFFSET(free_area, free_list);
VMCOREINFO_OFFSET(list_head, next);
VMCOREINFO_OFFSET(list_head, prev);
- VMCOREINFO_OFFSET(vm_struct, addr);
+ VMCOREINFO_OFFSET(vmap_area, va_start);
+ VMCOREINFO_OFFSET(vmap_area, list);
VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
log_buf_kexec_setup();
VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index e35be53f6613..3fed7f0cbcdf 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -794,16 +794,16 @@ out:
}
#ifdef CONFIG_SYSCTL
-/* This should be called with kprobe_mutex locked */
static void __kprobes optimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i;
+ mutex_lock(&kprobe_mutex);
/* If optimization is already allowed, just return */
if (kprobes_allow_optimization)
- return;
+ goto out;
kprobes_allow_optimization = true;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void)
optimize_kprobe(p);
}
printk(KERN_INFO "Kprobes globally optimized\n");
+out:
+ mutex_unlock(&kprobe_mutex);
}
-/* This should be called with kprobe_mutex locked */
static void __kprobes unoptimize_all_kprobes(void)
{
struct hlist_head *head;
struct kprobe *p;
unsigned int i;
+ mutex_lock(&kprobe_mutex);
/* If optimization is already prohibited, just return */
- if (!kprobes_allow_optimization)
+ if (!kprobes_allow_optimization) {
+ mutex_unlock(&kprobe_mutex);
return;
+ }
kprobes_allow_optimization = false;
for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void)
unoptimize_kprobe(p, false);
}
}
+ mutex_unlock(&kprobe_mutex);
+
/* Wait for unoptimizing completion */
wait_for_kprobe_optimizer();
printk(KERN_INFO "Kprobes globally unoptimized\n");
}
+static DEFINE_MUTEX(kprobe_sysctl_mutex);
int sysctl_kprobes_optimization;
int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
@@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
{
int ret;
- mutex_lock(&kprobe_mutex);
+ mutex_lock(&kprobe_sysctl_mutex);
sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0;
ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
@@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write,
optimize_all_kprobes();
else
unoptimize_all_kprobes();
- mutex_unlock(&kprobe_mutex);
+ mutex_unlock(&kprobe_sysctl_mutex);
return ret;
}
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 691dc2ef9baf..16d8ddd268b1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -52,8 +52,21 @@ enum KTHREAD_BITS {
KTHREAD_IS_PARKED,
};
-#define to_kthread(tsk) \
- container_of((tsk)->vfork_done, struct kthread, exited)
+#define __to_kthread(vfork) \
+ container_of(vfork, struct kthread, exited)
+
+static inline struct kthread *to_kthread(struct task_struct *k)
+{
+ return __to_kthread(k->vfork_done);
+}
+
+static struct kthread *to_live_kthread(struct task_struct *k)
+{
+ struct completion *vfork = ACCESS_ONCE(k->vfork_done);
+ if (likely(vfork))
+ return __to_kthread(vfork);
+ return NULL;
+}
/**
* kthread_should_stop - should this kthread return now?
@@ -124,12 +137,12 @@ void *kthread_data(struct task_struct *task)
static void __kthread_parkme(struct kthread *self)
{
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_PARKED);
while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) {
if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags))
complete(&self->parked);
schedule();
- __set_current_state(TASK_INTERRUPTIBLE);
+ __set_current_state(TASK_PARKED);
}
clear_bit(KTHREAD_IS_PARKED, &self->flags);
__set_current_state(TASK_RUNNING);
@@ -256,11 +269,16 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
}
EXPORT_SYMBOL(kthread_create_on_node);
-static void __kthread_bind(struct task_struct *p, unsigned int cpu)
+static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
{
+ /* Must have done schedule() in kthread() before we set_task_cpu */
+ if (!wait_task_inactive(p, state)) {
+ WARN_ON(1);
+ return;
+ }
/* It's safe because the task is inactive. */
do_set_cpus_allowed(p, cpumask_of(cpu));
- p->flags |= PF_THREAD_BOUND;
+ p->flags |= PF_NO_SETAFFINITY;
}
/**
@@ -274,12 +292,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu)
*/
void kthread_bind(struct task_struct *p, unsigned int cpu)
{
- /* Must have done schedule() in kthread() before we set_task_cpu */
- if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
- WARN_ON(1);
- return;
- }
- __kthread_bind(p, cpu);
+ __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(kthread_bind);
@@ -311,17 +324,20 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
return p;
}
-static struct kthread *task_get_live_kthread(struct task_struct *k)
+static void __kthread_unpark(struct task_struct *k, struct kthread *kthread)
{
- struct kthread *kthread;
-
- get_task_struct(k);
- kthread = to_kthread(k);
- /* It might have exited */
- barrier();
- if (k->vfork_done != NULL)
- return kthread;
- return NULL;
+ clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ /*
+ * We clear the IS_PARKED bit here as we don't wait
+ * until the task has left the park code. So if we'd
+ * park before that happens we'd see the IS_PARKED bit
+ * which might be about to be cleared.
+ */
+ if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
+ if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
+ __kthread_bind(k, kthread->cpu, TASK_PARKED);
+ wake_up_state(k, TASK_PARKED);
+ }
}
/**
@@ -334,23 +350,10 @@ static struct kthread *task_get_live_kthread(struct task_struct *k)
*/
void kthread_unpark(struct task_struct *k)
{
- struct kthread *kthread = task_get_live_kthread(k);
+ struct kthread *kthread = to_live_kthread(k);
- if (kthread) {
- clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
- /*
- * We clear the IS_PARKED bit here as we don't wait
- * until the task has left the park code. So if we'd
- * park before that happens we'd see the IS_PARKED bit
- * which might be about to be cleared.
- */
- if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) {
- if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags))
- __kthread_bind(k, kthread->cpu);
- wake_up_process(k);
- }
- }
- put_task_struct(k);
+ if (kthread)
+ __kthread_unpark(k, kthread);
}
/**
@@ -367,7 +370,7 @@ void kthread_unpark(struct task_struct *k)
*/
int kthread_park(struct task_struct *k)
{
- struct kthread *kthread = task_get_live_kthread(k);
+ struct kthread *kthread = to_live_kthread(k);
int ret = -ENOSYS;
if (kthread) {
@@ -380,7 +383,6 @@ int kthread_park(struct task_struct *k)
}
ret = 0;
}
- put_task_struct(k);
return ret;
}
@@ -401,21 +403,23 @@ int kthread_park(struct task_struct *k)
*/
int kthread_stop(struct task_struct *k)
{
- struct kthread *kthread = task_get_live_kthread(k);
+ struct kthread *kthread;
int ret;
trace_sched_kthread_stop(k);
+
+ get_task_struct(k);
+ kthread = to_live_kthread(k);
if (kthread) {
set_bit(KTHREAD_SHOULD_STOP, &kthread->flags);
- clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags);
+ __kthread_unpark(k, kthread);
wake_up_process(k);
wait_for_completion(&kthread->exited);
}
ret = k->exit_code;
-
put_task_struct(k);
- trace_sched_kthread_stop_ret(ret);
+ trace_sched_kthread_stop_ret(ret);
return ret;
}
EXPORT_SYMBOL(kthread_stop);
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 259db207b5d9..6a3bccba7e7d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -380,6 +380,13 @@ static int verbose(struct lock_class *class)
unsigned long nr_stack_trace_entries;
static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES];
+static void print_lockdep_off(const char *bug_msg)
+{
+ printk(KERN_DEBUG "%s\n", bug_msg);
+ printk(KERN_DEBUG "turning off the locking correctness validator.\n");
+ printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n");
+}
+
static int save_trace(struct stack_trace *trace)
{
trace->nr_entries = 0;
@@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace)
if (!debug_locks_off_graph_unlock())
return 0;
- printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!");
dump_stack();
return 0;
@@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
}
raw_local_irq_restore(flags);
- printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!");
dump_stack();
return NULL;
}
@@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void)
if (!debug_locks_off_graph_unlock())
return NULL;
- printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!");
dump_stack();
return NULL;
}
@@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
struct lock_class *class = hlock_class(hlock);
struct list_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
- struct held_lock *hlock_curr, *hlock_next;
+ struct held_lock *hlock_curr;
int i, j;
/*
@@ -2048,8 +2052,7 @@ cache_hit:
if (!debug_locks_off_graph_unlock())
return 0;
- printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
- printk("turning off the locking correctness validator.\n");
+ print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!");
dump_stack();
return 0;
}
@@ -2057,12 +2060,10 @@ cache_hit:
chain->chain_key = chain_key;
chain->irq_context = hlock->irq_context;
/* Find the first held_lock of current chain */
- hlock_next = hlock;
for (i = curr->lockdep_depth - 1; i >= 0; i--) {
hlock_curr = curr->held_locks + i;
- if (hlock_curr->irq_context != hlock_next->irq_context)
+ if (hlock_curr->irq_context != hlock->irq_context)
break;
- hlock_next = hlock;
}
i++;
chain->depth = curr->lockdep_depth + 1 - i;
@@ -3190,9 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
#endif
if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) {
debug_locks_off();
- printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n",
+ print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!");
+ printk(KERN_DEBUG "depth: %i max: %lu!\n",
curr->lockdep_depth, MAX_LOCK_DEPTH);
- printk("turning off the locking correctness validator.\n");
lockdep_print_held_locks(current);
debug_show_all_locks();
@@ -4088,7 +4089,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len)
}
EXPORT_SYMBOL_GPL(debug_check_no_locks_freed);
-static void print_held_locks_bug(void)
+static void print_held_locks_bug(struct task_struct *curr)
{
if (!debug_locks_off())
return;
@@ -4097,21 +4098,22 @@ static void print_held_locks_bug(void)
printk("\n");
printk("=====================================\n");
- printk("[ BUG: %s/%d still has locks held! ]\n",
- current->comm, task_pid_nr(current));
+ printk("[ BUG: lock held at task exit time! ]\n");
print_kernel_ident();
printk("-------------------------------------\n");
- lockdep_print_held_locks(current);
+ printk("%s/%d is exiting with locks still held!\n",
+ curr->comm, task_pid_nr(curr));
+ lockdep_print_held_locks(curr);
+
printk("\nstack backtrace:\n");
dump_stack();
}
-void debug_check_no_locks_held(void)
+void debug_check_no_locks_held(struct task_struct *task)
{
- if (unlikely(current->lockdep_depth > 0))
- print_held_locks_bug();
+ if (unlikely(task->lockdep_depth > 0))
+ print_held_locks_bug(task);
}
-EXPORT_SYMBOL_GPL(debug_check_no_locks_held);
void debug_show_all_locks(void)
{
diff --git a/kernel/mutex.c b/kernel/mutex.c
index 52f23011b6e0..ad53a664f113 100644
--- a/kernel/mutex.c
+++ b/kernel/mutex.c
@@ -37,6 +37,12 @@
# include <asm/mutex.h>
#endif
+/*
+ * A negative mutex count indicates that waiters are sleeping waiting for the
+ * mutex.
+ */
+#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0)
+
void
__mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
{
@@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key)
spin_lock_init(&lock->wait_lock);
INIT_LIST_HEAD(&lock->wait_list);
mutex_clear_owner(lock);
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+ lock->spin_mlock = NULL;
+#endif
debug_mutex_init(lock, name, key);
}
@@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock)
EXPORT_SYMBOL(mutex_lock);
#endif
+#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
+/*
+ * In order to avoid a stampede of mutex spinners from acquiring the mutex
+ * more or less simultaneously, the spinners need to acquire a MCS lock
+ * first before spinning on the owner field.
+ *
+ * We don't inline mspin_lock() so that perf can correctly account for the
+ * time spent in this lock function.
+ */
+struct mspin_node {
+ struct mspin_node *next ;
+ int locked; /* 1 if lock acquired */
+};
+#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock))
+
+static noinline
+void mspin_lock(struct mspin_node **lock, struct mspin_node *node)
+{
+ struct mspin_node *prev;
+
+ /* Init node */
+ node->locked = 0;
+ node->next = NULL;
+
+ prev = xchg(lock, node);
+ if (likely(prev == NULL)) {
+ /* Lock acquired */
+ node->locked = 1;
+ return;
+ }
+ ACCESS_ONCE(prev->next) = node;
+ smp_wmb();
+ /* Wait until the lock holder passes the lock down */
+ while (!ACCESS_ONCE(node->locked))
+ arch_mutex_cpu_relax();
+}
+
+static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node)
+{
+ struct mspin_node *next = ACCESS_ONCE(node->next);
+
+ if (likely(!next)) {
+ /*
+ * Release the lock by setting it to NULL
+ */
+ if (cmpxchg(lock, node, NULL) == node)
+ return;
+ /* Wait until the next pointer is set */
+ while (!(next = ACCESS_ONCE(node->next)))
+ arch_mutex_cpu_relax();
+ }
+ ACCESS_ONCE(next->locked) = 1;
+ smp_wmb();
+}
+
+/*
+ * Mutex spinning code migrated from kernel/sched/core.c
+ */
+
+static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
+{
+ if (lock->owner != owner)
+ return false;
+
+ /*
+ * Ensure we emit the owner->on_cpu, dereference _after_ checking
+ * lock->owner still matches owner, if that fails, owner might
+ * point to free()d memory, if it still matches, the rcu_read_lock()
+ * ensures the memory stays valid.
+ */
+ barrier();
+
+ return owner->on_cpu;
+}
+
+/*
+ * Look out! "owner" is an entirely speculative pointer
+ * access and not reliable.
+ */
+static noinline
+int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
+{
+ rcu_read_lock();
+ while (owner_running(lock, owner)) {
+ if (need_resched())
+ break;
+
+ arch_mutex_cpu_relax();
+ }
+ rcu_read_unlock();
+
+ /*
+ * We break out the loop above on need_resched() and when the
+ * owner changed, which is a sign for heavy contention. Return
+ * success only when lock->owner is NULL.
+ */
+ return lock->owner == NULL;
+}
+
+/*
+ * Initial check for entering the mutex spinning loop
+ */
+static inline int mutex_can_spin_on_owner(struct mutex *lock)
+{
+ int retval = 1;
+
+ rcu_read_lock();
+ if (lock->owner)
+ retval = lock->owner->on_cpu;
+ rcu_read_unlock();
+ /*
+ * if lock->owner is not set, the mutex owner may have just acquired
+ * it and not set the owner yet or the mutex has been released.
+ */
+ return retval;
+}
+#endif
+
static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count);
/**
@@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*
* We can't do this for DEBUG_MUTEXES because that relies on wait_lock
* to serialize everything.
+ *
+ * The mutex spinners are queued up using MCS lock so that only one
+ * spinner can compete for the mutex. However, if mutex spinning isn't
+ * going to happen, there is no point in going through the lock/unlock
+ * overhead.
*/
+ if (!mutex_can_spin_on_owner(lock))
+ goto slowpath;
for (;;) {
struct task_struct *owner;
+ struct mspin_node node;
/*
* If there's an owner, wait for it to either
* release the lock or go to sleep.
*/
+ mspin_lock(MLOCK(lock), &node);
owner = ACCESS_ONCE(lock->owner);
- if (owner && !mutex_spin_on_owner(lock, owner))
+ if (owner && !mutex_spin_on_owner(lock, owner)) {
+ mspin_unlock(MLOCK(lock), &node);
break;
+ }
- if (atomic_cmpxchg(&lock->count, 1, 0) == 1) {
+ if ((atomic_read(&lock->count) == 1) &&
+ (atomic_cmpxchg(&lock->count, 1, 0) == 1)) {
lock_acquired(&lock->dep_map, ip);
mutex_set_owner(lock);
+ mspin_unlock(MLOCK(lock), &node);
preempt_enable();
return 0;
}
+ mspin_unlock(MLOCK(lock), &node);
/*
* When there's no owner, we might have preempted between the
@@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
*/
arch_mutex_cpu_relax();
}
+slowpath:
#endif
spin_lock_mutex(&lock->wait_lock, flags);
@@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
list_add_tail(&waiter.list, &lock->wait_list);
waiter.task = task;
- if (atomic_xchg(&lock->count, -1) == 1)
+ if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1))
goto done;
lock_contended(&lock->dep_map, ip);
@@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
* that when we release the lock, we properly wake up the
* other waiters:
*/
- if (atomic_xchg(&lock->count, -1) == 1)
+ if (MUTEX_SHOW_NO_WAITER(lock) &&
+ (atomic_xchg(&lock->count, -1) == 1))
break;
/*
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index c1c3dc1c6023..bea15bdf82b0 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
int nr;
int rc;
struct task_struct *task, *me = current;
+ int init_pids = thread_group_leader(me) ? 1 : 2;
/* Don't allow any more processes into the pid namespace */
disable_pid_allocation(pid_ns);
@@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
*/
for (;;) {
set_current_state(TASK_UNINTERRUPTIBLE);
- if (pid_ns->nr_hashed == 1)
+ if (pid_ns->nr_hashed == init_pids)
break;
schedule();
}
diff --git a/kernel/printk.c b/kernel/printk.c
index 0b31715f335a..abbdd9e2ac82 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -63,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...)
#define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */
#define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */
-DECLARE_WAIT_QUEUE_HEAD(log_wait);
-
int console_printk[4] = {
DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */
DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */
@@ -224,6 +222,7 @@ struct log {
static DEFINE_RAW_SPINLOCK(logbuf_lock);
#ifdef CONFIG_PRINTK
+DECLARE_WAIT_QUEUE_HEAD(log_wait);
/* the next printk record to read by syslog(READ) or /proc/kmsg */
static u64 syslog_seq;
static u32 syslog_idx;
@@ -1957,45 +1956,6 @@ int is_console_locked(void)
return console_locked;
}
-/*
- * Delayed printk version, for scheduler-internal messages:
- */
-#define PRINTK_BUF_SIZE 512
-
-#define PRINTK_PENDING_WAKEUP 0x01
-#define PRINTK_PENDING_SCHED 0x02
-
-static DEFINE_PER_CPU(int, printk_pending);
-static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
-
-static void wake_up_klogd_work_func(struct irq_work *irq_work)
-{
- int pending = __this_cpu_xchg(printk_pending, 0);
-
- if (pending & PRINTK_PENDING_SCHED) {
- char *buf = __get_cpu_var(printk_sched_buf);
- printk(KERN_WARNING "[sched_delayed] %s", buf);
- }
-
- if (pending & PRINTK_PENDING_WAKEUP)
- wake_up_interruptible(&log_wait);
-}
-
-static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
- .func = wake_up_klogd_work_func,
- .flags = IRQ_WORK_LAZY,
-};
-
-void wake_up_klogd(void)
-{
- preempt_disable();
- if (waitqueue_active(&log_wait)) {
- this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
- irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
- }
- preempt_enable();
-}
-
static void console_cont_flush(char *text, size_t size)
{
unsigned long flags;
@@ -2458,6 +2418,44 @@ static int __init printk_late_init(void)
late_initcall(printk_late_init);
#if defined CONFIG_PRINTK
+/*
+ * Delayed printk version, for scheduler-internal messages:
+ */
+#define PRINTK_BUF_SIZE 512
+
+#define PRINTK_PENDING_WAKEUP 0x01
+#define PRINTK_PENDING_SCHED 0x02
+
+static DEFINE_PER_CPU(int, printk_pending);
+static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf);
+
+static void wake_up_klogd_work_func(struct irq_work *irq_work)
+{
+ int pending = __this_cpu_xchg(printk_pending, 0);
+
+ if (pending & PRINTK_PENDING_SCHED) {
+ char *buf = __get_cpu_var(printk_sched_buf);
+ printk(KERN_WARNING "[sched_delayed] %s", buf);
+ }
+
+ if (pending & PRINTK_PENDING_WAKEUP)
+ wake_up_interruptible(&log_wait);
+}
+
+static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
+ .func = wake_up_klogd_work_func,
+ .flags = IRQ_WORK_LAZY,
+};
+
+void wake_up_klogd(void)
+{
+ preempt_disable();
+ if (waitqueue_active(&log_wait)) {
+ this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
+ irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
+ }
+ preempt_enable();
+}
int printk_sched(const char *fmt, ...)
{
diff --git a/kernel/resource.c b/kernel/resource.c
index 73f35d4b30b9..d7386986e10e 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -21,6 +21,7 @@
#include <linux/seq_file.h>
#include <linux/device.h>
#include <linux/pfn.h>
+#include <linux/mm.h>
#include <asm/io.h>
@@ -50,6 +51,14 @@ struct resource_constraint {
static DEFINE_RWLOCK(resource_lock);
+/*
+ * For memory hotplug, there is no way to free resource entries allocated
+ * by boot mem after the system is up. So for reusing the resource entry
+ * we need to remember the resource.
+ */
+static struct resource *bootmem_resource_free;
+static DEFINE_SPINLOCK(bootmem_resource_lock);
+
static void *r_next(struct seq_file *m, void *v, loff_t *pos)
{
struct resource *p = v;
@@ -151,6 +160,40 @@ __initcall(ioresources_init);
#endif /* CONFIG_PROC_FS */
+static void free_resource(struct resource *res)
+{
+ if (!res)
+ return;
+
+ if (!PageSlab(virt_to_head_page(res))) {
+ spin_lock(&bootmem_resource_lock);
+ res->sibling = bootmem_resource_free;
+ bootmem_resource_free = res;
+ spin_unlock(&bootmem_resource_lock);
+ } else {
+ kfree(res);
+ }
+}
+
+static struct resource *alloc_resource(gfp_t flags)
+{
+ struct resource *res = NULL;
+
+ spin_lock(&bootmem_resource_lock);
+ if (bootmem_resource_free) {
+ res = bootmem_resource_free;
+ bootmem_resource_free = res->sibling;
+ }
+ spin_unlock(&bootmem_resource_lock);
+
+ if (res)
+ memset(res, 0, sizeof(struct resource));
+ else
+ res = kzalloc(sizeof(struct resource), flags);
+
+ return res;
+}
+
/* Return the conflict entry if you can't request it */
static struct resource * __request_resource(struct resource *root, struct resource *new)
{
@@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new)
write_unlock(&resource_lock);
}
-/**
- * adjust_resource - modify a resource's start and size
- * @res: resource to modify
- * @start: new start value
- * @size: new size
- *
- * Given an existing resource, change its start and size to match the
- * arguments. Returns 0 on success, -EBUSY if it can't fit.
- * Existing children of the resource are assumed to be immutable.
- */
-int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size)
+static int __adjust_resource(struct resource *res, resource_size_t start,
+ resource_size_t size)
{
struct resource *tmp, *parent = res->parent;
resource_size_t end = start + size - 1;
int result = -EBUSY;
- write_lock(&resource_lock);
-
if (!parent)
goto skip;
@@ -751,6 +783,26 @@ skip:
result = 0;
out:
+ return result;
+}
+
+/**
+ * adjust_resource - modify a resource's start and size
+ * @res: resource to modify
+ * @start: new start value
+ * @size: new size
+ *
+ * Given an existing resource, change its start and size to match the
+ * arguments. Returns 0 on success, -EBUSY if it can't fit.
+ * Existing children of the resource are assumed to be immutable.
+ */
+int adjust_resource(struct resource *res, resource_size_t start,
+ resource_size_t size)
+{
+ int result;
+
+ write_lock(&resource_lock);
+ result = __adjust_resource(res, start, size);
write_unlock(&resource_lock);
return result;
}
@@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root,
{
struct resource *parent = root;
struct resource *conflict;
- struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC);
+ struct resource *res = alloc_resource(GFP_ATOMIC);
struct resource *next_res = NULL;
if (!res)
@@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root,
/* conflict covered whole area */
if (conflict->start <= res->start &&
conflict->end >= res->end) {
- kfree(res);
+ free_resource(res);
WARN_ON(next_res);
break;
}
@@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root,
end = res->end;
res->end = conflict->start - 1;
if (conflict->end < end) {
- next_res = kzalloc(sizeof(*next_res),
- GFP_ATOMIC);
+ next_res = alloc_resource(GFP_ATOMIC);
if (!next_res) {
- kfree(res);
+ free_resource(res);
break;
}
next_res->name = name;
@@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent,
const char *name, int flags)
{
DECLARE_WAITQUEUE(wait, current);
- struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL);
+ struct resource *res = alloc_resource(GFP_KERNEL);
if (!res)
return NULL;
@@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent,
continue;
}
/* Uhhuh, that didn't work out.. */
- kfree(res);
+ free_resource(res);
res = NULL;
break;
}
@@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start,
return -EBUSY;
release_resource(res);
- kfree(res);
+ free_resource(res);
return 0;
}
EXPORT_SYMBOL(__check_region);
@@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start,
write_unlock(&resource_lock);
if (res->flags & IORESOURCE_MUXED)
wake_up(&muxed_resource_wait);
- kfree(res);
+ free_resource(res);
return;
}
p = &res->sibling;
@@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start,
}
EXPORT_SYMBOL(__release_region);
+#ifdef CONFIG_MEMORY_HOTREMOVE
+/**
+ * release_mem_region_adjustable - release a previously reserved memory region
+ * @parent: parent resource descriptor
+ * @start: resource start address
+ * @size: resource region size
+ *
+ * This interface is intended for memory hot-delete. The requested region
+ * is released from a currently busy memory resource. The requested region
+ * must either match exactly or fit into a single busy resource entry. In
+ * the latter case, the remaining resource is adjusted accordingly.
+ * Existing children of the busy memory resource must be immutable in the
+ * request.
+ *
+ * Note:
+ * - Additional release conditions, such as overlapping region, can be
+ * supported after they are confirmed as valid cases.
+ * - When a busy memory resource gets split into two entries, the code
+ * assumes that all children remain in the lower address entry for
+ * simplicity. Enhance this logic when necessary.
+ */
+int release_mem_region_adjustable(struct resource *parent,
+ resource_size_t start, resource_size_t size)
+{
+ struct resource **p;
+ struct resource *res;
+ struct resource *new_res;
+ resource_size_t end;
+ int ret = -EINVAL;
+
+ end = start + size - 1;
+ if ((start < parent->start) || (end > parent->end))
+ return ret;
+
+ /* The alloc_resource() result gets checked later */
+ new_res = alloc_resource(GFP_KERNEL);
+
+ p = &parent->child;
+ write_lock(&resource_lock);
+
+ while ((res = *p)) {
+ if (res->start >= end)
+ break;
+
+ /* look for the next resource if it does not fit into */
+ if (res->start > start || res->end < end) {
+ p = &res->sibling;
+ continue;
+ }
+
+ if (!(res->flags & IORESOURCE_MEM))
+ break;
+
+ if (!(res->flags & IORESOURCE_BUSY)) {
+ p = &res->child;
+ continue;
+ }
+
+ /* found the target resource; let's adjust accordingly */
+ if (res->start == start && res->end == end) {
+ /* free the whole entry */
+ *p = res->sibling;
+ free_resource(res);
+ ret = 0;
+ } else if (res->start == start && res->end != end) {
+ /* adjust the start */
+ ret = __adjust_resource(res, end + 1,
+ res->end - end);
+ } else if (res->start != start && res->end == end) {
+ /* adjust the end */
+ ret = __adjust_resource(res, res->start,
+ start - res->start);
+ } else {
+ /* split into two entries */
+ if (!new_res) {
+ ret = -ENOMEM;
+ break;
+ }
+ new_res->name = res->name;
+ new_res->start = end + 1;
+ new_res->end = res->end;
+ new_res->flags = res->flags;
+ new_res->parent = res->parent;
+ new_res->sibling = res->sibling;
+ new_res->child = NULL;
+
+ ret = __adjust_resource(res, res->start,
+ start - res->start);
+ if (ret)
+ break;
+ res->sibling = new_res;
+ new_res = NULL;
+ }
+
+ break;
+ }
+
+ write_unlock(&resource_lock);
+ free_resource(new_res);
+ return ret;
+}
+#endif /* CONFIG_MEMORY_HOTREMOVE */
+
/*
* Managed region resource
*/
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 7890b10084a7..1d96dd0d93c1 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -14,6 +14,7 @@
#include <linux/spinlock.h>
#include <linux/timer.h>
#include <linux/freezer.h>
+#include <linux/stat.h>
#include "rtmutex.h"
@@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at
return curr - buf;
}
-static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL);
-static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command);
+static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL);
+static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command);
static struct bus_type rttest_subsys = {
.name = "rttest",
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c685e31492df..c3ae1446461c 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd)
u64 this_clock, remote_clock;
u64 *ptr, old_val, val;
+#if BITS_PER_LONG != 64
+again:
+ /*
+ * Careful here: The local and the remote clock values need to
+ * be read out atomic as we need to compare the values and
+ * then update either the local or the remote side. So the
+ * cmpxchg64 below only protects one readout.
+ *
+ * We must reread via sched_clock_local() in the retry case on
+ * 32bit as an NMI could use sched_clock_local() via the
+ * tracer and hit between the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ this_clock = sched_clock_local(my_scd);
+ /*
+ * We must enforce atomic readout on 32bit, otherwise the
+ * update on the remote cpu can hit inbetween the readout of
+ * the low32bit and the high 32bit portion.
+ */
+ remote_clock = cmpxchg64(&scd->clock, 0, 0);
+#else
+ /*
+ * On 64bit the read of [my]scd->clock is atomic versus the
+ * update, so we can avoid the above 32bit dance.
+ */
sched_clock_local(my_scd);
again:
this_clock = my_scd->clock;
remote_clock = scd->clock;
+#endif
/*
* Use the opportunity that we have both locks
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7f12624a393c..d8285eb0cde6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1498,8 +1498,10 @@ static void try_to_wake_up_local(struct task_struct *p)
{
struct rq *rq = task_rq(p);
- BUG_ON(rq != this_rq());
- BUG_ON(p == current);
+ if (WARN_ON_ONCE(rq != this_rq()) ||
+ WARN_ON_ONCE(p == current))
+ return;
+
lockdep_assert_held(&rq->lock);
if (!raw_spin_trylock(&p->pi_lock)) {
@@ -2997,51 +2999,6 @@ void __sched schedule_preempt_disabled(void)
preempt_disable();
}
-#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
-
-static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
-{
- if (lock->owner != owner)
- return false;
-
- /*
- * Ensure we emit the owner->on_cpu, dereference _after_ checking
- * lock->owner still matches owner, if that fails, owner might
- * point to free()d memory, if it still matches, the rcu_read_lock()
- * ensures the memory stays valid.
- */
- barrier();
-
- return owner->on_cpu;
-}
-
-/*
- * Look out! "owner" is an entirely speculative pointer
- * access and not reliable.
- */
-int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
-{
- if (!sched_feat(OWNER_SPIN))
- return 0;
-
- rcu_read_lock();
- while (owner_running(lock, owner)) {
- if (need_resched())
- break;
-
- arch_mutex_cpu_relax();
- }
- rcu_read_unlock();
-
- /*
- * We break out the loop above on need_resched() and when the
- * owner changed, which is a sign for heavy contention. Return
- * success only when lock->owner is NULL.
- */
- return lock->owner == NULL;
-}
-#endif
-
#ifdef CONFIG_PREEMPT
/*
* this is the entry point to schedule() from in-kernel preemption
@@ -4126,6 +4083,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
get_task_struct(p);
rcu_read_unlock();
+ if (p->flags & PF_NO_SETAFFINITY) {
+ retval = -EINVAL;
+ goto out_put_task;
+ }
if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
retval = -ENOMEM;
goto out_put_task;
@@ -4773,11 +4734,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
goto out;
}
- if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) {
- ret = -EINVAL;
- goto out;
- }
-
do_set_cpus_allowed(p, new_mask);
/* Can the task run on the task's current CPU? If so, we're done */
@@ -4999,7 +4955,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep)
}
static int min_load_idx = 0;
-static int max_load_idx = CPU_LOAD_IDX_MAX;
+static int max_load_idx = CPU_LOAD_IDX_MAX-1;
static void
set_table_entry(struct ctl_table *entry,
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
index ed12cbb135f4..e93cca92f38b 100644
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
t = tsk;
do {
- task_cputime(tsk, &utime, &stime);
+ task_cputime(t, &utime, &stime);
times->utime += utime;
times->stime += stime;
times->sum_exec_runtime += task_sched_runtime(t);
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
index 1ad1d2b5395f..99399f8e4799 100644
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false)
SCHED_FEAT(LB_BIAS, true)
/*
- * Spin-wait on mutex acquisition when the mutex owner is running on
- * another cpu -- assumes that when the owner is running, it will soon
- * release the lock. Decreases scheduling overhead.
- */
-SCHED_FEAT(OWNER_SPIN, true)
-
-/*
* Decrement CPU power based on time not spent running tasks
*/
SCHED_FEAT(NONTASK_POWER, true)
diff --git a/kernel/signal.c b/kernel/signal.c
index 2ec870a4c3c4..598dc06be421 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default)
if (force_default || ka->sa.sa_handler != SIG_IGN)
ka->sa.sa_handler = SIG_DFL;
ka->sa.sa_flags = 0;
+#ifdef __ARCH_HAS_SA_RESTORER
+ ka->sa.sa_restorer = NULL;
+#endif
sigemptyset(&ka->sa.sa_mask);
ka++;
}
@@ -2682,7 +2685,7 @@ static int do_sigpending(void *set, unsigned long sigsetsize)
/**
* sys_rt_sigpending - examine a pending signal that has been raised
* while blocked
- * @set: stores pending signals
+ * @uset: stores pending signals
* @sigsetsize: size of sigset_t type or larger
*/
SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize)
@@ -2945,7 +2948,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
static int do_tkill(pid_t tgid, pid_t pid, int sig)
{
- struct siginfo info;
+ struct siginfo info = {};
info.si_signo = sig;
info.si_errno = 0;
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index b9bde5727829..02fc5c933673 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -131,7 +131,7 @@ static int smpboot_thread_fn(void *data)
continue;
}
- //BUG_ON(td->cpu != smp_processor_id());
+ BUG_ON(td->cpu != smp_processor_id());
/* Check for state change setup */
switch (td->status) {
@@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu)
}
get_task_struct(tsk);
*per_cpu_ptr(ht->store, cpu) = tsk;
- if (ht->create)
- ht->create(cpu);
+ if (ht->create) {
+ /*
+ * Make sure that the task has actually scheduled out
+ * into park position, before calling the create
+ * callback. At least the migration thread callback
+ * requires that the task is off the runqueue.
+ */
+ if (!wait_task_inactive(tsk, TASK_PARKED))
+ WARN_ON(1);
+ else
+ ht->create(cpu);
+ }
return 0;
}
@@ -209,6 +219,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp
{
struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu);
+ if (ht->pre_unpark)
+ ht->pre_unpark(cpu);
kthread_unpark(tsk);
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index b4d252fd195b..14d7758074aa 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -323,18 +323,10 @@ void irq_enter(void)
static inline void invoke_softirq(void)
{
- if (!force_irqthreads) {
-#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
+ if (!force_irqthreads)
__do_softirq();
-#else
- do_softirq();
-#endif
- } else {
- __local_bh_disable((unsigned long)__builtin_return_address(0),
- SOFTIRQ_OFFSET);
+ else
wakeup_softirqd();
- __local_bh_enable(SOFTIRQ_OFFSET);
- }
}
/*
@@ -342,9 +334,15 @@ static inline void invoke_softirq(void)
*/
void irq_exit(void)
{
+#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED
+ local_irq_disable();
+#else
+ WARN_ON_ONCE(!irqs_disabled());
+#endif
+
account_irq_exit_time(current);
trace_hardirq_exit();
- sub_preempt_count(IRQ_EXIT_OFFSET);
+ sub_preempt_count(HARDIRQ_OFFSET);
if (!in_interrupt() && local_softirq_pending())
invoke_softirq();
@@ -354,7 +352,6 @@ void irq_exit(void)
tick_nohz_irq_exit();
#endif
rcu_irq_exit();
- sched_preempt_enable_no_resched();
}
/*
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 95d178c62d5a..c09f2955ae30 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -336,7 +336,7 @@ static struct smp_hotplug_thread cpu_stop_threads = {
.create = cpu_stop_create,
.setup = cpu_stop_unpark,
.park = cpu_stop_park,
- .unpark = cpu_stop_unpark,
+ .pre_unpark = cpu_stop_unpark,
.selfparking = true,
};
diff --git a/kernel/sys.c b/kernel/sys.c
index 81f56445fba9..0da73cf73e60 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -324,7 +324,6 @@ void kernel_restart_prepare(char *cmd)
system_state = SYSTEM_RESTART;
usermodehelper_disable();
device_shutdown();
- syscore_shutdown();
}
/**
@@ -370,6 +369,7 @@ void kernel_restart(char *cmd)
{
kernel_restart_prepare(cmd);
disable_nonboot_cpus();
+ syscore_shutdown();
if (!cmd)
printk(KERN_EMERG "Restarting system.\n");
else
@@ -395,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state)
void kernel_halt(void)
{
kernel_shutdown_prepare(SYSTEM_HALT);
+ disable_nonboot_cpus();
syscore_shutdown();
printk(KERN_EMERG "System halted.\n");
kmsg_dump(KMSG_DUMP_HALT);
@@ -2185,9 +2186,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep,
char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff";
-static int __orderly_poweroff(void)
+static int __orderly_poweroff(bool force)
{
- int argc;
char **argv;
static char *envp[] = {
"HOME=/",
@@ -2196,20 +2196,40 @@ static int __orderly_poweroff(void)
};
int ret;
- argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc);
- if (argv == NULL) {
+ argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL);
+ if (argv) {
+ ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+ argv_free(argv);
+ } else {
printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n",
- __func__, poweroff_cmd);
- return -ENOMEM;
+ __func__, poweroff_cmd);
+ ret = -ENOMEM;
}
- ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC,
- NULL, NULL, NULL);
- argv_free(argv);
+ if (ret && force) {
+ printk(KERN_WARNING "Failed to start orderly shutdown: "
+ "forcing the issue\n");
+ /*
+ * I guess this should try to kick off some daemon to sync and
+ * poweroff asap. Or not even bother syncing if we're doing an
+ * emergency shutdown?
+ */
+ emergency_sync();
+ kernel_power_off();
+ }
return ret;
}
+static bool poweroff_force;
+
+static void poweroff_work_func(struct work_struct *work)
+{
+ __orderly_poweroff(poweroff_force);
+}
+
+static DECLARE_WORK(poweroff_work, poweroff_work_func);
+
/**
* orderly_poweroff - Trigger an orderly system poweroff
* @force: force poweroff if command execution fails
@@ -2219,21 +2239,9 @@ static int __orderly_poweroff(void)
*/
int orderly_poweroff(bool force)
{
- int ret = __orderly_poweroff();
-
- if (ret && force) {
- printk(KERN_WARNING "Failed to start orderly shutdown: "
- "forcing the issue\n");
-
- /*
- * I guess this should try to kick off some daemon to sync and
- * poweroff asap. Or not even bother syncing if we're doing an
- * emergency shutdown?
- */
- emergency_sync();
- kernel_power_off();
- }
-
- return ret;
+ if (force) /* do not override the pending "true" */
+ poweroff_force = true;
+ schedule_work(&poweroff_work);
+ return 0;
}
EXPORT_SYMBOL_GPL(orderly_poweroff);
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index afc1dc60f3f8..9edcf456e0fc 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit;
#endif
extern int pid_max;
extern int pid_max_min, pid_max_max;
-extern int sysctl_drop_caches;
extern int percpu_pagelist_fraction;
extern int compat_log;
extern int latencytop_enabled;
@@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = {
.extra2 = &one,
},
#endif
+ {
+ .procname = "user_reserve_kbytes",
+ .data = &sysctl_user_reserve_kbytes,
+ .maxlen = sizeof(sysctl_user_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
+ {
+ .procname = "admin_reserve_kbytes",
+ .data = &sysctl_admin_reserve_kbytes,
+ .maxlen = sizeof(sysctl_admin_reserve_kbytes),
+ .mode = 0644,
+ .proc_handler = proc_doulongvec_minmax,
+ },
{ }
};
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 2fb8cb88df8d..7f32fe0e52cd 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc)
*/
int tick_check_broadcast_device(struct clock_event_device *dev)
{
- if ((tick_broadcast_device.evtdev &&
+ if ((dev->features & CLOCK_EVT_FEAT_DUMMY) ||
+ (tick_broadcast_device.evtdev &&
tick_broadcast_device.evtdev->rating >= dev->rating) ||
(dev->features & CLOCK_EVT_FEAT_C3STOP))
return 0;
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 192473b22799..5e9efd4b83a4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -176,6 +176,8 @@ config IRQSOFF_TRACER
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
+ select TRACER_SNAPSHOT
+ select TRACER_SNAPSHOT_PER_CPU_SWAP
help
This option measures the time spent in irqs-off critical
sections, with microsecond accuracy.
@@ -198,6 +200,8 @@ config PREEMPT_TRACER
select GENERIC_TRACER
select TRACER_MAX_TRACE
select RING_BUFFER_ALLOW_SWAP
+ select TRACER_SNAPSHOT
+ select TRACER_SNAPSHOT_PER_CPU_SWAP
help
This option measures the time spent in preemption-off critical
sections, with microsecond accuracy.
@@ -217,6 +221,7 @@ config SCHED_TRACER
select GENERIC_TRACER
select CONTEXT_SWITCH_TRACER
select TRACER_MAX_TRACE
+ select TRACER_SNAPSHOT
help
This tracer tracks the latency of the highest priority task
to be scheduled in, starting from the point it has woken up.
@@ -248,6 +253,27 @@ config TRACER_SNAPSHOT
echo 1 > /sys/kernel/debug/tracing/snapshot
cat snapshot
+config TRACER_SNAPSHOT_PER_CPU_SWAP
+ bool "Allow snapshot to swap per CPU"
+ depends on TRACER_SNAPSHOT
+ select RING_BUFFER_ALLOW_SWAP
+ help
+ Allow doing a snapshot of a single CPU buffer instead of a
+ full swap (all buffers). If this is set, then the following is
+ allowed:
+
+ echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot
+
+ After which, only the tracing buffer for CPU 2 was swapped with
+ the main tracing buffer, and the other CPU buffers remain the same.
+
+ When this is enabled, this adds a little more overhead to the
+ trace recording, as it needs to add some checks to synchronize
+ recording with swaps. But this does not affect the performance
+ of the overall system. This is enabled by default when the preempt
+ or irq latency tracers are enabled, as those need to swap as well
+ and already adds the overhead (plus a lot more).
+
config TRACE_BRANCH_PROFILING
bool
select GENERIC_TRACER
@@ -414,24 +440,28 @@ config PROBE_EVENTS
def_bool n
config DYNAMIC_FTRACE
- bool "enable/disable ftrace tracepoints dynamically"
+ bool "enable/disable function tracing dynamically"
depends on FUNCTION_TRACER
depends on HAVE_DYNAMIC_FTRACE
default y
help
- This option will modify all the calls to ftrace dynamically
- (will patch them out of the binary image and replace them
- with a No-Op instruction) as they are called. A table is
- created to dynamically enable them again.
+ This option will modify all the calls to function tracing
+ dynamically (will patch them out of the binary image and
+ replace them with a No-Op instruction) on boot up. During
+ compile time, a table is made of all the locations that ftrace
+ can function trace, and this table is linked into the kernel
+ image. When this is enabled, functions can be individually
+ enabled, and the functions not enabled will not affect
+ performance of the system.
+
+ See the files in /sys/kernel/debug/tracing:
+ available_filter_functions
+ set_ftrace_filter
+ set_ftrace_notrace
This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but
otherwise has native performance as long as no tracing is active.
- The changes to the code are done by a kernel thread that
- wakes up once a second and checks to see if any ftrace calls
- were made. If so, it runs stop_machine (stops all CPUS)
- and modifies the code to jump over the call to ftrace.
-
config DYNAMIC_FTRACE_WITH_REGS
def_bool y
depends on DYNAMIC_FTRACE
@@ -520,6 +550,29 @@ config RING_BUFFER_BENCHMARK
If unsure, say N.
+config RING_BUFFER_STARTUP_TEST
+ bool "Ring buffer startup self test"
+ depends on RING_BUFFER
+ help
+ Run a simple self test on the ring buffer on boot up. Late in the
+ kernel boot sequence, the test will start that kicks off
+ a thread per cpu. Each thread will write various size events
+ into the ring buffer. Another thread is created to send IPIs
+ to each of the threads, where the IPI handler will also write
+ to the ring buffer, to test/stress the nesting ability.
+ If any anomalies are discovered, a warning will be displayed
+ and all ring buffers will be disabled.
+
+ The test runs for 10 seconds. This will slow your boot time
+ by at least 10 more seconds.
+
+ At the end of the test, statics and more checks are done.
+ It will output the stats of each per cpu buffer. What
+ was written, the sizes, what was read, what was lost, and
+ other similar details.
+
+ If unsure, say N
+
endif # FTRACE
endif # TRACING_SUPPORT
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9e5b8c272eec..ed58a3216a6d 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
bool blk_tracer = blk_tracer_enabled;
if (blk_tracer) {
- buffer = blk_tr->buffer;
+ buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + len,
@@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
if (blk_tracer) {
tracing_record_cmdline(current);
- buffer = blk_tr->buffer;
+ buffer = blk_tr->trace_buffer.buffer;
pc = preempt_count();
event = trace_buffer_lock_reserve(buffer, TRACE_BLK,
sizeof(*t) + pdu_len,
@@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore,
struct request_queue *q,
struct request *rq)
{
- struct blk_trace *bt = q->blk_trace;
-
- /* if control ever passes through here, it's a request based driver */
- if (unlikely(bt && !bt->rq_based))
- bt->rq_based = true;
-
blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
}
@@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore,
blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0);
}
-static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error)
+static void blk_add_trace_bio_complete(void *ignore,
+ struct request_queue *q, struct bio *bio,
+ int error)
{
- struct request_queue *q;
- struct blk_trace *bt;
-
- if (!bio->bi_bdev)
- return;
-
- q = bdev_get_queue(bio->bi_bdev);
- bt = q->blk_trace;
-
- /*
- * Request based drivers will generate both rq and bio completions.
- * Ignore bio ones.
- */
- if (likely(!bt) || bt->rq_based)
- return;
-
blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error);
}
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index ab25b88aae56..8a5c017bb50c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -66,7 +66,7 @@
static struct ftrace_ops ftrace_list_end __read_mostly = {
.func = ftrace_stub,
- .flags = FTRACE_OPS_FL_RECURSION_SAFE,
+ .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB,
};
/* ftrace_enabled is a method to turn ftrace on or off */
@@ -486,7 +486,6 @@ struct ftrace_profile_stat {
#define PROFILES_PER_PAGE \
(PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
-static int ftrace_profile_bits __read_mostly;
static int ftrace_profile_enabled __read_mostly;
/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
@@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock);
static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
-#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+#define FTRACE_PROFILE_HASH_BITS 10
+#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS)
static void *
function_stat_next(void *v, int idx)
@@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
- for (i = 0; i < pages; i++) {
+ for (i = 1; i < pages; i++) {
pg->next = (void *)get_zeroed_page(GFP_KERNEL);
if (!pg->next)
goto out_free;
@@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
free_page(tmp);
}
- free_page((unsigned long)stat->pages);
stat->pages = NULL;
stat->start = NULL;
@@ -725,13 +724,6 @@ static int ftrace_profile_init_cpu(int cpu)
if (!stat->hash)
return -ENOMEM;
- if (!ftrace_profile_bits) {
- size--;
-
- for (; size; size >>= 1)
- ftrace_profile_bits++;
- }
-
/* Preallocate the function profiling pages */
if (ftrace_profile_pages_init(stat) < 0) {
kfree(stat->hash);
@@ -764,7 +756,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
struct hlist_head *hhd;
unsigned long key;
- key = hash_long(ip, ftrace_profile_bits);
+ key = hash_long(ip, FTRACE_PROFILE_HASH_BITS);
hhd = &stat->hash[key];
if (hlist_empty(hhd))
@@ -783,7 +775,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat,
{
unsigned long key;
- key = hash_long(rec->ip, ftrace_profile_bits);
+ key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS);
hlist_add_head_rcu(&rec->node, &stat->hash[key]);
}
@@ -1053,6 +1045,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer)
static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+loff_t
+ftrace_filter_lseek(struct file *file, loff_t offset, int whence)
+{
+ loff_t ret;
+
+ if (file->f_mode & FMODE_READ)
+ ret = seq_lseek(file, offset, whence);
+ else
+ file->f_pos = ret = 1;
+
+ return ret;
+}
+
#ifdef CONFIG_DYNAMIC_FTRACE
#ifndef CONFIG_FTRACE_MCOUNT_RECORD
@@ -1067,7 +1072,7 @@ struct ftrace_func_probe {
unsigned long flags;
unsigned long ip;
void *data;
- struct rcu_head rcu;
+ struct list_head free_list;
};
struct ftrace_func_entry {
@@ -1317,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
struct hlist_head *hhd;
struct ftrace_hash *old_hash;
struct ftrace_hash *new_hash;
- unsigned long key;
int size = src->count;
int bits = 0;
int ret;
@@ -1360,10 +1364,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable,
for (i = 0; i < size; i++) {
hhd = &src->buckets[i];
hlist_for_each_entry_safe(entry, tn, hhd, hlist) {
- if (bits > 0)
- key = hash_long(entry->ip, bits);
- else
- key = 0;
remove_hash_entry(src, entry);
__add_hash_entry(new_hash, entry);
}
@@ -2613,7 +2613,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash)
* routine, you can use ftrace_filter_write() for the write
* routine if @flag has FTRACE_ITER_FILTER set, or
* ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set.
- * ftrace_regex_lseek() should be used as the lseek routine, and
+ * ftrace_filter_lseek() should be used as the lseek routine, and
* release must call ftrace_regex_release().
*/
int
@@ -2697,19 +2697,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
inode, file);
}
-loff_t
-ftrace_regex_lseek(struct file *file, loff_t offset, int whence)
-{
- loff_t ret;
-
- if (file->f_mode & FMODE_READ)
- ret = seq_lseek(file, offset, whence);
- else
- file->f_pos = ret = 1;
-
- return ret;
-}
-
static int ftrace_match(char *str, char *regex, int len, int type)
{
int matched = 0;
@@ -2974,28 +2961,27 @@ static void __disable_ftrace_function_probe(void)
}
-static void ftrace_free_entry_rcu(struct rcu_head *rhp)
+static void ftrace_free_entry(struct ftrace_func_probe *entry)
{
- struct ftrace_func_probe *entry =
- container_of(rhp, struct ftrace_func_probe, rcu);
-
if (entry->ops->free)
- entry->ops->free(&entry->data);
+ entry->ops->free(entry->ops, entry->ip, &entry->data);
kfree(entry);
}
-
int
register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data)
{
struct ftrace_func_probe *entry;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+ struct ftrace_hash *hash;
struct ftrace_page *pg;
struct dyn_ftrace *rec;
int type, len, not;
unsigned long key;
int count = 0;
char *search;
+ int ret;
type = filter_parse_regex(glob, strlen(glob), &search, &not);
len = strlen(search);
@@ -3006,8 +2992,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
mutex_lock(&ftrace_lock);
- if (unlikely(ftrace_disabled))
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+ if (!hash) {
+ count = -ENOMEM;
+ goto out_unlock;
+ }
+
+ if (unlikely(ftrace_disabled)) {
+ count = -ENODEV;
goto out_unlock;
+ }
do_for_each_ftrace_rec(pg, rec) {
@@ -3031,14 +3025,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
* for each function we find. We call the callback
* to give the caller an opportunity to do so.
*/
- if (ops->callback) {
- if (ops->callback(rec->ip, &entry->data) < 0) {
+ if (ops->init) {
+ if (ops->init(ops, rec->ip, &entry->data) < 0) {
/* caller does not like this func */
kfree(entry);
continue;
}
}
+ ret = enter_record(hash, rec, 0);
+ if (ret < 0) {
+ kfree(entry);
+ count = ret;
+ goto out_unlock;
+ }
+
entry->ops = ops;
entry->ip = rec->ip;
@@ -3046,10 +3047,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
} while_for_each_ftrace_rec();
+
+ ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+ if (ret < 0)
+ count = ret;
+
__enable_ftrace_function_probe();
out_unlock:
mutex_unlock(&ftrace_lock);
+ free_ftrace_hash(hash);
return count;
}
@@ -3063,7 +3070,12 @@ static void
__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
void *data, int flags)
{
+ struct ftrace_func_entry *rec_entry;
struct ftrace_func_probe *entry;
+ struct ftrace_func_probe *p;
+ struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash;
+ struct list_head free_list;
+ struct ftrace_hash *hash;
struct hlist_node *tmp;
char str[KSYM_SYMBOL_LEN];
int type = MATCH_FULL;
@@ -3084,6 +3096,14 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
}
mutex_lock(&ftrace_lock);
+
+ hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash);
+ if (!hash)
+ /* Hmm, should report this somehow */
+ goto out_unlock;
+
+ INIT_LIST_HEAD(&free_list);
+
for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
struct hlist_head *hhd = &ftrace_func_hash[i];
@@ -3104,12 +3124,30 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
continue;
}
- hlist_del(&entry->node);
- call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+ rec_entry = ftrace_lookup_ip(hash, entry->ip);
+ /* It is possible more than one entry had this ip */
+ if (rec_entry)
+ free_hash_entry(hash, rec_entry);
+
+ hlist_del_rcu(&entry->node);
+ list_add(&entry->free_list, &free_list);
}
}
__disable_ftrace_function_probe();
+ /*
+ * Remove after the disable is called. Otherwise, if the last
+ * probe is removed, a null hash means *all enabled*.
+ */
+ ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash);
+ synchronize_sched();
+ list_for_each_entry_safe(entry, p, &free_list, free_list) {
+ list_del(&entry->free_list);
+ ftrace_free_entry(entry);
+ }
+
+ out_unlock:
mutex_unlock(&ftrace_lock);
+ free_ftrace_hash(hash);
}
void
@@ -3441,14 +3479,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
static int __init set_ftrace_notrace(char *str)
{
- strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+ strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_notrace=", set_ftrace_notrace);
static int __init set_ftrace_filter(char *str)
{
- strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+ strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
return 1;
}
__setup("ftrace_filter=", set_ftrace_filter);
@@ -3571,7 +3609,7 @@ static const struct file_operations ftrace_filter_fops = {
.open = ftrace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -3579,7 +3617,7 @@ static const struct file_operations ftrace_notrace_fops = {
.open = ftrace_notrace_open,
.read = seq_read,
.write = ftrace_notrace_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -3737,7 +3775,8 @@ out:
if (fail)
return -EINVAL;
- ftrace_graph_filter_enabled = 1;
+ ftrace_graph_filter_enabled = !!(*idx);
+
return 0;
}
@@ -3784,8 +3823,8 @@ static const struct file_operations ftrace_graph_fops = {
.open = ftrace_graph_open,
.read = seq_read,
.write = ftrace_graph_write,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_graph_release,
- .llseek = seq_lseek,
};
#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
@@ -4131,7 +4170,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip,
preempt_disable_notrace();
trace_recursion_set(TRACE_CONTROL_BIT);
do_for_each_ftrace_op(op, ftrace_control_list) {
- if (!ftrace_function_local_disabled(op) &&
+ if (!(op->flags & FTRACE_OPS_FL_STUB) &&
+ !ftrace_function_local_disabled(op) &&
ftrace_ops_test(op, ip))
op->func(ip, parent_ip, op, regs);
} while_for_each_ftrace_op(op);
@@ -4439,7 +4479,7 @@ static const struct file_operations ftrace_pid_fops = {
.open = ftrace_pid_open,
.write = ftrace_pid_write,
.read = seq_read,
- .llseek = seq_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_pid_release,
};
@@ -4555,12 +4595,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
ftrace_startup_sysctl();
/* we are starting ftrace again */
- if (ftrace_ops_list != &ftrace_list_end) {
- if (ftrace_ops_list->next == &ftrace_list_end)
- ftrace_trace_function = ftrace_ops_list->func;
- else
- ftrace_trace_function = ftrace_ops_list_func;
- }
+ if (ftrace_ops_list != &ftrace_list_end)
+ update_ftrace_function();
} else {
/* stopping ftrace calls (just send to ftrace_stub) */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 6989df2ba194..b59aea2c48c2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,13 +8,16 @@
#include <linux/trace_clock.h>
#include <linux/trace_seq.h>
#include <linux/spinlock.h>
+#include <linux/irq_work.h>
#include <linux/debugfs.h>
#include <linux/uaccess.h>
#include <linux/hardirq.h>
+#include <linux/kthread.h> /* for self test */
#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/percpu.h>
#include <linux/mutex.h>
+#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/init.h>
#include <linux/hash.h>
@@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s)
return ret;
}
+struct rb_irq_work {
+ struct irq_work work;
+ wait_queue_head_t waiters;
+ bool waiters_pending;
+};
+
/*
* head_page == tail_page && head == tail then buffer is empty.
*/
@@ -478,6 +487,8 @@ struct ring_buffer_per_cpu {
struct list_head new_pages; /* new pages to add */
struct work_struct update_pages_work;
struct completion update_done;
+
+ struct rb_irq_work irq_work;
};
struct ring_buffer {
@@ -497,6 +508,8 @@ struct ring_buffer {
struct notifier_block cpu_notify;
#endif
u64 (*clock)(void);
+
+ struct rb_irq_work irq_work;
};
struct ring_buffer_iter {
@@ -508,6 +521,118 @@ struct ring_buffer_iter {
u64 read_stamp;
};
+/*
+ * rb_wake_up_waiters - wake up tasks waiting for ring buffer input
+ *
+ * Schedules a delayed work to wake up any task that is blocked on the
+ * ring buffer waiters queue.
+ */
+static void rb_wake_up_waiters(struct irq_work *work)
+{
+ struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work);
+
+ wake_up_all(&rbwork->waiters);
+}
+
+/**
+ * ring_buffer_wait - wait for input to the ring buffer
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ */
+void ring_buffer_wait(struct ring_buffer *buffer, int cpu)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ DEFINE_WAIT(wait);
+ struct rb_irq_work *work;
+
+ /*
+ * Depending on what the caller is waiting for, either any
+ * data in any cpu buffer, or a specific buffer, put the
+ * caller on the appropriate wait queue.
+ */
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ work = &buffer->irq_work;
+ else {
+ cpu_buffer = buffer->buffers[cpu];
+ work = &cpu_buffer->irq_work;
+ }
+
+
+ prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE);
+
+ /*
+ * The events can happen in critical sections where
+ * checking a work queue can cause deadlocks.
+ * After adding a task to the queue, this flag is set
+ * only to notify events to try to wake up the queue
+ * using irq_work.
+ *
+ * We don't clear it even if the buffer is no longer
+ * empty. The flag only causes the next event to run
+ * irq_work to do the work queue wake up. The worse
+ * that can happen if we race with !trace_empty() is that
+ * an event will cause an irq_work to try to wake up
+ * an empty queue.
+ *
+ * There's no reason to protect this flag either, as
+ * the work queue and irq_work logic will do the necessary
+ * synchronization for the wake ups. The only thing
+ * that is necessary is that the wake up happens after
+ * a task has been queued. It's OK for spurious wake ups.
+ */
+ work->waiters_pending = true;
+
+ if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) ||
+ (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu)))
+ schedule();
+
+ finish_wait(&work->waiters, &wait);
+}
+
+/**
+ * ring_buffer_poll_wait - poll on buffer input
+ * @buffer: buffer to wait on
+ * @cpu: the cpu buffer to wait on
+ * @filp: the file descriptor
+ * @poll_table: The poll descriptor
+ *
+ * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon
+ * as data is added to any of the @buffer's cpu buffers. Otherwise
+ * it will wait for data to be added to a specific cpu buffer.
+ *
+ * Returns POLLIN | POLLRDNORM if data exists in the buffers,
+ * zero otherwise.
+ */
+int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu,
+ struct file *filp, poll_table *poll_table)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct rb_irq_work *work;
+
+ if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+ (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+ return POLLIN | POLLRDNORM;
+
+ if (cpu == RING_BUFFER_ALL_CPUS)
+ work = &buffer->irq_work;
+ else {
+ cpu_buffer = buffer->buffers[cpu];
+ work = &cpu_buffer->irq_work;
+ }
+
+ work->waiters_pending = true;
+ poll_wait(filp, &work->waiters, poll_table);
+
+ if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) ||
+ (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu)))
+ return POLLIN | POLLRDNORM;
+ return 0;
+}
+
/* buffer may be either ring_buffer or ring_buffer_per_cpu */
#define RB_WARN_ON(b, cond) \
({ \
@@ -1063,6 +1188,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu)
cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler);
init_completion(&cpu_buffer->update_done);
+ init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters);
+ init_waitqueue_head(&cpu_buffer->irq_work.waiters);
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
GFP_KERNEL, cpu_to_node(cpu));
@@ -1158,6 +1285,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
+ init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters);
+ init_waitqueue_head(&buffer->irq_work.waiters);
+
/* need at least two pages */
if (nr_pages < 2)
nr_pages = 2;
@@ -1553,11 +1683,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
if (!cpu_buffer->nr_pages_to_update)
continue;
- if (cpu_online(cpu))
+ /* The update must run on the CPU that is being updated. */
+ preempt_disable();
+ if (cpu == smp_processor_id() || !cpu_online(cpu)) {
+ rb_update_pages(cpu_buffer);
+ cpu_buffer->nr_pages_to_update = 0;
+ } else {
+ /*
+ * Can not disable preemption for schedule_work_on()
+ * on PREEMPT_RT.
+ */
+ preempt_enable();
schedule_work_on(cpu,
&cpu_buffer->update_pages_work);
- else
- rb_update_pages(cpu_buffer);
+ preempt_disable();
+ }
+ preempt_enable();
}
/* wait for all the updates to complete */
@@ -1595,12 +1736,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size,
get_online_cpus();
- if (cpu_online(cpu_id)) {
+ preempt_disable();
+ /* The update must run on the CPU that is being updated. */
+ if (cpu_id == smp_processor_id() || !cpu_online(cpu_id))
+ rb_update_pages(cpu_buffer);
+ else {
+ /*
+ * Can not disable preemption for schedule_work_on()
+ * on PREEMPT_RT.
+ */
+ preempt_enable();
schedule_work_on(cpu_id,
&cpu_buffer->update_pages_work);
wait_for_completion(&cpu_buffer->update_done);
- } else
- rb_update_pages(cpu_buffer);
+ preempt_disable();
+ }
+ preempt_enable();
cpu_buffer->nr_pages_to_update = 0;
put_online_cpus();
@@ -2612,6 +2763,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
rb_end_commit(cpu_buffer);
}
+static __always_inline void
+rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer)
+{
+ if (buffer->irq_work.waiters_pending) {
+ buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&buffer->irq_work.work);
+ }
+
+ if (cpu_buffer->irq_work.waiters_pending) {
+ cpu_buffer->irq_work.waiters_pending = false;
+ /* irq_work_queue() supplies it's own memory barriers */
+ irq_work_queue(&cpu_buffer->irq_work.work);
+ }
+}
+
/**
* ring_buffer_unlock_commit - commit a reserved
* @buffer: The buffer to commit to
@@ -2631,6 +2798,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
rb_commit(cpu_buffer, event);
+ rb_wakeups(buffer, cpu_buffer);
+
trace_recursive_unlock();
preempt_enable_notrace();
@@ -2803,6 +2972,8 @@ int ring_buffer_write(struct ring_buffer *buffer,
rb_commit(cpu_buffer, event);
+ rb_wakeups(buffer, cpu_buffer);
+
ret = 0;
out:
preempt_enable_notrace();
@@ -4467,3 +4638,320 @@ static int rb_cpu_notify(struct notifier_block *self,
return NOTIFY_OK;
}
#endif
+
+#ifdef CONFIG_RING_BUFFER_STARTUP_TEST
+/*
+ * This is a basic integrity check of the ring buffer.
+ * Late in the boot cycle this test will run when configured in.
+ * It will kick off a thread per CPU that will go into a loop
+ * writing to the per cpu ring buffer various sizes of data.
+ * Some of the data will be large items, some small.
+ *
+ * Another thread is created that goes into a spin, sending out
+ * IPIs to the other CPUs to also write into the ring buffer.
+ * this is to test the nesting ability of the buffer.
+ *
+ * Basic stats are recorded and reported. If something in the
+ * ring buffer should happen that's not expected, a big warning
+ * is displayed and all ring buffers are disabled.
+ */
+static struct task_struct *rb_threads[NR_CPUS] __initdata;
+
+struct rb_test_data {
+ struct ring_buffer *buffer;
+ unsigned long events;
+ unsigned long bytes_written;
+ unsigned long bytes_alloc;
+ unsigned long bytes_dropped;
+ unsigned long events_nested;
+ unsigned long bytes_written_nested;
+ unsigned long bytes_alloc_nested;
+ unsigned long bytes_dropped_nested;
+ int min_size_nested;
+ int max_size_nested;
+ int max_size;
+ int min_size;
+ int cpu;
+ int cnt;
+};
+
+static struct rb_test_data rb_data[NR_CPUS] __initdata;
+
+/* 1 meg per cpu */
+#define RB_TEST_BUFFER_SIZE 1048576
+
+static char rb_string[] __initdata =
+ "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\"
+ "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890"
+ "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv";
+
+static bool rb_test_started __initdata;
+
+struct rb_item {
+ int size;
+ char str[];
+};
+
+static __init int rb_write_something(struct rb_test_data *data, bool nested)
+{
+ struct ring_buffer_event *event;
+ struct rb_item *item;
+ bool started;
+ int event_len;
+ int size;
+ int len;
+ int cnt;
+
+ /* Have nested writes different that what is written */
+ cnt = data->cnt + (nested ? 27 : 0);
+
+ /* Multiply cnt by ~e, to make some unique increment */
+ size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1);
+
+ len = size + sizeof(struct rb_item);
+
+ started = rb_test_started;
+ /* read rb_test_started before checking buffer enabled */
+ smp_rmb();
+
+ event = ring_buffer_lock_reserve(data->buffer, len);
+ if (!event) {
+ /* Ignore dropped events before test starts. */
+ if (started) {
+ if (nested)
+ data->bytes_dropped += len;
+ else
+ data->bytes_dropped_nested += len;
+ }
+ return len;
+ }
+
+ event_len = ring_buffer_event_length(event);
+
+ if (RB_WARN_ON(data->buffer, event_len < len))
+ goto out;
+
+ item = ring_buffer_event_data(event);
+ item->size = size;
+ memcpy(item->str, rb_string, size);
+
+ if (nested) {
+ data->bytes_alloc_nested += event_len;
+ data->bytes_written_nested += len;
+ data->events_nested++;
+ if (!data->min_size_nested || len < data->min_size_nested)
+ data->min_size_nested = len;
+ if (len > data->max_size_nested)
+ data->max_size_nested = len;
+ } else {
+ data->bytes_alloc += event_len;
+ data->bytes_written += len;
+ data->events++;
+ if (!data->min_size || len < data->min_size)
+ data->max_size = len;
+ if (len > data->max_size)
+ data->max_size = len;
+ }
+
+ out:
+ ring_buffer_unlock_commit(data->buffer, event);
+
+ return 0;
+}
+
+static __init int rb_test(void *arg)
+{
+ struct rb_test_data *data = arg;
+
+ while (!kthread_should_stop()) {
+ rb_write_something(data, false);
+ data->cnt++;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Now sleep between a min of 100-300us and a max of 1ms */
+ usleep_range(((data->cnt % 3) + 1) * 100, 1000);
+ }
+
+ return 0;
+}
+
+static __init void rb_ipi(void *ignore)
+{
+ struct rb_test_data *data;
+ int cpu = smp_processor_id();
+
+ data = &rb_data[cpu];
+ rb_write_something(data, true);
+}
+
+static __init int rb_hammer_test(void *arg)
+{
+ while (!kthread_should_stop()) {
+
+ /* Send an IPI to all cpus to write data! */
+ smp_call_function(rb_ipi, NULL, 1);
+ /* No sleep, but for non preempt, let others run */
+ schedule();
+ }
+
+ return 0;
+}
+
+static __init int test_ringbuffer(void)
+{
+ struct task_struct *rb_hammer;
+ struct ring_buffer *buffer;
+ int cpu;
+ int ret = 0;
+
+ pr_info("Running ring buffer tests...\n");
+
+ buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE);
+ if (WARN_ON(!buffer))
+ return 0;
+
+ /* Disable buffer so that threads can't write to it yet */
+ ring_buffer_record_off(buffer);
+
+ for_each_online_cpu(cpu) {
+ rb_data[cpu].buffer = buffer;
+ rb_data[cpu].cpu = cpu;
+ rb_data[cpu].cnt = cpu;
+ rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu],
+ "rbtester/%d", cpu);
+ if (WARN_ON(!rb_threads[cpu])) {
+ pr_cont("FAILED\n");
+ ret = -1;
+ goto out_free;
+ }
+
+ kthread_bind(rb_threads[cpu], cpu);
+ wake_up_process(rb_threads[cpu]);
+ }
+
+ /* Now create the rb hammer! */
+ rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer");
+ if (WARN_ON(!rb_hammer)) {
+ pr_cont("FAILED\n");
+ ret = -1;
+ goto out_free;
+ }
+
+ ring_buffer_record_on(buffer);
+ /*
+ * Show buffer is enabled before setting rb_test_started.
+ * Yes there's a small race window where events could be
+ * dropped and the thread wont catch it. But when a ring
+ * buffer gets enabled, there will always be some kind of
+ * delay before other CPUs see it. Thus, we don't care about
+ * those dropped events. We care about events dropped after
+ * the threads see that the buffer is active.
+ */
+ smp_wmb();
+ rb_test_started = true;
+
+ set_current_state(TASK_INTERRUPTIBLE);
+ /* Just run for 10 seconds */;
+ schedule_timeout(10 * HZ);
+
+ kthread_stop(rb_hammer);
+
+ out_free:
+ for_each_online_cpu(cpu) {
+ if (!rb_threads[cpu])
+ break;
+ kthread_stop(rb_threads[cpu]);
+ }
+ if (ret) {
+ ring_buffer_free(buffer);
+ return ret;
+ }
+
+ /* Report! */
+ pr_info("finished\n");
+ for_each_online_cpu(cpu) {
+ struct ring_buffer_event *event;
+ struct rb_test_data *data = &rb_data[cpu];
+ struct rb_item *item;
+ unsigned long total_events;
+ unsigned long total_dropped;
+ unsigned long total_written;
+ unsigned long total_alloc;
+ unsigned long total_read = 0;
+ unsigned long total_size = 0;
+ unsigned long total_len = 0;
+ unsigned long total_lost = 0;
+ unsigned long lost;
+ int big_event_size;
+ int small_event_size;
+
+ ret = -1;
+
+ total_events = data->events + data->events_nested;
+ total_written = data->bytes_written + data->bytes_written_nested;
+ total_alloc = data->bytes_alloc + data->bytes_alloc_nested;
+ total_dropped = data->bytes_dropped + data->bytes_dropped_nested;
+
+ big_event_size = data->max_size + data->max_size_nested;
+ small_event_size = data->min_size + data->min_size_nested;
+
+ pr_info("CPU %d:\n", cpu);
+ pr_info(" events: %ld\n", total_events);
+ pr_info(" dropped bytes: %ld\n", total_dropped);
+ pr_info(" alloced bytes: %ld\n", total_alloc);
+ pr_info(" written bytes: %ld\n", total_written);
+ pr_info(" biggest event: %d\n", big_event_size);
+ pr_info(" smallest event: %d\n", small_event_size);
+
+ if (RB_WARN_ON(buffer, total_dropped))
+ break;
+
+ ret = 0;
+
+ while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) {
+ total_lost += lost;
+ item = ring_buffer_event_data(event);
+ total_len += ring_buffer_event_length(event);
+ total_size += item->size + sizeof(struct rb_item);
+ if (memcmp(&item->str[0], rb_string, item->size) != 0) {
+ pr_info("FAILED!\n");
+ pr_info("buffer had: %.*s\n", item->size, item->str);
+ pr_info("expected: %.*s\n", item->size, rb_string);
+ RB_WARN_ON(buffer, 1);
+ ret = -1;
+ break;
+ }
+ total_read++;
+ }
+ if (ret)
+ break;
+
+ ret = -1;
+
+ pr_info(" read events: %ld\n", total_read);
+ pr_info(" lost events: %ld\n", total_lost);
+ pr_info(" total events: %ld\n", total_lost + total_read);
+ pr_info(" recorded len bytes: %ld\n", total_len);
+ pr_info(" recorded size bytes: %ld\n", total_size);
+ if (total_lost)
+ pr_info(" With dropped events, record len and size may not match\n"
+ " alloced and written from above\n");
+ if (!total_lost) {
+ if (RB_WARN_ON(buffer, total_len != total_alloc ||
+ total_size != total_written))
+ break;
+ }
+ if (RB_WARN_ON(buffer, total_lost + total_read != total_events))
+ break;
+
+ ret = 0;
+ }
+ if (!ret)
+ pr_info("Ring buffer PASSED!\n");
+
+ ring_buffer_free(buffer);
+ return 0;
+}
+
+late_initcall(test_ringbuffer);
+#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c2e2c2310374..581630a6387d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1,7 +1,7 @@
/*
* ring buffer based function tracer
*
- * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
+ * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com>
* Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
*
* Originally taken from the RT patch by:
@@ -19,7 +19,6 @@
#include <linux/seq_file.h>
#include <linux/notifier.h>
#include <linux/irqflags.h>
-#include <linux/irq_work.h>
#include <linux/debugfs.h>
#include <linux/pagemap.h>
#include <linux/hardirq.h>
@@ -48,7 +47,7 @@
* On boot up, the ring buffer is set to the minimum size, so that
* we do not waste memory on systems that are not using tracing.
*/
-int ring_buffer_expanded;
+bool ring_buffer_expanded;
/*
* We need to change this state when a selftest is running.
@@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
static DEFINE_PER_CPU(bool, trace_cmdline_save);
/*
- * When a reader is waiting for data, then this variable is
- * set to true.
- */
-static bool trace_wakeup_needed;
-
-static struct irq_work trace_work_wakeup;
-
-/*
* Kill all tracing for good (never come back).
* It is initialized to 1 but will turn to zero if the initialization
* of the tracer is successful. But that is the only place that sets
@@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf);
static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata;
static char *default_bootup_tracer;
+static bool allocate_snapshot;
+
static int __init set_cmdline_ftrace(char *str)
{
- strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
+ strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE);
default_bootup_tracer = bootup_tracer_buf;
/* We are using ftrace early, expand it */
- ring_buffer_expanded = 1;
+ ring_buffer_expanded = true;
return 1;
}
__setup("ftrace=", set_cmdline_ftrace);
@@ -156,13 +149,22 @@ static int __init set_ftrace_dump_on_oops(char *str)
}
__setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
+static int __init boot_alloc_snapshot(char *str)
+{
+ allocate_snapshot = true;
+ /* We also need the main ring buffer expanded */
+ ring_buffer_expanded = true;
+ return 1;
+}
+__setup("alloc_snapshot", boot_alloc_snapshot);
+
static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata;
static char *trace_boot_options __initdata;
static int __init set_trace_boot_options(char *str)
{
- strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
+ strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE);
trace_boot_options = trace_boot_options_buf;
return 0;
}
@@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec)
*/
static struct trace_array global_trace;
-static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
+LIST_HEAD(ftrace_trace_arrays);
int filter_current_check_discard(struct ring_buffer *buffer,
struct ftrace_event_call *call, void *rec,
@@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu)
u64 ts;
/* Early boot up does not have a buffer yet */
- if (!global_trace.buffer)
+ if (!global_trace.trace_buffer.buffer)
return trace_clock_local();
- ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
- ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
+ ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu);
+ ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts);
return ts;
}
-/*
- * The max_tr is used to snapshot the global_trace when a maximum
- * latency is reached. Some tracers will use this to store a maximum
- * trace while it continues examining live traces.
- *
- * The buffers for the max_tr are set up the same as the global_trace.
- * When a snapshot is taken, the link list of the max_tr is swapped
- * with the link list of the global_trace and the buffers are reset for
- * the global_trace so the tracing can continue.
- */
-static struct trace_array max_tr;
-
-static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data);
-
int tracing_is_enabled(void)
{
return tracing_is_on();
@@ -249,9 +237,6 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT;
/* trace_types holds a link list of available tracers. */
static struct tracer *trace_types __read_mostly;
-/* current_trace points to the tracer that is currently active */
-static struct tracer *current_trace __read_mostly = &nop_trace;
-
/*
* trace_types_lock is used to protect the trace_types list.
*/
@@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock);
static inline void trace_access_lock(int cpu)
{
- if (cpu == TRACE_PIPE_ALL_CPU) {
+ if (cpu == RING_BUFFER_ALL_CPUS) {
/* gain it for accessing the whole ring buffer. */
down_write(&all_cpu_access_lock);
} else {
/* gain it for accessing a cpu ring buffer. */
- /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */
+ /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */
down_read(&all_cpu_access_lock);
/* Secondly block other access to this @cpu ring buffer. */
@@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu)
static inline void trace_access_unlock(int cpu)
{
- if (cpu == TRACE_PIPE_ALL_CPU) {
+ if (cpu == RING_BUFFER_ALL_CPUS) {
up_write(&all_cpu_access_lock);
} else {
mutex_unlock(&per_cpu(cpu_access_lock, cpu));
@@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void)
#endif
-/* trace_wait is a waitqueue for tasks blocked on trace_poll */
-static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
-
/* trace_flags holds trace_options default values */
unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE |
- TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS;
-
-static int trace_stop_count;
-static DEFINE_RAW_SPINLOCK(tracing_start_lock);
-
-/**
- * trace_wake_up - wake up tasks waiting for trace input
- *
- * Schedules a delayed work to wake up any task that is blocked on the
- * trace_wait queue. These is used with trace_poll for tasks polling the
- * trace.
- */
-static void trace_wake_up(struct irq_work *work)
-{
- wake_up_all(&trace_wait);
-
-}
+ TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION;
/**
* tracing_on - enable tracing buffers
@@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work)
*/
void tracing_on(void)
{
- if (global_trace.buffer)
- ring_buffer_record_on(global_trace.buffer);
+ if (global_trace.trace_buffer.buffer)
+ ring_buffer_record_on(global_trace.trace_buffer.buffer);
/*
* This flag is only looked at when buffers haven't been
* allocated yet. We don't really care about the race
@@ -385,6 +351,196 @@ void tracing_on(void)
EXPORT_SYMBOL_GPL(tracing_on);
/**
+ * __trace_puts - write a constant string into the trace buffer.
+ * @ip: The address of the caller
+ * @str: The constant string to write
+ * @size: The size of the string.
+ */
+int __trace_puts(unsigned long ip, const char *str, int size)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct print_entry *entry;
+ unsigned long irq_flags;
+ int alloc;
+
+ alloc = sizeof(*entry) + size + 2; /* possible \n added */
+
+ local_save_flags(irq_flags);
+ buffer = global_trace.trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc,
+ irq_flags, preempt_count());
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+
+ memcpy(&entry->buf, str, size);
+
+ /* Add a newline if necessary */
+ if (entry->buf[size - 1] != '\n') {
+ entry->buf[size] = '\n';
+ entry->buf[size + 1] = '\0';
+ } else
+ entry->buf[size] = '\0';
+
+ __buffer_unlock_commit(buffer, event);
+
+ return size;
+}
+EXPORT_SYMBOL_GPL(__trace_puts);
+
+/**
+ * __trace_bputs - write the pointer to a constant string into trace buffer
+ * @ip: The address of the caller
+ * @str: The constant string to write to the buffer to
+ */
+int __trace_bputs(unsigned long ip, const char *str)
+{
+ struct ring_buffer_event *event;
+ struct ring_buffer *buffer;
+ struct bputs_entry *entry;
+ unsigned long irq_flags;
+ int size = sizeof(struct bputs_entry);
+
+ local_save_flags(irq_flags);
+ buffer = global_trace.trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size,
+ irq_flags, preempt_count());
+ if (!event)
+ return 0;
+
+ entry = ring_buffer_event_data(event);
+ entry->ip = ip;
+ entry->str = str;
+
+ __buffer_unlock_commit(buffer, event);
+
+ return 1;
+}
+EXPORT_SYMBOL_GPL(__trace_bputs);
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+/**
+ * trace_snapshot - take a snapshot of the current buffer.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ *
+ * Note, make sure to allocate the snapshot with either
+ * a tracing_snapshot_alloc(), or by doing it manually
+ * with: echo 1 > /sys/kernel/debug/tracing/snapshot
+ *
+ * If the snapshot buffer is not allocated, it will stop tracing.
+ * Basically making a permanent snapshot.
+ */
+void tracing_snapshot(void)
+{
+ struct trace_array *tr = &global_trace;
+ struct tracer *tracer = tr->current_trace;
+ unsigned long flags;
+
+ if (in_nmi()) {
+ internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n");
+ internal_trace_puts("*** snapshot is being ignored ***\n");
+ return;
+ }
+
+ if (!tr->allocated_snapshot) {
+ internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n");
+ internal_trace_puts("*** stopping trace here! ***\n");
+ tracing_off();
+ return;
+ }
+
+ /* Note, snapshot can not be used when the tracer uses it */
+ if (tracer->use_max_tr) {
+ internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n");
+ internal_trace_puts("*** Can not use snapshot (sorry) ***\n");
+ return;
+ }
+
+ local_irq_save(flags);
+ update_max_tr(tr, current, smp_processor_id());
+ local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot);
+
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
+ struct trace_buffer *size_buf, int cpu_id);
+static void set_buffer_entries(struct trace_buffer *buf, unsigned long val);
+
+static int alloc_snapshot(struct trace_array *tr)
+{
+ int ret;
+
+ if (!tr->allocated_snapshot) {
+
+ /* allocate spare buffer */
+ ret = resize_buffer_duplicate_size(&tr->max_buffer,
+ &tr->trace_buffer, RING_BUFFER_ALL_CPUS);
+ if (ret < 0)
+ return ret;
+
+ tr->allocated_snapshot = true;
+ }
+
+ return 0;
+}
+
+void free_snapshot(struct trace_array *tr)
+{
+ /*
+ * We don't free the ring buffer. instead, resize it because
+ * The max_tr ring buffer has some state (e.g. ring->clock) and
+ * we want preserve it.
+ */
+ ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
+ set_buffer_entries(&tr->max_buffer, 1);
+ tracing_reset_online_cpus(&tr->max_buffer);
+ tr->allocated_snapshot = false;
+}
+
+/**
+ * trace_snapshot_alloc - allocate and take a snapshot of the current buffer.
+ *
+ * This is similar to trace_snapshot(), but it will allocate the
+ * snapshot buffer if it isn't already allocated. Use this only
+ * where it is safe to sleep, as the allocation may sleep.
+ *
+ * This causes a swap between the snapshot buffer and the current live
+ * tracing buffer. You can use this to take snapshots of the live
+ * trace when some condition is triggered, but continue to trace.
+ */
+void tracing_snapshot_alloc(void)
+{
+ struct trace_array *tr = &global_trace;
+ int ret;
+
+ ret = alloc_snapshot(tr);
+ if (WARN_ON(ret < 0))
+ return;
+
+ tracing_snapshot();
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+#else
+void tracing_snapshot(void)
+{
+ WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used");
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot);
+void tracing_snapshot_alloc(void)
+{
+ /* Give warning */
+ tracing_snapshot();
+}
+EXPORT_SYMBOL_GPL(tracing_snapshot_alloc);
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
+/**
* tracing_off - turn off tracing buffers
*
* This function stops the tracing buffers from recording data.
@@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on);
*/
void tracing_off(void)
{
- if (global_trace.buffer)
- ring_buffer_record_off(global_trace.buffer);
+ if (global_trace.trace_buffer.buffer)
+ ring_buffer_record_off(global_trace.trace_buffer.buffer);
/*
* This flag is only looked at when buffers haven't been
* allocated yet. We don't really care about the race
@@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off);
*/
int tracing_is_on(void)
{
- if (global_trace.buffer)
- return ring_buffer_record_is_on(global_trace.buffer);
+ if (global_trace.trace_buffer.buffer)
+ return ring_buffer_record_is_on(global_trace.trace_buffer.buffer);
return !global_trace.buffer_disabled;
}
EXPORT_SYMBOL_GPL(tracing_is_on);
@@ -479,6 +635,7 @@ static const char *trace_options[] = {
"disable_on_free",
"irq-info",
"markers",
+ "function-trace",
NULL
};
@@ -490,6 +647,8 @@ static struct {
{ trace_clock_local, "local", 1 },
{ trace_clock_global, "global", 1 },
{ trace_clock_counter, "counter", 0 },
+ { trace_clock_jiffies, "uptime", 1 },
+ { trace_clock, "perf", 1 },
ARCH_TRACE_CLOCKS
};
@@ -670,13 +829,14 @@ unsigned long __read_mostly tracing_max_latency;
static void
__update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct trace_array_cpu *data = tr->data[cpu];
- struct trace_array_cpu *max_data;
+ struct trace_buffer *trace_buf = &tr->trace_buffer;
+ struct trace_buffer *max_buf = &tr->max_buffer;
+ struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu);
+ struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu);
- max_tr.cpu = cpu;
- max_tr.time_start = data->preempt_timestamp;
+ max_buf->cpu = cpu;
+ max_buf->time_start = data->preempt_timestamp;
- max_data = max_tr.data[cpu];
max_data->saved_latency = tracing_max_latency;
max_data->critical_start = data->critical_start;
max_data->critical_end = data->critical_end;
@@ -704,23 +864,24 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
void
update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
- struct ring_buffer *buf = tr->buffer;
+ struct ring_buffer *buf;
- if (trace_stop_count)
+ if (tr->stop_count)
return;
WARN_ON_ONCE(!irqs_disabled());
- if (!current_trace->allocated_snapshot) {
+ if (!tr->allocated_snapshot) {
/* Only the nop tracer should hit this when disabling */
- WARN_ON_ONCE(current_trace != &nop_trace);
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
return;
}
arch_spin_lock(&ftrace_max_lock);
- tr->buffer = max_tr.buffer;
- max_tr.buffer = buf;
+ buf = tr->trace_buffer.buffer;
+ tr->trace_buffer.buffer = tr->max_buffer.buffer;
+ tr->max_buffer.buffer = buf;
__update_max_tr(tr, tsk, cpu);
arch_spin_unlock(&ftrace_max_lock);
@@ -739,16 +900,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
{
int ret;
- if (trace_stop_count)
+ if (tr->stop_count)
return;
WARN_ON_ONCE(!irqs_disabled());
- if (WARN_ON_ONCE(!current_trace->allocated_snapshot))
+ if (tr->allocated_snapshot) {
+ /* Only the nop tracer should hit this when disabling */
+ WARN_ON_ONCE(tr->current_trace != &nop_trace);
return;
+ }
arch_spin_lock(&ftrace_max_lock);
- ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu);
+ ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu);
if (ret == -EBUSY) {
/*
@@ -757,7 +921,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
* the max trace buffer (no one writes directly to it)
* and flag that it failed.
*/
- trace_array_printk(&max_tr, _THIS_IP_,
+ trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_,
"Failed to swap buffers due to commit in progress\n");
}
@@ -770,37 +934,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
static void default_wait_pipe(struct trace_iterator *iter)
{
- DEFINE_WAIT(wait);
+ /* Iterators are static, they should be filled or empty */
+ if (trace_buffer_iter(iter, iter->cpu_file))
+ return;
- prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
+ ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file);
+}
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+static int run_tracer_selftest(struct tracer *type)
+{
+ struct trace_array *tr = &global_trace;
+ struct tracer *saved_tracer = tr->current_trace;
+ int ret;
+
+ if (!type->selftest || tracing_selftest_disabled)
+ return 0;
/*
- * The events can happen in critical sections where
- * checking a work queue can cause deadlocks.
- * After adding a task to the queue, this flag is set
- * only to notify events to try to wake up the queue
- * using irq_work.
- *
- * We don't clear it even if the buffer is no longer
- * empty. The flag only causes the next event to run
- * irq_work to do the work queue wake up. The worse
- * that can happen if we race with !trace_empty() is that
- * an event will cause an irq_work to try to wake up
- * an empty queue.
- *
- * There's no reason to protect this flag either, as
- * the work queue and irq_work logic will do the necessary
- * synchronization for the wake ups. The only thing
- * that is necessary is that the wake up happens after
- * a task has been queued. It's OK for spurious wake ups.
+ * Run a selftest on this tracer.
+ * Here we reset the trace buffer, and set the current
+ * tracer to be this tracer. The tracer can then run some
+ * internal tracing to verify that everything is in order.
+ * If we fail, we do not register this tracer.
*/
- trace_wakeup_needed = true;
+ tracing_reset_online_cpus(&tr->trace_buffer);
- if (trace_empty(iter))
- schedule();
+ tr->current_trace = type;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (type->use_max_tr) {
+ /* If we expanded the buffers, make sure the max is expanded too */
+ if (ring_buffer_expanded)
+ ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size,
+ RING_BUFFER_ALL_CPUS);
+ tr->allocated_snapshot = true;
+ }
+#endif
- finish_wait(&trace_wait, &wait);
+ /* the test is responsible for initializing and enabling */
+ pr_info("Testing tracer %s: ", type->name);
+ ret = type->selftest(type, tr);
+ /* the test is responsible for resetting too */
+ tr->current_trace = saved_tracer;
+ if (ret) {
+ printk(KERN_CONT "FAILED!\n");
+ /* Add the warning after printing 'FAILED' */
+ WARN_ON(1);
+ return -1;
+ }
+ /* Only reset on passing, to avoid touching corrupted buffers */
+ tracing_reset_online_cpus(&tr->trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (type->use_max_tr) {
+ tr->allocated_snapshot = false;
+
+ /* Shrink the max buffer again */
+ if (ring_buffer_expanded)
+ ring_buffer_resize(tr->max_buffer.buffer, 1,
+ RING_BUFFER_ALL_CPUS);
+ }
+#endif
+
+ printk(KERN_CONT "PASSED\n");
+ return 0;
}
+#else
+static inline int run_tracer_selftest(struct tracer *type)
+{
+ return 0;
+}
+#endif /* CONFIG_FTRACE_STARTUP_TEST */
/**
* register_tracer - register a tracer with the ftrace system.
@@ -847,57 +1052,9 @@ int register_tracer(struct tracer *type)
if (!type->wait_pipe)
type->wait_pipe = default_wait_pipe;
-
-#ifdef CONFIG_FTRACE_STARTUP_TEST
- if (type->selftest && !tracing_selftest_disabled) {
- struct tracer *saved_tracer = current_trace;
- struct trace_array *tr = &global_trace;
-
- /*
- * Run a selftest on this tracer.
- * Here we reset the trace buffer, and set the current
- * tracer to be this tracer. The tracer can then run some
- * internal tracing to verify that everything is in order.
- * If we fail, we do not register this tracer.
- */
- tracing_reset_online_cpus(tr);
-
- current_trace = type;
-
- if (type->use_max_tr) {
- /* If we expanded the buffers, make sure the max is expanded too */
- if (ring_buffer_expanded)
- ring_buffer_resize(max_tr.buffer, trace_buf_size,
- RING_BUFFER_ALL_CPUS);
- type->allocated_snapshot = true;
- }
-
- /* the test is responsible for initializing and enabling */
- pr_info("Testing tracer %s: ", type->name);
- ret = type->selftest(type, tr);
- /* the test is responsible for resetting too */
- current_trace = saved_tracer;
- if (ret) {
- printk(KERN_CONT "FAILED!\n");
- /* Add the warning after printing 'FAILED' */
- WARN_ON(1);
- goto out;
- }
- /* Only reset on passing, to avoid touching corrupted buffers */
- tracing_reset_online_cpus(tr);
-
- if (type->use_max_tr) {
- type->allocated_snapshot = false;
-
- /* Shrink the max buffer again */
- if (ring_buffer_expanded)
- ring_buffer_resize(max_tr.buffer, 1,
- RING_BUFFER_ALL_CPUS);
- }
-
- printk(KERN_CONT "PASSED\n");
- }
-#endif
+ ret = run_tracer_selftest(type);
+ if (ret < 0)
+ goto out;
type->next = trace_types;
trace_types = type;
@@ -917,7 +1074,7 @@ int register_tracer(struct tracer *type)
tracing_set_tracer(type->name);
default_bootup_tracer = NULL;
/* disable other selftests, since this will break it. */
- tracing_selftest_disabled = 1;
+ tracing_selftest_disabled = true;
#ifdef CONFIG_FTRACE_STARTUP_TEST
printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
type->name);
@@ -927,9 +1084,9 @@ int register_tracer(struct tracer *type)
return ret;
}
-void tracing_reset(struct trace_array *tr, int cpu)
+void tracing_reset(struct trace_buffer *buf, int cpu)
{
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = buf->buffer;
if (!buffer)
return;
@@ -943,9 +1100,9 @@ void tracing_reset(struct trace_array *tr, int cpu)
ring_buffer_record_enable(buffer);
}
-void tracing_reset_online_cpus(struct trace_array *tr)
+void tracing_reset_online_cpus(struct trace_buffer *buf)
{
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = buf->buffer;
int cpu;
if (!buffer)
@@ -956,7 +1113,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
/* Make sure all commits have finished */
synchronize_sched();
- tr->time_start = ftrace_now(tr->cpu);
+ buf->time_start = ftrace_now(buf->cpu);
for_each_online_cpu(cpu)
ring_buffer_reset_cpu(buffer, cpu);
@@ -966,12 +1123,21 @@ void tracing_reset_online_cpus(struct trace_array *tr)
void tracing_reset_current(int cpu)
{
- tracing_reset(&global_trace, cpu);
+ tracing_reset(&global_trace.trace_buffer, cpu);
}
-void tracing_reset_current_online_cpus(void)
+void tracing_reset_all_online_cpus(void)
{
- tracing_reset_online_cpus(&global_trace);
+ struct trace_array *tr;
+
+ mutex_lock(&trace_types_lock);
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ tracing_reset_online_cpus(&tr->trace_buffer);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ tracing_reset_online_cpus(&tr->max_buffer);
+#endif
+ }
+ mutex_unlock(&trace_types_lock);
}
#define SAVED_CMDLINES 128
@@ -994,7 +1160,7 @@ static void trace_init_cmdlines(void)
int is_tracing_stopped(void)
{
- return trace_stop_count;
+ return global_trace.stop_count;
}
/**
@@ -1026,12 +1192,12 @@ void tracing_start(void)
if (tracing_disabled)
return;
- raw_spin_lock_irqsave(&tracing_start_lock, flags);
- if (--trace_stop_count) {
- if (trace_stop_count < 0) {
+ raw_spin_lock_irqsave(&global_trace.start_lock, flags);
+ if (--global_trace.stop_count) {
+ if (global_trace.stop_count < 0) {
/* Someone screwed up their debugging */
WARN_ON_ONCE(1);
- trace_stop_count = 0;
+ global_trace.stop_count = 0;
}
goto out;
}
@@ -1039,19 +1205,52 @@ void tracing_start(void)
/* Prevent the buffers from switching */
arch_spin_lock(&ftrace_max_lock);
- buffer = global_trace.buffer;
+ buffer = global_trace.trace_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
- buffer = max_tr.buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+ buffer = global_trace.max_buffer.buffer;
if (buffer)
ring_buffer_record_enable(buffer);
+#endif
arch_spin_unlock(&ftrace_max_lock);
ftrace_start();
out:
- raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
+ raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+}
+
+static void tracing_start_tr(struct trace_array *tr)
+{
+ struct ring_buffer *buffer;
+ unsigned long flags;
+
+ if (tracing_disabled)
+ return;
+
+ /* If global, we need to also start the max tracer */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return tracing_start();
+
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+
+ if (--tr->stop_count) {
+ if (tr->stop_count < 0) {
+ /* Someone screwed up their debugging */
+ WARN_ON_ONCE(1);
+ tr->stop_count = 0;
+ }
+ goto out;
+ }
+
+ buffer = tr->trace_buffer.buffer;
+ if (buffer)
+ ring_buffer_record_enable(buffer);
+
+ out:
+ raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
/**
@@ -1066,25 +1265,48 @@ void tracing_stop(void)
unsigned long flags;
ftrace_stop();
- raw_spin_lock_irqsave(&tracing_start_lock, flags);
- if (trace_stop_count++)
+ raw_spin_lock_irqsave(&global_trace.start_lock, flags);
+ if (global_trace.stop_count++)
goto out;
/* Prevent the buffers from switching */
arch_spin_lock(&ftrace_max_lock);
- buffer = global_trace.buffer;
+ buffer = global_trace.trace_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
- buffer = max_tr.buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+ buffer = global_trace.max_buffer.buffer;
if (buffer)
ring_buffer_record_disable(buffer);
+#endif
arch_spin_unlock(&ftrace_max_lock);
out:
- raw_spin_unlock_irqrestore(&tracing_start_lock, flags);
+ raw_spin_unlock_irqrestore(&global_trace.start_lock, flags);
+}
+
+static void tracing_stop_tr(struct trace_array *tr)
+{
+ struct ring_buffer *buffer;
+ unsigned long flags;
+
+ /* If global, we need to also stop the max tracer */
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ return tracing_stop();
+
+ raw_spin_lock_irqsave(&tr->start_lock, flags);
+ if (tr->stop_count++)
+ goto out;
+
+ buffer = tr->trace_buffer.buffer;
+ if (buffer)
+ ring_buffer_record_disable(buffer);
+
+ out:
+ raw_spin_unlock_irqrestore(&tr->start_lock, flags);
}
void trace_stop_cmdline_recording(void);
@@ -1217,11 +1439,6 @@ void
__buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event)
{
__this_cpu_write(trace_cmdline_save, true);
- if (trace_wakeup_needed) {
- trace_wakeup_needed = false;
- /* irq_work_queue() supplies it's own memory barriers */
- irq_work_queue(&trace_work_wakeup);
- }
ring_buffer_unlock_commit(buffer, event);
}
@@ -1245,11 +1462,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer,
EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit);
struct ring_buffer_event *
+trace_event_buffer_lock_reserve(struct ring_buffer **current_rb,
+ struct ftrace_event_file *ftrace_file,
+ int type, unsigned long len,
+ unsigned long flags, int pc)
+{
+ *current_rb = ftrace_file->tr->trace_buffer.buffer;
+ return trace_buffer_lock_reserve(*current_rb,
+ type, len, flags, pc);
+}
+EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve);
+
+struct ring_buffer_event *
trace_current_buffer_lock_reserve(struct ring_buffer **current_rb,
int type, unsigned long len,
unsigned long flags, int pc)
{
- *current_rb = global_trace.buffer;
+ *current_rb = global_trace.trace_buffer.buffer;
return trace_buffer_lock_reserve(*current_rb,
type, len, flags, pc);
}
@@ -1288,7 +1517,7 @@ trace_function(struct trace_array *tr,
int pc)
{
struct ftrace_event_call *call = &event_function;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct ftrace_entry *entry;
@@ -1429,13 +1658,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags,
void __trace_stack(struct trace_array *tr, unsigned long flags, int skip,
int pc)
{
- __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL);
+ __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL);
}
/**
* trace_dump_stack - record a stack back trace in the trace buffer
+ * @skip: Number of functions to skip (helper handlers)
*/
-void trace_dump_stack(void)
+void trace_dump_stack(int skip)
{
unsigned long flags;
@@ -1444,8 +1674,13 @@ void trace_dump_stack(void)
local_save_flags(flags);
- /* skipping 3 traces, seems to get us at the caller of this function */
- __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL);
+ /*
+ * Skip 3 more, seems to get us at the caller of
+ * this function.
+ */
+ skip += 3;
+ __ftrace_trace_stack(global_trace.trace_buffer.buffer,
+ flags, skip, preempt_count(), NULL);
}
static DEFINE_PER_CPU(int, user_stack_count);
@@ -1615,7 +1850,7 @@ void trace_printk_init_buffers(void)
* directly here. If the global_trace.buffer is already
* allocated here, then this was called by module code.
*/
- if (global_trace.buffer)
+ if (global_trace.trace_buffer.buffer)
tracing_start_cmdline_record();
}
@@ -1675,7 +1910,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
local_save_flags(flags);
size = sizeof(*entry) + sizeof(u32) * len;
- buffer = tr->buffer;
+ buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size,
flags, pc);
if (!event)
@@ -1698,27 +1933,12 @@ out:
}
EXPORT_SYMBOL_GPL(trace_vbprintk);
-int trace_array_printk(struct trace_array *tr,
- unsigned long ip, const char *fmt, ...)
-{
- int ret;
- va_list ap;
-
- if (!(trace_flags & TRACE_ITER_PRINTK))
- return 0;
-
- va_start(ap, fmt);
- ret = trace_array_vprintk(tr, ip, fmt, ap);
- va_end(ap);
- return ret;
-}
-
-int trace_array_vprintk(struct trace_array *tr,
- unsigned long ip, const char *fmt, va_list args)
+static int
+__trace_array_vprintk(struct ring_buffer *buffer,
+ unsigned long ip, const char *fmt, va_list args)
{
struct ftrace_event_call *call = &event_print;
struct ring_buffer_event *event;
- struct ring_buffer *buffer;
int len = 0, size, pc;
struct print_entry *entry;
unsigned long flags;
@@ -1746,7 +1966,6 @@ int trace_array_vprintk(struct trace_array *tr,
local_save_flags(flags);
size = sizeof(*entry) + len + 1;
- buffer = tr->buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
flags, pc);
if (!event)
@@ -1767,6 +1986,42 @@ int trace_array_vprintk(struct trace_array *tr,
return len;
}
+int trace_array_vprintk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, va_list args)
+{
+ return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args);
+}
+
+int trace_array_printk(struct trace_array *tr,
+ unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = trace_array_vprintk(tr, ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
+int trace_array_printk_buf(struct ring_buffer *buffer,
+ unsigned long ip, const char *fmt, ...)
+{
+ int ret;
+ va_list ap;
+
+ if (!(trace_flags & TRACE_ITER_PRINTK))
+ return 0;
+
+ va_start(ap, fmt);
+ ret = __trace_array_vprintk(buffer, ip, fmt, ap);
+ va_end(ap);
+ return ret;
+}
+
int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
{
return trace_array_vprintk(&global_trace, ip, fmt, args);
@@ -1792,7 +2047,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts,
if (buf_iter)
event = ring_buffer_iter_peek(buf_iter, ts);
else
- event = ring_buffer_peek(iter->tr->buffer, cpu, ts,
+ event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts,
lost_events);
if (event) {
@@ -1807,7 +2062,7 @@ static struct trace_entry *
__find_next_entry(struct trace_iterator *iter, int *ent_cpu,
unsigned long *missing_events, u64 *ent_ts)
{
- struct ring_buffer *buffer = iter->tr->buffer;
+ struct ring_buffer *buffer = iter->trace_buffer->buffer;
struct trace_entry *ent, *next = NULL;
unsigned long lost_events = 0, next_lost = 0;
int cpu_file = iter->cpu_file;
@@ -1820,7 +2075,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu,
* If we are in a per_cpu trace file, don't bother by iterating over
* all cpu and peek directly.
*/
- if (cpu_file > TRACE_PIPE_ALL_CPU) {
+ if (cpu_file > RING_BUFFER_ALL_CPUS) {
if (ring_buffer_empty_cpu(buffer, cpu_file))
return NULL;
ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events);
@@ -1884,7 +2139,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter)
static void trace_consume(struct trace_iterator *iter)
{
- ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts,
+ ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts,
&iter->lost_events);
}
@@ -1917,13 +2172,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
void tracing_iter_reset(struct trace_iterator *iter, int cpu)
{
- struct trace_array *tr = iter->tr;
struct ring_buffer_event *event;
struct ring_buffer_iter *buf_iter;
unsigned long entries = 0;
u64 ts;
- tr->data[cpu]->skipped_entries = 0;
+ per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0;
buf_iter = trace_buffer_iter(iter, cpu);
if (!buf_iter)
@@ -1937,13 +2191,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
* by the timestamp being before the start of the buffer.
*/
while ((event = ring_buffer_iter_peek(buf_iter, &ts))) {
- if (ts >= iter->tr->time_start)
+ if (ts >= iter->trace_buffer->time_start)
break;
entries++;
ring_buffer_read(buf_iter, NULL);
}
- tr->data[cpu]->skipped_entries = entries;
+ per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries;
}
/*
@@ -1953,6 +2207,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu)
static void *s_start(struct seq_file *m, loff_t *pos)
{
struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
int cpu_file = iter->cpu_file;
void *p = NULL;
loff_t l = 0;
@@ -1965,12 +2220,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
* will point to the same string as current_trace->name.
*/
mutex_lock(&trace_types_lock);
- if (unlikely(current_trace && iter->trace->name != current_trace->name))
- *iter->trace = *current_trace;
+ if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name))
+ *iter->trace = *tr->current_trace;
mutex_unlock(&trace_types_lock);
+#ifdef CONFIG_TRACER_MAX_TRACE
if (iter->snapshot && iter->trace->use_max_tr)
return ERR_PTR(-EBUSY);
+#endif
if (!iter->snapshot)
atomic_inc(&trace_record_cmdline_disabled);
@@ -1980,7 +2237,7 @@ static void *s_start(struct seq_file *m, loff_t *pos)
iter->cpu = 0;
iter->idx = -1;
- if (cpu_file == TRACE_PIPE_ALL_CPU) {
+ if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu)
tracing_iter_reset(iter, cpu);
} else
@@ -2012,17 +2269,21 @@ static void s_stop(struct seq_file *m, void *p)
{
struct trace_iterator *iter = m->private;
+#ifdef CONFIG_TRACER_MAX_TRACE
if (iter->snapshot && iter->trace->use_max_tr)
return;
+#endif
if (!iter->snapshot)
atomic_dec(&trace_record_cmdline_disabled);
+
trace_access_unlock(iter->cpu_file);
trace_event_read_unlock();
}
static void
-get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries)
+get_total_entries(struct trace_buffer *buf,
+ unsigned long *total, unsigned long *entries)
{
unsigned long count;
int cpu;
@@ -2031,19 +2292,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e
*entries = 0;
for_each_tracing_cpu(cpu) {
- count = ring_buffer_entries_cpu(tr->buffer, cpu);
+ count = ring_buffer_entries_cpu(buf->buffer, cpu);
/*
* If this buffer has skipped entries, then we hold all
* entries for the trace and we need to ignore the
* ones before the time stamp.
*/
- if (tr->data[cpu]->skipped_entries) {
- count -= tr->data[cpu]->skipped_entries;
+ if (per_cpu_ptr(buf->data, cpu)->skipped_entries) {
+ count -= per_cpu_ptr(buf->data, cpu)->skipped_entries;
/* total is the same as the entries */
*total += count;
} else
*total += count +
- ring_buffer_overrun_cpu(tr->buffer, cpu);
+ ring_buffer_overrun_cpu(buf->buffer, cpu);
*entries += count;
}
}
@@ -2060,27 +2321,27 @@ static void print_lat_help_header(struct seq_file *m)
seq_puts(m, "# \\ / ||||| \\ | / \n");
}
-static void print_event_info(struct trace_array *tr, struct seq_file *m)
+static void print_event_info(struct trace_buffer *buf, struct seq_file *m)
{
unsigned long total;
unsigned long entries;
- get_total_entries(tr, &total, &entries);
+ get_total_entries(buf, &total, &entries);
seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n",
entries, total, num_online_cpus());
seq_puts(m, "#\n");
}
-static void print_func_help_header(struct trace_array *tr, struct seq_file *m)
+static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m)
{
- print_event_info(tr, m);
+ print_event_info(buf, m);
seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n");
seq_puts(m, "# | | | | |\n");
}
-static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m)
+static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
- print_event_info(tr, m);
+ print_event_info(buf, m);
seq_puts(m, "# _-----=> irqs-off\n");
seq_puts(m, "# / _----=> need-resched\n");
seq_puts(m, "# | / _---=> hardirq/softirq\n");
@@ -2094,16 +2355,16 @@ void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
- struct trace_array *tr = iter->tr;
- struct trace_array_cpu *data = tr->data[tr->cpu];
- struct tracer *type = current_trace;
+ struct trace_buffer *buf = iter->trace_buffer;
+ struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu);
+ struct tracer *type = iter->trace;
unsigned long entries;
unsigned long total;
const char *name = "preemption";
name = type->name;
- get_total_entries(tr, &total, &entries);
+ get_total_entries(buf, &total, &entries);
seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
name, UTS_RELEASE);
@@ -2114,7 +2375,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
nsecs_to_usecs(data->saved_latency),
entries,
total,
- tr->cpu,
+ buf->cpu,
#if defined(CONFIG_PREEMPT_NONE)
"server",
#elif defined(CONFIG_PREEMPT_VOLUNTARY)
@@ -2165,7 +2426,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
if (cpumask_test_cpu(iter->cpu, iter->started))
return;
- if (iter->tr->data[iter->cpu]->skipped_entries)
+ if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
return;
cpumask_set_cpu(iter->cpu, iter->started);
@@ -2288,14 +2549,14 @@ int trace_empty(struct trace_iterator *iter)
int cpu;
/* If we are looking at one CPU buffer, only check that one */
- if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
cpu = iter->cpu_file;
buf_iter = trace_buffer_iter(iter, cpu);
if (buf_iter) {
if (!ring_buffer_iter_empty(buf_iter))
return 0;
} else {
- if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+ if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
return 0;
}
return 1;
@@ -2307,7 +2568,7 @@ int trace_empty(struct trace_iterator *iter)
if (!ring_buffer_iter_empty(buf_iter))
return 0;
} else {
- if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+ if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu))
return 0;
}
}
@@ -2331,6 +2592,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter)
return ret;
}
+ if (iter->ent->type == TRACE_BPUTS &&
+ trace_flags & TRACE_ITER_PRINTK &&
+ trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+ return trace_print_bputs_msg_only(iter);
+
if (iter->ent->type == TRACE_BPRINT &&
trace_flags & TRACE_ITER_PRINTK &&
trace_flags & TRACE_ITER_PRINTK_MSGONLY)
@@ -2385,9 +2651,9 @@ void trace_default_header(struct seq_file *m)
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->tr, m);
+ print_func_help_header_irq(iter->trace_buffer, m);
else
- print_func_help_header(iter->tr, m);
+ print_func_help_header(iter->trace_buffer, m);
}
}
}
@@ -2400,6 +2666,50 @@ static void test_ftrace_alive(struct seq_file *m)
seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n");
}
+#ifdef CONFIG_TRACER_MAX_TRACE
+static void show_snapshot_main_help(struct seq_file *m)
+{
+ seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n");
+ seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+ seq_printf(m, "# Takes a snapshot of the main buffer.\n");
+ seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n");
+ seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
+ seq_printf(m, "# is not a '0' or '1')\n");
+}
+
+static void show_snapshot_percpu_help(struct seq_file *m)
+{
+ seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n");
+#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP
+ seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n");
+ seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n");
+#else
+ seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n");
+ seq_printf(m, "# Must use main snapshot file to allocate.\n");
+#endif
+ seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n");
+ seq_printf(m, "# (Doesn't have to be '2' works with any number that\n");
+ seq_printf(m, "# is not a '0' or '1')\n");
+}
+
+static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter)
+{
+ if (iter->tr->allocated_snapshot)
+ seq_printf(m, "#\n# * Snapshot is allocated *\n#\n");
+ else
+ seq_printf(m, "#\n# * Snapshot is freed *\n#\n");
+
+ seq_printf(m, "# Snapshot commands:\n");
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ show_snapshot_main_help(m);
+ else
+ show_snapshot_percpu_help(m);
+}
+#else
+/* Should never be called */
+static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { }
+#endif
+
static int s_show(struct seq_file *m, void *v)
{
struct trace_iterator *iter = v;
@@ -2411,7 +2721,9 @@ static int s_show(struct seq_file *m, void *v)
seq_puts(m, "#\n");
test_ftrace_alive(m);
}
- if (iter->trace && iter->trace->print_header)
+ if (iter->snapshot && trace_empty(iter))
+ print_snapshot_help(m, iter);
+ else if (iter->trace && iter->trace->print_header)
iter->trace->print_header(m);
else
trace_default_header(m);
@@ -2452,7 +2764,8 @@ static const struct seq_operations tracer_seq_ops = {
static struct trace_iterator *
__tracing_open(struct inode *inode, struct file *file, bool snapshot)
{
- long cpu_file = (long) inode->i_private;
+ struct trace_cpu *tc = inode->i_private;
+ struct trace_array *tr = tc->tr;
struct trace_iterator *iter;
int cpu;
@@ -2477,26 +2790,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
if (!iter->trace)
goto fail;
- *iter->trace = *current_trace;
+ *iter->trace = *tr->current_trace;
if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL))
goto fail;
- if (current_trace->print_max || snapshot)
- iter->tr = &max_tr;
+ iter->tr = tr;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ /* Currently only the top directory has a snapshot */
+ if (tr->current_trace->print_max || snapshot)
+ iter->trace_buffer = &tr->max_buffer;
else
- iter->tr = &global_trace;
+#endif
+ iter->trace_buffer = &tr->trace_buffer;
iter->snapshot = snapshot;
iter->pos = -1;
mutex_init(&iter->mutex);
- iter->cpu_file = cpu_file;
+ iter->cpu_file = tc->cpu;
/* Notify the tracer early; before we stop tracing. */
if (iter->trace && iter->trace->open)
iter->trace->open(iter);
/* Annotate start of buffers if we had overruns */
- if (ring_buffer_overruns(iter->tr->buffer))
+ if (ring_buffer_overruns(iter->trace_buffer->buffer))
iter->iter_flags |= TRACE_FILE_ANNOTATE;
/* Output in nanoseconds only if we are using a clock in nanoseconds. */
@@ -2505,12 +2823,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
/* stop the trace while dumping if we are not opening "snapshot" */
if (!iter->snapshot)
- tracing_stop();
+ tracing_stop_tr(tr);
- if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->tr->buffer, cpu);
+ ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
}
ring_buffer_read_prepare_sync();
for_each_tracing_cpu(cpu) {
@@ -2520,12 +2838,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot)
} else {
cpu = iter->cpu_file;
iter->buffer_iter[cpu] =
- ring_buffer_read_prepare(iter->tr->buffer, cpu);
+ ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu);
ring_buffer_read_prepare_sync();
ring_buffer_read_start(iter->buffer_iter[cpu]);
tracing_iter_reset(iter, cpu);
}
+ tr->ref++;
+
mutex_unlock(&trace_types_lock);
return iter;
@@ -2552,14 +2872,20 @@ static int tracing_release(struct inode *inode, struct file *file)
{
struct seq_file *m = file->private_data;
struct trace_iterator *iter;
+ struct trace_array *tr;
int cpu;
if (!(file->f_mode & FMODE_READ))
return 0;
iter = m->private;
+ tr = iter->tr;
mutex_lock(&trace_types_lock);
+
+ WARN_ON(!tr->ref);
+ tr->ref--;
+
for_each_tracing_cpu(cpu) {
if (iter->buffer_iter[cpu])
ring_buffer_read_finish(iter->buffer_iter[cpu]);
@@ -2570,7 +2896,7 @@ static int tracing_release(struct inode *inode, struct file *file)
if (!iter->snapshot)
/* reenable tracing if it was previously enabled */
- tracing_start();
+ tracing_start_tr(tr);
mutex_unlock(&trace_types_lock);
mutex_destroy(&iter->mutex);
@@ -2589,12 +2915,13 @@ static int tracing_open(struct inode *inode, struct file *file)
/* If this file was open for write, then erase contents */
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC)) {
- long cpu = (long) inode->i_private;
+ struct trace_cpu *tc = inode->i_private;
+ struct trace_array *tr = tc->tr;
- if (cpu == TRACE_PIPE_ALL_CPU)
- tracing_reset_online_cpus(&global_trace);
+ if (tc->cpu == RING_BUFFER_ALL_CPUS)
+ tracing_reset_online_cpus(&tr->trace_buffer);
else
- tracing_reset(&global_trace, cpu);
+ tracing_reset(&tr->trace_buffer, tc->cpu);
}
if (file->f_mode & FMODE_READ) {
@@ -2741,8 +3068,9 @@ static ssize_t
tracing_cpumask_write(struct file *filp, const char __user *ubuf,
size_t count, loff_t *ppos)
{
- int err, cpu;
+ struct trace_array *tr = filp->private_data;
cpumask_var_t tracing_cpumask_new;
+ int err, cpu;
if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL))
return -ENOMEM;
@@ -2762,13 +3090,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf,
*/
if (cpumask_test_cpu(cpu, tracing_cpumask) &&
!cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_inc(&global_trace.data[cpu]->disabled);
- ring_buffer_record_disable_cpu(global_trace.buffer, cpu);
+ atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
+ ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu);
}
if (!cpumask_test_cpu(cpu, tracing_cpumask) &&
cpumask_test_cpu(cpu, tracing_cpumask_new)) {
- atomic_dec(&global_trace.data[cpu]->disabled);
- ring_buffer_record_enable_cpu(global_trace.buffer, cpu);
+ atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled);
+ ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu);
}
}
arch_spin_unlock(&ftrace_max_lock);
@@ -2797,12 +3125,13 @@ static const struct file_operations tracing_cpumask_fops = {
static int tracing_trace_options_show(struct seq_file *m, void *v)
{
struct tracer_opt *trace_opts;
+ struct trace_array *tr = m->private;
u32 tracer_flags;
int i;
mutex_lock(&trace_types_lock);
- tracer_flags = current_trace->flags->val;
- trace_opts = current_trace->flags->opts;
+ tracer_flags = tr->current_trace->flags->val;
+ trace_opts = tr->current_trace->flags->opts;
for (i = 0; trace_options[i]; i++) {
if (trace_flags & (1 << i))
@@ -2857,11 +3186,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
return -EINVAL;
}
-static void set_tracer_flags(unsigned int mask, int enabled)
+/* Some tracers require overwrite to stay enabled */
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set)
+{
+ if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set)
+ return -1;
+
+ return 0;
+}
+
+int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled)
{
/* do nothing if flag is already set */
if (!!(trace_flags & mask) == !!enabled)
- return;
+ return 0;
+
+ /* Give the tracer a chance to approve the change */
+ if (tr->current_trace->flag_changed)
+ if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled))
+ return -EINVAL;
if (enabled)
trace_flags |= mask;
@@ -2871,18 +3214,24 @@ static void set_tracer_flags(unsigned int mask, int enabled)
if (mask == TRACE_ITER_RECORD_CMD)
trace_event_enable_cmd_record(enabled);
- if (mask == TRACE_ITER_OVERWRITE)
- ring_buffer_change_overwrite(global_trace.buffer, enabled);
+ if (mask == TRACE_ITER_OVERWRITE) {
+ ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled);
+#endif
+ }
if (mask == TRACE_ITER_PRINTK)
trace_printk_start_stop_comm(enabled);
+
+ return 0;
}
-static int trace_set_options(char *option)
+static int trace_set_options(struct trace_array *tr, char *option)
{
char *cmp;
int neg = 0;
- int ret = 0;
+ int ret = -ENODEV;
int i;
cmp = strstrip(option);
@@ -2892,19 +3241,20 @@ static int trace_set_options(char *option)
cmp += 2;
}
+ mutex_lock(&trace_types_lock);
+
for (i = 0; trace_options[i]; i++) {
if (strcmp(cmp, trace_options[i]) == 0) {
- set_tracer_flags(1 << i, !neg);
+ ret = set_tracer_flag(tr, 1 << i, !neg);
break;
}
}
/* If no option could be set, test the specific tracer options */
- if (!trace_options[i]) {
- mutex_lock(&trace_types_lock);
- ret = set_tracer_option(current_trace, cmp, neg);
- mutex_unlock(&trace_types_lock);
- }
+ if (!trace_options[i])
+ ret = set_tracer_option(tr->current_trace, cmp, neg);
+
+ mutex_unlock(&trace_types_lock);
return ret;
}
@@ -2913,7 +3263,10 @@ static ssize_t
tracing_trace_options_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
char buf[64];
+ int ret;
if (cnt >= sizeof(buf))
return -EINVAL;
@@ -2923,7 +3276,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
buf[cnt] = 0;
- trace_set_options(buf);
+ ret = trace_set_options(tr, buf);
+ if (ret < 0)
+ return ret;
*ppos += cnt;
@@ -2934,7 +3289,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file)
{
if (tracing_disabled)
return -ENODEV;
- return single_open(file, tracing_trace_options_show, NULL);
+
+ return single_open(file, tracing_trace_options_show, inode->i_private);
}
static const struct file_operations tracing_iter_fops = {
@@ -2947,20 +3303,84 @@ static const struct file_operations tracing_iter_fops = {
static const char readme_msg[] =
"tracing mini-HOWTO:\n\n"
- "# mount -t debugfs nodev /sys/kernel/debug\n\n"
- "# cat /sys/kernel/debug/tracing/available_tracers\n"
- "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n"
- "# cat /sys/kernel/debug/tracing/current_tracer\n"
- "nop\n"
- "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n"
- "# cat /sys/kernel/debug/tracing/current_tracer\n"
- "wakeup\n"
- "# cat /sys/kernel/debug/tracing/trace_options\n"
- "noprint-parent nosym-offset nosym-addr noverbose\n"
- "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n"
- "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n"
- "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n"
- "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n"
+ "# echo 0 > tracing_on : quick way to disable tracing\n"
+ "# echo 1 > tracing_on : quick way to re-enable tracing\n\n"
+ " Important files:\n"
+ " trace\t\t\t- The static contents of the buffer\n"
+ "\t\t\t To clear the buffer write into this file: echo > trace\n"
+ " trace_pipe\t\t- A consuming read to see the contents of the buffer\n"
+ " current_tracer\t- function and latency tracers\n"
+ " available_tracers\t- list of configured tracers for current_tracer\n"
+ " buffer_size_kb\t- view and modify size of per cpu buffer\n"
+ " buffer_total_size_kb - view total size of all cpu buffers\n\n"
+ " trace_clock\t\t-change the clock used to order events\n"
+ " local: Per cpu clock but may not be synced across CPUs\n"
+ " global: Synced across CPUs but slows tracing down.\n"
+ " counter: Not a clock, but just an increment\n"
+ " uptime: Jiffy counter from time of boot\n"
+ " perf: Same clock that perf events use\n"
+#ifdef CONFIG_X86_64
+ " x86-tsc: TSC cycle counter\n"
+#endif
+ "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n"
+ " tracing_cpumask\t- Limit which CPUs to trace\n"
+ " instances\t\t- Make sub-buffers with: mkdir instances/foo\n"
+ "\t\t\t Remove sub-buffer with rmdir\n"
+ " trace_options\t\t- Set format or modify how tracing happens\n"
+ "\t\t\t Disable an option by adding a suffix 'no' to the option name\n"
+#ifdef CONFIG_DYNAMIC_FTRACE
+ "\n available_filter_functions - list of functions that can be filtered on\n"
+ " set_ftrace_filter\t- echo function name in here to only trace these functions\n"
+ " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+ " modules: Can select a group via module\n"
+ " Format: :mod:<module-name>\n"
+ " example: echo :mod:ext3 > set_ftrace_filter\n"
+ " triggers: a command to perform when function is hit\n"
+ " Format: <function>:<trigger>[:count]\n"
+ " trigger: traceon, traceoff\n"
+ " enable_event:<system>:<event>\n"
+ " disable_event:<system>:<event>\n"
+#ifdef CONFIG_STACKTRACE
+ " stacktrace\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+ " snapshot\n"
+#endif
+ " example: echo do_fault:traceoff > set_ftrace_filter\n"
+ " echo do_trap:traceoff:3 > set_ftrace_filter\n"
+ " The first one will disable tracing every time do_fault is hit\n"
+ " The second will disable tracing at most 3 times when do_trap is hit\n"
+ " The first time do trap is hit and it disables tracing, the counter\n"
+ " will decrement to 2. If tracing is already disabled, the counter\n"
+ " will not decrement. It only decrements when the trigger did work\n"
+ " To remove trigger without count:\n"
+ " echo '!<function>:<trigger> > set_ftrace_filter\n"
+ " To remove trigger with a count:\n"
+ " echo '!<function>:<trigger>:0 > set_ftrace_filter\n"
+ " set_ftrace_notrace\t- echo function name in here to never trace.\n"
+ " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n"
+ " modules: Can select a group via module command :mod:\n"
+ " Does not accept triggers\n"
+#endif /* CONFIG_DYNAMIC_FTRACE */
+#ifdef CONFIG_FUNCTION_TRACER
+ " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n"
+#endif
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+ " set_graph_function\t- Trace the nested calls of a function (function_graph)\n"
+ " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n"
+#endif
+#ifdef CONFIG_TRACER_SNAPSHOT
+ "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n"
+ "\t\t\t Read the contents for more information\n"
+#endif
+#ifdef CONFIG_STACKTRACE
+ " stack_trace\t\t- Shows the max stack trace when active\n"
+ " stack_max_size\t- Shows current max stack size that was traced\n"
+ "\t\t\t Write into this file to reset the max size (trigger a new trace)\n"
+#ifdef CONFIG_DYNAMIC_FTRACE
+ " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n"
+#endif
+#endif /* CONFIG_STACKTRACE */
;
static ssize_t
@@ -3032,11 +3452,12 @@ static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
+ struct trace_array *tr = filp->private_data;
char buf[MAX_TRACER_SIZE+2];
int r;
mutex_lock(&trace_types_lock);
- r = sprintf(buf, "%s\n", current_trace->name);
+ r = sprintf(buf, "%s\n", tr->current_trace->name);
mutex_unlock(&trace_types_lock);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
@@ -3044,43 +3465,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
int tracer_init(struct tracer *t, struct trace_array *tr)
{
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
return t->init(tr);
}
-static void set_buffer_entries(struct trace_array *tr, unsigned long val)
+static void set_buffer_entries(struct trace_buffer *buf, unsigned long val)
{
int cpu;
+
for_each_tracing_cpu(cpu)
- tr->data[cpu]->entries = val;
+ per_cpu_ptr(buf->data, cpu)->entries = val;
}
+#ifdef CONFIG_TRACER_MAX_TRACE
/* resize @tr's buffer to the size of @size_tr's entries */
-static int resize_buffer_duplicate_size(struct trace_array *tr,
- struct trace_array *size_tr, int cpu_id)
+static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf,
+ struct trace_buffer *size_buf, int cpu_id)
{
int cpu, ret = 0;
if (cpu_id == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
- ret = ring_buffer_resize(tr->buffer,
- size_tr->data[cpu]->entries, cpu);
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu)->entries, cpu);
if (ret < 0)
break;
- tr->data[cpu]->entries = size_tr->data[cpu]->entries;
+ per_cpu_ptr(trace_buf->data, cpu)->entries =
+ per_cpu_ptr(size_buf->data, cpu)->entries;
}
} else {
- ret = ring_buffer_resize(tr->buffer,
- size_tr->data[cpu_id]->entries, cpu_id);
+ ret = ring_buffer_resize(trace_buf->buffer,
+ per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id);
if (ret == 0)
- tr->data[cpu_id]->entries =
- size_tr->data[cpu_id]->entries;
+ per_cpu_ptr(trace_buf->data, cpu_id)->entries =
+ per_cpu_ptr(size_buf->data, cpu_id)->entries;
}
return ret;
}
+#endif /* CONFIG_TRACER_MAX_TRACE */
-static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
+static int __tracing_resize_ring_buffer(struct trace_array *tr,
+ unsigned long size, int cpu)
{
int ret;
@@ -3089,23 +3515,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
* we use the size that was given, and we can forget about
* expanding it later.
*/
- ring_buffer_expanded = 1;
+ ring_buffer_expanded = true;
/* May be called before buffers are initialized */
- if (!global_trace.buffer)
+ if (!tr->trace_buffer.buffer)
return 0;
- ret = ring_buffer_resize(global_trace.buffer, size, cpu);
+ ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu);
if (ret < 0)
return ret;
- if (!current_trace->use_max_tr)
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) ||
+ !tr->current_trace->use_max_tr)
goto out;
- ret = ring_buffer_resize(max_tr.buffer, size, cpu);
+ ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu);
if (ret < 0) {
- int r = resize_buffer_duplicate_size(&global_trace,
- &global_trace, cpu);
+ int r = resize_buffer_duplicate_size(&tr->trace_buffer,
+ &tr->trace_buffer, cpu);
if (r < 0) {
/*
* AARGH! We are left with different
@@ -3128,20 +3556,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu)
}
if (cpu == RING_BUFFER_ALL_CPUS)
- set_buffer_entries(&max_tr, size);
+ set_buffer_entries(&tr->max_buffer, size);
else
- max_tr.data[cpu]->entries = size;
+ per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size;
out:
+#endif /* CONFIG_TRACER_MAX_TRACE */
+
if (cpu == RING_BUFFER_ALL_CPUS)
- set_buffer_entries(&global_trace, size);
+ set_buffer_entries(&tr->trace_buffer, size);
else
- global_trace.data[cpu]->entries = size;
+ per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size;
return ret;
}
-static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
+static ssize_t tracing_resize_ring_buffer(struct trace_array *tr,
+ unsigned long size, int cpu_id)
{
int ret = size;
@@ -3155,7 +3586,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id)
}
}
- ret = __tracing_resize_ring_buffer(size, cpu_id);
+ ret = __tracing_resize_ring_buffer(tr, size, cpu_id);
if (ret < 0)
ret = -ENOMEM;
@@ -3182,7 +3613,7 @@ int tracing_update_buffers(void)
mutex_lock(&trace_types_lock);
if (!ring_buffer_expanded)
- ret = __tracing_resize_ring_buffer(trace_buf_size,
+ ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size,
RING_BUFFER_ALL_CPUS);
mutex_unlock(&trace_types_lock);
@@ -3192,7 +3623,7 @@ int tracing_update_buffers(void)
struct trace_option_dentry;
static struct trace_option_dentry *
-create_trace_option_files(struct tracer *tracer);
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer);
static void
destroy_trace_option_files(struct trace_option_dentry *topts);
@@ -3202,13 +3633,15 @@ static int tracing_set_tracer(const char *buf)
static struct trace_option_dentry *topts;
struct trace_array *tr = &global_trace;
struct tracer *t;
+#ifdef CONFIG_TRACER_MAX_TRACE
bool had_max_tr;
+#endif
int ret = 0;
mutex_lock(&trace_types_lock);
if (!ring_buffer_expanded) {
- ret = __tracing_resize_ring_buffer(trace_buf_size,
+ ret = __tracing_resize_ring_buffer(tr, trace_buf_size,
RING_BUFFER_ALL_CPUS);
if (ret < 0)
goto out;
@@ -3223,15 +3656,21 @@ static int tracing_set_tracer(const char *buf)
ret = -EINVAL;
goto out;
}
- if (t == current_trace)
+ if (t == tr->current_trace)
goto out;
trace_branch_disable();
- if (current_trace->reset)
- current_trace->reset(tr);
- had_max_tr = current_trace->allocated_snapshot;
- current_trace = &nop_trace;
+ tr->current_trace->enabled = false;
+
+ if (tr->current_trace->reset)
+ tr->current_trace->reset(tr);
+
+ /* Current trace needs to be nop_trace before synchronize_sched */
+ tr->current_trace = &nop_trace;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ had_max_tr = tr->allocated_snapshot;
if (had_max_tr && !t->use_max_tr) {
/*
@@ -3242,27 +3681,20 @@ static int tracing_set_tracer(const char *buf)
* so a synchronized_sched() is sufficient.
*/
synchronize_sched();
- /*
- * We don't free the ring buffer. instead, resize it because
- * The max_tr ring buffer has some state (e.g. ring->clock) and
- * we want preserve it.
- */
- ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS);
- set_buffer_entries(&max_tr, 1);
- tracing_reset_online_cpus(&max_tr);
- current_trace->allocated_snapshot = false;
+ free_snapshot(tr);
}
+#endif
destroy_trace_option_files(topts);
- topts = create_trace_option_files(t);
+ topts = create_trace_option_files(tr, t);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
if (t->use_max_tr && !had_max_tr) {
- /* we need to make per cpu buffer sizes equivalent */
- ret = resize_buffer_duplicate_size(&max_tr, &global_trace,
- RING_BUFFER_ALL_CPUS);
+ ret = alloc_snapshot(tr);
if (ret < 0)
goto out;
- t->allocated_snapshot = true;
}
+#endif
if (t->init) {
ret = tracer_init(t, tr);
@@ -3270,7 +3702,8 @@ static int tracing_set_tracer(const char *buf)
goto out;
}
- current_trace = t;
+ tr->current_trace = t;
+ tr->current_trace->enabled = true;
trace_branch_enable(tr);
out:
mutex_unlock(&trace_types_lock);
@@ -3344,7 +3777,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
static int tracing_open_pipe(struct inode *inode, struct file *filp)
{
- long cpu_file = (long) inode->i_private;
+ struct trace_cpu *tc = inode->i_private;
+ struct trace_array *tr = tc->tr;
struct trace_iterator *iter;
int ret = 0;
@@ -3369,7 +3803,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
ret = -ENOMEM;
goto fail;
}
- *iter->trace = *current_trace;
+ *iter->trace = *tr->current_trace;
if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
ret = -ENOMEM;
@@ -3386,8 +3820,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
if (trace_clocks[trace_clock_id].in_ns)
iter->iter_flags |= TRACE_FILE_TIME_IN_NS;
- iter->cpu_file = cpu_file;
- iter->tr = &global_trace;
+ iter->cpu_file = tc->cpu;
+ iter->tr = tc->tr;
+ iter->trace_buffer = &tc->tr->trace_buffer;
mutex_init(&iter->mutex);
filp->private_data = iter;
@@ -3426,24 +3861,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
}
static unsigned int
-tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table)
{
- struct trace_iterator *iter = filp->private_data;
+ /* Iterators are static, they should be filled or empty */
+ if (trace_buffer_iter(iter, iter->cpu_file))
+ return POLLIN | POLLRDNORM;
- if (trace_flags & TRACE_ITER_BLOCK) {
+ if (trace_flags & TRACE_ITER_BLOCK)
/*
* Always select as readable when in blocking mode
*/
return POLLIN | POLLRDNORM;
- } else {
- if (!trace_empty(iter))
- return POLLIN | POLLRDNORM;
- poll_wait(filp, &trace_wait, poll_table);
- if (!trace_empty(iter))
- return POLLIN | POLLRDNORM;
+ else
+ return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file,
+ filp, poll_table);
+}
- return 0;
- }
+static unsigned int
+tracing_poll_pipe(struct file *filp, poll_table *poll_table)
+{
+ struct trace_iterator *iter = filp->private_data;
+
+ return trace_poll(iter, filp, poll_table);
}
/*
@@ -3509,6 +3948,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_iterator *iter = filp->private_data;
+ struct trace_array *tr = iter->tr;
ssize_t sret;
/* return any leftover data */
@@ -3520,8 +3960,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
/* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
- if (unlikely(iter->trace->name != current_trace->name))
- *iter->trace = *current_trace;
+ if (unlikely(iter->trace->name != tr->current_trace->name))
+ *iter->trace = *tr->current_trace;
mutex_unlock(&trace_types_lock);
/*
@@ -3677,6 +4117,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
.ops = &tracing_pipe_buf_ops,
.spd_release = tracing_spd_release_pipe,
};
+ struct trace_array *tr = iter->tr;
ssize_t ret;
size_t rem;
unsigned int i;
@@ -3686,8 +4127,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
/* copy the tracer to avoid using a global lock all around */
mutex_lock(&trace_types_lock);
- if (unlikely(iter->trace->name != current_trace->name))
- *iter->trace = *current_trace;
+ if (unlikely(iter->trace->name != tr->current_trace->name))
+ *iter->trace = *tr->current_trace;
mutex_unlock(&trace_types_lock);
mutex_lock(&iter->mutex);
@@ -3749,43 +4190,19 @@ out_err:
goto out;
}
-struct ftrace_entries_info {
- struct trace_array *tr;
- int cpu;
-};
-
-static int tracing_entries_open(struct inode *inode, struct file *filp)
-{
- struct ftrace_entries_info *info;
-
- if (tracing_disabled)
- return -ENODEV;
-
- info = kzalloc(sizeof(*info), GFP_KERNEL);
- if (!info)
- return -ENOMEM;
-
- info->tr = &global_trace;
- info->cpu = (unsigned long)inode->i_private;
-
- filp->private_data = info;
-
- return 0;
-}
-
static ssize_t
tracing_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct ftrace_entries_info *info = filp->private_data;
- struct trace_array *tr = info->tr;
+ struct trace_cpu *tc = filp->private_data;
+ struct trace_array *tr = tc->tr;
char buf[64];
int r = 0;
ssize_t ret;
mutex_lock(&trace_types_lock);
- if (info->cpu == RING_BUFFER_ALL_CPUS) {
+ if (tc->cpu == RING_BUFFER_ALL_CPUS) {
int cpu, buf_size_same;
unsigned long size;
@@ -3795,8 +4212,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
for_each_tracing_cpu(cpu) {
/* fill in the size from first enabled cpu */
if (size == 0)
- size = tr->data[cpu]->entries;
- if (size != tr->data[cpu]->entries) {
+ size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries;
+ if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) {
buf_size_same = 0;
break;
}
@@ -3812,7 +4229,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
} else
r = sprintf(buf, "X\n");
} else
- r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10);
+ r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10);
mutex_unlock(&trace_types_lock);
@@ -3824,7 +4241,7 @@ static ssize_t
tracing_entries_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
- struct ftrace_entries_info *info = filp->private_data;
+ struct trace_cpu *tc = filp->private_data;
unsigned long val;
int ret;
@@ -3839,7 +4256,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
/* value is in KB */
val <<= 10;
- ret = tracing_resize_ring_buffer(val, info->cpu);
+ ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu);
if (ret < 0)
return ret;
@@ -3848,16 +4265,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
return cnt;
}
-static int
-tracing_entries_release(struct inode *inode, struct file *filp)
-{
- struct ftrace_entries_info *info = filp->private_data;
-
- kfree(info);
-
- return 0;
-}
-
static ssize_t
tracing_total_entries_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
@@ -3869,7 +4276,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf,
mutex_lock(&trace_types_lock);
for_each_tracing_cpu(cpu) {
- size += tr->data[cpu]->entries >> 10;
+ size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10;
if (!ring_buffer_expanded)
expanded_size += trace_buf_size >> 10;
}
@@ -3899,11 +4306,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf,
static int
tracing_free_buffer_release(struct inode *inode, struct file *filp)
{
+ struct trace_array *tr = inode->i_private;
+
/* disable tracing ? */
if (trace_flags & TRACE_ITER_STOP_ON_FREE)
tracing_off();
/* resize the ring buffer to 0 */
- tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS);
+ tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS);
return 0;
}
@@ -3972,7 +4381,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
local_save_flags(irq_flags);
size = sizeof(*entry) + cnt + 2; /* possible \n added */
- buffer = global_trace.buffer;
+ buffer = global_trace.trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
irq_flags, preempt_count());
if (!event) {
@@ -4014,13 +4423,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
static int tracing_clock_show(struct seq_file *m, void *v)
{
+ struct trace_array *tr = m->private;
int i;
for (i = 0; i < ARRAY_SIZE(trace_clocks); i++)
seq_printf(m,
"%s%s%s%s", i ? " " : "",
- i == trace_clock_id ? "[" : "", trace_clocks[i].name,
- i == trace_clock_id ? "]" : "");
+ i == tr->clock_id ? "[" : "", trace_clocks[i].name,
+ i == tr->clock_id ? "]" : "");
seq_putc(m, '\n');
return 0;
@@ -4029,6 +4439,8 @@ static int tracing_clock_show(struct seq_file *m, void *v)
static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
{
+ struct seq_file *m = filp->private_data;
+ struct trace_array *tr = m->private;
char buf[64];
const char *clockstr;
int i;
@@ -4050,20 +4462,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf,
if (i == ARRAY_SIZE(trace_clocks))
return -EINVAL;
- trace_clock_id = i;
-
mutex_lock(&trace_types_lock);
- ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func);
- if (max_tr.buffer)
- ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func);
+ tr->clock_id = i;
+
+ ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func);
/*
* New clock may not be consistent with the previous clock.
* Reset the buffer so that it doesn't have incomparable timestamps.
*/
- tracing_reset_online_cpus(&global_trace);
- tracing_reset_online_cpus(&max_tr);
+ tracing_reset_online_cpus(&global_trace.trace_buffer);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer)
+ ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func);
+ tracing_reset_online_cpus(&global_trace.max_buffer);
+#endif
mutex_unlock(&trace_types_lock);
@@ -4076,20 +4491,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file)
{
if (tracing_disabled)
return -ENODEV;
- return single_open(file, tracing_clock_show, NULL);
+
+ return single_open(file, tracing_clock_show, inode->i_private);
}
+struct ftrace_buffer_info {
+ struct trace_iterator iter;
+ void *spare;
+ unsigned int read;
+};
+
#ifdef CONFIG_TRACER_SNAPSHOT
static int tracing_snapshot_open(struct inode *inode, struct file *file)
{
+ struct trace_cpu *tc = inode->i_private;
struct trace_iterator *iter;
+ struct seq_file *m;
int ret = 0;
if (file->f_mode & FMODE_READ) {
iter = __tracing_open(inode, file, true);
if (IS_ERR(iter))
ret = PTR_ERR(iter);
+ } else {
+ /* Writes still need the seq_file to hold the private data */
+ m = kzalloc(sizeof(*m), GFP_KERNEL);
+ if (!m)
+ return -ENOMEM;
+ iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+ if (!iter) {
+ kfree(m);
+ return -ENOMEM;
+ }
+ iter->tr = tc->tr;
+ iter->trace_buffer = &tc->tr->max_buffer;
+ iter->cpu_file = tc->cpu;
+ m->private = iter;
+ file->private_data = m;
}
+
return ret;
}
@@ -4097,6 +4537,9 @@ static ssize_t
tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
+ struct seq_file *m = filp->private_data;
+ struct trace_iterator *iter = m->private;
+ struct trace_array *tr = iter->tr;
unsigned long val;
int ret;
@@ -4110,42 +4553,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt,
mutex_lock(&trace_types_lock);
- if (current_trace->use_max_tr) {
+ if (tr->current_trace->use_max_tr) {
ret = -EBUSY;
goto out;
}
switch (val) {
case 0:
- if (current_trace->allocated_snapshot) {
- /* free spare buffer */
- ring_buffer_resize(max_tr.buffer, 1,
- RING_BUFFER_ALL_CPUS);
- set_buffer_entries(&max_tr, 1);
- tracing_reset_online_cpus(&max_tr);
- current_trace->allocated_snapshot = false;
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
+ ret = -EINVAL;
+ break;
}
+ if (tr->allocated_snapshot)
+ free_snapshot(tr);
break;
case 1:
- if (!current_trace->allocated_snapshot) {
- /* allocate spare buffer */
- ret = resize_buffer_duplicate_size(&max_tr,
- &global_trace, RING_BUFFER_ALL_CPUS);
+/* Only allow per-cpu swap if the ring buffer supports it */
+#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP
+ if (iter->cpu_file != RING_BUFFER_ALL_CPUS) {
+ ret = -EINVAL;
+ break;
+ }
+#endif
+ if (!tr->allocated_snapshot) {
+ ret = alloc_snapshot(tr);
if (ret < 0)
break;
- current_trace->allocated_snapshot = true;
}
-
local_irq_disable();
/* Now, we're going to swap */
- update_max_tr(&global_trace, current, smp_processor_id());
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ update_max_tr(tr, current, smp_processor_id());
+ else
+ update_max_tr_single(tr, current, iter->cpu_file);
local_irq_enable();
break;
default:
- if (current_trace->allocated_snapshot)
- tracing_reset_online_cpus(&max_tr);
- else
- ret = -EINVAL;
+ if (tr->allocated_snapshot) {
+ if (iter->cpu_file == RING_BUFFER_ALL_CPUS)
+ tracing_reset_online_cpus(&tr->max_buffer);
+ else
+ tracing_reset(&tr->max_buffer, iter->cpu_file);
+ }
break;
}
@@ -4157,6 +4606,51 @@ out:
mutex_unlock(&trace_types_lock);
return ret;
}
+
+static int tracing_snapshot_release(struct inode *inode, struct file *file)
+{
+ struct seq_file *m = file->private_data;
+
+ if (file->f_mode & FMODE_READ)
+ return tracing_release(inode, file);
+
+ /* If write only, the seq_file is just a stub */
+ if (m)
+ kfree(m->private);
+ kfree(m);
+
+ return 0;
+}
+
+static int tracing_buffers_open(struct inode *inode, struct file *filp);
+static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf,
+ size_t count, loff_t *ppos);
+static int tracing_buffers_release(struct inode *inode, struct file *file);
+static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len, unsigned int flags);
+
+static int snapshot_raw_open(struct inode *inode, struct file *filp)
+{
+ struct ftrace_buffer_info *info;
+ int ret;
+
+ ret = tracing_buffers_open(inode, filp);
+ if (ret < 0)
+ return ret;
+
+ info = filp->private_data;
+
+ if (info->iter.trace->use_max_tr) {
+ tracing_buffers_release(inode, filp);
+ return -EBUSY;
+ }
+
+ info->iter.snapshot = true;
+ info->iter.trace_buffer = &info->iter.tr->max_buffer;
+
+ return ret;
+}
+
#endif /* CONFIG_TRACER_SNAPSHOT */
@@ -4184,10 +4678,9 @@ static const struct file_operations tracing_pipe_fops = {
};
static const struct file_operations tracing_entries_fops = {
- .open = tracing_entries_open,
+ .open = tracing_open_generic,
.read = tracing_entries_read,
.write = tracing_entries_write,
- .release = tracing_entries_release,
.llseek = generic_file_llseek,
};
@@ -4222,20 +4715,23 @@ static const struct file_operations snapshot_fops = {
.read = seq_read,
.write = tracing_snapshot_write,
.llseek = tracing_seek,
- .release = tracing_release,
+ .release = tracing_snapshot_release,
};
-#endif /* CONFIG_TRACER_SNAPSHOT */
-struct ftrace_buffer_info {
- struct trace_array *tr;
- void *spare;
- int cpu;
- unsigned int read;
+static const struct file_operations snapshot_raw_fops = {
+ .open = snapshot_raw_open,
+ .read = tracing_buffers_read,
+ .release = tracing_buffers_release,
+ .splice_read = tracing_buffers_splice_read,
+ .llseek = no_llseek,
};
+#endif /* CONFIG_TRACER_SNAPSHOT */
+
static int tracing_buffers_open(struct inode *inode, struct file *filp)
{
- int cpu = (int)(long)inode->i_private;
+ struct trace_cpu *tc = inode->i_private;
+ struct trace_array *tr = tc->tr;
struct ftrace_buffer_info *info;
if (tracing_disabled)
@@ -4245,72 +4741,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp)
if (!info)
return -ENOMEM;
- info->tr = &global_trace;
- info->cpu = cpu;
- info->spare = NULL;
+ mutex_lock(&trace_types_lock);
+
+ tr->ref++;
+
+ info->iter.tr = tr;
+ info->iter.cpu_file = tc->cpu;
+ info->iter.trace = tr->current_trace;
+ info->iter.trace_buffer = &tr->trace_buffer;
+ info->spare = NULL;
/* Force reading ring buffer for first read */
- info->read = (unsigned int)-1;
+ info->read = (unsigned int)-1;
filp->private_data = info;
+ mutex_unlock(&trace_types_lock);
+
return nonseekable_open(inode, filp);
}
+static unsigned int
+tracing_buffers_poll(struct file *filp, poll_table *poll_table)
+{
+ struct ftrace_buffer_info *info = filp->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ return trace_poll(iter, filp, poll_table);
+}
+
static ssize_t
tracing_buffers_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
struct ftrace_buffer_info *info = filp->private_data;
+ struct trace_iterator *iter = &info->iter;
ssize_t ret;
- size_t size;
+ ssize_t size;
if (!count)
return 0;
+ mutex_lock(&trace_types_lock);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
+ size = -EBUSY;
+ goto out_unlock;
+ }
+#endif
+
if (!info->spare)
- info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu);
+ info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer,
+ iter->cpu_file);
+ size = -ENOMEM;
if (!info->spare)
- return -ENOMEM;
+ goto out_unlock;
/* Do we have previous read data to read? */
if (info->read < PAGE_SIZE)
goto read;
- trace_access_lock(info->cpu);
- ret = ring_buffer_read_page(info->tr->buffer,
+ again:
+ trace_access_lock(iter->cpu_file);
+ ret = ring_buffer_read_page(iter->trace_buffer->buffer,
&info->spare,
count,
- info->cpu, 0);
- trace_access_unlock(info->cpu);
- if (ret < 0)
- return 0;
+ iter->cpu_file, 0);
+ trace_access_unlock(iter->cpu_file);
- info->read = 0;
+ if (ret < 0) {
+ if (trace_empty(iter)) {
+ if ((filp->f_flags & O_NONBLOCK)) {
+ size = -EAGAIN;
+ goto out_unlock;
+ }
+ mutex_unlock(&trace_types_lock);
+ iter->trace->wait_pipe(iter);
+ mutex_lock(&trace_types_lock);
+ if (signal_pending(current)) {
+ size = -EINTR;
+ goto out_unlock;
+ }
+ goto again;
+ }
+ size = 0;
+ goto out_unlock;
+ }
-read:
+ info->read = 0;
+ read:
size = PAGE_SIZE - info->read;
if (size > count)
size = count;
ret = copy_to_user(ubuf, info->spare + info->read, size);
- if (ret == size)
- return -EFAULT;
+ if (ret == size) {
+ size = -EFAULT;
+ goto out_unlock;
+ }
size -= ret;
*ppos += size;
info->read += size;
+ out_unlock:
+ mutex_unlock(&trace_types_lock);
+
return size;
}
static int tracing_buffers_release(struct inode *inode, struct file *file)
{
struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
+
+ mutex_lock(&trace_types_lock);
+
+ WARN_ON(!iter->tr->ref);
+ iter->tr->ref--;
if (info->spare)
- ring_buffer_free_read_page(info->tr->buffer, info->spare);
+ ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare);
kfree(info);
+ mutex_unlock(&trace_types_lock);
+
return 0;
}
@@ -4375,6 +4930,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
unsigned int flags)
{
struct ftrace_buffer_info *info = file->private_data;
+ struct trace_iterator *iter = &info->iter;
struct partial_page partial_def[PIPE_DEF_BUFFERS];
struct page *pages_def[PIPE_DEF_BUFFERS];
struct splice_pipe_desc spd = {
@@ -4387,10 +4943,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
};
struct buffer_ref *ref;
int entries, size, i;
- size_t ret;
+ ssize_t ret;
- if (splice_grow_spd(pipe, &spd))
- return -ENOMEM;
+ mutex_lock(&trace_types_lock);
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ if (iter->snapshot && iter->tr->current_trace->use_max_tr) {
+ ret = -EBUSY;
+ goto out;
+ }
+#endif
+
+ if (splice_grow_spd(pipe, &spd)) {
+ ret = -ENOMEM;
+ goto out;
+ }
if (*ppos & (PAGE_SIZE - 1)) {
ret = -EINVAL;
@@ -4405,8 +4972,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
len &= PAGE_MASK;
}
- trace_access_lock(info->cpu);
- entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+ again:
+ trace_access_lock(iter->cpu_file);
+ entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) {
struct page *page;
@@ -4417,15 +4985,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
ref->ref = 1;
- ref->buffer = info->tr->buffer;
- ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu);
+ ref->buffer = iter->trace_buffer->buffer;
+ ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file);
if (!ref->page) {
kfree(ref);
break;
}
r = ring_buffer_read_page(ref->buffer, &ref->page,
- len, info->cpu, 1);
+ len, iter->cpu_file, 1);
if (r < 0) {
ring_buffer_free_read_page(ref->buffer, ref->page);
kfree(ref);
@@ -4449,31 +5017,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
spd.nr_pages++;
*ppos += PAGE_SIZE;
- entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+ entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file);
}
- trace_access_unlock(info->cpu);
+ trace_access_unlock(iter->cpu_file);
spd.nr_pages = i;
/* did we read anything? */
if (!spd.nr_pages) {
- if (flags & SPLICE_F_NONBLOCK)
+ if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) {
ret = -EAGAIN;
- else
- ret = 0;
- /* TODO: block */
- goto out;
+ goto out;
+ }
+ mutex_unlock(&trace_types_lock);
+ iter->trace->wait_pipe(iter);
+ mutex_lock(&trace_types_lock);
+ if (signal_pending(current)) {
+ ret = -EINTR;
+ goto out;
+ }
+ goto again;
}
ret = splice_to_pipe(pipe, &spd);
splice_shrink_spd(&spd);
out:
+ mutex_unlock(&trace_types_lock);
+
return ret;
}
static const struct file_operations tracing_buffers_fops = {
.open = tracing_buffers_open,
.read = tracing_buffers_read,
+ .poll = tracing_buffers_poll,
.release = tracing_buffers_release,
.splice_read = tracing_buffers_splice_read,
.llseek = no_llseek,
@@ -4483,12 +5060,14 @@ static ssize_t
tracing_stats_read(struct file *filp, char __user *ubuf,
size_t count, loff_t *ppos)
{
- unsigned long cpu = (unsigned long)filp->private_data;
- struct trace_array *tr = &global_trace;
+ struct trace_cpu *tc = filp->private_data;
+ struct trace_array *tr = tc->tr;
+ struct trace_buffer *trace_buf = &tr->trace_buffer;
struct trace_seq *s;
unsigned long cnt;
unsigned long long t;
unsigned long usec_rem;
+ int cpu = tc->cpu;
s = kmalloc(sizeof(*s), GFP_KERNEL);
if (!s)
@@ -4496,41 +5075,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf,
trace_seq_init(s);
- cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+ cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "entries: %ld\n", cnt);
- cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+ cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "overrun: %ld\n", cnt);
- cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+ cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "commit overrun: %ld\n", cnt);
- cnt = ring_buffer_bytes_cpu(tr->buffer, cpu);
+ cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "bytes: %ld\n", cnt);
if (trace_clocks[trace_clock_id].in_ns) {
/* local or global for trace_clock */
- t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu));
+ t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
usec_rem = do_div(t, USEC_PER_SEC);
trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n",
t, usec_rem);
- t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu));
+ t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu));
usec_rem = do_div(t, USEC_PER_SEC);
trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem);
} else {
/* counter or tsc mode for trace_clock */
trace_seq_printf(s, "oldest event ts: %llu\n",
- ring_buffer_oldest_event_ts(tr->buffer, cpu));
+ ring_buffer_oldest_event_ts(trace_buf->buffer, cpu));
trace_seq_printf(s, "now ts: %llu\n",
- ring_buffer_time_stamp(tr->buffer, cpu));
+ ring_buffer_time_stamp(trace_buf->buffer, cpu));
}
- cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu);
+ cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "dropped events: %ld\n", cnt);
- cnt = ring_buffer_read_events_cpu(tr->buffer, cpu);
+ cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu);
trace_seq_printf(s, "read events: %ld\n", cnt);
count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
@@ -4582,60 +5161,161 @@ static const struct file_operations tracing_dyn_info_fops = {
.read = tracing_read_dyn_info,
.llseek = generic_file_llseek,
};
-#endif
+#endif /* CONFIG_DYNAMIC_FTRACE */
-static struct dentry *d_tracer;
+#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE)
+static void
+ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ tracing_snapshot();
+}
-struct dentry *tracing_init_dentry(void)
+static void
+ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ unsigned long *count = (long *)data;
+
+ if (!*count)
+ return;
+
+ if (*count != -1)
+ (*count)--;
+
+ tracing_snapshot();
+}
+
+static int
+ftrace_snapshot_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ long count = (long)data;
+
+ seq_printf(m, "%ps:", (void *)ip);
+
+ seq_printf(m, "snapshot");
+
+ if (count == -1)
+ seq_printf(m, ":unlimited\n");
+ else
+ seq_printf(m, ":count=%ld\n", count);
+
+ return 0;
+}
+
+static struct ftrace_probe_ops snapshot_probe_ops = {
+ .func = ftrace_snapshot,
+ .print = ftrace_snapshot_print,
+};
+
+static struct ftrace_probe_ops snapshot_count_probe_ops = {
+ .func = ftrace_count_snapshot,
+ .print = ftrace_snapshot_print,
+};
+
+static int
+ftrace_trace_snapshot_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+ void *count = (void *)-1;
+ char *number;
+ int ret;
+
+ /* hash funcs only work with set_ftrace_filter */
+ if (!enable)
+ return -EINVAL;
+
+ ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops;
+
+ if (glob[0] == '!') {
+ unregister_ftrace_function_probe_func(glob+1, ops);
+ return 0;
+ }
+
+ if (!param)
+ goto out_reg;
+
+ number = strsep(&param, ":");
+
+ if (!strlen(number))
+ goto out_reg;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, (unsigned long *)&count);
+ if (ret)
+ return ret;
+
+ out_reg:
+ ret = register_ftrace_function_probe(glob, ops, count);
+
+ if (ret >= 0)
+ alloc_snapshot(&global_trace);
+
+ return ret < 0 ? ret : 0;
+}
+
+static struct ftrace_func_command ftrace_snapshot_cmd = {
+ .name = "snapshot",
+ .func = ftrace_trace_snapshot_callback,
+};
+
+static int register_snapshot_cmd(void)
{
- static int once;
+ return register_ftrace_command(&ftrace_snapshot_cmd);
+}
+#else
+static inline int register_snapshot_cmd(void) { return 0; }
+#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */
- if (d_tracer)
- return d_tracer;
+struct dentry *tracing_init_dentry_tr(struct trace_array *tr)
+{
+ if (tr->dir)
+ return tr->dir;
if (!debugfs_initialized())
return NULL;
- d_tracer = debugfs_create_dir("tracing", NULL);
+ if (tr->flags & TRACE_ARRAY_FL_GLOBAL)
+ tr->dir = debugfs_create_dir("tracing", NULL);
- if (!d_tracer && !once) {
- once = 1;
- pr_warning("Could not create debugfs directory 'tracing'\n");
- return NULL;
- }
+ if (!tr->dir)
+ pr_warn_once("Could not create debugfs directory 'tracing'\n");
- return d_tracer;
+ return tr->dir;
}
-static struct dentry *d_percpu;
+struct dentry *tracing_init_dentry(void)
+{
+ return tracing_init_dentry_tr(&global_trace);
+}
-static struct dentry *tracing_dentry_percpu(void)
+static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu)
{
- static int once;
struct dentry *d_tracer;
- if (d_percpu)
- return d_percpu;
-
- d_tracer = tracing_init_dentry();
+ if (tr->percpu_dir)
+ return tr->percpu_dir;
+ d_tracer = tracing_init_dentry_tr(tr);
if (!d_tracer)
return NULL;
- d_percpu = debugfs_create_dir("per_cpu", d_tracer);
+ tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer);
- if (!d_percpu && !once) {
- once = 1;
- pr_warning("Could not create debugfs directory 'per_cpu'\n");
- return NULL;
- }
+ WARN_ONCE(!tr->percpu_dir,
+ "Could not create debugfs directory 'per_cpu/%d'\n", cpu);
- return d_percpu;
+ return tr->percpu_dir;
}
-static void tracing_init_debugfs_percpu(long cpu)
+static void
+tracing_init_debugfs_percpu(struct trace_array *tr, long cpu)
{
- struct dentry *d_percpu = tracing_dentry_percpu();
+ struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu);
struct dentry *d_cpu;
char cpu_dir[30]; /* 30 characters should be more than enough */
@@ -4651,20 +5331,28 @@ static void tracing_init_debugfs_percpu(long cpu)
/* per cpu trace_pipe */
trace_create_file("trace_pipe", 0444, d_cpu,
- (void *) cpu, &tracing_pipe_fops);
+ (void *)&data->trace_cpu, &tracing_pipe_fops);
/* per cpu trace */
trace_create_file("trace", 0644, d_cpu,
- (void *) cpu, &tracing_fops);
+ (void *)&data->trace_cpu, &tracing_fops);
trace_create_file("trace_pipe_raw", 0444, d_cpu,
- (void *) cpu, &tracing_buffers_fops);
+ (void *)&data->trace_cpu, &tracing_buffers_fops);
trace_create_file("stats", 0444, d_cpu,
- (void *) cpu, &tracing_stats_fops);
+ (void *)&data->trace_cpu, &tracing_stats_fops);
trace_create_file("buffer_size_kb", 0444, d_cpu,
- (void *) cpu, &tracing_entries_fops);
+ (void *)&data->trace_cpu, &tracing_entries_fops);
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+ trace_create_file("snapshot", 0644, d_cpu,
+ (void *)&data->trace_cpu, &snapshot_fops);
+
+ trace_create_file("snapshot_raw", 0444, d_cpu,
+ (void *)&data->trace_cpu, &snapshot_raw_fops);
+#endif
}
#ifdef CONFIG_FTRACE_SELFTEST
@@ -4675,6 +5363,7 @@ static void tracing_init_debugfs_percpu(long cpu)
struct trace_option_dentry {
struct tracer_opt *opt;
struct tracer_flags *flags;
+ struct trace_array *tr;
struct dentry *entry;
};
@@ -4710,7 +5399,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (!!(topt->flags->val & topt->opt->bit) != val) {
mutex_lock(&trace_types_lock);
- ret = __set_tracer_option(current_trace, topt->flags,
+ ret = __set_tracer_option(topt->tr->current_trace, topt->flags,
topt->opt, !val);
mutex_unlock(&trace_types_lock);
if (ret)
@@ -4749,6 +5438,7 @@ static ssize_t
trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
+ struct trace_array *tr = &global_trace;
long index = (long)filp->private_data;
unsigned long val;
int ret;
@@ -4759,7 +5449,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (val != 0 && val != 1)
return -EINVAL;
- set_tracer_flags(1 << index, val);
+
+ mutex_lock(&trace_types_lock);
+ ret = set_tracer_flag(tr, 1 << index, val);
+ mutex_unlock(&trace_types_lock);
+
+ if (ret < 0)
+ return ret;
*ppos += cnt;
@@ -4789,40 +5485,41 @@ struct dentry *trace_create_file(const char *name,
}
-static struct dentry *trace_options_init_dentry(void)
+static struct dentry *trace_options_init_dentry(struct trace_array *tr)
{
struct dentry *d_tracer;
- static struct dentry *t_options;
- if (t_options)
- return t_options;
+ if (tr->options)
+ return tr->options;
- d_tracer = tracing_init_dentry();
+ d_tracer = tracing_init_dentry_tr(tr);
if (!d_tracer)
return NULL;
- t_options = debugfs_create_dir("options", d_tracer);
- if (!t_options) {
+ tr->options = debugfs_create_dir("options", d_tracer);
+ if (!tr->options) {
pr_warning("Could not create debugfs directory 'options'\n");
return NULL;
}
- return t_options;
+ return tr->options;
}
static void
-create_trace_option_file(struct trace_option_dentry *topt,
+create_trace_option_file(struct trace_array *tr,
+ struct trace_option_dentry *topt,
struct tracer_flags *flags,
struct tracer_opt *opt)
{
struct dentry *t_options;
- t_options = trace_options_init_dentry();
+ t_options = trace_options_init_dentry(tr);
if (!t_options)
return;
topt->flags = flags;
topt->opt = opt;
+ topt->tr = tr;
topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
&trace_options_fops);
@@ -4830,7 +5527,7 @@ create_trace_option_file(struct trace_option_dentry *topt,
}
static struct trace_option_dentry *
-create_trace_option_files(struct tracer *tracer)
+create_trace_option_files(struct trace_array *tr, struct tracer *tracer)
{
struct trace_option_dentry *topts;
struct tracer_flags *flags;
@@ -4855,7 +5552,7 @@ create_trace_option_files(struct tracer *tracer)
return NULL;
for (cnt = 0; opts[cnt].name; cnt++)
- create_trace_option_file(&topts[cnt], flags,
+ create_trace_option_file(tr, &topts[cnt], flags,
&opts[cnt]);
return topts;
@@ -4878,11 +5575,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts)
}
static struct dentry *
-create_trace_option_core_file(const char *option, long index)
+create_trace_option_core_file(struct trace_array *tr,
+ const char *option, long index)
{
struct dentry *t_options;
- t_options = trace_options_init_dentry();
+ t_options = trace_options_init_dentry(tr);
if (!t_options)
return NULL;
@@ -4890,17 +5588,17 @@ create_trace_option_core_file(const char *option, long index)
&trace_options_core_fops);
}
-static __init void create_trace_options_dir(void)
+static __init void create_trace_options_dir(struct trace_array *tr)
{
struct dentry *t_options;
int i;
- t_options = trace_options_init_dentry();
+ t_options = trace_options_init_dentry(tr);
if (!t_options)
return;
for (i = 0; trace_options[i]; i++)
- create_trace_option_core_file(trace_options[i], i);
+ create_trace_option_core_file(tr, trace_options[i], i);
}
static ssize_t
@@ -4908,7 +5606,7 @@ rb_simple_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_array *tr = filp->private_data;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
char buf[64];
int r;
@@ -4927,7 +5625,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_array *tr = filp->private_data;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
unsigned long val;
int ret;
@@ -4939,12 +5637,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
mutex_lock(&trace_types_lock);
if (val) {
ring_buffer_record_on(buffer);
- if (current_trace->start)
- current_trace->start(tr);
+ if (tr->current_trace->start)
+ tr->current_trace->start(tr);
} else {
ring_buffer_record_off(buffer);
- if (current_trace->stop)
- current_trace->stop(tr);
+ if (tr->current_trace->stop)
+ tr->current_trace->stop(tr);
}
mutex_unlock(&trace_types_lock);
}
@@ -4961,23 +5659,310 @@ static const struct file_operations rb_simple_fops = {
.llseek = default_llseek,
};
+struct dentry *trace_instance_dir;
+
+static void
+init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer);
+
+static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf)
+{
+ int cpu;
+
+ for_each_tracing_cpu(cpu) {
+ memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu));
+ per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu;
+ per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr;
+ }
+}
+
+static int
+allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size)
+{
+ enum ring_buffer_flags rb_flags;
+
+ rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
+
+ buf->buffer = ring_buffer_alloc(size, rb_flags);
+ if (!buf->buffer)
+ return -ENOMEM;
+
+ buf->data = alloc_percpu(struct trace_array_cpu);
+ if (!buf->data) {
+ ring_buffer_free(buf->buffer);
+ return -ENOMEM;
+ }
+
+ init_trace_buffers(tr, buf);
+
+ /* Allocate the first page for all buffers */
+ set_buffer_entries(&tr->trace_buffer,
+ ring_buffer_size(tr->trace_buffer.buffer, 0));
+
+ return 0;
+}
+
+static int allocate_trace_buffers(struct trace_array *tr, int size)
+{
+ int ret;
+
+ ret = allocate_trace_buffer(tr, &tr->trace_buffer, size);
+ if (ret)
+ return ret;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+ ret = allocate_trace_buffer(tr, &tr->max_buffer,
+ allocate_snapshot ? size : 1);
+ if (WARN_ON(ret)) {
+ ring_buffer_free(tr->trace_buffer.buffer);
+ free_percpu(tr->trace_buffer.data);
+ return -ENOMEM;
+ }
+ tr->allocated_snapshot = allocate_snapshot;
+
+ /*
+ * Only the top level trace array gets its snapshot allocated
+ * from the kernel command line.
+ */
+ allocate_snapshot = false;
+#endif
+ return 0;
+}
+
+static int new_instance_create(const char *name)
+{
+ struct trace_array *tr;
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+
+ ret = -EEXIST;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, name) == 0)
+ goto out_unlock;
+ }
+
+ ret = -ENOMEM;
+ tr = kzalloc(sizeof(*tr), GFP_KERNEL);
+ if (!tr)
+ goto out_unlock;
+
+ tr->name = kstrdup(name, GFP_KERNEL);
+ if (!tr->name)
+ goto out_free_tr;
+
+ raw_spin_lock_init(&tr->start_lock);
+
+ tr->current_trace = &nop_trace;
+
+ INIT_LIST_HEAD(&tr->systems);
+ INIT_LIST_HEAD(&tr->events);
+
+ if (allocate_trace_buffers(tr, trace_buf_size) < 0)
+ goto out_free_tr;
+
+ /* Holder for file callbacks */
+ tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
+ tr->trace_cpu.tr = tr;
+
+ tr->dir = debugfs_create_dir(name, trace_instance_dir);
+ if (!tr->dir)
+ goto out_free_tr;
+
+ ret = event_trace_add_tracer(tr->dir, tr);
+ if (ret)
+ goto out_free_tr;
+
+ init_tracer_debugfs(tr, tr->dir);
+
+ list_add(&tr->list, &ftrace_trace_arrays);
+
+ mutex_unlock(&trace_types_lock);
+
+ return 0;
+
+ out_free_tr:
+ if (tr->trace_buffer.buffer)
+ ring_buffer_free(tr->trace_buffer.buffer);
+ kfree(tr->name);
+ kfree(tr);
+
+ out_unlock:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+
+}
+
+static int instance_delete(const char *name)
+{
+ struct trace_array *tr;
+ int found = 0;
+ int ret;
+
+ mutex_lock(&trace_types_lock);
+
+ ret = -ENODEV;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (tr->name && strcmp(tr->name, name) == 0) {
+ found = 1;
+ break;
+ }
+ }
+ if (!found)
+ goto out_unlock;
+
+ ret = -EBUSY;
+ if (tr->ref)
+ goto out_unlock;
+
+ list_del(&tr->list);
+
+ event_trace_del_tracer(tr);
+ debugfs_remove_recursive(tr->dir);
+ free_percpu(tr->trace_buffer.data);
+ ring_buffer_free(tr->trace_buffer.buffer);
+
+ kfree(tr->name);
+ kfree(tr);
+
+ ret = 0;
+
+ out_unlock:
+ mutex_unlock(&trace_types_lock);
+
+ return ret;
+}
+
+static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode)
+{
+ struct dentry *parent;
+ int ret;
+
+ /* Paranoid: Make sure the parent is the "instances" directory */
+ parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+ if (WARN_ON_ONCE(parent != trace_instance_dir))
+ return -ENOENT;
+
+ /*
+ * The inode mutex is locked, but debugfs_create_dir() will also
+ * take the mutex. As the instances directory can not be destroyed
+ * or changed in any other way, it is safe to unlock it, and
+ * let the dentry try. If two users try to make the same dir at
+ * the same time, then the new_instance_create() will determine the
+ * winner.
+ */
+ mutex_unlock(&inode->i_mutex);
+
+ ret = new_instance_create(dentry->d_iname);
+
+ mutex_lock(&inode->i_mutex);
+
+ return ret;
+}
+
+static int instance_rmdir(struct inode *inode, struct dentry *dentry)
+{
+ struct dentry *parent;
+ int ret;
+
+ /* Paranoid: Make sure the parent is the "instances" directory */
+ parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias);
+ if (WARN_ON_ONCE(parent != trace_instance_dir))
+ return -ENOENT;
+
+ /* The caller did a dget() on dentry */
+ mutex_unlock(&dentry->d_inode->i_mutex);
+
+ /*
+ * The inode mutex is locked, but debugfs_create_dir() will also
+ * take the mutex. As the instances directory can not be destroyed
+ * or changed in any other way, it is safe to unlock it, and
+ * let the dentry try. If two users try to make the same dir at
+ * the same time, then the instance_delete() will determine the
+ * winner.
+ */
+ mutex_unlock(&inode->i_mutex);
+
+ ret = instance_delete(dentry->d_iname);
+
+ mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+ mutex_lock(&dentry->d_inode->i_mutex);
+
+ return ret;
+}
+
+static const struct inode_operations instance_dir_inode_operations = {
+ .lookup = simple_lookup,
+ .mkdir = instance_mkdir,
+ .rmdir = instance_rmdir,
+};
+
+static __init void create_trace_instances(struct dentry *d_tracer)
+{
+ trace_instance_dir = debugfs_create_dir("instances", d_tracer);
+ if (WARN_ON(!trace_instance_dir))
+ return;
+
+ /* Hijack the dir inode operations, to allow mkdir */
+ trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations;
+}
+
+static void
+init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer)
+{
+ int cpu;
+
+ trace_create_file("trace_options", 0644, d_tracer,
+ tr, &tracing_iter_fops);
+
+ trace_create_file("trace", 0644, d_tracer,
+ (void *)&tr->trace_cpu, &tracing_fops);
+
+ trace_create_file("trace_pipe", 0444, d_tracer,
+ (void *)&tr->trace_cpu, &tracing_pipe_fops);
+
+ trace_create_file("buffer_size_kb", 0644, d_tracer,
+ (void *)&tr->trace_cpu, &tracing_entries_fops);
+
+ trace_create_file("buffer_total_size_kb", 0444, d_tracer,
+ tr, &tracing_total_entries_fops);
+
+ trace_create_file("free_buffer", 0644, d_tracer,
+ tr, &tracing_free_buffer_fops);
+
+ trace_create_file("trace_marker", 0220, d_tracer,
+ tr, &tracing_mark_fops);
+
+ trace_create_file("trace_clock", 0644, d_tracer, tr,
+ &trace_clock_fops);
+
+ trace_create_file("tracing_on", 0644, d_tracer,
+ tr, &rb_simple_fops);
+
+#ifdef CONFIG_TRACER_SNAPSHOT
+ trace_create_file("snapshot", 0644, d_tracer,
+ (void *)&tr->trace_cpu, &snapshot_fops);
+#endif
+
+ for_each_tracing_cpu(cpu)
+ tracing_init_debugfs_percpu(tr, cpu);
+
+}
+
static __init int tracer_init_debugfs(void)
{
struct dentry *d_tracer;
- int cpu;
trace_access_lock_init();
d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
- trace_create_file("trace_options", 0644, d_tracer,
- NULL, &tracing_iter_fops);
+ init_tracer_debugfs(&global_trace, d_tracer);
trace_create_file("tracing_cpumask", 0644, d_tracer,
- NULL, &tracing_cpumask_fops);
-
- trace_create_file("trace", 0644, d_tracer,
- (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
+ &global_trace, &tracing_cpumask_fops);
trace_create_file("available_tracers", 0444, d_tracer,
&global_trace, &show_traces_fops);
@@ -4996,44 +5981,17 @@ static __init int tracer_init_debugfs(void)
trace_create_file("README", 0444, d_tracer,
NULL, &tracing_readme_fops);
- trace_create_file("trace_pipe", 0444, d_tracer,
- (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
-
- trace_create_file("buffer_size_kb", 0644, d_tracer,
- (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops);
-
- trace_create_file("buffer_total_size_kb", 0444, d_tracer,
- &global_trace, &tracing_total_entries_fops);
-
- trace_create_file("free_buffer", 0644, d_tracer,
- &global_trace, &tracing_free_buffer_fops);
-
- trace_create_file("trace_marker", 0220, d_tracer,
- NULL, &tracing_mark_fops);
-
trace_create_file("saved_cmdlines", 0444, d_tracer,
NULL, &tracing_saved_cmdlines_fops);
- trace_create_file("trace_clock", 0644, d_tracer, NULL,
- &trace_clock_fops);
-
- trace_create_file("tracing_on", 0644, d_tracer,
- &global_trace, &rb_simple_fops);
-
#ifdef CONFIG_DYNAMIC_FTRACE
trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
&ftrace_update_tot_cnt, &tracing_dyn_info_fops);
#endif
-#ifdef CONFIG_TRACER_SNAPSHOT
- trace_create_file("snapshot", 0644, d_tracer,
- (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops);
-#endif
-
- create_trace_options_dir();
+ create_trace_instances(d_tracer);
- for_each_tracing_cpu(cpu)
- tracing_init_debugfs_percpu(cpu);
+ create_trace_options_dir(&global_trace);
return 0;
}
@@ -5089,8 +6047,8 @@ void
trace_printk_seq(struct trace_seq *s)
{
/* Probably should print a warning here. */
- if (s->len >= 1000)
- s->len = 1000;
+ if (s->len >= TRACE_MAX_PRINT)
+ s->len = TRACE_MAX_PRINT;
/* should be zero ended, but we are paranoid. */
s->buffer[s->len] = 0;
@@ -5103,46 +6061,43 @@ trace_printk_seq(struct trace_seq *s)
void trace_init_global_iter(struct trace_iterator *iter)
{
iter->tr = &global_trace;
- iter->trace = current_trace;
- iter->cpu_file = TRACE_PIPE_ALL_CPU;
+ iter->trace = iter->tr->current_trace;
+ iter->cpu_file = RING_BUFFER_ALL_CPUS;
+ iter->trace_buffer = &global_trace.trace_buffer;
}
-static void
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
+void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
{
- static arch_spinlock_t ftrace_dump_lock =
- (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED;
/* use static because iter can be a bit big for the stack */
static struct trace_iterator iter;
+ static atomic_t dump_running;
unsigned int old_userobj;
- static int dump_ran;
unsigned long flags;
int cnt = 0, cpu;
- /* only one dump */
- local_irq_save(flags);
- arch_spin_lock(&ftrace_dump_lock);
- if (dump_ran)
- goto out;
-
- dump_ran = 1;
+ /* Only allow one dump user at a time. */
+ if (atomic_inc_return(&dump_running) != 1) {
+ atomic_dec(&dump_running);
+ return;
+ }
+ /*
+ * Always turn off tracing when we dump.
+ * We don't need to show trace output of what happens
+ * between multiple crashes.
+ *
+ * If the user does a sysrq-z, then they can re-enable
+ * tracing with echo 1 > tracing_on.
+ */
tracing_off();
- /* Did function tracer already get disabled? */
- if (ftrace_is_dead()) {
- printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
- printk("# MAY BE MISSING FUNCTION EVENTS\n");
- }
-
- if (disable_tracing)
- ftrace_kill();
+ local_irq_save(flags);
/* Simulate the iterator */
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
- atomic_inc(&iter.tr->data[cpu]->disabled);
+ atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled);
}
old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
@@ -5152,7 +6107,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
switch (oops_dump_mode) {
case DUMP_ALL:
- iter.cpu_file = TRACE_PIPE_ALL_CPU;
+ iter.cpu_file = RING_BUFFER_ALL_CPUS;
break;
case DUMP_ORIG:
iter.cpu_file = raw_smp_processor_id();
@@ -5161,11 +6116,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
goto out_enable;
default:
printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n");
- iter.cpu_file = TRACE_PIPE_ALL_CPU;
+ iter.cpu_file = RING_BUFFER_ALL_CPUS;
}
printk(KERN_TRACE "Dumping ftrace buffer:\n");
+ /* Did function tracer already get disabled? */
+ if (ftrace_is_dead()) {
+ printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n");
+ printk("# MAY BE MISSING FUNCTION EVENTS\n");
+ }
+
/*
* We need to stop all tracing on all CPUS to read the
* the next buffer. This is a bit expensive, but is
@@ -5205,33 +6166,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode)
printk(KERN_TRACE "---------------------------------\n");
out_enable:
- /* Re-enable tracing if requested */
- if (!disable_tracing) {
- trace_flags |= old_userobj;
+ trace_flags |= old_userobj;
- for_each_tracing_cpu(cpu) {
- atomic_dec(&iter.tr->data[cpu]->disabled);
- }
- tracing_on();
+ for_each_tracing_cpu(cpu) {
+ atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
-
- out:
- arch_spin_unlock(&ftrace_dump_lock);
+ atomic_dec(&dump_running);
local_irq_restore(flags);
}
-
-/* By default: disable tracing after the dump */
-void ftrace_dump(enum ftrace_dump_mode oops_dump_mode)
-{
- __ftrace_dump(true, oops_dump_mode);
-}
EXPORT_SYMBOL_GPL(ftrace_dump);
__init static int tracer_alloc_buffers(void)
{
int ring_buf_size;
- enum ring_buffer_flags rb_flags;
- int i;
int ret = -ENOMEM;
@@ -5252,49 +6199,27 @@ __init static int tracer_alloc_buffers(void)
else
ring_buf_size = 1;
- rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0;
-
cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
cpumask_copy(tracing_cpumask, cpu_all_mask);
+ raw_spin_lock_init(&global_trace.start_lock);
+
/* TODO: make the number of buffers hot pluggable with CPUS */
- global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags);
- if (!global_trace.buffer) {
+ if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) {
printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
WARN_ON(1);
goto out_free_cpumask;
}
+
if (global_trace.buffer_disabled)
tracing_off();
-
-#ifdef CONFIG_TRACER_MAX_TRACE
- max_tr.buffer = ring_buffer_alloc(1, rb_flags);
- if (!max_tr.buffer) {
- printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
- WARN_ON(1);
- ring_buffer_free(global_trace.buffer);
- goto out_free_cpumask;
- }
-#endif
-
- /* Allocate the first page for all buffers */
- for_each_tracing_cpu(i) {
- global_trace.data[i] = &per_cpu(global_trace_cpu, i);
- max_tr.data[i] = &per_cpu(max_tr_data, i);
- }
-
- set_buffer_entries(&global_trace,
- ring_buffer_size(global_trace.buffer, 0));
-#ifdef CONFIG_TRACER_MAX_TRACE
- set_buffer_entries(&max_tr, 1);
-#endif
-
trace_init_cmdlines();
- init_irq_work(&trace_work_wakeup, trace_wake_up);
register_tracer(&nop_trace);
+ global_trace.current_trace = &nop_trace;
+
/* All seems OK, enable tracing */
tracing_disabled = 0;
@@ -5303,16 +6228,32 @@ __init static int tracer_alloc_buffers(void)
register_die_notifier(&trace_die_notifier);
+ global_trace.flags = TRACE_ARRAY_FL_GLOBAL;
+
+ /* Holder for file callbacks */
+ global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS;
+ global_trace.trace_cpu.tr = &global_trace;
+
+ INIT_LIST_HEAD(&global_trace.systems);
+ INIT_LIST_HEAD(&global_trace.events);
+ list_add(&global_trace.list, &ftrace_trace_arrays);
+
while (trace_boot_options) {
char *option;
option = strsep(&trace_boot_options, ",");
- trace_set_options(option);
+ trace_set_options(&global_trace, option);
}
+ register_snapshot_cmd();
+
return 0;
out_free_cpumask:
+ free_percpu(global_trace.trace_buffer.data);
+#ifdef CONFIG_TRACER_MAX_TRACE
+ free_percpu(global_trace.max_buffer.data);
+#endif
free_cpumask_var(tracing_cpumask);
out_free_buffer_mask:
free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 57d7e5397d56..9e014582e763 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -13,6 +13,11 @@
#include <linux/trace_seq.h>
#include <linux/ftrace_event.h>
+#ifdef CONFIG_FTRACE_SYSCALLS
+#include <asm/unistd.h> /* For NR_SYSCALLS */
+#include <asm/syscall.h> /* some archs define it here */
+#endif
+
enum trace_type {
__TRACE_FIRST_TYPE = 0,
@@ -29,6 +34,7 @@ enum trace_type {
TRACE_GRAPH_ENT,
TRACE_USER_STACK,
TRACE_BLK,
+ TRACE_BPUTS,
__TRACE_LAST_TYPE,
};
@@ -127,12 +133,21 @@ enum trace_flag_type {
#define TRACE_BUF_SIZE 1024
+struct trace_array;
+
+struct trace_cpu {
+ struct trace_array *tr;
+ struct dentry *dir;
+ int cpu;
+};
+
/*
* The CPU trace array - it consists of thousands of trace entries
* plus some other descriptor data: (for example which task started
* the trace, etc.)
*/
struct trace_array_cpu {
+ struct trace_cpu trace_cpu;
atomic_t disabled;
void *buffer_page; /* ring buffer spare */
@@ -151,20 +166,83 @@ struct trace_array_cpu {
char comm[TASK_COMM_LEN];
};
+struct tracer;
+
+struct trace_buffer {
+ struct trace_array *tr;
+ struct ring_buffer *buffer;
+ struct trace_array_cpu __percpu *data;
+ cycle_t time_start;
+ int cpu;
+};
+
/*
* The trace array - an array of per-CPU trace arrays. This is the
* highest level data structure that individual tracers deal with.
* They have on/off state as well:
*/
struct trace_array {
- struct ring_buffer *buffer;
- int cpu;
+ struct list_head list;
+ char *name;
+ struct trace_buffer trace_buffer;
+#ifdef CONFIG_TRACER_MAX_TRACE
+ /*
+ * The max_buffer is used to snapshot the trace when a maximum
+ * latency is reached, or when the user initiates a snapshot.
+ * Some tracers will use this to store a maximum trace while
+ * it continues examining live traces.
+ *
+ * The buffers for the max_buffer are set up the same as the trace_buffer
+ * When a snapshot is taken, the buffer of the max_buffer is swapped
+ * with the buffer of the trace_buffer and the buffers are reset for
+ * the trace_buffer so the tracing can continue.
+ */
+ struct trace_buffer max_buffer;
+ bool allocated_snapshot;
+#endif
int buffer_disabled;
- cycle_t time_start;
+ struct trace_cpu trace_cpu; /* place holder */
+#ifdef CONFIG_FTRACE_SYSCALLS
+ int sys_refcount_enter;
+ int sys_refcount_exit;
+ DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
+ DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
+#endif
+ int stop_count;
+ int clock_id;
+ struct tracer *current_trace;
+ unsigned int flags;
+ raw_spinlock_t start_lock;
+ struct dentry *dir;
+ struct dentry *options;
+ struct dentry *percpu_dir;
+ struct dentry *event_dir;
+ struct list_head systems;
+ struct list_head events;
struct task_struct *waiter;
- struct trace_array_cpu *data[NR_CPUS];
+ int ref;
+};
+
+enum {
+ TRACE_ARRAY_FL_GLOBAL = (1 << 0)
};
+extern struct list_head ftrace_trace_arrays;
+
+/*
+ * The global tracer (top) should be the first trace array added,
+ * but we check the flag anyway.
+ */
+static inline struct trace_array *top_trace_array(void)
+{
+ struct trace_array *tr;
+
+ tr = list_entry(ftrace_trace_arrays.prev,
+ typeof(*tr), list);
+ WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL));
+ return tr;
+}
+
#define FTRACE_CMP_TYPE(var, type) \
__builtin_types_compatible_p(typeof(var), type *)
@@ -200,6 +278,7 @@ extern void __ftrace_bad_type(void);
IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \
IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \
+ IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \
TRACE_MMIO_RW); \
IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \
@@ -283,11 +362,16 @@ struct tracer {
enum print_line_t (*print_line)(struct trace_iterator *iter);
/* If you handled the flag setting, return 0 */
int (*set_flag)(u32 old_flags, u32 bit, int set);
+ /* Return 0 if OK with change, else return non-zero */
+ int (*flag_changed)(struct tracer *tracer,
+ u32 mask, int set);
struct tracer *next;
struct tracer_flags *flags;
bool print_max;
+ bool enabled;
+#ifdef CONFIG_TRACER_MAX_TRACE
bool use_max_tr;
- bool allocated_snapshot;
+#endif
};
@@ -423,8 +507,6 @@ static __always_inline void trace_clear_recursion(int bit)
current->trace_recursion = val;
}
-#define TRACE_PIPE_ALL_CPU -1
-
static inline struct ring_buffer_iter *
trace_buffer_iter(struct trace_iterator *iter, int cpu)
{
@@ -435,10 +517,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu)
int tracer_init(struct tracer *t, struct trace_array *tr);
int tracing_is_enabled(void);
-void tracing_reset(struct trace_array *tr, int cpu);
-void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset(struct trace_buffer *buf, int cpu);
+void tracing_reset_online_cpus(struct trace_buffer *buf);
void tracing_reset_current(int cpu);
-void tracing_reset_current_online_cpus(void);
+void tracing_reset_all_online_cpus(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
struct dentry *trace_create_file(const char *name,
umode_t mode,
@@ -446,6 +528,7 @@ struct dentry *trace_create_file(const char *name,
void *data,
const struct file_operations *fops);
+struct dentry *tracing_init_dentry_tr(struct trace_array *tr);
struct dentry *tracing_init_dentry(void);
struct ring_buffer_event;
@@ -579,7 +662,7 @@ extern int DYN_FTRACE_TEST_NAME(void);
#define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2
extern int DYN_FTRACE_TEST_NAME2(void);
-extern int ring_buffer_expanded;
+extern bool ring_buffer_expanded;
extern bool tracing_selftest_disabled;
DECLARE_PER_CPU(int, ftrace_cpu_disabled);
@@ -615,6 +698,8 @@ trace_array_vprintk(struct trace_array *tr,
unsigned long ip, const char *fmt, va_list args);
int trace_array_printk(struct trace_array *tr,
unsigned long ip, const char *fmt, ...);
+int trace_array_printk_buf(struct ring_buffer *buffer,
+ unsigned long ip, const char *fmt, ...);
void trace_printk_seq(struct trace_seq *s);
enum print_line_t print_trace_line(struct trace_iterator *iter);
@@ -782,6 +867,7 @@ enum trace_iterator_flags {
TRACE_ITER_STOP_ON_FREE = 0x400000,
TRACE_ITER_IRQ_INFO = 0x800000,
TRACE_ITER_MARKERS = 0x1000000,
+ TRACE_ITER_FUNCTION = 0x2000000,
};
/*
@@ -828,8 +914,8 @@ enum {
struct ftrace_event_field {
struct list_head link;
- char *name;
- char *type;
+ const char *name;
+ const char *type;
int filter_type;
int offset;
int size;
@@ -847,12 +933,19 @@ struct event_filter {
struct event_subsystem {
struct list_head list;
const char *name;
- struct dentry *entry;
struct event_filter *filter;
- int nr_events;
int ref_count;
};
+struct ftrace_subsystem_dir {
+ struct list_head list;
+ struct event_subsystem *subsystem;
+ struct trace_array *tr;
+ struct dentry *entry;
+ int ref_count;
+ int nr_events;
+};
+
#define FILTER_PRED_INVALID ((unsigned short)-1)
#define FILTER_PRED_IS_RIGHT (1 << 15)
#define FILTER_PRED_FOLD (1 << 15)
@@ -902,22 +995,20 @@ struct filter_pred {
unsigned short right;
};
-extern struct list_head ftrace_common_fields;
-
extern enum regex_type
filter_parse_regex(char *buff, int len, char **search, int *not);
extern void print_event_filter(struct ftrace_event_call *call,
struct trace_seq *s);
extern int apply_event_filter(struct ftrace_event_call *call,
char *filter_string);
-extern int apply_subsystem_event_filter(struct event_subsystem *system,
+extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
char *filter_string);
extern void print_subsystem_event_filter(struct event_subsystem *system,
struct trace_seq *s);
extern int filter_assign_type(const char *type);
-struct list_head *
-trace_get_fields(struct ftrace_event_call *event_call);
+struct ftrace_event_field *
+trace_find_event_field(struct ftrace_event_call *call, char *name);
static inline int
filter_check_discard(struct ftrace_event_call *call, void *rec,
@@ -934,6 +1025,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec,
}
extern void trace_event_enable_cmd_record(bool enable);
+extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr);
+extern int event_trace_del_tracer(struct trace_array *tr);
extern struct mutex event_mutex;
extern struct list_head ftrace_events;
@@ -943,6 +1036,19 @@ extern const char *__stop___trace_bprintk_fmt[];
void trace_printk_init_buffers(void);
void trace_printk_start_comm(void);
+int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set);
+int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled);
+
+/*
+ * Normal trace_printk() and friends allocates special buffers
+ * to do the manipulation, as well as saves the print formats
+ * into sections to display. But the trace infrastructure wants
+ * to use these without the added overhead at the price of being
+ * a bit slower (used mainly for warnings, where we don't care
+ * about performance). The internal_trace_puts() is for such
+ * a purpose.
+ */
+#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str))
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 95e96842ed29..d594da0dc03c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
{
struct ftrace_event_call *call = &event_branch;
struct trace_array *tr = branch_tracer;
+ struct trace_array_cpu *data;
struct ring_buffer_event *event;
struct trace_branch *entry;
struct ring_buffer *buffer;
@@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
local_irq_save(flags);
cpu = raw_smp_processor_id();
- if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
+ if (atomic_inc_return(&data->disabled) != 1)
goto out;
pc = preempt_count();
- buffer = tr->buffer;
+ buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
sizeof(*entry), flags, pc);
if (!event)
@@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
__buffer_unlock_commit(buffer, event);
out:
- atomic_dec(&tr->data[cpu]->disabled);
+ atomic_dec(&data->disabled);
local_irq_restore(flags);
}
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index aa8f5f48dae6..26dc348332b7 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -57,6 +57,16 @@ u64 notrace trace_clock(void)
return local_clock();
}
+/*
+ * trace_jiffy_clock(): Simply use jiffies as a clock counter.
+ */
+u64 notrace trace_clock_jiffies(void)
+{
+ u64 jiffy = jiffies - INITIAL_JIFFIES;
+
+ /* Return nsecs */
+ return (u64)jiffies_to_usecs(jiffy) * 1000ULL;
+}
/*
* trace_clock_global(): special globally coherent trace clock
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index 4108e1250ca2..e2d027ac66a2 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry,
__dynamic_array( u32, buf )
),
- F_printk("%08lx fmt:%p",
- __entry->ip, __entry->fmt),
+ F_printk("%pf: %s",
+ (void *)__entry->ip, __entry->fmt),
FILTER_OTHER
);
@@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry,
__dynamic_array( char, buf )
),
- F_printk("%08lx %s",
- __entry->ip, __entry->buf),
+ F_printk("%pf: %s",
+ (void *)__entry->ip, __entry->buf),
+
+ FILTER_OTHER
+);
+
+FTRACE_ENTRY(bputs, bputs_entry,
+
+ TRACE_BPUTS,
+
+ F_STRUCT(
+ __field( unsigned long, ip )
+ __field( const char *, str )
+ ),
+
+ F_printk("%pf: %s",
+ (void *)__entry->ip, __entry->str),
FILTER_OTHER
);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 57e9b284250c..53582e982e51 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE];
EXPORT_SYMBOL_GPL(event_storage);
LIST_HEAD(ftrace_events);
-LIST_HEAD(ftrace_common_fields);
+static LIST_HEAD(ftrace_common_fields);
-struct list_head *
+#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO)
+
+static struct kmem_cache *field_cachep;
+static struct kmem_cache *file_cachep;
+
+/* Double loops, do not use break, only goto's work */
+#define do_for_each_event_file(tr, file) \
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
+ list_for_each_entry(file, &tr->events, list)
+
+#define do_for_each_event_file_safe(tr, file) \
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) { \
+ struct ftrace_event_file *___n; \
+ list_for_each_entry_safe(file, ___n, &tr->events, list)
+
+#define while_for_each_event_file() \
+ }
+
+static struct list_head *
trace_get_fields(struct ftrace_event_call *event_call)
{
if (!event_call->class->get_fields)
@@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call)
return event_call->class->get_fields(event_call);
}
+static struct ftrace_event_field *
+__find_event_field(struct list_head *head, char *name)
+{
+ struct ftrace_event_field *field;
+
+ list_for_each_entry(field, head, link) {
+ if (!strcmp(field->name, name))
+ return field;
+ }
+
+ return NULL;
+}
+
+struct ftrace_event_field *
+trace_find_event_field(struct ftrace_event_call *call, char *name)
+{
+ struct ftrace_event_field *field;
+ struct list_head *head;
+
+ field = __find_event_field(&ftrace_common_fields, name);
+ if (field)
+ return field;
+
+ head = trace_get_fields(call);
+ return __find_event_field(head, name);
+}
+
static int __trace_define_field(struct list_head *head, const char *type,
const char *name, int offset, int size,
int is_signed, int filter_type)
{
struct ftrace_event_field *field;
- field = kzalloc(sizeof(*field), GFP_KERNEL);
+ field = kmem_cache_alloc(field_cachep, GFP_TRACE);
if (!field)
goto err;
- field->name = kstrdup(name, GFP_KERNEL);
- if (!field->name)
- goto err;
-
- field->type = kstrdup(type, GFP_KERNEL);
- if (!field->type)
- goto err;
+ field->name = name;
+ field->type = type;
if (filter_type == FILTER_OTHER)
field->filter_type = filter_assign_type(type);
@@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type,
return 0;
err:
- if (field)
- kfree(field->name);
- kfree(field);
+ kmem_cache_free(field_cachep, field);
return -ENOMEM;
}
@@ -120,7 +158,7 @@ static int trace_define_common_fields(void)
return ret;
}
-void trace_destroy_fields(struct ftrace_event_call *call)
+static void trace_destroy_fields(struct ftrace_event_call *call)
{
struct ftrace_event_field *field, *next;
struct list_head *head;
@@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call)
head = trace_get_fields(call);
list_for_each_entry_safe(field, next, head, link) {
list_del(&field->link);
- kfree(field->type);
- kfree(field->name);
- kfree(field);
+ kmem_cache_free(field_cachep, field);
}
}
@@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init);
int ftrace_event_reg(struct ftrace_event_call *call,
enum trace_reg type, void *data)
{
+ struct ftrace_event_file *file = data;
+
switch (type) {
case TRACE_REG_REGISTER:
return tracepoint_probe_register(call->name,
call->class->probe,
- call);
+ file);
case TRACE_REG_UNREGISTER:
tracepoint_probe_unregister(call->name,
call->class->probe,
- call);
+ file);
return 0;
#ifdef CONFIG_PERF_EVENTS
@@ -183,54 +221,100 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg);
void trace_event_enable_cmd_record(bool enable)
{
- struct ftrace_event_call *call;
+ struct ftrace_event_file *file;
+ struct trace_array *tr;
mutex_lock(&event_mutex);
- list_for_each_entry(call, &ftrace_events, list) {
- if (!(call->flags & TRACE_EVENT_FL_ENABLED))
+ do_for_each_event_file(tr, file) {
+
+ if (!(file->flags & FTRACE_EVENT_FL_ENABLED))
continue;
if (enable) {
tracing_start_cmdline_record();
- call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+ set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
} else {
tracing_stop_cmdline_record();
- call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+ clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
}
- }
+ } while_for_each_event_file();
mutex_unlock(&event_mutex);
}
-static int ftrace_event_enable_disable(struct ftrace_event_call *call,
- int enable)
+static int __ftrace_event_enable_disable(struct ftrace_event_file *file,
+ int enable, int soft_disable)
{
+ struct ftrace_event_call *call = file->event_call;
int ret = 0;
+ int disable;
switch (enable) {
case 0:
- if (call->flags & TRACE_EVENT_FL_ENABLED) {
- call->flags &= ~TRACE_EVENT_FL_ENABLED;
- if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) {
+ /*
+ * When soft_disable is set and enable is cleared, we want
+ * to clear the SOFT_DISABLED flag but leave the event in the
+ * state that it was. That is, if the event was enabled and
+ * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED
+ * is set we do not want the event to be enabled before we
+ * clear the bit.
+ *
+ * When soft_disable is not set but the SOFT_MODE flag is,
+ * we do nothing. Do not disable the tracepoint, otherwise
+ * "soft enable"s (clearing the SOFT_DISABLED bit) wont work.
+ */
+ if (soft_disable) {
+ disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED;
+ clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+ } else
+ disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE);
+
+ if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) {
+ clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+ if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) {
tracing_stop_cmdline_record();
- call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD;
+ clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
}
- call->class->reg(call, TRACE_REG_UNREGISTER, NULL);
+ call->class->reg(call, TRACE_REG_UNREGISTER, file);
}
+ /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */
+ if (file->flags & FTRACE_EVENT_FL_SOFT_MODE)
+ set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
break;
case 1:
- if (!(call->flags & TRACE_EVENT_FL_ENABLED)) {
+ /*
+ * When soft_disable is set and enable is set, we want to
+ * register the tracepoint for the event, but leave the event
+ * as is. That means, if the event was already enabled, we do
+ * nothing (but set SOFT_MODE). If the event is disabled, we
+ * set SOFT_DISABLED before enabling the event tracepoint, so
+ * it still seems to be disabled.
+ */
+ if (!soft_disable)
+ clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+ else
+ set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags);
+
+ if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) {
+
+ /* Keep the event disabled, when going to SOFT_MODE. */
+ if (soft_disable)
+ set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags);
+
if (trace_flags & TRACE_ITER_RECORD_CMD) {
tracing_start_cmdline_record();
- call->flags |= TRACE_EVENT_FL_RECORDED_CMD;
+ set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags);
}
- ret = call->class->reg(call, TRACE_REG_REGISTER, NULL);
+ ret = call->class->reg(call, TRACE_REG_REGISTER, file);
if (ret) {
tracing_stop_cmdline_record();
pr_info("event trace: Could not enable event "
"%s\n", call->name);
break;
}
- call->flags |= TRACE_EVENT_FL_ENABLED;
+ set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags);
+
+ /* WAS_ENABLED gets set but never cleared. */
+ call->flags |= TRACE_EVENT_FL_WAS_ENABLED;
}
break;
}
@@ -238,13 +322,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call,
return ret;
}
-static void ftrace_clear_events(void)
+static int ftrace_event_enable_disable(struct ftrace_event_file *file,
+ int enable)
{
- struct ftrace_event_call *call;
+ return __ftrace_event_enable_disable(file, enable, 0);
+}
+
+static void ftrace_clear_events(struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
mutex_lock(&event_mutex);
- list_for_each_entry(call, &ftrace_events, list) {
- ftrace_event_enable_disable(call, 0);
+ list_for_each_entry(file, &tr->events, list) {
+ ftrace_event_enable_disable(file, 0);
}
mutex_unlock(&event_mutex);
}
@@ -257,11 +347,12 @@ static void __put_system(struct event_subsystem *system)
if (--system->ref_count)
return;
+ list_del(&system->list);
+
if (filter) {
kfree(filter->filter_string);
kfree(filter);
}
- kfree(system->name);
kfree(system);
}
@@ -271,24 +362,45 @@ static void __get_system(struct event_subsystem *system)
system->ref_count++;
}
-static void put_system(struct event_subsystem *system)
+static void __get_system_dir(struct ftrace_subsystem_dir *dir)
+{
+ WARN_ON_ONCE(dir->ref_count == 0);
+ dir->ref_count++;
+ __get_system(dir->subsystem);
+}
+
+static void __put_system_dir(struct ftrace_subsystem_dir *dir)
+{
+ WARN_ON_ONCE(dir->ref_count == 0);
+ /* If the subsystem is about to be freed, the dir must be too */
+ WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1);
+
+ __put_system(dir->subsystem);
+ if (!--dir->ref_count)
+ kfree(dir);
+}
+
+static void put_system(struct ftrace_subsystem_dir *dir)
{
mutex_lock(&event_mutex);
- __put_system(system);
+ __put_system_dir(dir);
mutex_unlock(&event_mutex);
}
/*
* __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
*/
-static int __ftrace_set_clr_event(const char *match, const char *sub,
- const char *event, int set)
+static int __ftrace_set_clr_event(struct trace_array *tr, const char *match,
+ const char *sub, const char *event, int set)
{
+ struct ftrace_event_file *file;
struct ftrace_event_call *call;
int ret = -EINVAL;
mutex_lock(&event_mutex);
- list_for_each_entry(call, &ftrace_events, list) {
+ list_for_each_entry(file, &tr->events, list) {
+
+ call = file->event_call;
if (!call->name || !call->class || !call->class->reg)
continue;
@@ -307,7 +419,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
if (event && strcmp(event, call->name) != 0)
continue;
- ftrace_event_enable_disable(call, set);
+ ftrace_event_enable_disable(file, set);
ret = 0;
}
@@ -316,7 +428,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub,
return ret;
}
-static int ftrace_set_clr_event(char *buf, int set)
+static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
{
char *event = NULL, *sub = NULL, *match;
@@ -344,7 +456,7 @@ static int ftrace_set_clr_event(char *buf, int set)
event = NULL;
}
- return __ftrace_set_clr_event(match, sub, event, set);
+ return __ftrace_set_clr_event(tr, match, sub, event, set);
}
/**
@@ -361,7 +473,9 @@ static int ftrace_set_clr_event(char *buf, int set)
*/
int trace_set_clr_event(const char *system, const char *event, int set)
{
- return __ftrace_set_clr_event(NULL, system, event, set);
+ struct trace_array *tr = top_trace_array();
+
+ return __ftrace_set_clr_event(tr, NULL, system, event, set);
}
EXPORT_SYMBOL_GPL(trace_set_clr_event);
@@ -373,6 +487,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
size_t cnt, loff_t *ppos)
{
struct trace_parser parser;
+ struct seq_file *m = file->private_data;
+ struct trace_array *tr = m->private;
ssize_t read, ret;
if (!cnt)
@@ -395,7 +511,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
parser.buffer[parser.idx] = 0;
- ret = ftrace_set_clr_event(parser.buffer + !set, set);
+ ret = ftrace_set_clr_event(tr, parser.buffer + !set, set);
if (ret)
goto out_put;
}
@@ -411,17 +527,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
static void *
t_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = v;
+ struct ftrace_event_file *file = v;
+ struct ftrace_event_call *call;
+ struct trace_array *tr = m->private;
(*pos)++;
- list_for_each_entry_continue(call, &ftrace_events, list) {
+ list_for_each_entry_continue(file, &tr->events, list) {
+ call = file->event_call;
/*
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
if (call->class && call->class->reg)
- return call;
+ return file;
}
return NULL;
@@ -429,30 +548,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
static void *t_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_call *call;
+ struct ftrace_event_file *file;
+ struct trace_array *tr = m->private;
loff_t l;
mutex_lock(&event_mutex);
- call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+ file = list_entry(&tr->events, struct ftrace_event_file, list);
for (l = 0; l <= *pos; ) {
- call = t_next(m, call, &l);
- if (!call)
+ file = t_next(m, file, &l);
+ if (!file)
break;
}
- return call;
+ return file;
}
static void *
s_next(struct seq_file *m, void *v, loff_t *pos)
{
- struct ftrace_event_call *call = v;
+ struct ftrace_event_file *file = v;
+ struct trace_array *tr = m->private;
(*pos)++;
- list_for_each_entry_continue(call, &ftrace_events, list) {
- if (call->flags & TRACE_EVENT_FL_ENABLED)
- return call;
+ list_for_each_entry_continue(file, &tr->events, list) {
+ if (file->flags & FTRACE_EVENT_FL_ENABLED)
+ return file;
}
return NULL;
@@ -460,23 +581,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos)
static void *s_start(struct seq_file *m, loff_t *pos)
{
- struct ftrace_event_call *call;
+ struct ftrace_event_file *file;
+ struct trace_array *tr = m->private;
loff_t l;
mutex_lock(&event_mutex);
- call = list_entry(&ftrace_events, struct ftrace_event_call, list);
+ file = list_entry(&tr->events, struct ftrace_event_file, list);
for (l = 0; l <= *pos; ) {
- call = s_next(m, call, &l);
- if (!call)
+ file = s_next(m, file, &l);
+ if (!file)
break;
}
- return call;
+ return file;
}
static int t_show(struct seq_file *m, void *v)
{
- struct ftrace_event_call *call = v;
+ struct ftrace_event_file *file = v;
+ struct ftrace_event_call *call = file->event_call;
if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
seq_printf(m, "%s:", call->class->system);
@@ -494,25 +617,31 @@ static ssize_t
event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_file *file = filp->private_data;
char *buf;
- if (call->flags & TRACE_EVENT_FL_ENABLED)
- buf = "1\n";
- else
+ if (file->flags & FTRACE_EVENT_FL_ENABLED) {
+ if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)
+ buf = "0*\n";
+ else
+ buf = "1\n";
+ } else
buf = "0\n";
- return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf));
}
static ssize_t
event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct ftrace_event_call *call = filp->private_data;
+ struct ftrace_event_file *file = filp->private_data;
unsigned long val;
int ret;
+ if (!file)
+ return -EINVAL;
+
ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
if (ret)
return ret;
@@ -525,7 +654,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
case 0:
case 1:
mutex_lock(&event_mutex);
- ret = ftrace_event_enable_disable(call, val);
+ ret = ftrace_event_enable_disable(file, val);
mutex_unlock(&event_mutex);
break;
@@ -543,14 +672,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
const char set_to_char[4] = { '?', '0', '1', 'X' };
- struct event_subsystem *system = filp->private_data;
+ struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
struct ftrace_event_call *call;
+ struct ftrace_event_file *file;
+ struct trace_array *tr = dir->tr;
char buf[2];
int set = 0;
int ret;
mutex_lock(&event_mutex);
- list_for_each_entry(call, &ftrace_events, list) {
+ list_for_each_entry(file, &tr->events, list) {
+ call = file->event_call;
if (!call->name || !call->class || !call->class->reg)
continue;
@@ -562,7 +695,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
* or if all events or cleared, or if we have
* a mixture.
*/
- set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED));
+ set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED));
/*
* If we have a mixture, no need to look further.
@@ -584,7 +717,8 @@ static ssize_t
system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct event_subsystem *system = filp->private_data;
+ struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
const char *name = NULL;
unsigned long val;
ssize_t ret;
@@ -607,7 +741,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
if (system)
name = system->name;
- ret = __ftrace_set_clr_event(NULL, name, NULL, val);
+ ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val);
if (ret)
goto out;
@@ -845,43 +979,75 @@ static LIST_HEAD(event_subsystems);
static int subsystem_open(struct inode *inode, struct file *filp)
{
struct event_subsystem *system = NULL;
+ struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */
+ struct trace_array *tr;
int ret;
- if (!inode->i_private)
- goto skip_search;
-
/* Make sure the system still exists */
mutex_lock(&event_mutex);
- list_for_each_entry(system, &event_subsystems, list) {
- if (system == inode->i_private) {
- /* Don't open systems with no events */
- if (!system->nr_events) {
- system = NULL;
- break;
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ list_for_each_entry(dir, &tr->systems, list) {
+ if (dir == inode->i_private) {
+ /* Don't open systems with no events */
+ if (dir->nr_events) {
+ __get_system_dir(dir);
+ system = dir->subsystem;
+ }
+ goto exit_loop;
}
- __get_system(system);
- break;
}
}
+ exit_loop:
mutex_unlock(&event_mutex);
- if (system != inode->i_private)
+ if (!system)
return -ENODEV;
- skip_search:
+ /* Some versions of gcc think dir can be uninitialized here */
+ WARN_ON(!dir);
+
ret = tracing_open_generic(inode, filp);
- if (ret < 0 && system)
- put_system(system);
+ if (ret < 0)
+ put_system(dir);
+
+ return ret;
+}
+
+static int system_tr_open(struct inode *inode, struct file *filp)
+{
+ struct ftrace_subsystem_dir *dir;
+ struct trace_array *tr = inode->i_private;
+ int ret;
+
+ /* Make a temporary dir that has no system but points to tr */
+ dir = kzalloc(sizeof(*dir), GFP_KERNEL);
+ if (!dir)
+ return -ENOMEM;
+
+ dir->tr = tr;
+
+ ret = tracing_open_generic(inode, filp);
+ if (ret < 0)
+ kfree(dir);
+
+ filp->private_data = dir;
return ret;
}
static int subsystem_release(struct inode *inode, struct file *file)
{
- struct event_subsystem *system = inode->i_private;
+ struct ftrace_subsystem_dir *dir = file->private_data;
- if (system)
- put_system(system);
+ /*
+ * If dir->subsystem is NULL, then this is a temporary
+ * descriptor that was made for a trace_array to enable
+ * all subsystems.
+ */
+ if (dir->subsystem)
+ put_system(dir);
+ else
+ kfree(dir);
return 0;
}
@@ -890,7 +1056,8 @@ static ssize_t
subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct event_subsystem *system = filp->private_data;
+ struct ftrace_subsystem_dir *dir = filp->private_data;
+ struct event_subsystem *system = dir->subsystem;
struct trace_seq *s;
int r;
@@ -915,7 +1082,7 @@ static ssize_t
subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
loff_t *ppos)
{
- struct event_subsystem *system = filp->private_data;
+ struct ftrace_subsystem_dir *dir = filp->private_data;
char *buf;
int err;
@@ -932,7 +1099,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
buf[cnt] = '\0';
- err = apply_subsystem_event_filter(system, buf);
+ err = apply_subsystem_event_filter(dir, buf);
free_page((unsigned long) buf);
if (err < 0)
return err;
@@ -1041,30 +1208,35 @@ static const struct file_operations ftrace_system_enable_fops = {
.release = subsystem_release,
};
+static const struct file_operations ftrace_tr_enable_fops = {
+ .open = system_tr_open,
+ .read = system_enable_read,
+ .write = system_enable_write,
+ .llseek = default_llseek,
+ .release = subsystem_release,
+};
+
static const struct file_operations ftrace_show_header_fops = {
.open = tracing_open_generic,
.read = show_header,
.llseek = default_llseek,
};
-static struct dentry *event_trace_events_dir(void)
+static int
+ftrace_event_open(struct inode *inode, struct file *file,
+ const struct seq_operations *seq_ops)
{
- static struct dentry *d_tracer;
- static struct dentry *d_events;
-
- if (d_events)
- return d_events;
-
- d_tracer = tracing_init_dentry();
- if (!d_tracer)
- return NULL;
+ struct seq_file *m;
+ int ret;
- d_events = debugfs_create_dir("events", d_tracer);
- if (!d_events)
- pr_warning("Could not create debugfs "
- "'events' directory\n");
+ ret = seq_open(file, seq_ops);
+ if (ret < 0)
+ return ret;
+ m = file->private_data;
+ /* copy tr over to seq ops */
+ m->private = inode->i_private;
- return d_events;
+ return ret;
}
static int
@@ -1072,117 +1244,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file)
{
const struct seq_operations *seq_ops = &show_event_seq_ops;
- return seq_open(file, seq_ops);
+ return ftrace_event_open(inode, file, seq_ops);
}
static int
ftrace_event_set_open(struct inode *inode, struct file *file)
{
const struct seq_operations *seq_ops = &show_set_event_seq_ops;
+ struct trace_array *tr = inode->i_private;
if ((file->f_mode & FMODE_WRITE) &&
(file->f_flags & O_TRUNC))
- ftrace_clear_events();
+ ftrace_clear_events(tr);
- return seq_open(file, seq_ops);
+ return ftrace_event_open(inode, file, seq_ops);
+}
+
+static struct event_subsystem *
+create_new_subsystem(const char *name)
+{
+ struct event_subsystem *system;
+
+ /* need to create new entry */
+ system = kmalloc(sizeof(*system), GFP_KERNEL);
+ if (!system)
+ return NULL;
+
+ system->ref_count = 1;
+ system->name = name;
+
+ system->filter = NULL;
+
+ system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+ if (!system->filter)
+ goto out_free;
+
+ list_add(&system->list, &event_subsystems);
+
+ return system;
+
+ out_free:
+ kfree(system);
+ return NULL;
}
static struct dentry *
-event_subsystem_dir(const char *name, struct dentry *d_events)
+event_subsystem_dir(struct trace_array *tr, const char *name,
+ struct ftrace_event_file *file, struct dentry *parent)
{
+ struct ftrace_subsystem_dir *dir;
struct event_subsystem *system;
struct dentry *entry;
/* First see if we did not already create this dir */
- list_for_each_entry(system, &event_subsystems, list) {
+ list_for_each_entry(dir, &tr->systems, list) {
+ system = dir->subsystem;
if (strcmp(system->name, name) == 0) {
- system->nr_events++;
- return system->entry;
+ dir->nr_events++;
+ file->system = dir;
+ return dir->entry;
}
}
- /* need to create new entry */
- system = kmalloc(sizeof(*system), GFP_KERNEL);
- if (!system) {
- pr_warning("No memory to create event subsystem %s\n",
- name);
- return d_events;
+ /* Now see if the system itself exists. */
+ list_for_each_entry(system, &event_subsystems, list) {
+ if (strcmp(system->name, name) == 0)
+ break;
}
+ /* Reset system variable when not found */
+ if (&system->list == &event_subsystems)
+ system = NULL;
- system->entry = debugfs_create_dir(name, d_events);
- if (!system->entry) {
- pr_warning("Could not create event subsystem %s\n",
- name);
- kfree(system);
- return d_events;
- }
+ dir = kmalloc(sizeof(*dir), GFP_KERNEL);
+ if (!dir)
+ goto out_fail;
- system->nr_events = 1;
- system->ref_count = 1;
- system->name = kstrdup(name, GFP_KERNEL);
- if (!system->name) {
- debugfs_remove(system->entry);
- kfree(system);
- return d_events;
+ if (!system) {
+ system = create_new_subsystem(name);
+ if (!system)
+ goto out_free;
+ } else
+ __get_system(system);
+
+ dir->entry = debugfs_create_dir(name, parent);
+ if (!dir->entry) {
+ pr_warning("Failed to create system directory %s\n", name);
+ __put_system(system);
+ goto out_free;
}
- list_add(&system->list, &event_subsystems);
-
- system->filter = NULL;
-
- system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
- if (!system->filter) {
- pr_warning("Could not allocate filter for subsystem "
- "'%s'\n", name);
- return system->entry;
- }
+ dir->tr = tr;
+ dir->ref_count = 1;
+ dir->nr_events = 1;
+ dir->subsystem = system;
+ file->system = dir;
- entry = debugfs_create_file("filter", 0644, system->entry, system,
+ entry = debugfs_create_file("filter", 0644, dir->entry, dir,
&ftrace_subsystem_filter_fops);
if (!entry) {
kfree(system->filter);
system->filter = NULL;
- pr_warning("Could not create debugfs "
- "'%s/filter' entry\n", name);
+ pr_warning("Could not create debugfs '%s/filter' entry\n", name);
}
- trace_create_file("enable", 0644, system->entry, system,
+ trace_create_file("enable", 0644, dir->entry, dir,
&ftrace_system_enable_fops);
- return system->entry;
+ list_add(&dir->list, &tr->systems);
+
+ return dir->entry;
+
+ out_free:
+ kfree(dir);
+ out_fail:
+ /* Only print this message if failed on memory allocation */
+ if (!dir || !system)
+ pr_warning("No memory to create event subsystem %s\n",
+ name);
+ return NULL;
}
static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+event_create_dir(struct dentry *parent,
+ struct ftrace_event_file *file,
const struct file_operations *id,
const struct file_operations *enable,
const struct file_operations *filter,
const struct file_operations *format)
{
+ struct ftrace_event_call *call = file->event_call;
+ struct trace_array *tr = file->tr;
struct list_head *head;
+ struct dentry *d_events;
int ret;
/*
* If the trace point header did not define TRACE_SYSTEM
* then the system would be called "TRACE_SYSTEM".
*/
- if (strcmp(call->class->system, TRACE_SYSTEM) != 0)
- d_events = event_subsystem_dir(call->class->system, d_events);
-
- call->dir = debugfs_create_dir(call->name, d_events);
- if (!call->dir) {
- pr_warning("Could not create debugfs "
- "'%s' directory\n", call->name);
+ if (strcmp(call->class->system, TRACE_SYSTEM) != 0) {
+ d_events = event_subsystem_dir(tr, call->class->system, file, parent);
+ if (!d_events)
+ return -ENOMEM;
+ } else
+ d_events = parent;
+
+ file->dir = debugfs_create_dir(call->name, d_events);
+ if (!file->dir) {
+ pr_warning("Could not create debugfs '%s' directory\n",
+ call->name);
return -1;
}
if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
- trace_create_file("enable", 0644, call->dir, call,
+ trace_create_file("enable", 0644, file->dir, file,
enable);
#ifdef CONFIG_PERF_EVENTS
if (call->event.type && call->class->reg)
- trace_create_file("id", 0444, call->dir, call,
+ trace_create_file("id", 0444, file->dir, call,
id);
#endif
@@ -1196,23 +1416,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
if (ret < 0) {
pr_warning("Could not initialize trace point"
" events/%s\n", call->name);
- return ret;
+ return -1;
}
}
- trace_create_file("filter", 0644, call->dir, call,
+ trace_create_file("filter", 0644, file->dir, call,
filter);
- trace_create_file("format", 0444, call->dir, call,
+ trace_create_file("format", 0444, file->dir, call,
format);
return 0;
}
+static void remove_subsystem(struct ftrace_subsystem_dir *dir)
+{
+ if (!dir)
+ return;
+
+ if (!--dir->nr_events) {
+ debugfs_remove_recursive(dir->entry);
+ list_del(&dir->list);
+ __put_system_dir(dir);
+ }
+}
+
+static void remove_event_from_tracers(struct ftrace_event_call *call)
+{
+ struct ftrace_event_file *file;
+ struct trace_array *tr;
+
+ do_for_each_event_file_safe(tr, file) {
+
+ if (file->event_call != call)
+ continue;
+
+ list_del(&file->list);
+ debugfs_remove_recursive(file->dir);
+ remove_subsystem(file->system);
+ kmem_cache_free(file_cachep, file);
+
+ /*
+ * The do_for_each_event_file_safe() is
+ * a double loop. After finding the call for this
+ * trace_array, we use break to jump to the next
+ * trace_array.
+ */
+ break;
+ } while_for_each_event_file();
+}
+
static void event_remove(struct ftrace_event_call *call)
{
- ftrace_event_enable_disable(call, 0);
+ struct trace_array *tr;
+ struct ftrace_event_file *file;
+
+ do_for_each_event_file(tr, file) {
+ if (file->event_call != call)
+ continue;
+ ftrace_event_enable_disable(file, 0);
+ /*
+ * The do_for_each_event_file() is
+ * a double loop. After finding the call for this
+ * trace_array, we use break to jump to the next
+ * trace_array.
+ */
+ break;
+ } while_for_each_event_file();
+
if (call->event.funcs)
__unregister_ftrace_event(&call->event);
+ remove_event_from_tracers(call);
list_del(&call->list);
}
@@ -1234,82 +1507,99 @@ static int event_init(struct ftrace_event_call *call)
}
static int
-__trace_add_event_call(struct ftrace_event_call *call, struct module *mod,
- const struct file_operations *id,
- const struct file_operations *enable,
- const struct file_operations *filter,
- const struct file_operations *format)
+__register_event(struct ftrace_event_call *call, struct module *mod)
{
- struct dentry *d_events;
int ret;
ret = event_init(call);
if (ret < 0)
return ret;
- d_events = event_trace_events_dir();
- if (!d_events)
- return -ENOENT;
-
- ret = event_create_dir(call, d_events, id, enable, filter, format);
- if (!ret)
- list_add(&call->list, &ftrace_events);
+ list_add(&call->list, &ftrace_events);
call->mod = mod;
- return ret;
+ return 0;
+}
+
+/* Add an event to a trace directory */
+static int
+__trace_add_new_event(struct ftrace_event_call *call,
+ struct trace_array *tr,
+ const struct file_operations *id,
+ const struct file_operations *enable,
+ const struct file_operations *filter,
+ const struct file_operations *format)
+{
+ struct ftrace_event_file *file;
+
+ file = kmem_cache_alloc(file_cachep, GFP_TRACE);
+ if (!file)
+ return -ENOMEM;
+
+ file->event_call = call;
+ file->tr = tr;
+ list_add(&file->list, &tr->events);
+
+ return event_create_dir(tr->event_dir, file, id, enable, filter, format);
}
+/*
+ * Just create a decriptor for early init. A descriptor is required
+ * for enabling events at boot. We want to enable events before
+ * the filesystem is initialized.
+ */
+static __init int
+__trace_early_add_new_event(struct ftrace_event_call *call,
+ struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+
+ file = kmem_cache_alloc(file_cachep, GFP_TRACE);
+ if (!file)
+ return -ENOMEM;
+
+ file->event_call = call;
+ file->tr = tr;
+ list_add(&file->list, &tr->events);
+
+ return 0;
+}
+
+struct ftrace_module_file_ops;
+static void __add_event_to_tracers(struct ftrace_event_call *call,
+ struct ftrace_module_file_ops *file_ops);
+
/* Add an additional event_call dynamically */
int trace_add_event_call(struct ftrace_event_call *call)
{
int ret;
mutex_lock(&event_mutex);
- ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops,
- &ftrace_enable_fops,
- &ftrace_event_filter_fops,
- &ftrace_event_format_fops);
- mutex_unlock(&event_mutex);
- return ret;
-}
-static void remove_subsystem_dir(const char *name)
-{
- struct event_subsystem *system;
-
- if (strcmp(name, TRACE_SYSTEM) == 0)
- return;
+ ret = __register_event(call, NULL);
+ if (ret >= 0)
+ __add_event_to_tracers(call, NULL);
- list_for_each_entry(system, &event_subsystems, list) {
- if (strcmp(system->name, name) == 0) {
- if (!--system->nr_events) {
- debugfs_remove_recursive(system->entry);
- list_del(&system->list);
- __put_system(system);
- }
- break;
- }
- }
+ mutex_unlock(&event_mutex);
+ return ret;
}
/*
- * Must be called under locking both of event_mutex and trace_event_mutex.
+ * Must be called under locking both of event_mutex and trace_event_sem.
*/
static void __trace_remove_event_call(struct ftrace_event_call *call)
{
event_remove(call);
trace_destroy_fields(call);
destroy_preds(call);
- debugfs_remove_recursive(call->dir);
- remove_subsystem_dir(call->class->system);
}
/* Remove an event_call */
void trace_remove_event_call(struct ftrace_event_call *call)
{
mutex_lock(&event_mutex);
- down_write(&trace_event_mutex);
+ down_write(&trace_event_sem);
__trace_remove_event_call(call);
- up_write(&trace_event_mutex);
+ up_write(&trace_event_sem);
mutex_unlock(&event_mutex);
}
@@ -1336,6 +1626,26 @@ struct ftrace_module_file_ops {
};
static struct ftrace_module_file_ops *
+find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
+{
+ /*
+ * As event_calls are added in groups by module,
+ * when we find one file_ops, we don't need to search for
+ * each call in that module, as the rest should be the
+ * same. Only search for a new one if the last one did
+ * not match.
+ */
+ if (file_ops && mod == file_ops->mod)
+ return file_ops;
+
+ list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+ if (file_ops->mod == mod)
+ return file_ops;
+ }
+ return NULL;
+}
+
+static struct ftrace_module_file_ops *
trace_create_file_ops(struct module *mod)
{
struct ftrace_module_file_ops *file_ops;
@@ -1386,9 +1696,8 @@ static void trace_module_add_events(struct module *mod)
return;
for_each_event(call, start, end) {
- __trace_add_event_call(*call, mod,
- &file_ops->id, &file_ops->enable,
- &file_ops->filter, &file_ops->format);
+ __register_event(*call, mod);
+ __add_event_to_tracers(*call, file_ops);
}
}
@@ -1396,12 +1705,13 @@ static void trace_module_remove_events(struct module *mod)
{
struct ftrace_module_file_ops *file_ops;
struct ftrace_event_call *call, *p;
- bool found = false;
+ bool clear_trace = false;
- down_write(&trace_event_mutex);
+ down_write(&trace_event_sem);
list_for_each_entry_safe(call, p, &ftrace_events, list) {
if (call->mod == mod) {
- found = true;
+ if (call->flags & TRACE_EVENT_FL_WAS_ENABLED)
+ clear_trace = true;
__trace_remove_event_call(call);
}
}
@@ -1415,14 +1725,18 @@ static void trace_module_remove_events(struct module *mod)
list_del(&file_ops->list);
kfree(file_ops);
}
+ up_write(&trace_event_sem);
/*
* It is safest to reset the ring buffer if the module being unloaded
- * registered any events.
+ * registered any events that were used. The only worry is if
+ * a new module gets loaded, and takes on the same id as the events
+ * of this module. When printing out the buffer, traced events left
+ * over from this module may be passed to the new module events and
+ * unexpected results may occur.
*/
- if (found)
- tracing_reset_current_online_cpus();
- up_write(&trace_event_mutex);
+ if (clear_trace)
+ tracing_reset_all_online_cpus();
}
static int trace_module_notify(struct notifier_block *self,
@@ -1443,14 +1757,433 @@ static int trace_module_notify(struct notifier_block *self,
return 0;
}
+
+static int
+__trace_add_new_mod_event(struct ftrace_event_call *call,
+ struct trace_array *tr,
+ struct ftrace_module_file_ops *file_ops)
+{
+ return __trace_add_new_event(call, tr,
+ &file_ops->id, &file_ops->enable,
+ &file_ops->filter, &file_ops->format);
+}
+
#else
-static int trace_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
+static inline struct ftrace_module_file_ops *
+find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod)
+{
+ return NULL;
+}
+static inline int trace_module_notify(struct notifier_block *self,
+ unsigned long val, void *data)
{
return 0;
}
+static inline int
+__trace_add_new_mod_event(struct ftrace_event_call *call,
+ struct trace_array *tr,
+ struct ftrace_module_file_ops *file_ops)
+{
+ return -ENODEV;
+}
#endif /* CONFIG_MODULES */
+/* Create a new event directory structure for a trace directory. */
+static void
+__trace_add_event_dirs(struct trace_array *tr)
+{
+ struct ftrace_module_file_ops *file_ops = NULL;
+ struct ftrace_event_call *call;
+ int ret;
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ if (call->mod) {
+ /*
+ * Directories for events by modules need to
+ * keep module ref counts when opened (as we don't
+ * want the module to disappear when reading one
+ * of these files). The file_ops keep account of
+ * the module ref count.
+ */
+ file_ops = find_ftrace_file_ops(file_ops, call->mod);
+ if (!file_ops)
+ continue; /* Warn? */
+ ret = __trace_add_new_mod_event(call, tr, file_ops);
+ if (ret < 0)
+ pr_warning("Could not create directory for event %s\n",
+ call->name);
+ continue;
+ }
+ ret = __trace_add_new_event(call, tr,
+ &ftrace_event_id_fops,
+ &ftrace_enable_fops,
+ &ftrace_event_filter_fops,
+ &ftrace_event_format_fops);
+ if (ret < 0)
+ pr_warning("Could not create directory for event %s\n",
+ call->name);
+ }
+}
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+
+/* Avoid typos */
+#define ENABLE_EVENT_STR "enable_event"
+#define DISABLE_EVENT_STR "disable_event"
+
+struct event_probe_data {
+ struct ftrace_event_file *file;
+ unsigned long count;
+ int ref;
+ bool enable;
+};
+
+static struct ftrace_event_file *
+find_event_file(struct trace_array *tr, const char *system, const char *event)
+{
+ struct ftrace_event_file *file;
+ struct ftrace_event_call *call;
+
+ list_for_each_entry(file, &tr->events, list) {
+
+ call = file->event_call;
+
+ if (!call->name || !call->class || !call->class->reg)
+ continue;
+
+ if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)
+ continue;
+
+ if (strcmp(event, call->name) == 0 &&
+ strcmp(system, call->class->system) == 0)
+ return file;
+ }
+ return NULL;
+}
+
+static void
+event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ if (!data)
+ return;
+
+ if (data->enable)
+ clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+ else
+ set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags);
+}
+
+static void
+event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ if (!data)
+ return;
+
+ if (!data->count)
+ return;
+
+ /* Skip if the event is in a state we want to switch to */
+ if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED))
+ return;
+
+ if (data->count != -1)
+ (data->count)--;
+
+ event_enable_probe(ip, parent_ip, _data);
+}
+
+static int
+event_enable_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *_data)
+{
+ struct event_probe_data *data = _data;
+
+ seq_printf(m, "%ps:", (void *)ip);
+
+ seq_printf(m, "%s:%s:%s",
+ data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR,
+ data->file->event_call->class->system,
+ data->file->event_call->name);
+
+ if (data->count == -1)
+ seq_printf(m, ":unlimited\n");
+ else
+ seq_printf(m, ":count=%ld\n", data->count);
+
+ return 0;
+}
+
+static int
+event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip,
+ void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ data->ref++;
+ return 0;
+}
+
+static void
+event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip,
+ void **_data)
+{
+ struct event_probe_data **pdata = (struct event_probe_data **)_data;
+ struct event_probe_data *data = *pdata;
+
+ if (WARN_ON_ONCE(data->ref <= 0))
+ return;
+
+ data->ref--;
+ if (!data->ref) {
+ /* Remove the SOFT_MODE flag */
+ __ftrace_event_enable_disable(data->file, 0, 1);
+ module_put(data->file->event_call->mod);
+ kfree(data);
+ }
+ *pdata = NULL;
+}
+
+static struct ftrace_probe_ops event_enable_probe_ops = {
+ .func = event_enable_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static struct ftrace_probe_ops event_enable_count_probe_ops = {
+ .func = event_enable_count_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static struct ftrace_probe_ops event_disable_probe_ops = {
+ .func = event_enable_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static struct ftrace_probe_ops event_disable_count_probe_ops = {
+ .func = event_enable_count_probe,
+ .print = event_enable_print,
+ .init = event_enable_init,
+ .free = event_enable_free,
+};
+
+static int
+event_enable_func(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enabled)
+{
+ struct trace_array *tr = top_trace_array();
+ struct ftrace_event_file *file;
+ struct ftrace_probe_ops *ops;
+ struct event_probe_data *data;
+ const char *system;
+ const char *event;
+ char *number;
+ bool enable;
+ int ret;
+
+ /* hash funcs only work with set_ftrace_filter */
+ if (!enabled)
+ return -EINVAL;
+
+ if (!param)
+ return -EINVAL;
+
+ system = strsep(&param, ":");
+ if (!param)
+ return -EINVAL;
+
+ event = strsep(&param, ":");
+
+ mutex_lock(&event_mutex);
+
+ ret = -EINVAL;
+ file = find_event_file(tr, system, event);
+ if (!file)
+ goto out;
+
+ enable = strcmp(cmd, ENABLE_EVENT_STR) == 0;
+
+ if (enable)
+ ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops;
+ else
+ ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops;
+
+ if (glob[0] == '!') {
+ unregister_ftrace_function_probe_func(glob+1, ops);
+ ret = 0;
+ goto out;
+ }
+
+ ret = -ENOMEM;
+ data = kzalloc(sizeof(*data), GFP_KERNEL);
+ if (!data)
+ goto out;
+
+ data->enable = enable;
+ data->count = -1;
+ data->file = file;
+
+ if (!param)
+ goto out_reg;
+
+ number = strsep(&param, ":");
+
+ ret = -EINVAL;
+ if (!strlen(number))
+ goto out_free;
+
+ /*
+ * We use the callback data field (which is a pointer)
+ * as our counter.
+ */
+ ret = kstrtoul(number, 0, &data->count);
+ if (ret)
+ goto out_free;
+
+ out_reg:
+ /* Don't let event modules unload while probe registered */
+ ret = try_module_get(file->event_call->mod);
+ if (!ret)
+ goto out_free;
+
+ ret = __ftrace_event_enable_disable(file, 1, 1);
+ if (ret < 0)
+ goto out_put;
+ ret = register_ftrace_function_probe(glob, ops, data);
+ if (!ret)
+ goto out_disable;
+ out:
+ mutex_unlock(&event_mutex);
+ return ret;
+
+ out_disable:
+ __ftrace_event_enable_disable(file, 0, 1);
+ out_put:
+ module_put(file->event_call->mod);
+ out_free:
+ kfree(data);
+ goto out;
+}
+
+static struct ftrace_func_command event_enable_cmd = {
+ .name = ENABLE_EVENT_STR,
+ .func = event_enable_func,
+};
+
+static struct ftrace_func_command event_disable_cmd = {
+ .name = DISABLE_EVENT_STR,
+ .func = event_enable_func,
+};
+
+static __init int register_event_cmds(void)
+{
+ int ret;
+
+ ret = register_ftrace_command(&event_enable_cmd);
+ if (WARN_ON(ret < 0))
+ return ret;
+ ret = register_ftrace_command(&event_disable_cmd);
+ if (WARN_ON(ret < 0))
+ unregister_ftrace_command(&event_enable_cmd);
+ return ret;
+}
+#else
+static inline int register_event_cmds(void) { return 0; }
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
+/*
+ * The top level array has already had its ftrace_event_file
+ * descriptors created in order to allow for early events to
+ * be recorded. This function is called after the debugfs has been
+ * initialized, and we now have to create the files associated
+ * to the events.
+ */
+static __init void
+__trace_early_add_event_dirs(struct trace_array *tr)
+{
+ struct ftrace_event_file *file;
+ int ret;
+
+
+ list_for_each_entry(file, &tr->events, list) {
+ ret = event_create_dir(tr->event_dir, file,
+ &ftrace_event_id_fops,
+ &ftrace_enable_fops,
+ &ftrace_event_filter_fops,
+ &ftrace_event_format_fops);
+ if (ret < 0)
+ pr_warning("Could not create directory for event %s\n",
+ file->event_call->name);
+ }
+}
+
+/*
+ * For early boot up, the top trace array requires to have
+ * a list of events that can be enabled. This must be done before
+ * the filesystem is set up in order to allow events to be traced
+ * early.
+ */
+static __init void
+__trace_early_add_events(struct trace_array *tr)
+{
+ struct ftrace_event_call *call;
+ int ret;
+
+ list_for_each_entry(call, &ftrace_events, list) {
+ /* Early boot up should not have any modules loaded */
+ if (WARN_ON_ONCE(call->mod))
+ continue;
+
+ ret = __trace_early_add_new_event(call, tr);
+ if (ret < 0)
+ pr_warning("Could not create early event %s\n",
+ call->name);
+ }
+}
+
+/* Remove the event directory structure for a trace directory. */
+static void
+__trace_remove_event_dirs(struct trace_array *tr)
+{
+ struct ftrace_event_file *file, *next;
+
+ list_for_each_entry_safe(file, next, &tr->events, list) {
+ list_del(&file->list);
+ debugfs_remove_recursive(file->dir);
+ remove_subsystem(file->system);
+ kmem_cache_free(file_cachep, file);
+ }
+}
+
+static void
+__add_event_to_tracers(struct ftrace_event_call *call,
+ struct ftrace_module_file_ops *file_ops)
+{
+ struct trace_array *tr;
+
+ list_for_each_entry(tr, &ftrace_trace_arrays, list) {
+ if (file_ops)
+ __trace_add_new_mod_event(call, tr, file_ops);
+ else
+ __trace_add_new_event(call, tr,
+ &ftrace_event_id_fops,
+ &ftrace_enable_fops,
+ &ftrace_event_filter_fops,
+ &ftrace_event_format_fops);
+ }
+}
+
static struct notifier_block trace_module_nb = {
.notifier_call = trace_module_notify,
.priority = 0,
@@ -1464,15 +2197,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata;
static __init int setup_trace_event(char *str)
{
strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE);
- ring_buffer_expanded = 1;
- tracing_selftest_disabled = 1;
+ ring_buffer_expanded = true;
+ tracing_selftest_disabled = true;
return 1;
}
__setup("trace_event=", setup_trace_event);
+/* Expects to have event_mutex held when called */
+static int
+create_event_toplevel_files(struct dentry *parent, struct trace_array *tr)
+{
+ struct dentry *d_events;
+ struct dentry *entry;
+
+ entry = debugfs_create_file("set_event", 0644, parent,
+ tr, &ftrace_set_event_fops);
+ if (!entry) {
+ pr_warning("Could not create debugfs 'set_event' entry\n");
+ return -ENOMEM;
+ }
+
+ d_events = debugfs_create_dir("events", parent);
+ if (!d_events) {
+ pr_warning("Could not create debugfs 'events' directory\n");
+ return -ENOMEM;
+ }
+
+ /* ring buffer internal formats */
+ trace_create_file("header_page", 0444, d_events,
+ ring_buffer_print_page_header,
+ &ftrace_show_header_fops);
+
+ trace_create_file("header_event", 0444, d_events,
+ ring_buffer_print_entry_header,
+ &ftrace_show_header_fops);
+
+ trace_create_file("enable", 0644, d_events,
+ tr, &ftrace_tr_enable_fops);
+
+ tr->event_dir = d_events;
+
+ return 0;
+}
+
+/**
+ * event_trace_add_tracer - add a instance of a trace_array to events
+ * @parent: The parent dentry to place the files/directories for events in
+ * @tr: The trace array associated with these events
+ *
+ * When a new instance is created, it needs to set up its events
+ * directory, as well as other files associated with events. It also
+ * creates the event hierachry in the @parent/events directory.
+ *
+ * Returns 0 on success.
+ */
+int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
+
+ ret = create_event_toplevel_files(parent, tr);
+ if (ret)
+ goto out_unlock;
+
+ down_write(&trace_event_sem);
+ __trace_add_event_dirs(tr);
+ up_write(&trace_event_sem);
+
+ out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+/*
+ * The top trace array already had its file descriptors created.
+ * Now the files themselves need to be created.
+ */
+static __init int
+early_event_add_tracer(struct dentry *parent, struct trace_array *tr)
+{
+ int ret;
+
+ mutex_lock(&event_mutex);
+
+ ret = create_event_toplevel_files(parent, tr);
+ if (ret)
+ goto out_unlock;
+
+ down_write(&trace_event_sem);
+ __trace_early_add_event_dirs(tr);
+ up_write(&trace_event_sem);
+
+ out_unlock:
+ mutex_unlock(&event_mutex);
+
+ return ret;
+}
+
+int event_trace_del_tracer(struct trace_array *tr)
+{
+ /* Disable any running events */
+ __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
+
+ mutex_lock(&event_mutex);
+
+ down_write(&trace_event_sem);
+ __trace_remove_event_dirs(tr);
+ debugfs_remove_recursive(tr->event_dir);
+ up_write(&trace_event_sem);
+
+ tr->event_dir = NULL;
+
+ mutex_unlock(&event_mutex);
+
+ return 0;
+}
+
+static __init int event_trace_memsetup(void)
+{
+ field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC);
+ file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC);
+ return 0;
+}
+
static __init int event_trace_enable(void)
{
+ struct trace_array *tr = top_trace_array();
struct ftrace_event_call **iter, *call;
char *buf = bootup_event_buf;
char *token;
@@ -1486,6 +2339,14 @@ static __init int event_trace_enable(void)
list_add(&call->list, &ftrace_events);
}
+ /*
+ * We need the top trace array to have a working set of trace
+ * points at early init, before the debug files and directories
+ * are created. Create the file entries now, and attach them
+ * to the actual file dentries later.
+ */
+ __trace_early_add_events(tr);
+
while (true) {
token = strsep(&buf, ",");
@@ -1494,73 +2355,43 @@ static __init int event_trace_enable(void)
if (!*token)
continue;
- ret = ftrace_set_clr_event(token, 1);
+ ret = ftrace_set_clr_event(tr, token, 1);
if (ret)
pr_warn("Failed to enable trace event: %s\n", token);
}
trace_printk_start_comm();
+ register_event_cmds();
+
return 0;
}
static __init int event_trace_init(void)
{
- struct ftrace_event_call *call;
+ struct trace_array *tr;
struct dentry *d_tracer;
struct dentry *entry;
- struct dentry *d_events;
int ret;
+ tr = top_trace_array();
+
d_tracer = tracing_init_dentry();
if (!d_tracer)
return 0;
entry = debugfs_create_file("available_events", 0444, d_tracer,
- NULL, &ftrace_avail_fops);
+ tr, &ftrace_avail_fops);
if (!entry)
pr_warning("Could not create debugfs "
"'available_events' entry\n");
- entry = debugfs_create_file("set_event", 0644, d_tracer,
- NULL, &ftrace_set_event_fops);
- if (!entry)
- pr_warning("Could not create debugfs "
- "'set_event' entry\n");
-
- d_events = event_trace_events_dir();
- if (!d_events)
- return 0;
-
- /* ring buffer internal formats */
- trace_create_file("header_page", 0444, d_events,
- ring_buffer_print_page_header,
- &ftrace_show_header_fops);
-
- trace_create_file("header_event", 0444, d_events,
- ring_buffer_print_entry_header,
- &ftrace_show_header_fops);
-
- trace_create_file("enable", 0644, d_events,
- NULL, &ftrace_system_enable_fops);
-
if (trace_define_common_fields())
pr_warning("tracing: Failed to allocate common fields");
- /*
- * Early initialization already enabled ftrace event.
- * Now it's only necessary to create the event directory.
- */
- list_for_each_entry(call, &ftrace_events, list) {
-
- ret = event_create_dir(call, d_events,
- &ftrace_event_id_fops,
- &ftrace_enable_fops,
- &ftrace_event_filter_fops,
- &ftrace_event_format_fops);
- if (ret < 0)
- event_remove(call);
- }
+ ret = early_event_add_tracer(d_tracer, tr);
+ if (ret)
+ return ret;
ret = register_module_notifier(&trace_module_nb);
if (ret)
@@ -1568,6 +2399,7 @@ static __init int event_trace_init(void)
return 0;
}
+early_initcall(event_trace_memsetup);
core_initcall(event_trace_enable);
fs_initcall(event_trace_init);
@@ -1627,13 +2459,20 @@ static __init void event_test_stuff(void)
*/
static __init void event_trace_self_tests(void)
{
+ struct ftrace_subsystem_dir *dir;
+ struct ftrace_event_file *file;
struct ftrace_event_call *call;
struct event_subsystem *system;
+ struct trace_array *tr;
int ret;
+ tr = top_trace_array();
+
pr_info("Running tests on trace events:\n");
- list_for_each_entry(call, &ftrace_events, list) {
+ list_for_each_entry(file, &tr->events, list) {
+
+ call = file->event_call;
/* Only test those that have a probe */
if (!call->class || !call->class->probe)
@@ -1657,15 +2496,15 @@ static __init void event_trace_self_tests(void)
* If an event is already enabled, someone is using
* it and the self test should not be on.
*/
- if (call->flags & TRACE_EVENT_FL_ENABLED) {
+ if (file->flags & FTRACE_EVENT_FL_ENABLED) {
pr_warning("Enabled event during self test!\n");
WARN_ON_ONCE(1);
continue;
}
- ftrace_event_enable_disable(call, 1);
+ ftrace_event_enable_disable(file, 1);
event_test_stuff();
- ftrace_event_enable_disable(call, 0);
+ ftrace_event_enable_disable(file, 0);
pr_cont("OK\n");
}
@@ -1674,7 +2513,9 @@ static __init void event_trace_self_tests(void)
pr_info("Running tests on trace event systems:\n");
- list_for_each_entry(system, &event_subsystems, list) {
+ list_for_each_entry(dir, &tr->systems, list) {
+
+ system = dir->subsystem;
/* the ftrace system is special, skip it */
if (strcmp(system->name, "ftrace") == 0)
@@ -1682,7 +2523,7 @@ static __init void event_trace_self_tests(void)
pr_info("Testing event system %s: ", system->name);
- ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1);
if (WARN_ON_ONCE(ret)) {
pr_warning("error enabling system %s\n",
system->name);
@@ -1691,7 +2532,7 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
- ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
+ ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0);
if (WARN_ON_ONCE(ret)) {
pr_warning("error disabling system %s\n",
system->name);
@@ -1706,7 +2547,7 @@ static __init void event_trace_self_tests(void)
pr_info("Running tests on all trace events:\n");
pr_info("Testing all events: ");
- ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1);
if (WARN_ON_ONCE(ret)) {
pr_warning("error enabling all events\n");
return;
@@ -1715,7 +2556,7 @@ static __init void event_trace_self_tests(void)
event_test_stuff();
/* reset sysname */
- ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
+ ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0);
if (WARN_ON_ONCE(ret)) {
pr_warning("error disabling all events\n");
return;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index e5b0ca8b8d4d..a6361178de5a 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system,
mutex_unlock(&event_mutex);
}
-static struct ftrace_event_field *
-__find_event_field(struct list_head *head, char *name)
-{
- struct ftrace_event_field *field;
-
- list_for_each_entry(field, head, link) {
- if (!strcmp(field->name, name))
- return field;
- }
-
- return NULL;
-}
-
-static struct ftrace_event_field *
-find_event_field(struct ftrace_event_call *call, char *name)
-{
- struct ftrace_event_field *field;
- struct list_head *head;
-
- field = __find_event_field(&ftrace_common_fields, name);
- if (field)
- return field;
-
- head = trace_get_fields(call);
- return __find_event_field(head, name);
-}
-
static int __alloc_pred_stack(struct pred_stack *stack, int n_preds)
{
stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL);
@@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps,
return NULL;
}
- field = find_event_field(call, operand1);
+ field = trace_find_event_field(call, operand1);
if (!field) {
parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
return NULL;
@@ -1907,16 +1880,17 @@ out_unlock:
return err;
}
-int apply_subsystem_event_filter(struct event_subsystem *system,
+int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir,
char *filter_string)
{
+ struct event_subsystem *system = dir->subsystem;
struct event_filter *filter;
int err = 0;
mutex_lock(&event_mutex);
/* Make sure the system still has events */
- if (!system->nr_events) {
+ if (!dir->nr_events) {
err = -ENODEV;
goto out_unlock;
}
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e039906b037d..d21a74670088 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void) \
#undef FTRACE_ENTRY
#define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \
-int \
+static int __init \
ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
{ \
struct struct_name field; \
@@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \
#define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\
regfn) \
\
-struct ftrace_event_class event_class_ftrace_##call = { \
+struct ftrace_event_class __refdata event_class_ftrace_##call = { \
.system = __stringify(TRACE_SYSTEM), \
.define_fields = ftrace_define_fields_##call, \
.fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 601152523326..c4d6d7191988 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void);
static int function_trace_init(struct trace_array *tr)
{
func_trace = tr;
- tr->cpu = get_cpu();
+ tr->trace_buffer.cpu = get_cpu();
put_cpu();
tracing_start_cmdline_record();
@@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr)
static void function_trace_start(struct trace_array *tr)
{
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
}
/* Our option */
@@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip,
goto out;
cpu = smp_processor_id();
- data = tr->data[cpu];
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
if (!atomic_read(&data->disabled)) {
local_save_flags(flags);
trace_function(tr, ip, parent_ip, flags, pc);
@@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip,
*/
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = tr->data[cpu];
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
@@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly =
};
#ifdef CONFIG_DYNAMIC_FTRACE
-static void
-ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+static int update_count(void **data)
{
- long *count = (long *)data;
-
- if (tracing_is_on())
- return;
+ unsigned long *count = (long *)data;
if (!*count)
- return;
+ return 0;
if (*count != -1)
(*count)--;
- tracing_on();
+ return 1;
}
static void
-ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data)
{
- long *count = (long *)data;
+ if (tracing_is_on())
+ return;
+
+ if (update_count(data))
+ tracing_on();
+}
+static void
+ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data)
+{
if (!tracing_is_on())
return;
- if (!*count)
+ if (update_count(data))
+ tracing_off();
+}
+
+static void
+ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (tracing_is_on())
return;
- if (*count != -1)
- (*count)--;
+ tracing_on();
+}
+
+static void
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (!tracing_is_on())
+ return;
tracing_off();
}
-static int
-ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
- struct ftrace_probe_ops *ops, void *data);
+/*
+ * Skip 4:
+ * ftrace_stacktrace()
+ * function_trace_probe_call()
+ * ftrace_ops_list_func()
+ * ftrace_call()
+ */
+#define STACK_SKIP 4
-static struct ftrace_probe_ops traceon_probe_ops = {
- .func = ftrace_traceon,
- .print = ftrace_trace_onoff_print,
-};
+static void
+ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ trace_dump_stack(STACK_SKIP);
+}
-static struct ftrace_probe_ops traceoff_probe_ops = {
- .func = ftrace_traceoff,
- .print = ftrace_trace_onoff_print,
-};
+static void
+ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data)
+{
+ if (!tracing_is_on())
+ return;
+
+ if (update_count(data))
+ trace_dump_stack(STACK_SKIP);
+}
static int
-ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
- struct ftrace_probe_ops *ops, void *data)
+ftrace_probe_print(const char *name, struct seq_file *m,
+ unsigned long ip, void *data)
{
long count = (long)data;
- seq_printf(m, "%ps:", (void *)ip);
-
- if (ops == &traceon_probe_ops)
- seq_printf(m, "traceon");
- else
- seq_printf(m, "traceoff");
+ seq_printf(m, "%ps:%s", (void *)ip, name);
if (count == -1)
seq_printf(m, ":unlimited\n");
@@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
}
static int
-ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
+ftrace_traceon_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
{
- struct ftrace_probe_ops *ops;
-
- /* we register both traceon and traceoff to this callback */
- if (strcmp(cmd, "traceon") == 0)
- ops = &traceon_probe_ops;
- else
- ops = &traceoff_probe_ops;
+ return ftrace_probe_print("traceon", m, ip, data);
+}
- unregister_ftrace_function_probe_func(glob, ops);
+static int
+ftrace_traceoff_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("traceoff", m, ip, data);
+}
- return 0;
+static int
+ftrace_stacktrace_print(struct seq_file *m, unsigned long ip,
+ struct ftrace_probe_ops *ops, void *data)
+{
+ return ftrace_probe_print("stacktrace", m, ip, data);
}
+static struct ftrace_probe_ops traceon_count_probe_ops = {
+ .func = ftrace_traceon_count,
+ .print = ftrace_traceon_print,
+};
+
+static struct ftrace_probe_ops traceoff_count_probe_ops = {
+ .func = ftrace_traceoff_count,
+ .print = ftrace_traceoff_print,
+};
+
+static struct ftrace_probe_ops stacktrace_count_probe_ops = {
+ .func = ftrace_stacktrace_count,
+ .print = ftrace_stacktrace_print,
+};
+
+static struct ftrace_probe_ops traceon_probe_ops = {
+ .func = ftrace_traceon,
+ .print = ftrace_traceon_print,
+};
+
+static struct ftrace_probe_ops traceoff_probe_ops = {
+ .func = ftrace_traceoff,
+ .print = ftrace_traceoff_print,
+};
+
+static struct ftrace_probe_ops stacktrace_probe_ops = {
+ .func = ftrace_stacktrace,
+ .print = ftrace_stacktrace_print,
+};
+
static int
-ftrace_trace_onoff_callback(struct ftrace_hash *hash,
- char *glob, char *cmd, char *param, int enable)
+ftrace_trace_probe_callback(struct ftrace_probe_ops *ops,
+ struct ftrace_hash *hash, char *glob,
+ char *cmd, char *param, int enable)
{
- struct ftrace_probe_ops *ops;
void *count = (void *)-1;
char *number;
int ret;
@@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
if (!enable)
return -EINVAL;
- if (glob[0] == '!')
- return ftrace_trace_onoff_unreg(glob+1, cmd, param);
-
- /* we register both traceon and traceoff to this callback */
- if (strcmp(cmd, "traceon") == 0)
- ops = &traceon_probe_ops;
- else
- ops = &traceoff_probe_ops;
+ if (glob[0] == '!') {
+ unregister_ftrace_function_probe_func(glob+1, ops);
+ return 0;
+ }
if (!param)
goto out_reg;
@@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash,
return ret < 0 ? ret : 0;
}
+static int
+ftrace_trace_onoff_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+
+ /* we register both traceon and traceoff to this callback */
+ if (strcmp(cmd, "traceon") == 0)
+ ops = param ? &traceon_count_probe_ops : &traceon_probe_ops;
+ else
+ ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops;
+
+ return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ param, enable);
+}
+
+static int
+ftrace_stacktrace_callback(struct ftrace_hash *hash,
+ char *glob, char *cmd, char *param, int enable)
+{
+ struct ftrace_probe_ops *ops;
+
+ ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops;
+
+ return ftrace_trace_probe_callback(ops, hash, glob, cmd,
+ param, enable);
+}
+
static struct ftrace_func_command ftrace_traceon_cmd = {
.name = "traceon",
.func = ftrace_trace_onoff_callback,
@@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = {
.func = ftrace_trace_onoff_callback,
};
+static struct ftrace_func_command ftrace_stacktrace_cmd = {
+ .name = "stacktrace",
+ .func = ftrace_stacktrace_callback,
+};
+
static int __init init_func_cmd_traceon(void)
{
int ret;
@@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void)
ret = register_ftrace_command(&ftrace_traceon_cmd);
if (ret)
unregister_ftrace_command(&ftrace_traceoff_cmd);
+
+ ret = register_ftrace_command(&ftrace_stacktrace_cmd);
+ if (ret) {
+ unregister_ftrace_command(&ftrace_traceoff_cmd);
+ unregister_ftrace_command(&ftrace_traceon_cmd);
+ }
return ret;
}
#else
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 39ada66389cc..8388bc99f2ee 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr,
{
struct ftrace_event_call *call = &event_funcgraph_entry;
struct ring_buffer_event *event;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ent_entry *entry;
if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = tr->data[cpu];
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
@@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr,
{
struct ftrace_event_call *call = &event_funcgraph_exit;
struct ring_buffer_event *event;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ftrace_graph_ret_entry *entry;
if (unlikely(__this_cpu_read(ftrace_cpu_disabled)))
@@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = tr->data[cpu];
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&data->disabled);
if (likely(disabled == 1)) {
pc = preempt_count();
@@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter,
* We need to consume the current entry to see
* the next one.
*/
- ring_buffer_consume(iter->tr->buffer, iter->cpu,
+ ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu,
NULL, NULL);
- event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+ event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu,
NULL, NULL);
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 713a2cac4881..b19d065a28cb 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -32,7 +32,8 @@ enum {
static int trace_type __read_mostly;
-static int save_lat_flag;
+static int save_flags;
+static bool function_enabled;
static void stop_irqsoff_tracer(struct trace_array *tr, int graph);
static int start_irqsoff_tracer(struct trace_array *tr, int graph);
@@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr,
if (!irqs_disabled_flags(*flags))
return 0;
- *data = tr->data[cpu];
+ *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&(*data)->disabled);
if (likely(disabled == 1))
@@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set)
per_cpu(tracing_cpu, cpu) = 0;
tracing_max_latency = 0;
- tracing_reset_online_cpus(irqsoff_trace);
+ tracing_reset_online_cpus(&irqsoff_trace->trace_buffer);
return start_irqsoff_tracer(irqsoff_trace, set);
}
@@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
if (per_cpu(tracing_cpu, cpu))
return;
- data = tr->data[cpu];
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
if (unlikely(!data) || atomic_read(&data->disabled))
return;
@@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
if (!tracer_enabled)
return;
- data = tr->data[cpu];
+ data = per_cpu_ptr(tr->trace_buffer.data, cpu);
if (unlikely(!data) ||
!data->critical_start || atomic_read(&data->disabled))
@@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
}
#endif /* CONFIG_PREEMPT_TRACER */
-static int start_irqsoff_tracer(struct trace_array *tr, int graph)
+static int register_irqsoff_function(int graph, int set)
{
- int ret = 0;
+ int ret;
- if (!graph)
- ret = register_ftrace_function(&trace_ops);
- else
+ /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
+ if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+ return 0;
+
+ if (graph)
ret = register_ftrace_graph(&irqsoff_graph_return,
&irqsoff_graph_entry);
+ else
+ ret = register_ftrace_function(&trace_ops);
+
+ if (!ret)
+ function_enabled = true;
+
+ return ret;
+}
+
+static void unregister_irqsoff_function(int graph)
+{
+ if (!function_enabled)
+ return;
+
+ if (graph)
+ unregister_ftrace_graph();
+ else
+ unregister_ftrace_function(&trace_ops);
+
+ function_enabled = false;
+}
+
+static void irqsoff_function_set(int set)
+{
+ if (set)
+ register_irqsoff_function(is_graph(), 1);
+ else
+ unregister_irqsoff_function(is_graph());
+}
+
+static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set)
+{
+ if (mask & TRACE_ITER_FUNCTION)
+ irqsoff_function_set(set);
+
+ return trace_keep_overwrite(tracer, mask, set);
+}
+
+static int start_irqsoff_tracer(struct trace_array *tr, int graph)
+{
+ int ret;
+
+ ret = register_irqsoff_function(graph, 0);
if (!ret && tracing_is_enabled())
tracer_enabled = 1;
@@ -550,22 +596,22 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph)
{
tracer_enabled = 0;
- if (!graph)
- unregister_ftrace_function(&trace_ops);
- else
- unregister_ftrace_graph();
+ unregister_irqsoff_function(graph);
}
static void __irqsoff_tracer_init(struct trace_array *tr)
{
- save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
- trace_flags |= TRACE_ITER_LATENCY_FMT;
+ save_flags = trace_flags;
+
+ /* non overwrite screws up the latency tracers */
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
tracing_max_latency = 0;
irqsoff_trace = tr;
/* make sure that the tracer is visible */
smp_wmb();
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
if (start_irqsoff_tracer(tr, is_graph()))
printk(KERN_ERR "failed to start irqsoff tracer\n");
@@ -573,10 +619,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
static void irqsoff_tracer_reset(struct trace_array *tr)
{
+ int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+ int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
stop_irqsoff_tracer(tr, is_graph());
- if (!save_lat_flag)
- trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
}
static void irqsoff_tracer_start(struct trace_array *tr)
@@ -609,6 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_irqsoff,
#endif
@@ -642,6 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptoff,
#endif
@@ -677,6 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
.print_line = irqsoff_print_line,
.flags = &tracer_flags,
.set_flag = irqsoff_set_flag,
+ .flag_changed = irqsoff_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_preemptirqsoff,
#endif
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
index 3c5c5dfea0b3..bd90e1b06088 100644
--- a/kernel/trace/trace_kdb.c
+++ b/kernel/trace/trace_kdb.c
@@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
trace_init_global_iter(&iter);
for_each_tracing_cpu(cpu) {
- atomic_inc(&iter.tr->data[cpu]->disabled);
+ atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
old_userobj = trace_flags;
@@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file)
iter.iter_flags |= TRACE_FILE_LAT_FMT;
iter.pos = -1;
- if (cpu_file == TRACE_PIPE_ALL_CPU) {
+ if (cpu_file == RING_BUFFER_ALL_CPUS) {
for_each_tracing_cpu(cpu) {
iter.buffer_iter[cpu] =
- ring_buffer_read_prepare(iter.tr->buffer, cpu);
+ ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu);
ring_buffer_read_start(iter.buffer_iter[cpu]);
tracing_iter_reset(&iter, cpu);
}
} else {
iter.cpu_file = cpu_file;
iter.buffer_iter[cpu_file] =
- ring_buffer_read_prepare(iter.tr->buffer, cpu_file);
+ ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file);
ring_buffer_read_start(iter.buffer_iter[cpu_file]);
tracing_iter_reset(&iter, cpu_file);
}
@@ -83,7 +83,7 @@ out:
trace_flags = old_userobj;
for_each_tracing_cpu(cpu) {
- atomic_dec(&iter.tr->data[cpu]->disabled);
+ atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled);
}
for_each_tracing_cpu(cpu)
@@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv)
!cpu_online(cpu_file))
return KDB_BADINT;
} else {
- cpu_file = TRACE_PIPE_ALL_CPU;
+ cpu_file = RING_BUFFER_ALL_CPUS;
}
kdb_trap_printk++;
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fd3c8aae55e5..a5e8f4878bfa 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr)
overrun_detected = false;
prev_overruns = 0;
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
}
static int mmio_trace_init(struct trace_array *tr)
@@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter)
static unsigned long count_overruns(struct trace_iterator *iter)
{
unsigned long cnt = atomic_xchg(&dropped_count, 0);
- unsigned long over = ring_buffer_overruns(iter->tr->buffer);
+ unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer);
if (over > prev_overruns)
cnt += over - prev_overruns;
@@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
struct mmiotrace_rw *rw)
{
struct ftrace_event_call *call = &event_mmiotrace_rw;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_rw *entry;
int pc = preempt_count();
@@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
void mmio_trace_rw(struct mmiotrace_rw *rw)
{
struct trace_array *tr = mmio_trace_array;
- struct trace_array_cpu *data = tr->data[smp_processor_id()];
+ struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
__trace_mmiotrace_rw(tr, data, rw);
}
@@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
struct mmiotrace_map *map)
{
struct ftrace_event_call *call = &event_mmiotrace_map;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct trace_mmiotrace_map *entry;
int pc = preempt_count();
@@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
struct trace_array_cpu *data;
preempt_disable();
- data = tr->data[smp_processor_id()];
+ data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id());
__trace_mmiotrace_map(tr, data, map);
preempt_enable();
}
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 697e88d13907..bb922d9ee51b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -14,7 +14,7 @@
/* must be a power of 2 */
#define EVENT_HASHSIZE 128
-DECLARE_RWSEM(trace_event_mutex);
+DECLARE_RWSEM(trace_event_sem);
static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
@@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s)
return ret;
}
+enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct bputs_entry *field;
+ int ret;
+
+ trace_assign_type(field, entry);
+
+ ret = trace_seq_puts(s, field->str);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
{
struct trace_seq *s = &iter->seq;
@@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len)
}
EXPORT_SYMBOL(ftrace_print_hex_seq);
+int ftrace_raw_output_prep(struct trace_iterator *iter,
+ struct trace_event *trace_event)
+{
+ struct ftrace_event_call *event;
+ struct trace_seq *s = &iter->seq;
+ struct trace_seq *p = &iter->tmp_seq;
+ struct trace_entry *entry;
+ int ret;
+
+ event = container_of(trace_event, struct ftrace_event_call, event);
+ entry = iter->ent;
+
+ if (entry->type != event->event.type) {
+ WARN_ON_ONCE(1);
+ return TRACE_TYPE_UNHANDLED;
+ }
+
+ trace_seq_init(p);
+ ret = trace_seq_printf(s, "%s: ", event->name);
+ if (!ret)
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return 0;
+}
+EXPORT_SYMBOL(ftrace_raw_output_prep);
+
#ifdef CONFIG_KRETPROBES
static inline const char *kretprobed(const char *name)
{
@@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts)
{
unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE;
unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS;
- unsigned long long abs_ts = iter->ts - iter->tr->time_start;
+ unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start;
unsigned long long rel_ts = next_ts - iter->ts;
struct trace_seq *s = &iter->seq;
@@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list)
void trace_event_read_lock(void)
{
- down_read(&trace_event_mutex);
+ down_read(&trace_event_sem);
}
void trace_event_read_unlock(void)
{
- up_read(&trace_event_mutex);
+ up_read(&trace_event_sem);
}
/**
@@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event)
unsigned key;
int ret = 0;
- down_write(&trace_event_mutex);
+ down_write(&trace_event_sem);
if (WARN_ON(!event))
goto out;
@@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event)
ret = event->type;
out:
- up_write(&trace_event_mutex);
+ up_write(&trace_event_sem);
return ret;
}
EXPORT_SYMBOL_GPL(register_ftrace_event);
/*
- * Used by module code with the trace_event_mutex held for write.
+ * Used by module code with the trace_event_sem held for write.
*/
int __unregister_ftrace_event(struct trace_event *event)
{
@@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event)
*/
int unregister_ftrace_event(struct trace_event *event)
{
- down_write(&trace_event_mutex);
+ down_write(&trace_event_sem);
__unregister_ftrace_event(event);
- up_write(&trace_event_mutex);
+ up_write(&trace_event_sem);
return 0;
}
@@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = {
.funcs = &trace_user_stack_funcs,
};
+/* TRACE_BPUTS */
+static enum print_line_t
+trace_bputs_print(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_entry *entry = iter->ent;
+ struct trace_seq *s = &iter->seq;
+ struct bputs_entry *field;
+
+ trace_assign_type(field, entry);
+
+ if (!seq_print_ip_sym(s, field->ip, flags))
+ goto partial;
+
+ if (!trace_seq_puts(s, ": "))
+ goto partial;
+
+ if (!trace_seq_puts(s, field->str))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+
+static enum print_line_t
+trace_bputs_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct bputs_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ if (!trace_seq_printf(s, ": %lx : ", field->ip))
+ goto partial;
+
+ if (!trace_seq_puts(s, field->str))
+ goto partial;
+
+ return TRACE_TYPE_HANDLED;
+
+ partial:
+ return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event_functions trace_bputs_funcs = {
+ .trace = trace_bputs_print,
+ .raw = trace_bputs_raw,
+};
+
+static struct trace_event trace_bputs_event = {
+ .type = TRACE_BPUTS,
+ .funcs = &trace_bputs_funcs,
+};
+
/* TRACE_BPRINT */
static enum print_line_t
trace_bprint_print(struct trace_iterator *iter, int flags,
@@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = {
&trace_wake_event,
&trace_stack_event,
&trace_user_stack_event,
+ &trace_bputs_event,
&trace_bprint_event,
&trace_print_event,
NULL
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index c038eba0492b..127a9d8c8357 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -5,6 +5,8 @@
#include "trace.h"
extern enum print_line_t
+trace_print_bputs_msg_only(struct trace_iterator *iter);
+extern enum print_line_t
trace_print_bprintk_msg_only(struct trace_iterator *iter);
extern enum print_line_t
trace_print_printk_msg_only(struct trace_iterator *iter);
@@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry);
/* used by module unregistering */
extern int __unregister_ftrace_event(struct trace_event *event);
-extern struct rw_semaphore trace_event_mutex;
+extern struct rw_semaphore trace_event_sem;
#define MAX_MEMHEX_BYTES 8
#define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1)
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 3374c792ccd8..4e98e3b257a3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
unsigned long flags, int pc)
{
struct ftrace_event_call *call = &event_context_switch;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
@@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n
pc = preempt_count();
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = ctx_trace->data[cpu];
+ data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
if (likely(!atomic_read(&data->disabled)))
tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
@@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
struct ftrace_event_call *call = &event_wakeup;
struct ring_buffer_event *event;
struct ctx_switch_entry *entry;
- struct ring_buffer *buffer = tr->buffer;
+ struct ring_buffer *buffer = tr->trace_buffer.buffer;
event = trace_buffer_lock_reserve(buffer, TRACE_WAKE,
sizeof(*entry), flags, pc);
@@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success)
pc = preempt_count();
local_irq_save(flags);
cpu = raw_smp_processor_id();
- data = ctx_trace->data[cpu];
+ data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu);
if (likely(!atomic_read(&data->disabled)))
tracing_sched_wakeup_trace(ctx_trace, wakee, current,
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 75aa97fbe1a1..fee77e15d815 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -36,7 +36,8 @@ static void __wakeup_reset(struct trace_array *tr);
static int wakeup_graph_entry(struct ftrace_graph_ent *trace);
static void wakeup_graph_return(struct ftrace_graph_ret *trace);
-static int save_lat_flag;
+static int save_flags;
+static bool function_enabled;
#define TRACE_DISPLAY_GRAPH 1
@@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr,
if (cpu != wakeup_current_cpu)
goto out_enable;
- *data = tr->data[cpu];
+ *data = per_cpu_ptr(tr->trace_buffer.data, cpu);
disabled = atomic_inc_return(&(*data)->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly =
};
#endif /* CONFIG_FUNCTION_TRACER */
-static int start_func_tracer(int graph)
+static int register_wakeup_function(int graph, int set)
{
int ret;
- if (!graph)
- ret = register_ftrace_function(&trace_ops);
- else
+ /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */
+ if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION)))
+ return 0;
+
+ if (graph)
ret = register_ftrace_graph(&wakeup_graph_return,
&wakeup_graph_entry);
+ else
+ ret = register_ftrace_function(&trace_ops);
+
+ if (!ret)
+ function_enabled = true;
+
+ return ret;
+}
+
+static void unregister_wakeup_function(int graph)
+{
+ if (!function_enabled)
+ return;
+
+ if (graph)
+ unregister_ftrace_graph();
+ else
+ unregister_ftrace_function(&trace_ops);
+
+ function_enabled = false;
+}
+
+static void wakeup_function_set(int set)
+{
+ if (set)
+ register_wakeup_function(is_graph(), 1);
+ else
+ unregister_wakeup_function(is_graph());
+}
+
+static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set)
+{
+ if (mask & TRACE_ITER_FUNCTION)
+ wakeup_function_set(set);
+
+ return trace_keep_overwrite(tracer, mask, set);
+}
+
+static int start_func_tracer(int graph)
+{
+ int ret;
+
+ ret = register_wakeup_function(graph, 0);
if (!ret && tracing_is_enabled())
tracer_enabled = 1;
@@ -156,10 +202,7 @@ static void stop_func_tracer(int graph)
{
tracer_enabled = 0;
- if (!graph)
- unregister_ftrace_function(&trace_ops);
- else
- unregister_ftrace_graph();
+ unregister_wakeup_function(graph);
}
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore,
/* disable local data, not wakeup_cpu data */
cpu = raw_smp_processor_id();
- disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
+ disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
if (likely(disabled != 1))
goto out;
@@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore,
goto out_unlock;
/* The task we are waiting for is waking up */
- data = wakeup_trace->data[wakeup_cpu];
+ data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
__trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
@@ -387,7 +430,7 @@ out_unlock:
arch_spin_unlock(&wakeup_lock);
local_irq_restore(flags);
out:
- atomic_dec(&wakeup_trace->data[cpu]->disabled);
+ atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
}
static void __wakeup_reset(struct trace_array *tr)
@@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr)
{
unsigned long flags;
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
local_irq_save(flags);
arch_spin_lock(&wakeup_lock);
@@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
return;
pc = preempt_count();
- disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
+ disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
if (unlikely(disabled != 1))
goto out;
@@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
local_save_flags(flags);
- data = wakeup_trace->data[wakeup_cpu];
+ data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu);
data->preempt_timestamp = ftrace_now(cpu);
tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
@@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success)
out_locked:
arch_spin_unlock(&wakeup_lock);
out:
- atomic_dec(&wakeup_trace->data[cpu]->disabled);
+ atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled);
}
static void start_wakeup_tracer(struct trace_array *tr)
@@ -540,8 +583,11 @@ static void stop_wakeup_tracer(struct trace_array *tr)
static int __wakeup_tracer_init(struct trace_array *tr)
{
- save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
- trace_flags |= TRACE_ITER_LATENCY_FMT;
+ save_flags = trace_flags;
+
+ /* non overwrite screws up the latency tracers */
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1);
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1);
tracing_max_latency = 0;
wakeup_trace = tr;
@@ -563,12 +609,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr)
static void wakeup_tracer_reset(struct trace_array *tr)
{
+ int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT;
+ int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE;
+
stop_wakeup_tracer(tr);
/* make sure we put back any tasks we are tracing */
wakeup_reset(tr);
- if (!save_lat_flag)
- trace_flags &= ~TRACE_ITER_LATENCY_FMT;
+ set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag);
+ set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag);
}
static void wakeup_tracer_start(struct trace_array *tr)
@@ -594,6 +643,7 @@ static struct tracer wakeup_tracer __read_mostly =
.print_line = wakeup_print_line,
.flags = &tracer_flags,
.set_flag = wakeup_set_flag,
+ .flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
@@ -615,6 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
.print_line = wakeup_print_line,
.flags = &tracer_flags,
.set_flag = wakeup_set_flag,
+ .flag_changed = wakeup_flag_changed,
#ifdef CONFIG_FTRACE_SELFTEST
.selftest = trace_selftest_startup_wakeup,
#endif
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 51c819c12c29..55e2cf66967b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry)
return 0;
}
-static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
+static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu)
{
struct ring_buffer_event *event;
struct trace_entry *entry;
unsigned int loops = 0;
- while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) {
+ while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) {
entry = ring_buffer_event_data(event);
/*
@@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
* Test the trace buffer to see if all the elements
* are still sane.
*/
-static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
+static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count)
{
unsigned long flags, cnt = 0;
int cpu, ret = 0;
@@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
local_irq_save(flags);
arch_spin_lock(&ftrace_max_lock);
- cnt = ring_buffer_entries(tr->buffer);
+ cnt = ring_buffer_entries(buf->buffer);
/*
* The trace_test_buffer_cpu runs a while loop to consume all data.
@@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
*/
tracing_off();
for_each_possible_cpu(cpu) {
- ret = trace_test_buffer_cpu(tr, cpu);
+ ret = trace_test_buffer_cpu(buf, cpu);
if (ret)
break;
}
@@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
msleep(100);
/* we should have nothing in the buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
if (ret)
goto out;
@@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
ftrace_enabled = 0;
/* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
tracing_start();
/* we should only have one item */
@@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
ftrace_enabled = 0;
/* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
/* Maximum number of functions to trace before diagnosing a hang */
#define GRAPH_MAX_FUNC_TEST 100000000
-static void
-__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode);
static unsigned int graph_hang_thresh;
/* Wrap the real function entry probe to avoid possible hanging */
@@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
ftrace_graph_stop();
printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
- if (ftrace_dump_on_oops)
- __ftrace_dump(false, DUMP_ALL);
+ if (ftrace_dump_on_oops) {
+ ftrace_dump(DUMP_ALL);
+ /* ftrace_dump() disables tracing */
+ tracing_on();
+ }
return 0;
}
@@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
* Simulate the init() callback but we attach a watchdog callback
* to detect and recover from possible hangs
*/
- tracing_reset_online_cpus(tr);
+ tracing_reset_online_cpus(&tr->trace_buffer);
set_graph_array(tr);
ret = register_ftrace_graph(&trace_graph_return,
&trace_graph_entry_watchdog);
@@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
tracing_stop();
/* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(tr, NULL);
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (!ret)
- ret = trace_test_buffer(&max_tr, &count);
+ ret = trace_test_buffer(&tr->max_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(tr, NULL);
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (!ret)
- ret = trace_test_buffer(&max_tr, &count);
+ ret = trace_test_buffer(&tr->max_buffer, &count);
trace->reset(tr);
tracing_start();
@@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(tr, NULL);
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (ret)
goto out;
- ret = trace_test_buffer(&max_tr, &count);
+ ret = trace_test_buffer(&tr->max_buffer, &count);
if (ret)
goto out;
@@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(tr, NULL);
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
if (ret)
goto out;
- ret = trace_test_buffer(&max_tr, &count);
+ ret = trace_test_buffer(&tr->max_buffer, &count);
if (!ret && !count) {
printk(KERN_CONT ".. no entries found ..");
@@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
/* stop the tracing. */
tracing_stop();
/* check both trace buffers */
- ret = trace_test_buffer(tr, NULL);
+ ret = trace_test_buffer(&tr->trace_buffer, NULL);
printk("ret = %d\n", ret);
if (!ret)
- ret = trace_test_buffer(&max_tr, &count);
+ ret = trace_test_buffer(&tr->max_buffer, &count);
trace->reset(tr);
@@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
/* stop the tracing. */
tracing_stop();
/* check the trace buffer */
- ret = trace_test_buffer(tr, &count);
+ ret = trace_test_buffer(&tr->trace_buffer, &count);
trace->reset(tr);
tracing_start();
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 42ca822fc701..b20428c5efe2 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -20,13 +20,24 @@
#define STACK_TRACE_ENTRIES 500
+#ifdef CC_USING_FENTRY
+# define fentry 1
+#else
+# define fentry 0
+#endif
+
static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] =
{ [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX };
static unsigned stack_dump_index[STACK_TRACE_ENTRIES];
+/*
+ * Reserve one entry for the passed in ip. This will allow
+ * us to remove most or all of the stack size overhead
+ * added by the stack tracer itself.
+ */
static struct stack_trace max_stack_trace = {
- .max_entries = STACK_TRACE_ENTRIES,
- .entries = stack_dump_trace,
+ .max_entries = STACK_TRACE_ENTRIES - 1,
+ .entries = &stack_dump_trace[1],
};
static unsigned long max_stack_size;
@@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex);
int stack_tracer_enabled;
static int last_stack_tracer_enabled;
-static inline void check_stack(void)
+static inline void
+check_stack(unsigned long ip, unsigned long *stack)
{
unsigned long this_size, flags;
unsigned long *p, *top, *start;
+ static int tracer_frame;
+ int frame_size = ACCESS_ONCE(tracer_frame);
int i;
- this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1);
+ this_size = ((unsigned long)stack) & (THREAD_SIZE-1);
this_size = THREAD_SIZE - this_size;
+ /* Remove the frame of the tracer */
+ this_size -= frame_size;
if (this_size <= max_stack_size)
return;
/* we do not handle interrupt stacks yet */
- if (!object_is_on_stack(&this_size))
+ if (!object_is_on_stack(stack))
return;
local_irq_save(flags);
arch_spin_lock(&max_stack_lock);
+ /* In case another CPU set the tracer_frame on us */
+ if (unlikely(!frame_size))
+ this_size -= tracer_frame;
+
/* a race could have already updated it */
if (this_size <= max_stack_size)
goto out;
@@ -70,10 +90,18 @@ static inline void check_stack(void)
save_stack_trace(&max_stack_trace);
/*
+ * Add the passed in ip from the function tracer.
+ * Searching for this on the stack will skip over
+ * most of the overhead from the stack tracer itself.
+ */
+ stack_dump_trace[0] = ip;
+ max_stack_trace.nr_entries++;
+
+ /*
* Now find where in the stack these are.
*/
i = 0;
- start = &this_size;
+ start = stack;
top = (unsigned long *)
(((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE);
@@ -97,6 +125,18 @@ static inline void check_stack(void)
found = 1;
/* Start the search from here */
start = p + 1;
+ /*
+ * We do not want to show the overhead
+ * of the stack tracer stack in the
+ * max stack. If we haven't figured
+ * out what that is, then figure it out
+ * now.
+ */
+ if (unlikely(!tracer_frame) && i == 1) {
+ tracer_frame = (p - stack) *
+ sizeof(unsigned long);
+ max_stack_size -= tracer_frame;
+ }
}
}
@@ -113,6 +153,7 @@ static void
stack_trace_call(unsigned long ip, unsigned long parent_ip,
struct ftrace_ops *op, struct pt_regs *pt_regs)
{
+ unsigned long stack;
int cpu;
preempt_disable_notrace();
@@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip,
if (per_cpu(trace_active, cpu)++ != 0)
goto out;
- check_stack();
+ /*
+ * When fentry is used, the traced function does not get
+ * its stack frame set up, and we lose the parent.
+ * The ip is pretty useless because the function tracer
+ * was called before that function set up its stack frame.
+ * In this case, we use the parent ip.
+ *
+ * By adding the return address of either the parent ip
+ * or the current ip we can disregard most of the stack usage
+ * caused by the stack tracer itself.
+ *
+ * The function tracer always reports the address of where the
+ * mcount call was, but the stack will hold the return address.
+ */
+ if (fentry)
+ ip = parent_ip;
+ else
+ ip += MCOUNT_INSN_SIZE;
+
+ check_stack(ip, &stack);
out:
per_cpu(trace_active, cpu)--;
@@ -322,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = {
.open = stack_trace_filter_open,
.read = seq_read,
.write = ftrace_filter_write,
- .llseek = ftrace_regex_lseek,
+ .llseek = ftrace_filter_lseek,
.release = ftrace_regex_release,
};
@@ -371,6 +431,8 @@ static __init int stack_trace_init(void)
struct dentry *d_tracer;
d_tracer = tracing_init_dentry();
+ if (!d_tracer)
+ return 0;
trace_create_file("stack_max_size", 0644, d_tracer,
&max_stack_size, &stack_max_size_fops);
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 96cffb269e73..847f88a6194b 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -307,6 +307,8 @@ static int tracing_stat_init(void)
struct dentry *d_tracing;
d_tracing = tracing_init_dentry();
+ if (!d_tracing)
+ return 0;
stat_dir = debugfs_create_dir("trace_stat", d_tracing);
if (!stat_dir)
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 7a809e321058..8f2ac73c7a5f 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -12,10 +12,6 @@
#include "trace.h"
static DEFINE_MUTEX(syscall_trace_lock);
-static int sys_refcount_enter;
-static int sys_refcount_exit;
-static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls);
-static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls);
static int syscall_enter_register(struct ftrace_event_call *event,
enum trace_reg type, void *data);
@@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name
/*
* Only compare after the "sys" prefix. Archs that use
* syscall wrappers may have syscalls symbols aliases prefixed
- * with "SyS" instead of "sys", leading to an unwanted
+ * with ".SyS" or ".sys" instead of "sys", leading to an unwanted
* mismatch.
*/
return !strcmp(sym + 3, name + 3);
@@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call)
kfree(call->print_fmt);
}
-static int syscall_enter_define_fields(struct ftrace_event_call *call)
+static int __init syscall_enter_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_enter trace;
struct syscall_metadata *meta = call->data;
@@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call)
return ret;
}
-static int syscall_exit_define_fields(struct ftrace_event_call *call)
+static int __init syscall_exit_define_fields(struct ftrace_event_call *call)
{
struct syscall_trace_exit trace;
int ret;
@@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call)
return ret;
}
-static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
+static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id)
{
+ struct trace_array *tr = data;
struct syscall_trace_enter *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
@@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
- if (!test_bit(syscall_nr, enabled_enter_syscalls))
+ if (!test_bit(syscall_nr, tr->enabled_enter_syscalls))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
@@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
- event = trace_current_buffer_lock_reserve(&buffer,
+ buffer = tr->trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer,
sys_data->enter_event->event.type, size, 0, 0);
if (!event)
return;
@@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
+static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret)
{
+ struct trace_array *tr = data;
struct syscall_trace_exit *entry;
struct syscall_metadata *sys_data;
struct ring_buffer_event *event;
@@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
syscall_nr = trace_get_syscall_nr(current, regs);
if (syscall_nr < 0)
return;
- if (!test_bit(syscall_nr, enabled_exit_syscalls))
+ if (!test_bit(syscall_nr, tr->enabled_exit_syscalls))
return;
sys_data = syscall_nr_to_meta(syscall_nr);
if (!sys_data)
return;
- event = trace_current_buffer_lock_reserve(&buffer,
+ buffer = tr->trace_buffer.buffer;
+ event = trace_buffer_lock_reserve(buffer,
sys_data->exit_event->event.type, sizeof(*entry), 0, 0);
if (!event)
return;
@@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret)
trace_current_buffer_unlock_commit(buffer, event, 0, 0);
}
-static int reg_event_syscall_enter(struct ftrace_event_call *call)
+static int reg_event_syscall_enter(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
{
+ struct trace_array *tr = file->tr;
int ret = 0;
int num;
@@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call)
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
- if (!sys_refcount_enter)
- ret = register_trace_sys_enter(ftrace_syscall_enter, NULL);
+ if (!tr->sys_refcount_enter)
+ ret = register_trace_sys_enter(ftrace_syscall_enter, tr);
if (!ret) {
- set_bit(num, enabled_enter_syscalls);
- sys_refcount_enter++;
+ set_bit(num, tr->enabled_enter_syscalls);
+ tr->sys_refcount_enter++;
}
mutex_unlock(&syscall_trace_lock);
return ret;
}
-static void unreg_event_syscall_enter(struct ftrace_event_call *call)
+static void unreg_event_syscall_enter(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
{
+ struct trace_array *tr = file->tr;
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
- sys_refcount_enter--;
- clear_bit(num, enabled_enter_syscalls);
- if (!sys_refcount_enter)
- unregister_trace_sys_enter(ftrace_syscall_enter, NULL);
+ tr->sys_refcount_enter--;
+ clear_bit(num, tr->enabled_enter_syscalls);
+ if (!tr->sys_refcount_enter)
+ unregister_trace_sys_enter(ftrace_syscall_enter, tr);
mutex_unlock(&syscall_trace_lock);
}
-static int reg_event_syscall_exit(struct ftrace_event_call *call)
+static int reg_event_syscall_exit(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
{
+ struct trace_array *tr = file->tr;
int ret = 0;
int num;
@@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call)
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return -ENOSYS;
mutex_lock(&syscall_trace_lock);
- if (!sys_refcount_exit)
- ret = register_trace_sys_exit(ftrace_syscall_exit, NULL);
+ if (!tr->sys_refcount_exit)
+ ret = register_trace_sys_exit(ftrace_syscall_exit, tr);
if (!ret) {
- set_bit(num, enabled_exit_syscalls);
- sys_refcount_exit++;
+ set_bit(num, tr->enabled_exit_syscalls);
+ tr->sys_refcount_exit++;
}
mutex_unlock(&syscall_trace_lock);
return ret;
}
-static void unreg_event_syscall_exit(struct ftrace_event_call *call)
+static void unreg_event_syscall_exit(struct ftrace_event_file *file,
+ struct ftrace_event_call *call)
{
+ struct trace_array *tr = file->tr;
int num;
num = ((struct syscall_metadata *)call->data)->syscall_nr;
if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls))
return;
mutex_lock(&syscall_trace_lock);
- sys_refcount_exit--;
- clear_bit(num, enabled_exit_syscalls);
- if (!sys_refcount_exit)
- unregister_trace_sys_exit(ftrace_syscall_exit, NULL);
+ tr->sys_refcount_exit--;
+ clear_bit(num, tr->enabled_exit_syscalls);
+ if (!tr->sys_refcount_exit)
+ unregister_trace_sys_exit(ftrace_syscall_exit, tr);
mutex_unlock(&syscall_trace_lock);
}
@@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = {
.trace = print_syscall_exit,
};
-struct ftrace_event_class event_class_syscall_enter = {
+struct ftrace_event_class __refdata event_class_syscall_enter = {
.system = "syscalls",
.reg = syscall_enter_register,
.define_fields = syscall_enter_define_fields,
@@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = {
.raw_init = init_syscall_trace,
};
-struct ftrace_event_class event_class_syscall_exit = {
+struct ftrace_event_class __refdata event_class_syscall_exit = {
.system = "syscalls",
.reg = syscall_exit_register,
.define_fields = syscall_exit_define_fields,
@@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call)
static int syscall_enter_register(struct ftrace_event_call *event,
enum trace_reg type, void *data)
{
+ struct ftrace_event_file *file = data;
+
switch (type) {
case TRACE_REG_REGISTER:
- return reg_event_syscall_enter(event);
+ return reg_event_syscall_enter(file, event);
case TRACE_REG_UNREGISTER:
- unreg_event_syscall_enter(event);
+ unreg_event_syscall_enter(file, event);
return 0;
#ifdef CONFIG_PERF_EVENTS
@@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event,
static int syscall_exit_register(struct ftrace_event_call *event,
enum trace_reg type, void *data)
{
+ struct ftrace_event_file *file = data;
+
switch (type) {
case TRACE_REG_REGISTER:
- return reg_event_syscall_exit(event);
+ return reg_event_syscall_exit(file, event);
case TRACE_REG_UNREGISTER:
- unreg_event_syscall_exit(event);
+ unreg_event_syscall_exit(file, event);
return 0;
#ifdef CONFIG_PERF_EVENTS
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 0c05a4592047..29f26540e9c9 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry,
int nr_probes = 0;
struct tracepoint_func *old, *new;
- WARN_ON(!probe);
+ if (WARN_ON(!probe))
+ return ERR_PTR(-EINVAL);
debug_print_probes(entry);
old = entry->funcs;
@@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
debug_print_probes(entry);
/* (N -> M), (N > 1, M >= 0) probes */
- for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
- if (!probe ||
- (old[nr_probes].func == probe &&
- old[nr_probes].data == data))
- nr_del++;
+ if (probe) {
+ for (nr_probes = 0; old[nr_probes].func; nr_probes++) {
+ if (old[nr_probes].func == probe &&
+ old[nr_probes].data == data)
+ nr_del++;
+ }
}
+ /*
+ * If probe is NULL, then nr_probes = nr_del = 0, and then the
+ * entire entry will be removed.
+ */
if (nr_probes - nr_del == 0) {
/* N -> 0, (N > 1) */
entry->funcs = NULL;
@@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry,
if (new == NULL)
return ERR_PTR(-ENOMEM);
for (i = 0; old[i].func; i++)
- if (probe &&
- (old[i].func != probe || old[i].data != data))
+ if (old[i].func != probe || old[i].data != data)
new[j++] = old[i];
new[nr_probes - nr_del].func = NULL;
entry->refcount = nr_probes - nr_del;
diff --git a/kernel/user.c b/kernel/user.c
index e81978e8c03b..8e635a18ab52 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -51,6 +51,8 @@ struct user_namespace init_user_ns = {
.owner = GLOBAL_ROOT_UID,
.group = GLOBAL_ROOT_GID,
.proc_inum = PROC_USER_INIT_INO,
+ .may_mount_sysfs = true,
+ .may_mount_proc = true,
};
EXPORT_SYMBOL_GPL(init_user_ns);
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 8b650837083e..e134d8f365dd 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -21,10 +21,12 @@
#include <linux/uaccess.h>
#include <linux/ctype.h>
#include <linux/projid.h>
+#include <linux/fs_struct.h>
static struct kmem_cache *user_ns_cachep __read_mostly;
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file,
+ struct user_namespace *ns, int cap_setid,
struct uid_gid_map *map);
static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
@@ -60,6 +62,15 @@ int create_user_ns(struct cred *new)
kgid_t group = new->egid;
int ret;
+ /*
+ * Verify that we can not violate the policy of which files
+ * may be accessed that is specified by the root directory,
+ * by verifing that the root directory is at the root of the
+ * mount namespace which allows all files to be accessed.
+ */
+ if (current_chrooted())
+ return -EPERM;
+
/* The creator needs a mapping in the parent user namespace
* or else we won't be able to reasonably tell userspace who
* created a user_namespace.
@@ -86,6 +97,8 @@ int create_user_ns(struct cred *new)
set_cred_user_ns(new, ns);
+ update_mnt_policy(ns);
+
return 0;
}
@@ -600,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf,
if (map->nr_extents != 0)
goto out;
- /* Require the appropriate privilege CAP_SETUID or CAP_SETGID
- * over the user namespace in order to set the id mapping.
+ /*
+ * Adjusting namespace settings requires capabilities on the target.
*/
- if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid))
+ if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN))
goto out;
/* Get a buffer */
@@ -688,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf,
ret = -EPERM;
/* Validate the user is allowed to use user id's mapped to. */
- if (!new_idmap_permitted(ns, cap_setid, &new_map))
+ if (!new_idmap_permitted(file, ns, cap_setid, &new_map))
goto out;
/* Map the lower ids from the parent user namespace to the
@@ -775,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t
&ns->projid_map, &ns->parent->projid_map);
}
-static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
+static bool new_idmap_permitted(const struct file *file,
+ struct user_namespace *ns, int cap_setid,
struct uid_gid_map *new_map)
{
/* Allow mapping to your own filesystem ids */
@@ -783,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
u32 id = new_map->extent[0].lower_first;
if (cap_setid == CAP_SETUID) {
kuid_t uid = make_kuid(ns->parent, id);
- if (uid_eq(uid, current_fsuid()))
+ if (uid_eq(uid, file->f_cred->fsuid))
return true;
}
else if (cap_setid == CAP_SETGID) {
kgid_t gid = make_kgid(ns->parent, id);
- if (gid_eq(gid, current_fsgid()))
+ if (gid_eq(gid, file->f_cred->fsgid))
return true;
}
}
@@ -799,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid,
/* Allow the specified ids if we have the appropriate capability
* (CAP_SETUID or CAP_SETGID) over the parent user namespace.
+ * And the opener of the id file also had the approprpiate capability.
*/
- if (ns_capable(ns->parent, cap_setid))
+ if (ns_capable(ns->parent, cap_setid) &&
+ file_ns_capable(file, ns->parent, cap_setid))
return true;
return false;
@@ -837,6 +853,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
if (atomic_read(&current->mm->mm_users) > 1)
return -EINVAL;
+ if (current->fs->users != 1)
+ return -EINVAL;
+
if (!ns_capable(user_ns, CAP_SYS_ADMIN))
return -EPERM;
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 81f2457811eb..154aa12af48e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -41,7 +41,11 @@
#include <linux/debug_locks.h>
#include <linux/lockdep.h>
#include <linux/idr.h>
+#include <linux/jhash.h>
#include <linux/hashtable.h>
+#include <linux/rculist.h>
+#include <linux/nodemask.h>
+#include <linux/moduleparam.h>
#include "workqueue_internal.h"
@@ -58,12 +62,11 @@ enum {
* %WORKER_UNBOUND set and concurrency management disabled, and may
* be executing on any CPU. The pool behaves as an unbound one.
*
- * Note that DISASSOCIATED can be flipped only while holding
- * assoc_mutex to avoid changing binding state while
+ * Note that DISASSOCIATED should be flipped only while holding
+ * manager_mutex to avoid changing binding state while
* create_worker() is in progress.
*/
POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */
- POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */
POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */
POOL_FREEZING = 1 << 3, /* freeze in progress */
@@ -74,12 +77,14 @@ enum {
WORKER_PREP = 1 << 3, /* preparing to run works */
WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */
WORKER_UNBOUND = 1 << 7, /* worker is unbound */
+ WORKER_REBOUND = 1 << 8, /* worker was rebound */
- WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND |
- WORKER_CPU_INTENSIVE,
+ WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE |
+ WORKER_UNBOUND | WORKER_REBOUND,
NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */
+ UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */
BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */
MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */
@@ -97,6 +102,8 @@ enum {
*/
RESCUER_NICE_LEVEL = -20,
HIGHPRI_NICE_LEVEL = -20,
+
+ WQ_NAME_LEN = 24,
};
/*
@@ -115,16 +122,26 @@ enum {
* cpu or grabbing pool->lock is enough for read access. If
* POOL_DISASSOCIATED is set, it's identical to L.
*
- * F: wq->flush_mutex protected.
+ * MG: pool->manager_mutex and pool->lock protected. Writes require both
+ * locks. Reads can happen under either lock.
+ *
+ * PL: wq_pool_mutex protected.
+ *
+ * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads.
+ *
+ * WQ: wq->mutex protected.
*
- * W: workqueue_lock protected.
+ * WR: wq->mutex protected for writes. Sched-RCU protected for reads.
+ *
+ * MD: wq_mayday_lock protected.
*/
/* struct worker is defined in workqueue_internal.h */
struct worker_pool {
spinlock_t lock; /* the pool lock */
- unsigned int cpu; /* I: the associated cpu */
+ int cpu; /* I: the associated cpu */
+ int node; /* I: the associated node ID */
int id; /* I: pool ID */
unsigned int flags; /* X: flags */
@@ -138,12 +155,18 @@ struct worker_pool {
struct timer_list idle_timer; /* L: worker idle timeout */
struct timer_list mayday_timer; /* L: SOS timer for workers */
- /* workers are chained either in busy_hash or idle_list */
+ /* a workers is either on busy_hash or idle_list, or the manager */
DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER);
/* L: hash of busy workers */
- struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */
- struct ida worker_ida; /* L: for worker IDs */
+ /* see manage_workers() for details on the two manager mutexes */
+ struct mutex manager_arb; /* manager arbitration */
+ struct mutex manager_mutex; /* manager exclusion */
+ struct idr worker_idr; /* MG: worker IDs and iteration */
+
+ struct workqueue_attrs *attrs; /* I: worker attributes */
+ struct hlist_node hash_node; /* PL: unbound_pool_hash node */
+ int refcnt; /* PL: refcnt for unbound pools */
/*
* The current concurrency level. As it's likely to be accessed
@@ -151,6 +174,12 @@ struct worker_pool {
* cacheline.
*/
atomic_t nr_running ____cacheline_aligned_in_smp;
+
+ /*
+ * Destruction of pool is sched-RCU protected to allow dereferences
+ * from get_work_pool().
+ */
+ struct rcu_head rcu;
} ____cacheline_aligned_in_smp;
/*
@@ -164,75 +193,107 @@ struct pool_workqueue {
struct workqueue_struct *wq; /* I: the owning workqueue */
int work_color; /* L: current color */
int flush_color; /* L: flushing color */
+ int refcnt; /* L: reference count */
int nr_in_flight[WORK_NR_COLORS];
/* L: nr of in_flight works */
int nr_active; /* L: nr of active works */
int max_active; /* L: max active works */
struct list_head delayed_works; /* L: delayed works */
-};
+ struct list_head pwqs_node; /* WR: node on wq->pwqs */
+ struct list_head mayday_node; /* MD: node on wq->maydays */
+
+ /*
+ * Release of unbound pwq is punted to system_wq. See put_pwq()
+ * and pwq_unbound_release_workfn() for details. pool_workqueue
+ * itself is also sched-RCU protected so that the first pwq can be
+ * determined without grabbing wq->mutex.
+ */
+ struct work_struct unbound_release_work;
+ struct rcu_head rcu;
+} __aligned(1 << WORK_STRUCT_FLAG_BITS);
/*
* Structure used to wait for workqueue flush.
*/
struct wq_flusher {
- struct list_head list; /* F: list of flushers */
- int flush_color; /* F: flush color waiting for */
+ struct list_head list; /* WQ: list of flushers */
+ int flush_color; /* WQ: flush color waiting for */
struct completion done; /* flush completion */
};
-/*
- * All cpumasks are assumed to be always set on UP and thus can't be
- * used to determine whether there's something to be done.
- */
-#ifdef CONFIG_SMP
-typedef cpumask_var_t mayday_mask_t;
-#define mayday_test_and_set_cpu(cpu, mask) \
- cpumask_test_and_set_cpu((cpu), (mask))
-#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask))
-#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask))
-#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp))
-#define free_mayday_mask(mask) free_cpumask_var((mask))
-#else
-typedef unsigned long mayday_mask_t;
-#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask))
-#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask))
-#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask))
-#define alloc_mayday_mask(maskp, gfp) true
-#define free_mayday_mask(mask) do { } while (0)
-#endif
+struct wq_device;
/*
- * The externally visible workqueue abstraction is an array of
- * per-CPU workqueues:
+ * The externally visible workqueue. It relays the issued work items to
+ * the appropriate worker_pool through its pool_workqueues.
*/
struct workqueue_struct {
- unsigned int flags; /* W: WQ_* flags */
- union {
- struct pool_workqueue __percpu *pcpu;
- struct pool_workqueue *single;
- unsigned long v;
- } pool_wq; /* I: pwq's */
- struct list_head list; /* W: list of all workqueues */
-
- struct mutex flush_mutex; /* protects wq flushing */
- int work_color; /* F: current work color */
- int flush_color; /* F: current flush color */
+ struct list_head pwqs; /* WR: all pwqs of this wq */
+ struct list_head list; /* PL: list of all workqueues */
+
+ struct mutex mutex; /* protects this wq */
+ int work_color; /* WQ: current work color */
+ int flush_color; /* WQ: current flush color */
atomic_t nr_pwqs_to_flush; /* flush in progress */
- struct wq_flusher *first_flusher; /* F: first flusher */
- struct list_head flusher_queue; /* F: flush waiters */
- struct list_head flusher_overflow; /* F: flush overflow list */
+ struct wq_flusher *first_flusher; /* WQ: first flusher */
+ struct list_head flusher_queue; /* WQ: flush waiters */
+ struct list_head flusher_overflow; /* WQ: flush overflow list */
- mayday_mask_t mayday_mask; /* cpus requesting rescue */
+ struct list_head maydays; /* MD: pwqs requesting rescue */
struct worker *rescuer; /* I: rescue worker */
- int nr_drainers; /* W: drain in progress */
- int saved_max_active; /* W: saved pwq max_active */
+ int nr_drainers; /* WQ: drain in progress */
+ int saved_max_active; /* WQ: saved pwq max_active */
+
+ struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */
+ struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */
+
+#ifdef CONFIG_SYSFS
+ struct wq_device *wq_dev; /* I: for sysfs interface */
+#endif
#ifdef CONFIG_LOCKDEP
struct lockdep_map lockdep_map;
#endif
- char name[]; /* I: workqueue name */
+ char name[WQ_NAME_LEN]; /* I: workqueue name */
+
+ /* hot fields used during command issue, aligned to cacheline */
+ unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */
+ struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */
+ struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */
};
+static struct kmem_cache *pwq_cache;
+
+static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */
+static cpumask_var_t *wq_numa_possible_cpumask;
+ /* possible CPUs of each node */
+
+static bool wq_disable_numa;
+module_param_named(disable_numa, wq_disable_numa, bool, 0444);
+
+static bool wq_numa_enabled; /* unbound NUMA affinity enabled */
+
+/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */
+static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf;
+
+static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */
+static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
+
+static LIST_HEAD(workqueues); /* PL: list of all workqueues */
+static bool workqueue_freezing; /* PL: have wqs started freezing? */
+
+/* the per-cpu worker pools */
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
+ cpu_worker_pools);
+
+static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */
+
+/* PL: hash of all unbound pools keyed by pool->attrs */
+static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER);
+
+/* I: attributes used when instantiating standard unbound pools on demand */
+static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS];
+
struct workqueue_struct *system_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_wq);
struct workqueue_struct *system_highpri_wq __read_mostly;
@@ -244,64 +305,87 @@ EXPORT_SYMBOL_GPL(system_unbound_wq);
struct workqueue_struct *system_freezable_wq __read_mostly;
EXPORT_SYMBOL_GPL(system_freezable_wq);
+static int worker_thread(void *__worker);
+static void copy_workqueue_attrs(struct workqueue_attrs *to,
+ const struct workqueue_attrs *from);
+
#define CREATE_TRACE_POINTS
#include <trace/events/workqueue.h>
-#define for_each_std_worker_pool(pool, cpu) \
- for ((pool) = &std_worker_pools(cpu)[0]; \
- (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++)
+#define assert_rcu_or_pool_mutex() \
+ rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+ lockdep_is_held(&wq_pool_mutex), \
+ "sched RCU or wq_pool_mutex should be held")
-#define for_each_busy_worker(worker, i, pool) \
- hash_for_each(pool->busy_hash, i, worker, hentry)
+#define assert_rcu_or_wq_mutex(wq) \
+ rcu_lockdep_assert(rcu_read_lock_sched_held() || \
+ lockdep_is_held(&wq->mutex), \
+ "sched RCU or wq->mutex should be held")
-static inline int __next_wq_cpu(int cpu, const struct cpumask *mask,
- unsigned int sw)
-{
- if (cpu < nr_cpu_ids) {
- if (sw & 1) {
- cpu = cpumask_next(cpu, mask);
- if (cpu < nr_cpu_ids)
- return cpu;
- }
- if (sw & 2)
- return WORK_CPU_UNBOUND;
- }
- return WORK_CPU_END;
-}
+#ifdef CONFIG_LOCKDEP
+#define assert_manager_or_pool_lock(pool) \
+ WARN_ONCE(debug_locks && \
+ !lockdep_is_held(&(pool)->manager_mutex) && \
+ !lockdep_is_held(&(pool)->lock), \
+ "pool->manager_mutex or ->lock should be held")
+#else
+#define assert_manager_or_pool_lock(pool) do { } while (0)
+#endif
-static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask,
- struct workqueue_struct *wq)
-{
- return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2);
-}
+#define for_each_cpu_worker_pool(pool, cpu) \
+ for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \
+ (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \
+ (pool)++)
-/*
- * CPU iterators
+/**
+ * for_each_pool - iterate through all worker_pools in the system
+ * @pool: iteration cursor
+ * @pi: integer used for iteration
*
- * An extra cpu number is defined using an invalid cpu number
- * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any
- * specific CPU. The following iterators are similar to for_each_*_cpu()
- * iterators but also considers the unbound CPU.
+ * This must be called either with wq_pool_mutex held or sched RCU read
+ * locked. If the pool needs to be used beyond the locking in effect, the
+ * caller is responsible for guaranteeing that the pool stays online.
*
- * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND
- * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND
- * for_each_pwq_cpu() : possible CPUs for bound workqueues,
- * WORK_CPU_UNBOUND for unbound workqueues
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
*/
-#define for_each_wq_cpu(cpu) \
- for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \
- (cpu) < WORK_CPU_END; \
- (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3))
+#define for_each_pool(pool, pi) \
+ idr_for_each_entry(&worker_pool_idr, pool, pi) \
+ if (({ assert_rcu_or_pool_mutex(); false; })) { } \
+ else
-#define for_each_online_wq_cpu(cpu) \
- for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \
- (cpu) < WORK_CPU_END; \
- (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3))
+/**
+ * for_each_pool_worker - iterate through all workers of a worker_pool
+ * @worker: iteration cursor
+ * @wi: integer used for iteration
+ * @pool: worker_pool to iterate workers of
+ *
+ * This must be called with either @pool->manager_mutex or ->lock held.
+ *
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
+ */
+#define for_each_pool_worker(worker, wi, pool) \
+ idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \
+ if (({ assert_manager_or_pool_lock((pool)); false; })) { } \
+ else
-#define for_each_pwq_cpu(cpu, wq) \
- for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \
- (cpu) < WORK_CPU_END; \
- (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq)))
+/**
+ * for_each_pwq - iterate through all pool_workqueues of the specified workqueue
+ * @pwq: iteration cursor
+ * @wq: the target workqueue
+ *
+ * This must be called either with wq->mutex held or sched RCU read locked.
+ * If the pwq needs to be used beyond the locking in effect, the caller is
+ * responsible for guaranteeing that the pwq stays online.
+ *
+ * The if/else clause exists only for the lockdep assertion and can be
+ * ignored.
+ */
+#define for_each_pwq(pwq, wq) \
+ list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \
+ if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \
+ else
#ifdef CONFIG_DEBUG_OBJECTS_WORK
@@ -419,76 +503,35 @@ static inline void debug_work_activate(struct work_struct *work) { }
static inline void debug_work_deactivate(struct work_struct *work) { }
#endif
-/* Serializes the accesses to the list of workqueues. */
-static DEFINE_SPINLOCK(workqueue_lock);
-static LIST_HEAD(workqueues);
-static bool workqueue_freezing; /* W: have wqs started freezing? */
-
-/*
- * The CPU and unbound standard worker pools. The unbound ones have
- * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set.
- */
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
- cpu_std_worker_pools);
-static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS];
-
-/* idr of all pools */
-static DEFINE_MUTEX(worker_pool_idr_mutex);
-static DEFINE_IDR(worker_pool_idr);
-
-static int worker_thread(void *__worker);
-
-static struct worker_pool *std_worker_pools(int cpu)
-{
- if (cpu != WORK_CPU_UNBOUND)
- return per_cpu(cpu_std_worker_pools, cpu);
- else
- return unbound_std_worker_pools;
-}
-
-static int std_worker_pool_pri(struct worker_pool *pool)
-{
- return pool - std_worker_pools(pool->cpu);
-}
-
/* allocate ID and assign it to @pool */
static int worker_pool_assign_id(struct worker_pool *pool)
{
int ret;
- mutex_lock(&worker_pool_idr_mutex);
- idr_pre_get(&worker_pool_idr, GFP_KERNEL);
- ret = idr_get_new(&worker_pool_idr, pool, &pool->id);
- mutex_unlock(&worker_pool_idr_mutex);
+ lockdep_assert_held(&wq_pool_mutex);
+ ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL);
+ if (ret >= 0) {
+ pool->id = ret;
+ return 0;
+ }
return ret;
}
-/*
- * Lookup worker_pool by id. The idr currently is built during boot and
- * never modified. Don't worry about locking for now.
+/**
+ * unbound_pwq_by_node - return the unbound pool_workqueue for the given node
+ * @wq: the target workqueue
+ * @node: the node ID
+ *
+ * This must be called either with pwq_lock held or sched RCU read locked.
+ * If the pwq needs to be used beyond the locking in effect, the caller is
+ * responsible for guaranteeing that the pwq stays online.
*/
-static struct worker_pool *worker_pool_by_id(int pool_id)
-{
- return idr_find(&worker_pool_idr, pool_id);
-}
-
-static struct worker_pool *get_std_worker_pool(int cpu, bool highpri)
+static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
+ int node)
{
- struct worker_pool *pools = std_worker_pools(cpu);
-
- return &pools[highpri];
-}
-
-static struct pool_workqueue *get_pwq(unsigned int cpu,
- struct workqueue_struct *wq)
-{
- if (!(wq->flags & WQ_UNBOUND)) {
- if (likely(cpu < nr_cpu_ids))
- return per_cpu_ptr(wq->pool_wq.pcpu, cpu);
- } else if (likely(cpu == WORK_CPU_UNBOUND))
- return wq->pool_wq.single;
- return NULL;
+ assert_rcu_or_wq_mutex(wq);
+ return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}
static unsigned int work_color_to_flags(int color)
@@ -530,7 +573,7 @@ static int work_next_color(int color)
static inline void set_work_data(struct work_struct *work, unsigned long data,
unsigned long flags)
{
- BUG_ON(!work_pending(work));
+ WARN_ON_ONCE(!work_pending(work));
atomic_long_set(&work->data, data | flags | work_static(work));
}
@@ -582,13 +625,23 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work)
* @work: the work item of interest
*
* Return the worker_pool @work was last associated with. %NULL if none.
+ *
+ * Pools are created and destroyed under wq_pool_mutex, and allows read
+ * access under sched-RCU read lock. As such, this function should be
+ * called under wq_pool_mutex or with preemption disabled.
+ *
+ * All fields of the returned pool are accessible as long as the above
+ * mentioned locking is in effect. If the returned pool needs to be used
+ * beyond the critical section, the caller is responsible for ensuring the
+ * returned pool is and stays online.
*/
static struct worker_pool *get_work_pool(struct work_struct *work)
{
unsigned long data = atomic_long_read(&work->data);
- struct worker_pool *pool;
int pool_id;
+ assert_rcu_or_pool_mutex();
+
if (data & WORK_STRUCT_PWQ)
return ((struct pool_workqueue *)
(data & WORK_STRUCT_WQ_DATA_MASK))->pool;
@@ -597,9 +650,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work)
if (pool_id == WORK_OFFQ_POOL_NONE)
return NULL;
- pool = worker_pool_by_id(pool_id);
- WARN_ON_ONCE(!pool);
- return pool;
+ return idr_find(&worker_pool_idr, pool_id);
}
/**
@@ -688,7 +739,7 @@ static bool need_to_manage_workers(struct worker_pool *pool)
/* Do we have too many workers and should some go away? */
static bool too_many_workers(struct worker_pool *pool)
{
- bool managing = pool->flags & POOL_MANAGING_WORKERS;
+ bool managing = mutex_is_locked(&pool->manager_arb);
int nr_idle = pool->nr_idle + managing; /* manager is considered idle */
int nr_busy = pool->nr_workers - nr_idle;
@@ -743,7 +794,7 @@ static void wake_up_worker(struct worker_pool *pool)
* CONTEXT:
* spin_lock_irq(rq->lock)
*/
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
+void wq_worker_waking_up(struct task_struct *task, int cpu)
{
struct worker *worker = kthread_data(task);
@@ -768,8 +819,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu)
* RETURNS:
* Worker task on @cpu to wake up, %NULL if none.
*/
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
- unsigned int cpu)
+struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu)
{
struct worker *worker = kthread_data(task), *to_wakeup = NULL;
struct worker_pool *pool;
@@ -785,7 +835,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task,
pool = worker->pool;
/* this can only happen on the local cpu */
- BUG_ON(cpu != raw_smp_processor_id());
+ if (WARN_ON_ONCE(cpu != raw_smp_processor_id()))
+ return NULL;
/*
* The counterpart of the following dec_and_test, implied mb,
@@ -890,13 +941,12 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags)
* recycled work item as currently executing and make it wait until the
* current execution finishes, introducing an unwanted dependency.
*
- * This function checks the work item address, work function and workqueue
- * to avoid false positives. Note that this isn't complete as one may
- * construct a work function which can introduce dependency onto itself
- * through a recycled work item. Well, if somebody wants to shoot oneself
- * in the foot that badly, there's only so much we can do, and if such
- * deadlock actually occurs, it should be easy to locate the culprit work
- * function.
+ * This function checks the work item address and work function to avoid
+ * false positives. Note that this isn't complete as one may construct a
+ * work function which can introduce dependency onto itself through a
+ * recycled work item. Well, if somebody wants to shoot oneself in the
+ * foot that badly, there's only so much we can do, and if such deadlock
+ * actually occurs, it should be easy to locate the culprit work function.
*
* CONTEXT:
* spin_lock_irq(pool->lock).
@@ -960,6 +1010,64 @@ static void move_linked_works(struct work_struct *work, struct list_head *head,
*nextp = n;
}
+/**
+ * get_pwq - get an extra reference on the specified pool_workqueue
+ * @pwq: pool_workqueue to get
+ *
+ * Obtain an extra reference on @pwq. The caller should guarantee that
+ * @pwq has positive refcnt and be holding the matching pool->lock.
+ */
+static void get_pwq(struct pool_workqueue *pwq)
+{
+ lockdep_assert_held(&pwq->pool->lock);
+ WARN_ON_ONCE(pwq->refcnt <= 0);
+ pwq->refcnt++;
+}
+
+/**
+ * put_pwq - put a pool_workqueue reference
+ * @pwq: pool_workqueue to put
+ *
+ * Drop a reference of @pwq. If its refcnt reaches zero, schedule its
+ * destruction. The caller should be holding the matching pool->lock.
+ */
+static void put_pwq(struct pool_workqueue *pwq)
+{
+ lockdep_assert_held(&pwq->pool->lock);
+ if (likely(--pwq->refcnt))
+ return;
+ if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND)))
+ return;
+ /*
+ * @pwq can't be released under pool->lock, bounce to
+ * pwq_unbound_release_workfn(). This never recurses on the same
+ * pool->lock as this path is taken only for unbound workqueues and
+ * the release work item is scheduled on a per-cpu workqueue. To
+ * avoid lockdep warning, unbound pool->locks are given lockdep
+ * subclass of 1 in get_unbound_pool().
+ */
+ schedule_work(&pwq->unbound_release_work);
+}
+
+/**
+ * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock
+ * @pwq: pool_workqueue to put (can be %NULL)
+ *
+ * put_pwq() with locking. This function also allows %NULL @pwq.
+ */
+static void put_pwq_unlocked(struct pool_workqueue *pwq)
+{
+ if (pwq) {
+ /*
+ * As both pwqs and pools are sched-RCU protected, the
+ * following lock operations are safe.
+ */
+ spin_lock_irq(&pwq->pool->lock);
+ put_pwq(pwq);
+ spin_unlock_irq(&pwq->pool->lock);
+ }
+}
+
static void pwq_activate_delayed_work(struct work_struct *work)
{
struct pool_workqueue *pwq = get_work_pwq(work);
@@ -991,9 +1099,9 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq)
*/
static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
{
- /* ignore uncolored works */
+ /* uncolored work items don't participate in flushing or nr_active */
if (color == WORK_NO_COLOR)
- return;
+ goto out_put;
pwq->nr_in_flight[color]--;
@@ -1006,11 +1114,11 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
/* is flush in progress and are we at the flushing tip? */
if (likely(pwq->flush_color != color))
- return;
+ goto out_put;
/* are there still in-flight works? */
if (pwq->nr_in_flight[color])
- return;
+ goto out_put;
/* this pwq is done, clear flush_color */
pwq->flush_color = -1;
@@ -1021,6 +1129,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color)
*/
if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush))
complete(&pwq->wq->first_flusher->done);
+out_put:
+ put_pwq(pwq);
}
/**
@@ -1143,11 +1253,12 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work,
/* we own @work, set data and link */
set_work_pwq(work, pwq, extra_flags);
list_add_tail(&work->entry, head);
+ get_pwq(pwq);
/*
- * Ensure either worker_sched_deactivated() sees the above
- * list_add_tail() or we see zero nr_running to avoid workers
- * lying around lazily while there are works to be processed.
+ * Ensure either wq_worker_sleeping() sees the above
+ * list_add_tail() or we see zero nr_running to avoid workers lying
+ * around lazily while there are works to be processed.
*/
smp_mb();
@@ -1171,10 +1282,11 @@ static bool is_chained_work(struct workqueue_struct *wq)
return worker && worker->current_pwq->wq == wq;
}
-static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
+static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
struct pool_workqueue *pwq;
+ struct worker_pool *last_pool;
struct list_head *worklist;
unsigned int work_flags;
unsigned int req_cpu = cpu;
@@ -1190,48 +1302,62 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq,
debug_work_activate(work);
/* if dying, only works from the same workqueue are allowed */
- if (unlikely(wq->flags & WQ_DRAINING) &&
+ if (unlikely(wq->flags & __WQ_DRAINING) &&
WARN_ON_ONCE(!is_chained_work(wq)))
return;
+retry:
+ if (req_cpu == WORK_CPU_UNBOUND)
+ cpu = raw_smp_processor_id();
- /* determine the pwq to use */
- if (!(wq->flags & WQ_UNBOUND)) {
- struct worker_pool *last_pool;
-
- if (cpu == WORK_CPU_UNBOUND)
- cpu = raw_smp_processor_id();
-
- /*
- * It's multi cpu. If @work was previously on a different
- * cpu, it might still be running there, in which case the
- * work needs to be queued on that cpu to guarantee
- * non-reentrancy.
- */
- pwq = get_pwq(cpu, wq);
- last_pool = get_work_pool(work);
+ /* pwq which will be used unless @work is executing elsewhere */
+ if (!(wq->flags & WQ_UNBOUND))
+ pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
+ else
+ pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
- if (last_pool && last_pool != pwq->pool) {
- struct worker *worker;
+ /*
+ * If @work was previously on a different pool, it might still be
+ * running there, in which case the work needs to be queued on that
+ * pool to guarantee non-reentrancy.
+ */
+ last_pool = get_work_pool(work);
+ if (last_pool && last_pool != pwq->pool) {
+ struct worker *worker;
- spin_lock(&last_pool->lock);
+ spin_lock(&last_pool->lock);
- worker = find_worker_executing_work(last_pool, work);
+ worker = find_worker_executing_work(last_pool, work);
- if (worker && worker->current_pwq->wq == wq) {
- pwq = get_pwq(last_pool->cpu, wq);
- } else {
- /* meh... not running there, queue here */
- spin_unlock(&last_pool->lock);
- spin_lock(&pwq->pool->lock);
- }
+ if (worker && worker->current_pwq->wq == wq) {
+ pwq = worker->current_pwq;
} else {
+ /* meh... not running there, queue here */
+ spin_unlock(&last_pool->lock);
spin_lock(&pwq->pool->lock);
}
} else {
- pwq = get_pwq(WORK_CPU_UNBOUND, wq);
spin_lock(&pwq->pool->lock);
}
+ /*
+ * pwq is determined and locked. For unbound pools, we could have
+ * raced with pwq release and it could already be dead. If its
+ * refcnt is zero, repeat pwq selection. Note that pwqs never die
+ * without another pwq replacing it in the numa_pwq_tbl or while
+ * work items are executing on it, so the retrying is guaranteed to
+ * make forward-progress.
+ */
+ if (unlikely(!pwq->refcnt)) {
+ if (wq->flags & WQ_UNBOUND) {
+ spin_unlock(&pwq->pool->lock);
+ cpu_relax();
+ goto retry;
+ }
+ /* oops */
+ WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt",
+ wq->name, cpu);
+ }
+
/* pwq determined, queue */
trace_workqueue_queue_work(req_cpu, pwq, work);
@@ -1286,22 +1412,6 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq,
}
EXPORT_SYMBOL_GPL(queue_work_on);
-/**
- * queue_work - queue work on a workqueue
- * @wq: workqueue to use
- * @work: work to queue
- *
- * Returns %false if @work was already on a queue, %true otherwise.
- *
- * We queue the work to the CPU on which it was submitted, but if the CPU dies
- * it can be processed by another CPU.
- */
-bool queue_work(struct workqueue_struct *wq, struct work_struct *work)
-{
- return queue_work_on(WORK_CPU_UNBOUND, wq, work);
-}
-EXPORT_SYMBOL_GPL(queue_work);
-
void delayed_work_timer_fn(unsigned long __data)
{
struct delayed_work *dwork = (struct delayed_work *)__data;
@@ -1377,21 +1487,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
EXPORT_SYMBOL_GPL(queue_delayed_work_on);
/**
- * queue_delayed_work - queue work on a workqueue after delay
- * @wq: workqueue to use
- * @dwork: delayable work to queue
- * @delay: number of jiffies to wait before queueing
- *
- * Equivalent to queue_delayed_work_on() but tries to use the local CPU.
- */
-bool queue_delayed_work(struct workqueue_struct *wq,
- struct delayed_work *dwork, unsigned long delay)
-{
- return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
-}
-EXPORT_SYMBOL_GPL(queue_delayed_work);
-
-/**
* mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU
* @cpu: CPU number to execute work on
* @wq: workqueue to use
@@ -1430,21 +1525,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq,
EXPORT_SYMBOL_GPL(mod_delayed_work_on);
/**
- * mod_delayed_work - modify delay of or queue a delayed work
- * @wq: workqueue to use
- * @dwork: work to queue
- * @delay: number of jiffies to wait before queueing
- *
- * mod_delayed_work_on() on local CPU.
- */
-bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork,
- unsigned long delay)
-{
- return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay);
-}
-EXPORT_SYMBOL_GPL(mod_delayed_work);
-
-/**
* worker_enter_idle - enter idle state
* @worker: worker which is entering idle state
*
@@ -1458,9 +1538,10 @@ static void worker_enter_idle(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- BUG_ON(worker->flags & WORKER_IDLE);
- BUG_ON(!list_empty(&worker->entry) &&
- (worker->hentry.next || worker->hentry.pprev));
+ if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) ||
+ WARN_ON_ONCE(!list_empty(&worker->entry) &&
+ (worker->hentry.next || worker->hentry.pprev)))
+ return;
/* can't use worker_set_flags(), also called from start_worker() */
worker->flags |= WORKER_IDLE;
@@ -1497,22 +1578,25 @@ static void worker_leave_idle(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- BUG_ON(!(worker->flags & WORKER_IDLE));
+ if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE)))
+ return;
worker_clr_flags(worker, WORKER_IDLE);
pool->nr_idle--;
list_del_init(&worker->entry);
}
/**
- * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool
- * @worker: self
+ * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it
+ * @pool: target worker_pool
+ *
+ * Bind %current to the cpu of @pool if it is associated and lock @pool.
*
* Works which are scheduled while the cpu is online must at least be
* scheduled to a worker which is bound to the cpu so that if they are
* flushed from cpu callbacks while cpu is going down, they are
* guaranteed to execute on the cpu.
*
- * This function is to be used by rogue workers and rescuers to bind
+ * This function is to be used by unbound workers and rescuers to bind
* themselves to the target cpu and may race with cpu going down or
* coming online. kthread_bind() can't be used because it may put the
* worker to already dead cpu and set_cpus_allowed_ptr() can't be used
@@ -1533,12 +1617,9 @@ static void worker_leave_idle(struct worker *worker)
* %true if the associated pool is online (@worker is successfully
* bound), %false if offline.
*/
-static bool worker_maybe_bind_and_lock(struct worker *worker)
+static bool worker_maybe_bind_and_lock(struct worker_pool *pool)
__acquires(&pool->lock)
{
- struct worker_pool *pool = worker->pool;
- struct task_struct *task = worker->task;
-
while (true) {
/*
* The following call may fail, succeed or succeed
@@ -1547,14 +1628,13 @@ __acquires(&pool->lock)
* against POOL_DISASSOCIATED.
*/
if (!(pool->flags & POOL_DISASSOCIATED))
- set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu));
+ set_cpus_allowed_ptr(current, pool->attrs->cpumask);
spin_lock_irq(&pool->lock);
if (pool->flags & POOL_DISASSOCIATED)
return false;
- if (task_cpu(task) == pool->cpu &&
- cpumask_equal(&current->cpus_allowed,
- get_cpu_mask(pool->cpu)))
+ if (task_cpu(current) == pool->cpu &&
+ cpumask_equal(&current->cpus_allowed, pool->attrs->cpumask))
return true;
spin_unlock_irq(&pool->lock);
@@ -1569,108 +1649,6 @@ __acquires(&pool->lock)
}
}
-/*
- * Rebind an idle @worker to its CPU. worker_thread() will test
- * list_empty(@worker->entry) before leaving idle and call this function.
- */
-static void idle_worker_rebind(struct worker *worker)
-{
- /* CPU may go down again inbetween, clear UNBOUND only on success */
- if (worker_maybe_bind_and_lock(worker))
- worker_clr_flags(worker, WORKER_UNBOUND);
-
- /* rebind complete, become available again */
- list_add(&worker->entry, &worker->pool->idle_list);
- spin_unlock_irq(&worker->pool->lock);
-}
-
-/*
- * Function for @worker->rebind.work used to rebind unbound busy workers to
- * the associated cpu which is coming back online. This is scheduled by
- * cpu up but can race with other cpu hotplug operations and may be
- * executed twice without intervening cpu down.
- */
-static void busy_worker_rebind_fn(struct work_struct *work)
-{
- struct worker *worker = container_of(work, struct worker, rebind_work);
-
- if (worker_maybe_bind_and_lock(worker))
- worker_clr_flags(worker, WORKER_UNBOUND);
-
- spin_unlock_irq(&worker->pool->lock);
-}
-
-/**
- * rebind_workers - rebind all workers of a pool to the associated CPU
- * @pool: pool of interest
- *
- * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding
- * is different for idle and busy ones.
- *
- * Idle ones will be removed from the idle_list and woken up. They will
- * add themselves back after completing rebind. This ensures that the
- * idle_list doesn't contain any unbound workers when re-bound busy workers
- * try to perform local wake-ups for concurrency management.
- *
- * Busy workers can rebind after they finish their current work items.
- * Queueing the rebind work item at the head of the scheduled list is
- * enough. Note that nr_running will be properly bumped as busy workers
- * rebind.
- *
- * On return, all non-manager workers are scheduled for rebind - see
- * manage_workers() for the manager special case. Any idle worker
- * including the manager will not appear on @idle_list until rebind is
- * complete, making local wake-ups safe.
- */
-static void rebind_workers(struct worker_pool *pool)
-{
- struct worker *worker, *n;
- int i;
-
- lockdep_assert_held(&pool->assoc_mutex);
- lockdep_assert_held(&pool->lock);
-
- /* dequeue and kick idle ones */
- list_for_each_entry_safe(worker, n, &pool->idle_list, entry) {
- /*
- * idle workers should be off @pool->idle_list until rebind
- * is complete to avoid receiving premature local wake-ups.
- */
- list_del_init(&worker->entry);
-
- /*
- * worker_thread() will see the above dequeuing and call
- * idle_worker_rebind().
- */
- wake_up_process(worker->task);
- }
-
- /* rebind busy workers */
- for_each_busy_worker(worker, i, pool) {
- struct work_struct *rebind_work = &worker->rebind_work;
- struct workqueue_struct *wq;
-
- if (test_and_set_bit(WORK_STRUCT_PENDING_BIT,
- work_data_bits(rebind_work)))
- continue;
-
- debug_work_activate(rebind_work);
-
- /*
- * wq doesn't really matter but let's keep @worker->pool
- * and @pwq->pool consistent for sanity.
- */
- if (std_worker_pool_pri(worker->pool))
- wq = system_highpri_wq;
- else
- wq = system_wq;
-
- insert_work(get_pwq(pool->cpu, wq), rebind_work,
- worker->scheduled.next,
- work_color_to_flags(WORK_NO_COLOR));
- }
-}
-
static struct worker *alloc_worker(void)
{
struct worker *worker;
@@ -1679,7 +1657,6 @@ static struct worker *alloc_worker(void)
if (worker) {
INIT_LIST_HEAD(&worker->entry);
INIT_LIST_HEAD(&worker->scheduled);
- INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn);
/* on creation a worker is in !idle && prep state */
worker->flags = WORKER_PREP;
}
@@ -1702,18 +1679,25 @@ static struct worker *alloc_worker(void)
*/
static struct worker *create_worker(struct worker_pool *pool)
{
- const char *pri = std_worker_pool_pri(pool) ? "H" : "";
struct worker *worker = NULL;
int id = -1;
+ char id_buf[16];
+ lockdep_assert_held(&pool->manager_mutex);
+
+ /*
+ * ID is needed to determine kthread name. Allocate ID first
+ * without installing the pointer.
+ */
+ idr_preload(GFP_KERNEL);
spin_lock_irq(&pool->lock);
- while (ida_get_new(&pool->worker_ida, &id)) {
- spin_unlock_irq(&pool->lock);
- if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL))
- goto fail;
- spin_lock_irq(&pool->lock);
- }
+
+ id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT);
+
spin_unlock_irq(&pool->lock);
+ idr_preload_end();
+ if (id < 0)
+ goto fail;
worker = alloc_worker();
if (!worker)
@@ -1722,40 +1706,46 @@ static struct worker *create_worker(struct worker_pool *pool)
worker->pool = pool;
worker->id = id;
- if (pool->cpu != WORK_CPU_UNBOUND)
- worker->task = kthread_create_on_node(worker_thread,
- worker, cpu_to_node(pool->cpu),
- "kworker/%u:%d%s", pool->cpu, id, pri);
+ if (pool->cpu >= 0)
+ snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id,
+ pool->attrs->nice < 0 ? "H" : "");
else
- worker->task = kthread_create(worker_thread, worker,
- "kworker/u:%d%s", id, pri);
+ snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id);
+
+ worker->task = kthread_create_on_node(worker_thread, worker, pool->node,
+ "kworker/%s", id_buf);
if (IS_ERR(worker->task))
goto fail;
- if (std_worker_pool_pri(pool))
- set_user_nice(worker->task, HIGHPRI_NICE_LEVEL);
+ /*
+ * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
+ * online CPUs. It'll be re-applied when any of the CPUs come up.
+ */
+ set_user_nice(worker->task, pool->attrs->nice);
+ set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+
+ /* prevent userland from meddling with cpumask of workqueue workers */
+ worker->task->flags |= PF_NO_SETAFFINITY;
/*
- * Determine CPU binding of the new worker depending on
- * %POOL_DISASSOCIATED. The caller is responsible for ensuring the
- * flag remains stable across this function. See the comments
- * above the flag definition for details.
- *
- * As an unbound worker may later become a regular one if CPU comes
- * online, make sure every worker has %PF_THREAD_BOUND set.
+ * The caller is responsible for ensuring %POOL_DISASSOCIATED
+ * remains stable across this function. See the comments above the
+ * flag definition for details.
*/
- if (!(pool->flags & POOL_DISASSOCIATED)) {
- kthread_bind(worker->task, pool->cpu);
- } else {
- worker->task->flags |= PF_THREAD_BOUND;
+ if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
- }
+
+ /* successful, commit the pointer to idr */
+ spin_lock_irq(&pool->lock);
+ idr_replace(&pool->worker_idr, worker, worker->id);
+ spin_unlock_irq(&pool->lock);
return worker;
+
fail:
if (id >= 0) {
spin_lock_irq(&pool->lock);
- ida_remove(&pool->worker_ida, id);
+ idr_remove(&pool->worker_idr, id);
spin_unlock_irq(&pool->lock);
}
kfree(worker);
@@ -1780,6 +1770,30 @@ static void start_worker(struct worker *worker)
}
/**
+ * create_and_start_worker - create and start a worker for a pool
+ * @pool: the target pool
+ *
+ * Grab the managership of @pool and create and start a new worker for it.
+ */
+static int create_and_start_worker(struct worker_pool *pool)
+{
+ struct worker *worker;
+
+ mutex_lock(&pool->manager_mutex);
+
+ worker = create_worker(pool);
+ if (worker) {
+ spin_lock_irq(&pool->lock);
+ start_worker(worker);
+ spin_unlock_irq(&pool->lock);
+ }
+
+ mutex_unlock(&pool->manager_mutex);
+
+ return worker ? 0 : -ENOMEM;
+}
+
+/**
* destroy_worker - destroy a workqueue worker
* @worker: worker to be destroyed
*
@@ -1791,11 +1805,14 @@ static void start_worker(struct worker *worker)
static void destroy_worker(struct worker *worker)
{
struct worker_pool *pool = worker->pool;
- int id = worker->id;
+
+ lockdep_assert_held(&pool->manager_mutex);
+ lockdep_assert_held(&pool->lock);
/* sanity check frenzy */
- BUG_ON(worker->current_work);
- BUG_ON(!list_empty(&worker->scheduled));
+ if (WARN_ON(worker->current_work) ||
+ WARN_ON(!list_empty(&worker->scheduled)))
+ return;
if (worker->flags & WORKER_STARTED)
pool->nr_workers--;
@@ -1805,13 +1822,14 @@ static void destroy_worker(struct worker *worker)
list_del_init(&worker->entry);
worker->flags |= WORKER_DIE;
+ idr_remove(&pool->worker_idr, worker->id);
+
spin_unlock_irq(&pool->lock);
kthread_stop(worker->task);
kfree(worker);
spin_lock_irq(&pool->lock);
- ida_remove(&pool->worker_ida, id);
}
static void idle_worker_timeout(unsigned long __pool)
@@ -1840,23 +1858,21 @@ static void idle_worker_timeout(unsigned long __pool)
spin_unlock_irq(&pool->lock);
}
-static bool send_mayday(struct work_struct *work)
+static void send_mayday(struct work_struct *work)
{
struct pool_workqueue *pwq = get_work_pwq(work);
struct workqueue_struct *wq = pwq->wq;
- unsigned int cpu;
- if (!(wq->flags & WQ_RESCUER))
- return false;
+ lockdep_assert_held(&wq_mayday_lock);
+
+ if (!wq->rescuer)
+ return;
/* mayday mayday mayday */
- cpu = pwq->pool->cpu;
- /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */
- if (cpu == WORK_CPU_UNBOUND)
- cpu = 0;
- if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask))
+ if (list_empty(&pwq->mayday_node)) {
+ list_add_tail(&pwq->mayday_node, &wq->maydays);
wake_up_process(wq->rescuer->task);
- return true;
+ }
}
static void pool_mayday_timeout(unsigned long __pool)
@@ -1864,7 +1880,8 @@ static void pool_mayday_timeout(unsigned long __pool)
struct worker_pool *pool = (void *)__pool;
struct work_struct *work;
- spin_lock_irq(&pool->lock);
+ spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */
+ spin_lock(&pool->lock);
if (need_to_create_worker(pool)) {
/*
@@ -1877,7 +1894,8 @@ static void pool_mayday_timeout(unsigned long __pool)
send_mayday(work);
}
- spin_unlock_irq(&pool->lock);
+ spin_unlock(&pool->lock);
+ spin_unlock_irq(&wq_mayday_lock);
mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL);
}
@@ -1892,8 +1910,8 @@ static void pool_mayday_timeout(unsigned long __pool)
* sent to all rescuers with works scheduled on @pool to resolve
* possible allocation deadlock.
*
- * On return, need_to_create_worker() is guaranteed to be false and
- * may_start_working() true.
+ * On return, need_to_create_worker() is guaranteed to be %false and
+ * may_start_working() %true.
*
* LOCKING:
* spin_lock_irq(pool->lock) which may be released and regrabbed
@@ -1901,7 +1919,7 @@ static void pool_mayday_timeout(unsigned long __pool)
* manager.
*
* RETURNS:
- * false if no action was taken and pool->lock stayed locked, true
+ * %false if no action was taken and pool->lock stayed locked, %true
* otherwise.
*/
static bool maybe_create_worker(struct worker_pool *pool)
@@ -1924,7 +1942,8 @@ restart:
del_timer_sync(&pool->mayday_timer);
spin_lock_irq(&pool->lock);
start_worker(worker);
- BUG_ON(need_to_create_worker(pool));
+ if (WARN_ON_ONCE(need_to_create_worker(pool)))
+ goto restart;
return true;
}
@@ -1957,7 +1976,7 @@ restart:
* multiple times. Called only from manager.
*
* RETURNS:
- * false if no action was taken and pool->lock stayed locked, true
+ * %false if no action was taken and pool->lock stayed locked, %true
* otherwise.
*/
static bool maybe_destroy_workers(struct worker_pool *pool)
@@ -2008,42 +2027,37 @@ static bool manage_workers(struct worker *worker)
struct worker_pool *pool = worker->pool;
bool ret = false;
- if (pool->flags & POOL_MANAGING_WORKERS)
+ /*
+ * Managership is governed by two mutexes - manager_arb and
+ * manager_mutex. manager_arb handles arbitration of manager role.
+ * Anyone who successfully grabs manager_arb wins the arbitration
+ * and becomes the manager. mutex_trylock() on pool->manager_arb
+ * failure while holding pool->lock reliably indicates that someone
+ * else is managing the pool and the worker which failed trylock
+ * can proceed to executing work items. This means that anyone
+ * grabbing manager_arb is responsible for actually performing
+ * manager duties. If manager_arb is grabbed and released without
+ * actual management, the pool may stall indefinitely.
+ *
+ * manager_mutex is used for exclusion of actual management
+ * operations. The holder of manager_mutex can be sure that none
+ * of management operations, including creation and destruction of
+ * workers, won't take place until the mutex is released. Because
+ * manager_mutex doesn't interfere with manager role arbitration,
+ * it is guaranteed that the pool's management, while may be
+ * delayed, won't be disturbed by someone else grabbing
+ * manager_mutex.
+ */
+ if (!mutex_trylock(&pool->manager_arb))
return ret;
- pool->flags |= POOL_MANAGING_WORKERS;
-
/*
- * To simplify both worker management and CPU hotplug, hold off
- * management while hotplug is in progress. CPU hotplug path can't
- * grab %POOL_MANAGING_WORKERS to achieve this because that can
- * lead to idle worker depletion (all become busy thinking someone
- * else is managing) which in turn can result in deadlock under
- * extreme circumstances. Use @pool->assoc_mutex to synchronize
- * manager against CPU hotplug.
- *
- * assoc_mutex would always be free unless CPU hotplug is in
- * progress. trylock first without dropping @pool->lock.
+ * With manager arbitration won, manager_mutex would be free in
+ * most cases. trylock first without dropping @pool->lock.
*/
- if (unlikely(!mutex_trylock(&pool->assoc_mutex))) {
+ if (unlikely(!mutex_trylock(&pool->manager_mutex))) {
spin_unlock_irq(&pool->lock);
- mutex_lock(&pool->assoc_mutex);
- /*
- * CPU hotplug could have happened while we were waiting
- * for assoc_mutex. Hotplug itself can't handle us
- * because manager isn't either on idle or busy list, and
- * @pool's state and ours could have deviated.
- *
- * As hotplug is now excluded via assoc_mutex, we can
- * simply try to bind. It will succeed or fail depending
- * on @pool's current state. Try it and adjust
- * %WORKER_UNBOUND accordingly.
- */
- if (worker_maybe_bind_and_lock(worker))
- worker->flags &= ~WORKER_UNBOUND;
- else
- worker->flags |= WORKER_UNBOUND;
-
+ mutex_lock(&pool->manager_mutex);
ret = true;
}
@@ -2056,8 +2070,8 @@ static bool manage_workers(struct worker *worker)
ret |= maybe_destroy_workers(pool);
ret |= maybe_create_worker(pool);
- pool->flags &= ~POOL_MANAGING_WORKERS;
- mutex_unlock(&pool->assoc_mutex);
+ mutex_unlock(&pool->manager_mutex);
+ mutex_unlock(&pool->manager_arb);
return ret;
}
@@ -2211,11 +2225,11 @@ static void process_scheduled_works(struct worker *worker)
* worker_thread - the worker thread function
* @__worker: self
*
- * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools
- * of these per each cpu. These workers process all works regardless of
- * their specific target workqueue. The only exception is works which
- * belong to workqueues with a rescuer which will be explained in
- * rescuer_thread().
+ * The worker thread function. All workers belong to a worker_pool -
+ * either a per-cpu one or dynamic unbound one. These workers process all
+ * work items regardless of their specific target workqueue. The only
+ * exception is work items which belong to workqueues with a rescuer which
+ * will be explained in rescuer_thread().
*/
static int worker_thread(void *__worker)
{
@@ -2227,19 +2241,12 @@ static int worker_thread(void *__worker)
woke_up:
spin_lock_irq(&pool->lock);
- /* we are off idle list if destruction or rebind is requested */
- if (unlikely(list_empty(&worker->entry))) {
+ /* am I supposed to die? */
+ if (unlikely(worker->flags & WORKER_DIE)) {
spin_unlock_irq(&pool->lock);
-
- /* if DIE is set, destruction is requested */
- if (worker->flags & WORKER_DIE) {
- worker->task->flags &= ~PF_WQ_WORKER;
- return 0;
- }
-
- /* otherwise, rebind */
- idle_worker_rebind(worker);
- goto woke_up;
+ WARN_ON_ONCE(!list_empty(&worker->entry));
+ worker->task->flags &= ~PF_WQ_WORKER;
+ return 0;
}
worker_leave_idle(worker);
@@ -2257,14 +2264,16 @@ recheck:
* preparing to process a work or actually processing it.
* Make sure nobody diddled with it while I was sleeping.
*/
- BUG_ON(!list_empty(&worker->scheduled));
+ WARN_ON_ONCE(!list_empty(&worker->scheduled));
/*
- * When control reaches this point, we're guaranteed to have
- * at least one idle worker or that someone else has already
- * assumed the manager role.
+ * Finish PREP stage. We're guaranteed to have at least one idle
+ * worker or that someone else has already assumed the manager
+ * role. This is where @worker starts participating in concurrency
+ * management if applicable and concurrency management is restored
+ * after being rebound. See rebind_workers() for details.
*/
- worker_clr_flags(worker, WORKER_PREP);
+ worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND);
do {
struct work_struct *work =
@@ -2306,7 +2315,7 @@ sleep:
* @__rescuer: self
*
* Workqueue rescuer thread function. There's one rescuer for each
- * workqueue which has WQ_RESCUER set.
+ * workqueue which has WQ_MEM_RECLAIM set.
*
* Regular work processing on a pool may block trying to create a new
* worker which uses GFP_KERNEL allocation which has slight chance of
@@ -2325,8 +2334,6 @@ static int rescuer_thread(void *__rescuer)
struct worker *rescuer = __rescuer;
struct workqueue_struct *wq = rescuer->rescue_wq;
struct list_head *scheduled = &rescuer->scheduled;
- bool is_unbound = wq->flags & WQ_UNBOUND;
- unsigned int cpu;
set_user_nice(current, RESCUER_NICE_LEVEL);
@@ -2344,28 +2351,29 @@ repeat:
return 0;
}
- /*
- * See whether any cpu is asking for help. Unbounded
- * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND.
- */
- for_each_mayday_cpu(cpu, wq->mayday_mask) {
- unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu;
- struct pool_workqueue *pwq = get_pwq(tcpu, wq);
+ /* see whether any pwq is asking for help */
+ spin_lock_irq(&wq_mayday_lock);
+
+ while (!list_empty(&wq->maydays)) {
+ struct pool_workqueue *pwq = list_first_entry(&wq->maydays,
+ struct pool_workqueue, mayday_node);
struct worker_pool *pool = pwq->pool;
struct work_struct *work, *n;
__set_current_state(TASK_RUNNING);
- mayday_clear_cpu(cpu, wq->mayday_mask);
+ list_del_init(&pwq->mayday_node);
+
+ spin_unlock_irq(&wq_mayday_lock);
/* migrate to the target cpu if possible */
+ worker_maybe_bind_and_lock(pool);
rescuer->pool = pool;
- worker_maybe_bind_and_lock(rescuer);
/*
* Slurp in all works issued via this workqueue and
* process'em.
*/
- BUG_ON(!list_empty(&rescuer->scheduled));
+ WARN_ON_ONCE(!list_empty(&rescuer->scheduled));
list_for_each_entry_safe(work, n, &pool->worklist, entry)
if (get_work_pwq(work) == pwq)
move_linked_works(work, scheduled, &n);
@@ -2380,9 +2388,13 @@ repeat:
if (keep_working(pool))
wake_up_worker(pool);
- spin_unlock_irq(&pool->lock);
+ rescuer->pool = NULL;
+ spin_unlock(&pool->lock);
+ spin_lock(&wq_mayday_lock);
}
+ spin_unlock_irq(&wq_mayday_lock);
+
/* rescuers should never participate in concurrency management */
WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING));
schedule();
@@ -2486,7 +2498,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq,
* advanced to @work_color.
*
* CONTEXT:
- * mutex_lock(wq->flush_mutex).
+ * mutex_lock(wq->mutex).
*
* RETURNS:
* %true if @flush_color >= 0 and there's something to flush. %false
@@ -2496,21 +2508,20 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
int flush_color, int work_color)
{
bool wait = false;
- unsigned int cpu;
+ struct pool_workqueue *pwq;
if (flush_color >= 0) {
- BUG_ON(atomic_read(&wq->nr_pwqs_to_flush));
+ WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush));
atomic_set(&wq->nr_pwqs_to_flush, 1);
}
- for_each_pwq_cpu(cpu, wq) {
- struct pool_workqueue *pwq = get_pwq(cpu, wq);
+ for_each_pwq(pwq, wq) {
struct worker_pool *pool = pwq->pool;
spin_lock_irq(&pool->lock);
if (flush_color >= 0) {
- BUG_ON(pwq->flush_color != -1);
+ WARN_ON_ONCE(pwq->flush_color != -1);
if (pwq->nr_in_flight[flush_color]) {
pwq->flush_color = flush_color;
@@ -2520,7 +2531,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
}
if (work_color >= 0) {
- BUG_ON(work_color != work_next_color(pwq->work_color));
+ WARN_ON_ONCE(work_color != work_next_color(pwq->work_color));
pwq->work_color = work_color;
}
@@ -2537,11 +2548,8 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq,
* flush_workqueue - ensure that any scheduled work has run to completion.
* @wq: workqueue to flush
*
- * Forces execution of the workqueue and blocks until its completion.
- * This is typically used in driver shutdown handlers.
- *
- * We sleep until all works which were queued on entry have been handled,
- * but we are not livelocked by new incoming ones.
+ * This function sleeps until all work items which were queued on entry
+ * have finished execution, but it is not livelocked by new incoming ones.
*/
void flush_workqueue(struct workqueue_struct *wq)
{
@@ -2555,7 +2563,7 @@ void flush_workqueue(struct workqueue_struct *wq)
lock_map_acquire(&wq->lockdep_map);
lock_map_release(&wq->lockdep_map);
- mutex_lock(&wq->flush_mutex);
+ mutex_lock(&wq->mutex);
/*
* Start-to-wait phase
@@ -2568,13 +2576,13 @@ void flush_workqueue(struct workqueue_struct *wq)
* becomes our flush_color and work_color is advanced
* by one.
*/
- BUG_ON(!list_empty(&wq->flusher_overflow));
+ WARN_ON_ONCE(!list_empty(&wq->flusher_overflow));
this_flusher.flush_color = wq->work_color;
wq->work_color = next_color;
if (!wq->first_flusher) {
/* no flush in progress, become the first flusher */
- BUG_ON(wq->flush_color != this_flusher.flush_color);
+ WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
wq->first_flusher = &this_flusher;
@@ -2587,7 +2595,7 @@ void flush_workqueue(struct workqueue_struct *wq)
}
} else {
/* wait in queue */
- BUG_ON(wq->flush_color == this_flusher.flush_color);
+ WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color);
list_add_tail(&this_flusher.list, &wq->flusher_queue);
flush_workqueue_prep_pwqs(wq, -1, wq->work_color);
}
@@ -2600,7 +2608,7 @@ void flush_workqueue(struct workqueue_struct *wq)
list_add_tail(&this_flusher.list, &wq->flusher_overflow);
}
- mutex_unlock(&wq->flush_mutex);
+ mutex_unlock(&wq->mutex);
wait_for_completion(&this_flusher.done);
@@ -2613,7 +2621,7 @@ void flush_workqueue(struct workqueue_struct *wq)
if (wq->first_flusher != &this_flusher)
return;
- mutex_lock(&wq->flush_mutex);
+ mutex_lock(&wq->mutex);
/* we might have raced, check again with mutex held */
if (wq->first_flusher != &this_flusher)
@@ -2621,8 +2629,8 @@ void flush_workqueue(struct workqueue_struct *wq)
wq->first_flusher = NULL;
- BUG_ON(!list_empty(&this_flusher.list));
- BUG_ON(wq->flush_color != this_flusher.flush_color);
+ WARN_ON_ONCE(!list_empty(&this_flusher.list));
+ WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color);
while (true) {
struct wq_flusher *next, *tmp;
@@ -2635,8 +2643,8 @@ void flush_workqueue(struct workqueue_struct *wq)
complete(&next->done);
}
- BUG_ON(!list_empty(&wq->flusher_overflow) &&
- wq->flush_color != work_next_color(wq->work_color));
+ WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) &&
+ wq->flush_color != work_next_color(wq->work_color));
/* this flush_color is finished, advance by one */
wq->flush_color = work_next_color(wq->flush_color);
@@ -2660,7 +2668,7 @@ void flush_workqueue(struct workqueue_struct *wq)
}
if (list_empty(&wq->flusher_queue)) {
- BUG_ON(wq->flush_color != wq->work_color);
+ WARN_ON_ONCE(wq->flush_color != wq->work_color);
break;
}
@@ -2668,8 +2676,8 @@ void flush_workqueue(struct workqueue_struct *wq)
* Need to flush more colors. Make the next flusher
* the new first flusher and arm pwqs.
*/
- BUG_ON(wq->flush_color == wq->work_color);
- BUG_ON(wq->flush_color != next->flush_color);
+ WARN_ON_ONCE(wq->flush_color == wq->work_color);
+ WARN_ON_ONCE(wq->flush_color != next->flush_color);
list_del_init(&next->list);
wq->first_flusher = next;
@@ -2685,7 +2693,7 @@ void flush_workqueue(struct workqueue_struct *wq)
}
out_unlock:
- mutex_unlock(&wq->flush_mutex);
+ mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -2703,22 +2711,23 @@ EXPORT_SYMBOL_GPL(flush_workqueue);
void drain_workqueue(struct workqueue_struct *wq)
{
unsigned int flush_cnt = 0;
- unsigned int cpu;
+ struct pool_workqueue *pwq;
/*
* __queue_work() needs to test whether there are drainers, is much
* hotter than drain_workqueue() and already looks at @wq->flags.
- * Use WQ_DRAINING so that queue doesn't have to check nr_drainers.
+ * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers.
*/
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq->mutex);
if (!wq->nr_drainers++)
- wq->flags |= WQ_DRAINING;
- spin_unlock(&workqueue_lock);
+ wq->flags |= __WQ_DRAINING;
+ mutex_unlock(&wq->mutex);
reflush:
flush_workqueue(wq);
- for_each_pwq_cpu(cpu, wq) {
- struct pool_workqueue *pwq = get_pwq(cpu, wq);
+ mutex_lock(&wq->mutex);
+
+ for_each_pwq(pwq, wq) {
bool drained;
spin_lock_irq(&pwq->pool->lock);
@@ -2730,15 +2739,16 @@ reflush:
if (++flush_cnt == 10 ||
(flush_cnt % 100 == 0 && flush_cnt <= 1000))
- pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n",
+ pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n",
wq->name, flush_cnt);
+
+ mutex_unlock(&wq->mutex);
goto reflush;
}
- spin_lock(&workqueue_lock);
if (!--wq->nr_drainers)
- wq->flags &= ~WQ_DRAINING;
- spin_unlock(&workqueue_lock);
+ wq->flags &= ~__WQ_DRAINING;
+ mutex_unlock(&wq->mutex);
}
EXPORT_SYMBOL_GPL(drain_workqueue);
@@ -2749,11 +2759,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
struct pool_workqueue *pwq;
might_sleep();
+
+ local_irq_disable();
pool = get_work_pool(work);
- if (!pool)
+ if (!pool) {
+ local_irq_enable();
return false;
+ }
- spin_lock_irq(&pool->lock);
+ spin_lock(&pool->lock);
/* see the comment in try_to_grab_pending() with the same code */
pwq = get_work_pwq(work);
if (pwq) {
@@ -2775,7 +2789,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr)
* flusher is not running on the same workqueue by verifying write
* access.
*/
- if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER)
+ if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer)
lock_map_acquire(&pwq->wq->lockdep_map);
else
lock_map_acquire_read(&pwq->wq->lockdep_map);
@@ -2932,66 +2946,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork)
EXPORT_SYMBOL(cancel_delayed_work_sync);
/**
- * schedule_work_on - put work task on a specific cpu
- * @cpu: cpu to put the work task on
- * @work: job to be done
- *
- * This puts a job on a specific cpu
- */
-bool schedule_work_on(int cpu, struct work_struct *work)
-{
- return queue_work_on(cpu, system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work_on);
-
-/**
- * schedule_work - put work task in global workqueue
- * @work: job to be done
- *
- * Returns %false if @work was already on the kernel-global workqueue and
- * %true otherwise.
- *
- * This puts a job in the kernel-global workqueue if it was not already
- * queued and leaves it in the same position on the kernel-global
- * workqueue otherwise.
- */
-bool schedule_work(struct work_struct *work)
-{
- return queue_work(system_wq, work);
-}
-EXPORT_SYMBOL(schedule_work);
-
-/**
- * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
- * @cpu: cpu to use
- * @dwork: job to be done
- * @delay: number of jiffies to wait
- *
- * After waiting for a given time this puts a job in the kernel-global
- * workqueue on the specified CPU.
- */
-bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
- unsigned long delay)
-{
- return queue_delayed_work_on(cpu, system_wq, dwork, delay);
-}
-EXPORT_SYMBOL(schedule_delayed_work_on);
-
-/**
- * schedule_delayed_work - put work task in global workqueue after delay
- * @dwork: job to be done
- * @delay: number of jiffies to wait or 0 for immediate execution
- *
- * After waiting for a given time this puts a job in the kernel-global
- * workqueue.
- */
-bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay)
-{
- return queue_delayed_work(system_wq, dwork, delay);
-}
-EXPORT_SYMBOL(schedule_delayed_work);
-
-/**
* schedule_on_each_cpu - execute a function synchronously on each online CPU
* @func: the function to call
*
@@ -3084,51 +3038,1025 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew)
}
EXPORT_SYMBOL_GPL(execute_in_process_context);
-int keventd_up(void)
+#ifdef CONFIG_SYSFS
+/*
+ * Workqueues with WQ_SYSFS flag set is visible to userland via
+ * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the
+ * following attributes.
+ *
+ * per_cpu RO bool : whether the workqueue is per-cpu or unbound
+ * max_active RW int : maximum number of in-flight work items
+ *
+ * Unbound workqueues have the following extra attributes.
+ *
+ * id RO int : the associated pool ID
+ * nice RW int : nice value of the workers
+ * cpumask RW mask : bitmask of allowed CPUs for the workers
+ */
+struct wq_device {
+ struct workqueue_struct *wq;
+ struct device dev;
+};
+
+static struct workqueue_struct *dev_to_wq(struct device *dev)
+{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ return wq_dev->wq;
+}
+
+static ssize_t wq_per_cpu_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND));
+}
+
+static ssize_t wq_max_active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+
+ return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active);
+}
+
+static ssize_t wq_max_active_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int val;
+
+ if (sscanf(buf, "%d", &val) != 1 || val <= 0)
+ return -EINVAL;
+
+ workqueue_set_max_active(wq, val);
+ return count;
+}
+
+static struct device_attribute wq_sysfs_attrs[] = {
+ __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL),
+ __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store),
+ __ATTR_NULL,
+};
+
+static ssize_t wq_pool_ids_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ const char *delim = "";
+ int node, written = 0;
+
+ rcu_read_lock_sched();
+ for_each_node(node) {
+ written += scnprintf(buf + written, PAGE_SIZE - written,
+ "%s%d:%d", delim, node,
+ unbound_pwq_by_node(wq, node)->pool->id);
+ delim = " ";
+ }
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+ rcu_read_unlock_sched();
+
+ return written;
+}
+
+static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+/* prepare workqueue_attrs for sysfs store operations */
+static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq)
+{
+ struct workqueue_attrs *attrs;
+
+ attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+
+ mutex_lock(&wq->mutex);
+ copy_workqueue_attrs(attrs, wq->unbound_attrs);
+ mutex_unlock(&wq->mutex);
+ return attrs;
+}
+
+static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ if (sscanf(buf, "%d", &attrs->nice) == 1 &&
+ attrs->nice >= -20 && attrs->nice <= 19)
+ ret = apply_workqueue_attrs(wq, attrs);
+ else
+ ret = -EINVAL;
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_cpumask_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask);
+ mutex_unlock(&wq->mutex);
+
+ written += scnprintf(buf + written, PAGE_SIZE - written, "\n");
+ return written;
+}
+
+static ssize_t wq_cpumask_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = cpumask_parse(buf, attrs->cpumask);
+ if (!ret)
+ ret = apply_workqueue_attrs(wq, attrs);
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr,
+ char *buf)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ int written;
+
+ mutex_lock(&wq->mutex);
+ written = scnprintf(buf, PAGE_SIZE, "%d\n",
+ !wq->unbound_attrs->no_numa);
+ mutex_unlock(&wq->mutex);
+
+ return written;
+}
+
+static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct workqueue_struct *wq = dev_to_wq(dev);
+ struct workqueue_attrs *attrs;
+ int v, ret;
+
+ attrs = wq_sysfs_prep_attrs(wq);
+ if (!attrs)
+ return -ENOMEM;
+
+ ret = -EINVAL;
+ if (sscanf(buf, "%d", &v) == 1) {
+ attrs->no_numa = !v;
+ ret = apply_workqueue_attrs(wq, attrs);
+ }
+
+ free_workqueue_attrs(attrs);
+ return ret ?: count;
+}
+
+static struct device_attribute wq_sysfs_unbound_attrs[] = {
+ __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL),
+ __ATTR(nice, 0644, wq_nice_show, wq_nice_store),
+ __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store),
+ __ATTR(numa, 0644, wq_numa_show, wq_numa_store),
+ __ATTR_NULL,
+};
+
+static struct bus_type wq_subsys = {
+ .name = "workqueue",
+ .dev_attrs = wq_sysfs_attrs,
+};
+
+static int __init wq_sysfs_init(void)
{
- return system_wq != NULL;
+ return subsys_virtual_register(&wq_subsys, NULL);
}
+core_initcall(wq_sysfs_init);
-static int alloc_pwqs(struct workqueue_struct *wq)
+static void wq_device_release(struct device *dev)
{
+ struct wq_device *wq_dev = container_of(dev, struct wq_device, dev);
+
+ kfree(wq_dev);
+}
+
+/**
+ * workqueue_sysfs_register - make a workqueue visible in sysfs
+ * @wq: the workqueue to register
+ *
+ * Expose @wq in sysfs under /sys/bus/workqueue/devices.
+ * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set
+ * which is the preferred method.
+ *
+ * Workqueue user should use this function directly iff it wants to apply
+ * workqueue_attrs before making the workqueue visible in sysfs; otherwise,
+ * apply_workqueue_attrs() may race against userland updating the
+ * attributes.
+ *
+ * Returns 0 on success, -errno on failure.
+ */
+int workqueue_sysfs_register(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev;
+ int ret;
+
/*
- * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS.
- * Make sure that the alignment isn't lower than that of
- * unsigned long long.
+ * Adjusting max_active or creating new pwqs by applyting
+ * attributes breaks ordering guarantee. Disallow exposing ordered
+ * workqueues.
*/
- const size_t size = sizeof(struct pool_workqueue);
- const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS,
- __alignof__(unsigned long long));
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
+ return -EINVAL;
- if (!(wq->flags & WQ_UNBOUND))
- wq->pool_wq.pcpu = __alloc_percpu(size, align);
- else {
- void *ptr;
+ wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL);
+ if (!wq_dev)
+ return -ENOMEM;
+
+ wq_dev->wq = wq;
+ wq_dev->dev.bus = &wq_subsys;
+ wq_dev->dev.init_name = wq->name;
+ wq_dev->dev.release = wq_device_release;
+
+ /*
+ * unbound_attrs are created separately. Suppress uevent until
+ * everything is ready.
+ */
+ dev_set_uevent_suppress(&wq_dev->dev, true);
+
+ ret = device_register(&wq_dev->dev);
+ if (ret) {
+ kfree(wq_dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+
+ if (wq->flags & WQ_UNBOUND) {
+ struct device_attribute *attr;
+
+ for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) {
+ ret = device_create_file(&wq_dev->dev, attr);
+ if (ret) {
+ device_unregister(&wq_dev->dev);
+ wq->wq_dev = NULL;
+ return ret;
+ }
+ }
+ }
+
+ kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD);
+ return 0;
+}
+
+/**
+ * workqueue_sysfs_unregister - undo workqueue_sysfs_register()
+ * @wq: the workqueue to unregister
+ *
+ * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister.
+ */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq)
+{
+ struct wq_device *wq_dev = wq->wq_dev;
+
+ if (!wq->wq_dev)
+ return;
+
+ wq->wq_dev = NULL;
+ device_unregister(&wq_dev->dev);
+}
+#else /* CONFIG_SYSFS */
+static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { }
+#endif /* CONFIG_SYSFS */
+
+/**
+ * free_workqueue_attrs - free a workqueue_attrs
+ * @attrs: workqueue_attrs to free
+ *
+ * Undo alloc_workqueue_attrs().
+ */
+void free_workqueue_attrs(struct workqueue_attrs *attrs)
+{
+ if (attrs) {
+ free_cpumask_var(attrs->cpumask);
+ kfree(attrs);
+ }
+}
+
+/**
+ * alloc_workqueue_attrs - allocate a workqueue_attrs
+ * @gfp_mask: allocation mask to use
+ *
+ * Allocate a new workqueue_attrs, initialize with default settings and
+ * return it. Returns NULL on failure.
+ */
+struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask)
+{
+ struct workqueue_attrs *attrs;
+
+ attrs = kzalloc(sizeof(*attrs), gfp_mask);
+ if (!attrs)
+ goto fail;
+ if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask))
+ goto fail;
+
+ cpumask_copy(attrs->cpumask, cpu_possible_mask);
+ return attrs;
+fail:
+ free_workqueue_attrs(attrs);
+ return NULL;
+}
+
+static void copy_workqueue_attrs(struct workqueue_attrs *to,
+ const struct workqueue_attrs *from)
+{
+ to->nice = from->nice;
+ cpumask_copy(to->cpumask, from->cpumask);
+}
+
+/* hash value of the content of @attr */
+static u32 wqattrs_hash(const struct workqueue_attrs *attrs)
+{
+ u32 hash = 0;
+
+ hash = jhash_1word(attrs->nice, hash);
+ hash = jhash(cpumask_bits(attrs->cpumask),
+ BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash);
+ return hash;
+}
+
+/* content equality test */
+static bool wqattrs_equal(const struct workqueue_attrs *a,
+ const struct workqueue_attrs *b)
+{
+ if (a->nice != b->nice)
+ return false;
+ if (!cpumask_equal(a->cpumask, b->cpumask))
+ return false;
+ return true;
+}
+
+/**
+ * init_worker_pool - initialize a newly zalloc'd worker_pool
+ * @pool: worker_pool to initialize
+ *
+ * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs.
+ * Returns 0 on success, -errno on failure. Even on failure, all fields
+ * inside @pool proper are initialized and put_unbound_pool() can be called
+ * on @pool safely to release it.
+ */
+static int init_worker_pool(struct worker_pool *pool)
+{
+ spin_lock_init(&pool->lock);
+ pool->id = -1;
+ pool->cpu = -1;
+ pool->node = NUMA_NO_NODE;
+ pool->flags |= POOL_DISASSOCIATED;
+ INIT_LIST_HEAD(&pool->worklist);
+ INIT_LIST_HEAD(&pool->idle_list);
+ hash_init(pool->busy_hash);
+
+ init_timer_deferrable(&pool->idle_timer);
+ pool->idle_timer.function = idle_worker_timeout;
+ pool->idle_timer.data = (unsigned long)pool;
+
+ setup_timer(&pool->mayday_timer, pool_mayday_timeout,
+ (unsigned long)pool);
+
+ mutex_init(&pool->manager_arb);
+ mutex_init(&pool->manager_mutex);
+ idr_init(&pool->worker_idr);
+
+ INIT_HLIST_NODE(&pool->hash_node);
+ pool->refcnt = 1;
+
+ /* shouldn't fail above this point */
+ pool->attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!pool->attrs)
+ return -ENOMEM;
+ return 0;
+}
+
+static void rcu_free_pool(struct rcu_head *rcu)
+{
+ struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu);
+
+ idr_destroy(&pool->worker_idr);
+ free_workqueue_attrs(pool->attrs);
+ kfree(pool);
+}
+
+/**
+ * put_unbound_pool - put a worker_pool
+ * @pool: worker_pool to put
+ *
+ * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU
+ * safe manner. get_unbound_pool() calls this function on its failure path
+ * and this function should be able to release pools which went through,
+ * successfully or not, init_worker_pool().
+ *
+ * Should be called with wq_pool_mutex held.
+ */
+static void put_unbound_pool(struct worker_pool *pool)
+{
+ struct worker *worker;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (--pool->refcnt)
+ return;
+
+ /* sanity checks */
+ if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) ||
+ WARN_ON(!list_empty(&pool->worklist)))
+ return;
+
+ /* release id and unhash */
+ if (pool->id >= 0)
+ idr_remove(&worker_pool_idr, pool->id);
+ hash_del(&pool->hash_node);
+
+ /*
+ * Become the manager and destroy all workers. Grabbing
+ * manager_arb prevents @pool's workers from blocking on
+ * manager_mutex.
+ */
+ mutex_lock(&pool->manager_arb);
+ mutex_lock(&pool->manager_mutex);
+ spin_lock_irq(&pool->lock);
+
+ while ((worker = first_worker(pool)))
+ destroy_worker(worker);
+ WARN_ON(pool->nr_workers || pool->nr_idle);
+
+ spin_unlock_irq(&pool->lock);
+ mutex_unlock(&pool->manager_mutex);
+ mutex_unlock(&pool->manager_arb);
+
+ /* shut down the timers */
+ del_timer_sync(&pool->idle_timer);
+ del_timer_sync(&pool->mayday_timer);
+
+ /* sched-RCU protected to allow dereferences from get_work_pool() */
+ call_rcu_sched(&pool->rcu, rcu_free_pool);
+}
+
+/**
+ * get_unbound_pool - get a worker_pool with the specified attributes
+ * @attrs: the attributes of the worker_pool to get
+ *
+ * Obtain a worker_pool which has the same attributes as @attrs, bump the
+ * reference count and return it. If there already is a matching
+ * worker_pool, it will be used; otherwise, this function attempts to
+ * create a new one. On failure, returns NULL.
+ *
+ * Should be called with wq_pool_mutex held.
+ */
+static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs)
+{
+ u32 hash = wqattrs_hash(attrs);
+ struct worker_pool *pool;
+ int node;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ /* do we already have a matching pool? */
+ hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) {
+ if (wqattrs_equal(pool->attrs, attrs)) {
+ pool->refcnt++;
+ goto out_unlock;
+ }
+ }
+
+ /* nope, create a new one */
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool || init_worker_pool(pool) < 0)
+ goto fail;
+
+ if (workqueue_freezing)
+ pool->flags |= POOL_FREEZING;
+
+ lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */
+ copy_workqueue_attrs(pool->attrs, attrs);
+
+ /* if cpumask is contained inside a NUMA node, we belong to that node */
+ if (wq_numa_enabled) {
+ for_each_node(node) {
+ if (cpumask_subset(pool->attrs->cpumask,
+ wq_numa_possible_cpumask[node])) {
+ pool->node = node;
+ break;
+ }
+ }
+ }
+
+ if (worker_pool_assign_id(pool) < 0)
+ goto fail;
+
+ /* create and start the initial worker */
+ if (create_and_start_worker(pool) < 0)
+ goto fail;
+
+ /* install */
+ hash_add(unbound_pool_hash, &pool->hash_node, hash);
+out_unlock:
+ return pool;
+fail:
+ if (pool)
+ put_unbound_pool(pool);
+ return NULL;
+}
+
+static void rcu_free_pwq(struct rcu_head *rcu)
+{
+ kmem_cache_free(pwq_cache,
+ container_of(rcu, struct pool_workqueue, rcu));
+}
+
+/*
+ * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt
+ * and needs to be destroyed.
+ */
+static void pwq_unbound_release_workfn(struct work_struct *work)
+{
+ struct pool_workqueue *pwq = container_of(work, struct pool_workqueue,
+ unbound_release_work);
+ struct workqueue_struct *wq = pwq->wq;
+ struct worker_pool *pool = pwq->pool;
+ bool is_last;
+
+ if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND)))
+ return;
+
+ /*
+ * Unlink @pwq. Synchronization against wq->mutex isn't strictly
+ * necessary on release but do it anyway. It's easier to verify
+ * and consistent with the linking path.
+ */
+ mutex_lock(&wq->mutex);
+ list_del_rcu(&pwq->pwqs_node);
+ is_last = list_empty(&wq->pwqs);
+ mutex_unlock(&wq->mutex);
+
+ mutex_lock(&wq_pool_mutex);
+ put_unbound_pool(pool);
+ mutex_unlock(&wq_pool_mutex);
+
+ call_rcu_sched(&pwq->rcu, rcu_free_pwq);
+
+ /*
+ * If we're the last pwq going away, @wq is already dead and no one
+ * is gonna access it anymore. Free it.
+ */
+ if (is_last) {
+ free_workqueue_attrs(wq->unbound_attrs);
+ kfree(wq);
+ }
+}
+
+/**
+ * pwq_adjust_max_active - update a pwq's max_active to the current setting
+ * @pwq: target pool_workqueue
+ *
+ * If @pwq isn't freezing, set @pwq->max_active to the associated
+ * workqueue's saved_max_active and activate delayed work items
+ * accordingly. If @pwq is freezing, clear @pwq->max_active to zero.
+ */
+static void pwq_adjust_max_active(struct pool_workqueue *pwq)
+{
+ struct workqueue_struct *wq = pwq->wq;
+ bool freezable = wq->flags & WQ_FREEZABLE;
+
+ /* for @wq->saved_max_active */
+ lockdep_assert_held(&wq->mutex);
+
+ /* fast exit for non-freezable wqs */
+ if (!freezable && pwq->max_active == wq->saved_max_active)
+ return;
+
+ spin_lock_irq(&pwq->pool->lock);
+
+ if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) {
+ pwq->max_active = wq->saved_max_active;
+
+ while (!list_empty(&pwq->delayed_works) &&
+ pwq->nr_active < pwq->max_active)
+ pwq_activate_first_delayed(pwq);
/*
- * Allocate enough room to align pwq and put an extra
- * pointer at the end pointing back to the originally
- * allocated pointer which will be used for free.
+ * Need to kick a worker after thawed or an unbound wq's
+ * max_active is bumped. It's a slow path. Do it always.
*/
- ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL);
- if (ptr) {
- wq->pool_wq.single = PTR_ALIGN(ptr, align);
- *(void **)(wq->pool_wq.single + 1) = ptr;
+ wake_up_worker(pwq->pool);
+ } else {
+ pwq->max_active = 0;
+ }
+
+ spin_unlock_irq(&pwq->pool->lock);
+}
+
+/* initialize newly alloced @pwq which is associated with @wq and @pool */
+static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq,
+ struct worker_pool *pool)
+{
+ BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
+
+ memset(pwq, 0, sizeof(*pwq));
+
+ pwq->pool = pool;
+ pwq->wq = wq;
+ pwq->flush_color = -1;
+ pwq->refcnt = 1;
+ INIT_LIST_HEAD(&pwq->delayed_works);
+ INIT_LIST_HEAD(&pwq->pwqs_node);
+ INIT_LIST_HEAD(&pwq->mayday_node);
+ INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn);
+}
+
+/* sync @pwq with the current state of its associated wq and link it */
+static void link_pwq(struct pool_workqueue *pwq)
+{
+ struct workqueue_struct *wq = pwq->wq;
+
+ lockdep_assert_held(&wq->mutex);
+
+ /* may be called multiple times, ignore if already linked */
+ if (!list_empty(&pwq->pwqs_node))
+ return;
+
+ /*
+ * Set the matching work_color. This is synchronized with
+ * wq->mutex to avoid confusing flush_workqueue().
+ */
+ pwq->work_color = wq->work_color;
+
+ /* sync max_active to the current setting */
+ pwq_adjust_max_active(pwq);
+
+ /* link in @pwq */
+ list_add_rcu(&pwq->pwqs_node, &wq->pwqs);
+}
+
+/* obtain a pool matching @attr and create a pwq associating the pool and @wq */
+static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
+{
+ struct worker_pool *pool;
+ struct pool_workqueue *pwq;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ pool = get_unbound_pool(attrs);
+ if (!pool)
+ return NULL;
+
+ pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node);
+ if (!pwq) {
+ put_unbound_pool(pool);
+ return NULL;
+ }
+
+ init_pwq(pwq, wq, pool);
+ return pwq;
+}
+
+/* undo alloc_unbound_pwq(), used only in the error path */
+static void free_unbound_pwq(struct pool_workqueue *pwq)
+{
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (pwq) {
+ put_unbound_pool(pwq->pool);
+ kmem_cache_free(pwq_cache, pwq);
+ }
+}
+
+/**
+ * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node
+ * @attrs: the wq_attrs of interest
+ * @node: the target NUMA node
+ * @cpu_going_down: if >= 0, the CPU to consider as offline
+ * @cpumask: outarg, the resulting cpumask
+ *
+ * Calculate the cpumask a workqueue with @attrs should use on @node. If
+ * @cpu_going_down is >= 0, that cpu is considered offline during
+ * calculation. The result is stored in @cpumask. This function returns
+ * %true if the resulting @cpumask is different from @attrs->cpumask,
+ * %false if equal.
+ *
+ * If NUMA affinity is not enabled, @attrs->cpumask is always used. If
+ * enabled and @node has online CPUs requested by @attrs, the returned
+ * cpumask is the intersection of the possible CPUs of @node and
+ * @attrs->cpumask.
+ *
+ * The caller is responsible for ensuring that the cpumask of @node stays
+ * stable.
+ */
+static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
+ int cpu_going_down, cpumask_t *cpumask)
+{
+ if (!wq_numa_enabled || attrs->no_numa)
+ goto use_dfl;
+
+ /* does @node have any online CPUs @attrs wants? */
+ cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
+ if (cpu_going_down >= 0)
+ cpumask_clear_cpu(cpu_going_down, cpumask);
+
+ if (cpumask_empty(cpumask))
+ goto use_dfl;
+
+ /* yeap, return possible CPUs in @node that @attrs wants */
+ cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]);
+ return !cpumask_equal(cpumask, attrs->cpumask);
+
+use_dfl:
+ cpumask_copy(cpumask, attrs->cpumask);
+ return false;
+}
+
+/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */
+static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq,
+ int node,
+ struct pool_workqueue *pwq)
+{
+ struct pool_workqueue *old_pwq;
+
+ lockdep_assert_held(&wq->mutex);
+
+ /* link_pwq() can handle duplicate calls */
+ link_pwq(pwq);
+
+ old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
+ rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq);
+ return old_pwq;
+}
+
+/**
+ * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue
+ * @wq: the target workqueue
+ * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs()
+ *
+ * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA
+ * machines, this function maps a separate pwq to each NUMA node with
+ * possibles CPUs in @attrs->cpumask so that work items are affine to the
+ * NUMA node it was issued on. Older pwqs are released as in-flight work
+ * items finish. Note that a work item which repeatedly requeues itself
+ * back-to-back will stay on its current pwq.
+ *
+ * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on
+ * failure.
+ */
+int apply_workqueue_attrs(struct workqueue_struct *wq,
+ const struct workqueue_attrs *attrs)
+{
+ struct workqueue_attrs *new_attrs, *tmp_attrs;
+ struct pool_workqueue **pwq_tbl, *dfl_pwq;
+ int node, ret;
+
+ /* only unbound workqueues can change attributes */
+ if (WARN_ON(!(wq->flags & WQ_UNBOUND)))
+ return -EINVAL;
+
+ /* creating multiple pwqs breaks ordering guarantee */
+ if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs)))
+ return -EINVAL;
+
+ pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL);
+ new_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!pwq_tbl || !new_attrs || !tmp_attrs)
+ goto enomem;
+
+ /* make a copy of @attrs and sanitize it */
+ copy_workqueue_attrs(new_attrs, attrs);
+ cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask);
+
+ /*
+ * We may create multiple pwqs with differing cpumasks. Make a
+ * copy of @new_attrs which will be modified and used to obtain
+ * pools.
+ */
+ copy_workqueue_attrs(tmp_attrs, new_attrs);
+
+ /*
+ * CPUs should stay stable across pwq creations and installations.
+ * Pin CPUs, determine the target cpumask for each node and create
+ * pwqs accordingly.
+ */
+ get_online_cpus();
+
+ mutex_lock(&wq_pool_mutex);
+
+ /*
+ * If something goes wrong during CPU up/down, we'll fall back to
+ * the default pwq covering whole @attrs->cpumask. Always create
+ * it even if we don't use it immediately.
+ */
+ dfl_pwq = alloc_unbound_pwq(wq, new_attrs);
+ if (!dfl_pwq)
+ goto enomem_pwq;
+
+ for_each_node(node) {
+ if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) {
+ pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
+ if (!pwq_tbl[node])
+ goto enomem_pwq;
+ } else {
+ dfl_pwq->refcnt++;
+ pwq_tbl[node] = dfl_pwq;
}
}
- /* just in case, make sure it's actually aligned */
- BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align));
- return wq->pool_wq.v ? 0 : -ENOMEM;
+ mutex_unlock(&wq_pool_mutex);
+
+ /* all pwqs have been created successfully, let's install'em */
+ mutex_lock(&wq->mutex);
+
+ copy_workqueue_attrs(wq->unbound_attrs, new_attrs);
+
+ /* save the previous pwq and install the new one */
+ for_each_node(node)
+ pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]);
+
+ /* @dfl_pwq might not have been used, ensure it's linked */
+ link_pwq(dfl_pwq);
+ swap(wq->dfl_pwq, dfl_pwq);
+
+ mutex_unlock(&wq->mutex);
+
+ /* put the old pwqs */
+ for_each_node(node)
+ put_pwq_unlocked(pwq_tbl[node]);
+ put_pwq_unlocked(dfl_pwq);
+
+ put_online_cpus();
+ ret = 0;
+ /* fall through */
+out_free:
+ free_workqueue_attrs(tmp_attrs);
+ free_workqueue_attrs(new_attrs);
+ kfree(pwq_tbl);
+ return ret;
+
+enomem_pwq:
+ free_unbound_pwq(dfl_pwq);
+ for_each_node(node)
+ if (pwq_tbl && pwq_tbl[node] != dfl_pwq)
+ free_unbound_pwq(pwq_tbl[node]);
+ mutex_unlock(&wq_pool_mutex);
+ put_online_cpus();
+enomem:
+ ret = -ENOMEM;
+ goto out_free;
}
-static void free_pwqs(struct workqueue_struct *wq)
+/**
+ * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
+ * @wq: the target workqueue
+ * @cpu: the CPU coming up or going down
+ * @online: whether @cpu is coming up or going down
+ *
+ * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
+ * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of
+ * @wq accordingly.
+ *
+ * If NUMA affinity can't be adjusted due to memory allocation failure, it
+ * falls back to @wq->dfl_pwq which may not be optimal but is always
+ * correct.
+ *
+ * Note that when the last allowed CPU of a NUMA node goes offline for a
+ * workqueue with a cpumask spanning multiple nodes, the workers which were
+ * already executing the work items for the workqueue will lose their CPU
+ * affinity and may execute on any CPU. This is similar to how per-cpu
+ * workqueues behave on CPU_DOWN. If a workqueue user wants strict
+ * affinity, it's the user's responsibility to flush the work item from
+ * CPU_DOWN_PREPARE.
+ */
+static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
+ bool online)
{
- if (!(wq->flags & WQ_UNBOUND))
- free_percpu(wq->pool_wq.pcpu);
- else if (wq->pool_wq.single) {
- /* the pointer to free is stored right after the pwq */
- kfree(*(void **)(wq->pool_wq.single + 1));
+ int node = cpu_to_node(cpu);
+ int cpu_off = online ? -1 : cpu;
+ struct pool_workqueue *old_pwq = NULL, *pwq;
+ struct workqueue_attrs *target_attrs;
+ cpumask_t *cpumask;
+
+ lockdep_assert_held(&wq_pool_mutex);
+
+ if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND))
+ return;
+
+ /*
+ * We don't wanna alloc/free wq_attrs for each wq for each CPU.
+ * Let's use a preallocated one. The following buf is protected by
+ * CPU hotplug exclusion.
+ */
+ target_attrs = wq_update_unbound_numa_attrs_buf;
+ cpumask = target_attrs->cpumask;
+
+ mutex_lock(&wq->mutex);
+ if (wq->unbound_attrs->no_numa)
+ goto out_unlock;
+
+ copy_workqueue_attrs(target_attrs, wq->unbound_attrs);
+ pwq = unbound_pwq_by_node(wq, node);
+
+ /*
+ * Let's determine what needs to be done. If the target cpumask is
+ * different from wq's, we need to compare it to @pwq's and create
+ * a new one if they don't match. If the target cpumask equals
+ * wq's, the default pwq should be used. If @pwq is already the
+ * default one, nothing to do; otherwise, install the default one.
+ */
+ if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) {
+ if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
+ goto out_unlock;
+ } else {
+ if (pwq == wq->dfl_pwq)
+ goto out_unlock;
+ else
+ goto use_dfl_pwq;
+ }
+
+ mutex_unlock(&wq->mutex);
+
+ /* create a new pwq */
+ pwq = alloc_unbound_pwq(wq, target_attrs);
+ if (!pwq) {
+ pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n",
+ wq->name);
+ goto out_unlock;
+ }
+
+ /*
+ * Install the new pwq. As this function is called only from CPU
+ * hotplug callbacks and applying a new attrs is wrapped with
+ * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed
+ * inbetween.
+ */
+ mutex_lock(&wq->mutex);
+ old_pwq = numa_pwq_tbl_install(wq, node, pwq);
+ goto out_unlock;
+
+use_dfl_pwq:
+ spin_lock_irq(&wq->dfl_pwq->pool->lock);
+ get_pwq(wq->dfl_pwq);
+ spin_unlock_irq(&wq->dfl_pwq->pool->lock);
+ old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq);
+out_unlock:
+ mutex_unlock(&wq->mutex);
+ put_pwq_unlocked(old_pwq);
+}
+
+static int alloc_and_link_pwqs(struct workqueue_struct *wq)
+{
+ bool highpri = wq->flags & WQ_HIGHPRI;
+ int cpu;
+
+ if (!(wq->flags & WQ_UNBOUND)) {
+ wq->cpu_pwqs = alloc_percpu(struct pool_workqueue);
+ if (!wq->cpu_pwqs)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu) {
+ struct pool_workqueue *pwq =
+ per_cpu_ptr(wq->cpu_pwqs, cpu);
+ struct worker_pool *cpu_pools =
+ per_cpu(cpu_worker_pools, cpu);
+
+ init_pwq(pwq, wq, &cpu_pools[highpri]);
+
+ mutex_lock(&wq->mutex);
+ link_pwq(pwq);
+ mutex_unlock(&wq->mutex);
+ }
+ return 0;
+ } else {
+ return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]);
}
}
@@ -3150,30 +4078,28 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
struct lock_class_key *key,
const char *lock_name, ...)
{
- va_list args, args1;
+ size_t tbl_size = 0;
+ va_list args;
struct workqueue_struct *wq;
- unsigned int cpu;
- size_t namelen;
+ struct pool_workqueue *pwq;
- /* determine namelen, allocate wq and format name */
- va_start(args, lock_name);
- va_copy(args1, args);
- namelen = vsnprintf(NULL, 0, fmt, args) + 1;
+ /* allocate wq and format name */
+ if (flags & WQ_UNBOUND)
+ tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]);
- wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL);
+ wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL);
if (!wq)
- goto err;
+ return NULL;
- vsnprintf(wq->name, namelen, fmt, args1);
- va_end(args);
- va_end(args1);
+ if (flags & WQ_UNBOUND) {
+ wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL);
+ if (!wq->unbound_attrs)
+ goto err_free_wq;
+ }
- /*
- * Workqueues which may be used during memory reclaim should
- * have a rescuer to guarantee forward progress.
- */
- if (flags & WQ_MEM_RECLAIM)
- flags |= WQ_RESCUER;
+ va_start(args, lock_name);
+ vsnprintf(wq->name, sizeof(wq->name), fmt, args);
+ va_end(args);
max_active = max_active ?: WQ_DFL_ACTIVE;
max_active = wq_clamp_max_active(max_active, flags, wq->name);
@@ -3181,71 +4107,70 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt,
/* init wq */
wq->flags = flags;
wq->saved_max_active = max_active;
- mutex_init(&wq->flush_mutex);
+ mutex_init(&wq->mutex);
atomic_set(&wq->nr_pwqs_to_flush, 0);
+ INIT_LIST_HEAD(&wq->pwqs);
INIT_LIST_HEAD(&wq->flusher_queue);
INIT_LIST_HEAD(&wq->flusher_overflow);
+ INIT_LIST_HEAD(&wq->maydays);
lockdep_init_map(&wq->lockdep_map, lock_name, key, 0);
INIT_LIST_HEAD(&wq->list);
- if (alloc_pwqs(wq) < 0)
- goto err;
-
- for_each_pwq_cpu(cpu, wq) {
- struct pool_workqueue *pwq = get_pwq(cpu, wq);
+ if (alloc_and_link_pwqs(wq) < 0)
+ goto err_free_wq;
- BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK);
- pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI);
- pwq->wq = wq;
- pwq->flush_color = -1;
- pwq->max_active = max_active;
- INIT_LIST_HEAD(&pwq->delayed_works);
- }
-
- if (flags & WQ_RESCUER) {
+ /*
+ * Workqueues which may be used during memory reclaim should
+ * have a rescuer to guarantee forward progress.
+ */
+ if (flags & WQ_MEM_RECLAIM) {
struct worker *rescuer;
- if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL))
- goto err;
-
- wq->rescuer = rescuer = alloc_worker();
+ rescuer = alloc_worker();
if (!rescuer)
- goto err;
+ goto err_destroy;
rescuer->rescue_wq = wq;
rescuer->task = kthread_create(rescuer_thread, rescuer, "%s",
wq->name);
- if (IS_ERR(rescuer->task))
- goto err;
+ if (IS_ERR(rescuer->task)) {
+ kfree(rescuer);
+ goto err_destroy;
+ }
- rescuer->task->flags |= PF_THREAD_BOUND;
+ wq->rescuer = rescuer;
+ rescuer->task->flags |= PF_NO_SETAFFINITY;
wake_up_process(rescuer->task);
}
+ if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq))
+ goto err_destroy;
+
/*
- * workqueue_lock protects global freeze state and workqueues
- * list. Grab it, set max_active accordingly and add the new
- * workqueue to workqueues list.
+ * wq_pool_mutex protects global freeze state and workqueues list.
+ * Grab it, adjust max_active and add the new @wq to workqueues
+ * list.
*/
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq_pool_mutex);
- if (workqueue_freezing && wq->flags & WQ_FREEZABLE)
- for_each_pwq_cpu(cpu, wq)
- get_pwq(cpu, wq)->max_active = 0;
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
+ mutex_unlock(&wq->mutex);
list_add(&wq->list, &workqueues);
- spin_unlock(&workqueue_lock);
+ mutex_unlock(&wq_pool_mutex);
return wq;
-err:
- if (wq) {
- free_pwqs(wq);
- free_mayday_mask(wq->mayday_mask);
- kfree(wq->rescuer);
- kfree(wq);
- }
+
+err_free_wq:
+ free_workqueue_attrs(wq->unbound_attrs);
+ kfree(wq);
+ return NULL;
+err_destroy:
+ destroy_workqueue(wq);
return NULL;
}
EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
@@ -3258,60 +4183,78 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key);
*/
void destroy_workqueue(struct workqueue_struct *wq)
{
- unsigned int cpu;
+ struct pool_workqueue *pwq;
+ int node;
/* drain it before proceeding with destruction */
drain_workqueue(wq);
+ /* sanity checks */
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq) {
+ int i;
+
+ for (i = 0; i < WORK_NR_COLORS; i++) {
+ if (WARN_ON(pwq->nr_in_flight[i])) {
+ mutex_unlock(&wq->mutex);
+ return;
+ }
+ }
+
+ if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) ||
+ WARN_ON(pwq->nr_active) ||
+ WARN_ON(!list_empty(&pwq->delayed_works))) {
+ mutex_unlock(&wq->mutex);
+ return;
+ }
+ }
+ mutex_unlock(&wq->mutex);
+
/*
* wq list is used to freeze wq, remove from list after
* flushing is complete in case freeze races us.
*/
- spin_lock(&workqueue_lock);
- list_del(&wq->list);
- spin_unlock(&workqueue_lock);
+ mutex_lock(&wq_pool_mutex);
+ list_del_init(&wq->list);
+ mutex_unlock(&wq_pool_mutex);
- /* sanity check */
- for_each_pwq_cpu(cpu, wq) {
- struct pool_workqueue *pwq = get_pwq(cpu, wq);
- int i;
+ workqueue_sysfs_unregister(wq);
- for (i = 0; i < WORK_NR_COLORS; i++)
- BUG_ON(pwq->nr_in_flight[i]);
- BUG_ON(pwq->nr_active);
- BUG_ON(!list_empty(&pwq->delayed_works));
- }
-
- if (wq->flags & WQ_RESCUER) {
+ if (wq->rescuer) {
kthread_stop(wq->rescuer->task);
- free_mayday_mask(wq->mayday_mask);
kfree(wq->rescuer);
+ wq->rescuer = NULL;
}
- free_pwqs(wq);
- kfree(wq);
-}
-EXPORT_SYMBOL_GPL(destroy_workqueue);
-
-/**
- * pwq_set_max_active - adjust max_active of a pwq
- * @pwq: target pool_workqueue
- * @max_active: new max_active value.
- *
- * Set @pwq->max_active to @max_active and activate delayed works if
- * increased.
- *
- * CONTEXT:
- * spin_lock_irq(pool->lock).
- */
-static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
-{
- pwq->max_active = max_active;
+ if (!(wq->flags & WQ_UNBOUND)) {
+ /*
+ * The base ref is never dropped on per-cpu pwqs. Directly
+ * free the pwqs and wq.
+ */
+ free_percpu(wq->cpu_pwqs);
+ kfree(wq);
+ } else {
+ /*
+ * We're the sole accessor of @wq at this point. Directly
+ * access numa_pwq_tbl[] and dfl_pwq to put the base refs.
+ * @wq will be freed when the last pwq is released.
+ */
+ for_each_node(node) {
+ pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]);
+ RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL);
+ put_pwq_unlocked(pwq);
+ }
- while (!list_empty(&pwq->delayed_works) &&
- pwq->nr_active < pwq->max_active)
- pwq_activate_first_delayed(pwq);
+ /*
+ * Put dfl_pwq. @wq may be freed any time after dfl_pwq is
+ * put. Don't access it afterwards.
+ */
+ pwq = wq->dfl_pwq;
+ wq->dfl_pwq = NULL;
+ put_pwq_unlocked(pwq);
+ }
}
+EXPORT_SYMBOL_GPL(destroy_workqueue);
/**
* workqueue_set_max_active - adjust max_active of a workqueue
@@ -3325,30 +4268,37 @@ static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active)
*/
void workqueue_set_max_active(struct workqueue_struct *wq, int max_active)
{
- unsigned int cpu;
+ struct pool_workqueue *pwq;
+
+ /* disallow meddling with max_active for ordered workqueues */
+ if (WARN_ON(wq->flags & __WQ_ORDERED))
+ return;
max_active = wq_clamp_max_active(max_active, wq->flags, wq->name);
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq->mutex);
wq->saved_max_active = max_active;
- for_each_pwq_cpu(cpu, wq) {
- struct pool_workqueue *pwq = get_pwq(cpu, wq);
- struct worker_pool *pool = pwq->pool;
-
- spin_lock_irq(&pool->lock);
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
- if (!(wq->flags & WQ_FREEZABLE) ||
- !(pool->flags & POOL_FREEZING))
- pwq_set_max_active(pwq, max_active);
+ mutex_unlock(&wq->mutex);
+}
+EXPORT_SYMBOL_GPL(workqueue_set_max_active);
- spin_unlock_irq(&pool->lock);
- }
+/**
+ * current_is_workqueue_rescuer - is %current workqueue rescuer?
+ *
+ * Determine whether %current is a workqueue rescuer. Can be used from
+ * work functions to determine whether it's being run off the rescuer task.
+ */
+bool current_is_workqueue_rescuer(void)
+{
+ struct worker *worker = current_wq_worker();
- spin_unlock(&workqueue_lock);
+ return worker && worker->rescue_wq;
}
-EXPORT_SYMBOL_GPL(workqueue_set_max_active);
/**
* workqueue_congested - test whether a workqueue is congested
@@ -3362,11 +4312,22 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active);
* RETURNS:
* %true if congested, %false otherwise.
*/
-bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq)
+bool workqueue_congested(int cpu, struct workqueue_struct *wq)
{
- struct pool_workqueue *pwq = get_pwq(cpu, wq);
+ struct pool_workqueue *pwq;
+ bool ret;
+
+ rcu_read_lock_sched();
+
+ if (!(wq->flags & WQ_UNBOUND))
+ pwq = per_cpu_ptr(wq->cpu_pwqs, cpu);
+ else
+ pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu));
+
+ ret = !list_empty(&pwq->delayed_works);
+ rcu_read_unlock_sched();
- return !list_empty(&pwq->delayed_works);
+ return ret;
}
EXPORT_SYMBOL_GPL(workqueue_congested);
@@ -3383,19 +4344,22 @@ EXPORT_SYMBOL_GPL(workqueue_congested);
*/
unsigned int work_busy(struct work_struct *work)
{
- struct worker_pool *pool = get_work_pool(work);
+ struct worker_pool *pool;
unsigned long flags;
unsigned int ret = 0;
if (work_pending(work))
ret |= WORK_BUSY_PENDING;
+ local_irq_save(flags);
+ pool = get_work_pool(work);
if (pool) {
- spin_lock_irqsave(&pool->lock, flags);
+ spin_lock(&pool->lock);
if (find_worker_executing_work(pool, work))
ret |= WORK_BUSY_RUNNING;
- spin_unlock_irqrestore(&pool->lock, flags);
+ spin_unlock(&pool->lock);
}
+ local_irq_restore(flags);
return ret;
}
@@ -3421,53 +4385,153 @@ static void wq_unbind_fn(struct work_struct *work)
int cpu = smp_processor_id();
struct worker_pool *pool;
struct worker *worker;
- int i;
+ int wi;
- for_each_std_worker_pool(pool, cpu) {
- BUG_ON(cpu != smp_processor_id());
+ for_each_cpu_worker_pool(pool, cpu) {
+ WARN_ON_ONCE(cpu != smp_processor_id());
- mutex_lock(&pool->assoc_mutex);
+ mutex_lock(&pool->manager_mutex);
spin_lock_irq(&pool->lock);
/*
- * We've claimed all manager positions. Make all workers
+ * We've blocked all manager operations. Make all workers
* unbound and set DISASSOCIATED. Before this, all workers
* except for the ones which are still executing works from
* before the last CPU down must be on the cpu. After
* this, they may become diasporas.
*/
- list_for_each_entry(worker, &pool->idle_list, entry)
- worker->flags |= WORKER_UNBOUND;
-
- for_each_busy_worker(worker, i, pool)
+ for_each_pool_worker(worker, wi, pool)
worker->flags |= WORKER_UNBOUND;
pool->flags |= POOL_DISASSOCIATED;
spin_unlock_irq(&pool->lock);
- mutex_unlock(&pool->assoc_mutex);
+ mutex_unlock(&pool->manager_mutex);
+
+ /*
+ * Call schedule() so that we cross rq->lock and thus can
+ * guarantee sched callbacks see the %WORKER_UNBOUND flag.
+ * This is necessary as scheduler callbacks may be invoked
+ * from other cpus.
+ */
+ schedule();
+
+ /*
+ * Sched callbacks are disabled now. Zap nr_running.
+ * After this, nr_running stays zero and need_more_worker()
+ * and keep_working() are always true as long as the
+ * worklist is not empty. This pool now behaves as an
+ * unbound (in terms of concurrency management) pool which
+ * are served by workers tied to the pool.
+ */
+ atomic_set(&pool->nr_running, 0);
+
+ /*
+ * With concurrency management just turned off, a busy
+ * worker blocking could lead to lengthy stalls. Kick off
+ * unbound chain execution of currently pending work items.
+ */
+ spin_lock_irq(&pool->lock);
+ wake_up_worker(pool);
+ spin_unlock_irq(&pool->lock);
}
+}
- /*
- * Call schedule() so that we cross rq->lock and thus can guarantee
- * sched callbacks see the %WORKER_UNBOUND flag. This is necessary
- * as scheduler callbacks may be invoked from other cpus.
- */
- schedule();
+/**
+ * rebind_workers - rebind all workers of a pool to the associated CPU
+ * @pool: pool of interest
+ *
+ * @pool->cpu is coming online. Rebind all workers to the CPU.
+ */
+static void rebind_workers(struct worker_pool *pool)
+{
+ struct worker *worker;
+ int wi;
+
+ lockdep_assert_held(&pool->manager_mutex);
/*
- * Sched callbacks are disabled now. Zap nr_running. After this,
- * nr_running stays zero and need_more_worker() and keep_working()
- * are always true as long as the worklist is not empty. Pools on
- * @cpu now behave as unbound (in terms of concurrency management)
- * pools which are served by workers tied to the CPU.
- *
- * On return from this function, the current worker would trigger
- * unbound chain execution of pending work items if other workers
- * didn't already.
+ * Restore CPU affinity of all workers. As all idle workers should
+ * be on the run-queue of the associated CPU before any local
+ * wake-ups for concurrency management happen, restore CPU affinty
+ * of all workers first and then clear UNBOUND. As we're called
+ * from CPU_ONLINE, the following shouldn't fail.
*/
- for_each_std_worker_pool(pool, cpu)
- atomic_set(&pool->nr_running, 0);
+ for_each_pool_worker(worker, wi, pool)
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
+ pool->attrs->cpumask) < 0);
+
+ spin_lock_irq(&pool->lock);
+
+ for_each_pool_worker(worker, wi, pool) {
+ unsigned int worker_flags = worker->flags;
+
+ /*
+ * A bound idle worker should actually be on the runqueue
+ * of the associated CPU for local wake-ups targeting it to
+ * work. Kick all idle workers so that they migrate to the
+ * associated CPU. Doing this in the same loop as
+ * replacing UNBOUND with REBOUND is safe as no worker will
+ * be bound before @pool->lock is released.
+ */
+ if (worker_flags & WORKER_IDLE)
+ wake_up_process(worker->task);
+
+ /*
+ * We want to clear UNBOUND but can't directly call
+ * worker_clr_flags() or adjust nr_running. Atomically
+ * replace UNBOUND with another NOT_RUNNING flag REBOUND.
+ * @worker will clear REBOUND using worker_clr_flags() when
+ * it initiates the next execution cycle thus restoring
+ * concurrency management. Note that when or whether
+ * @worker clears REBOUND doesn't affect correctness.
+ *
+ * ACCESS_ONCE() is necessary because @worker->flags may be
+ * tested without holding any lock in
+ * wq_worker_waking_up(). Without it, NOT_RUNNING test may
+ * fail incorrectly leading to premature concurrency
+ * management operations.
+ */
+ WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND));
+ worker_flags |= WORKER_REBOUND;
+ worker_flags &= ~WORKER_UNBOUND;
+ ACCESS_ONCE(worker->flags) = worker_flags;
+ }
+
+ spin_unlock_irq(&pool->lock);
+}
+
+/**
+ * restore_unbound_workers_cpumask - restore cpumask of unbound workers
+ * @pool: unbound pool of interest
+ * @cpu: the CPU which is coming up
+ *
+ * An unbound pool may end up with a cpumask which doesn't have any online
+ * CPUs. When a worker of such pool get scheduled, the scheduler resets
+ * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any
+ * online CPU before, cpus_allowed of all its workers should be restored.
+ */
+static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
+{
+ static cpumask_t cpumask;
+ struct worker *worker;
+ int wi;
+
+ lockdep_assert_held(&pool->manager_mutex);
+
+ /* is @cpu allowed for @pool? */
+ if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
+ return;
+
+ /* is @cpu the only online CPU? */
+ cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask);
+ if (cpumask_weight(&cpumask) != 1)
+ return;
+
+ /* as we're called from CPU_ONLINE, the following shouldn't fail */
+ for_each_pool_worker(worker, wi, pool)
+ WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
+ pool->attrs->cpumask) < 0);
}
/*
@@ -3478,39 +4542,46 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
- unsigned int cpu = (unsigned long)hcpu;
+ int cpu = (unsigned long)hcpu;
struct worker_pool *pool;
+ struct workqueue_struct *wq;
+ int pi;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- for_each_std_worker_pool(pool, cpu) {
- struct worker *worker;
-
+ for_each_cpu_worker_pool(pool, cpu) {
if (pool->nr_workers)
continue;
-
- worker = create_worker(pool);
- if (!worker)
+ if (create_and_start_worker(pool) < 0)
return NOTIFY_BAD;
-
- spin_lock_irq(&pool->lock);
- start_worker(worker);
- spin_unlock_irq(&pool->lock);
}
break;
case CPU_DOWN_FAILED:
case CPU_ONLINE:
- for_each_std_worker_pool(pool, cpu) {
- mutex_lock(&pool->assoc_mutex);
- spin_lock_irq(&pool->lock);
+ mutex_lock(&wq_pool_mutex);
- pool->flags &= ~POOL_DISASSOCIATED;
- rebind_workers(pool);
+ for_each_pool(pool, pi) {
+ mutex_lock(&pool->manager_mutex);
+
+ if (pool->cpu == cpu) {
+ spin_lock_irq(&pool->lock);
+ pool->flags &= ~POOL_DISASSOCIATED;
+ spin_unlock_irq(&pool->lock);
+
+ rebind_workers(pool);
+ } else if (pool->cpu < 0) {
+ restore_unbound_workers_cpumask(pool, cpu);
+ }
- spin_unlock_irq(&pool->lock);
- mutex_unlock(&pool->assoc_mutex);
+ mutex_unlock(&pool->manager_mutex);
}
+
+ /* update NUMA affinity of unbound workqueues */
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, true);
+
+ mutex_unlock(&wq_pool_mutex);
break;
}
return NOTIFY_OK;
@@ -3524,14 +4595,23 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb,
unsigned long action,
void *hcpu)
{
- unsigned int cpu = (unsigned long)hcpu;
+ int cpu = (unsigned long)hcpu;
struct work_struct unbind_work;
+ struct workqueue_struct *wq;
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_DOWN_PREPARE:
- /* unbinding should happen on the local CPU */
+ /* unbinding per-cpu workers should happen on the local CPU */
INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn);
queue_work_on(cpu, system_highpri_wq, &unbind_work);
+
+ /* update NUMA affinity of unbound workqueues */
+ mutex_lock(&wq_pool_mutex);
+ list_for_each_entry(wq, &workqueues, list)
+ wq_update_unbound_numa(wq, cpu, false);
+ mutex_unlock(&wq_pool_mutex);
+
+ /* wait for per-cpu unbinding to finish */
flush_work(&unbind_work);
break;
}
@@ -3564,7 +4644,7 @@ static void work_for_cpu_fn(struct work_struct *work)
* It is up to the caller to ensure that the cpu doesn't go offline.
* The caller must not hold any locks which would prevent @fn from completing.
*/
-long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
+long work_on_cpu(int cpu, long (*fn)(void *), void *arg)
{
struct work_for_cpu wfc = { .fn = fn, .arg = arg };
@@ -3582,44 +4662,40 @@ EXPORT_SYMBOL_GPL(work_on_cpu);
* freeze_workqueues_begin - begin freezing workqueues
*
* Start freezing workqueues. After this function returns, all freezable
- * workqueues will queue new works to their frozen_works list instead of
+ * workqueues will queue new works to their delayed_works list instead of
* pool->worklist.
*
* CONTEXT:
- * Grabs and releases workqueue_lock and pool->lock's.
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
*/
void freeze_workqueues_begin(void)
{
- unsigned int cpu;
+ struct worker_pool *pool;
+ struct workqueue_struct *wq;
+ struct pool_workqueue *pwq;
+ int pi;
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq_pool_mutex);
- BUG_ON(workqueue_freezing);
+ WARN_ON_ONCE(workqueue_freezing);
workqueue_freezing = true;
- for_each_wq_cpu(cpu) {
- struct worker_pool *pool;
- struct workqueue_struct *wq;
-
- for_each_std_worker_pool(pool, cpu) {
- spin_lock_irq(&pool->lock);
-
- WARN_ON_ONCE(pool->flags & POOL_FREEZING);
- pool->flags |= POOL_FREEZING;
-
- list_for_each_entry(wq, &workqueues, list) {
- struct pool_workqueue *pwq = get_pwq(cpu, wq);
-
- if (pwq && pwq->pool == pool &&
- (wq->flags & WQ_FREEZABLE))
- pwq->max_active = 0;
- }
+ /* set FREEZING */
+ for_each_pool(pool, pi) {
+ spin_lock_irq(&pool->lock);
+ WARN_ON_ONCE(pool->flags & POOL_FREEZING);
+ pool->flags |= POOL_FREEZING;
+ spin_unlock_irq(&pool->lock);
+ }
- spin_unlock_irq(&pool->lock);
- }
+ list_for_each_entry(wq, &workqueues, list) {
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
+ mutex_unlock(&wq->mutex);
}
- spin_unlock(&workqueue_lock);
+ mutex_unlock(&wq_pool_mutex);
}
/**
@@ -3629,7 +4705,7 @@ void freeze_workqueues_begin(void)
* between freeze_workqueues_begin() and thaw_workqueues().
*
* CONTEXT:
- * Grabs and releases workqueue_lock.
+ * Grabs and releases wq_pool_mutex.
*
* RETURNS:
* %true if some freezable workqueues are still busy. %false if freezing
@@ -3637,34 +4713,34 @@ void freeze_workqueues_begin(void)
*/
bool freeze_workqueues_busy(void)
{
- unsigned int cpu;
bool busy = false;
+ struct workqueue_struct *wq;
+ struct pool_workqueue *pwq;
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq_pool_mutex);
- BUG_ON(!workqueue_freezing);
+ WARN_ON_ONCE(!workqueue_freezing);
- for_each_wq_cpu(cpu) {
- struct workqueue_struct *wq;
+ list_for_each_entry(wq, &workqueues, list) {
+ if (!(wq->flags & WQ_FREEZABLE))
+ continue;
/*
* nr_active is monotonically decreasing. It's safe
* to peek without lock.
*/
- list_for_each_entry(wq, &workqueues, list) {
- struct pool_workqueue *pwq = get_pwq(cpu, wq);
-
- if (!pwq || !(wq->flags & WQ_FREEZABLE))
- continue;
-
- BUG_ON(pwq->nr_active < 0);
+ rcu_read_lock_sched();
+ for_each_pwq(pwq, wq) {
+ WARN_ON_ONCE(pwq->nr_active < 0);
if (pwq->nr_active) {
busy = true;
+ rcu_read_unlock_sched();
goto out_unlock;
}
}
+ rcu_read_unlock_sched();
}
out_unlock:
- spin_unlock(&workqueue_lock);
+ mutex_unlock(&wq_pool_mutex);
return busy;
}
@@ -3675,104 +4751,141 @@ out_unlock:
* frozen works are transferred to their respective pool worklists.
*
* CONTEXT:
- * Grabs and releases workqueue_lock and pool->lock's.
+ * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's.
*/
void thaw_workqueues(void)
{
- unsigned int cpu;
+ struct workqueue_struct *wq;
+ struct pool_workqueue *pwq;
+ struct worker_pool *pool;
+ int pi;
- spin_lock(&workqueue_lock);
+ mutex_lock(&wq_pool_mutex);
if (!workqueue_freezing)
goto out_unlock;
- for_each_wq_cpu(cpu) {
- struct worker_pool *pool;
- struct workqueue_struct *wq;
+ /* clear FREEZING */
+ for_each_pool(pool, pi) {
+ spin_lock_irq(&pool->lock);
+ WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
+ pool->flags &= ~POOL_FREEZING;
+ spin_unlock_irq(&pool->lock);
+ }
- for_each_std_worker_pool(pool, cpu) {
- spin_lock_irq(&pool->lock);
+ /* restore max_active and repopulate worklist */
+ list_for_each_entry(wq, &workqueues, list) {
+ mutex_lock(&wq->mutex);
+ for_each_pwq(pwq, wq)
+ pwq_adjust_max_active(pwq);
+ mutex_unlock(&wq->mutex);
+ }
+
+ workqueue_freezing = false;
+out_unlock:
+ mutex_unlock(&wq_pool_mutex);
+}
+#endif /* CONFIG_FREEZER */
- WARN_ON_ONCE(!(pool->flags & POOL_FREEZING));
- pool->flags &= ~POOL_FREEZING;
+static void __init wq_numa_init(void)
+{
+ cpumask_var_t *tbl;
+ int node, cpu;
- list_for_each_entry(wq, &workqueues, list) {
- struct pool_workqueue *pwq = get_pwq(cpu, wq);
+ /* determine NUMA pwq table len - highest node id + 1 */
+ for_each_node(node)
+ wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1);
- if (!pwq || pwq->pool != pool ||
- !(wq->flags & WQ_FREEZABLE))
- continue;
+ if (num_possible_nodes() <= 1)
+ return;
- /* restore max_active and repopulate worklist */
- pwq_set_max_active(pwq, wq->saved_max_active);
- }
+ if (wq_disable_numa) {
+ pr_info("workqueue: NUMA affinity support disabled\n");
+ return;
+ }
- wake_up_worker(pool);
+ wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL);
+ BUG_ON(!wq_update_unbound_numa_attrs_buf);
- spin_unlock_irq(&pool->lock);
+ /*
+ * We want masks of possible CPUs of each node which isn't readily
+ * available. Build one from cpu_to_node() which should have been
+ * fully initialized by now.
+ */
+ tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL);
+ BUG_ON(!tbl);
+
+ for_each_node(node)
+ BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node));
+
+ for_each_possible_cpu(cpu) {
+ node = cpu_to_node(cpu);
+ if (WARN_ON(node == NUMA_NO_NODE)) {
+ pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu);
+ /* happens iff arch is bonkers, let's just proceed */
+ return;
}
+ cpumask_set_cpu(cpu, tbl[node]);
}
- workqueue_freezing = false;
-out_unlock:
- spin_unlock(&workqueue_lock);
+ wq_numa_possible_cpumask = tbl;
+ wq_numa_enabled = true;
}
-#endif /* CONFIG_FREEZER */
static int __init init_workqueues(void)
{
- unsigned int cpu;
+ int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL };
+ int i, cpu;
/* make sure we have enough bits for OFFQ pool ID */
BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) <
WORK_CPU_END * NR_STD_WORKER_POOLS);
+ WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long));
+
+ pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC);
+
cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP);
hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN);
+ wq_numa_init();
+
/* initialize CPU pools */
- for_each_wq_cpu(cpu) {
+ for_each_possible_cpu(cpu) {
struct worker_pool *pool;
- for_each_std_worker_pool(pool, cpu) {
- spin_lock_init(&pool->lock);
+ i = 0;
+ for_each_cpu_worker_pool(pool, cpu) {
+ BUG_ON(init_worker_pool(pool));
pool->cpu = cpu;
- pool->flags |= POOL_DISASSOCIATED;
- INIT_LIST_HEAD(&pool->worklist);
- INIT_LIST_HEAD(&pool->idle_list);
- hash_init(pool->busy_hash);
-
- init_timer_deferrable(&pool->idle_timer);
- pool->idle_timer.function = idle_worker_timeout;
- pool->idle_timer.data = (unsigned long)pool;
-
- setup_timer(&pool->mayday_timer, pool_mayday_timeout,
- (unsigned long)pool);
-
- mutex_init(&pool->assoc_mutex);
- ida_init(&pool->worker_ida);
+ cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu));
+ pool->attrs->nice = std_nice[i++];
+ pool->node = cpu_to_node(cpu);
/* alloc pool ID */
+ mutex_lock(&wq_pool_mutex);
BUG_ON(worker_pool_assign_id(pool));
+ mutex_unlock(&wq_pool_mutex);
}
}
/* create the initial worker */
- for_each_online_wq_cpu(cpu) {
+ for_each_online_cpu(cpu) {
struct worker_pool *pool;
- for_each_std_worker_pool(pool, cpu) {
- struct worker *worker;
+ for_each_cpu_worker_pool(pool, cpu) {
+ pool->flags &= ~POOL_DISASSOCIATED;
+ BUG_ON(create_and_start_worker(pool) < 0);
+ }
+ }
- if (cpu != WORK_CPU_UNBOUND)
- pool->flags &= ~POOL_DISASSOCIATED;
+ /* create default unbound wq attrs */
+ for (i = 0; i < NR_STD_WORKER_POOLS; i++) {
+ struct workqueue_attrs *attrs;
- worker = create_worker(pool);
- BUG_ON(!worker);
- spin_lock_irq(&pool->lock);
- start_worker(worker);
- spin_unlock_irq(&pool->lock);
- }
+ BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL)));
+ attrs->nice = std_nice[i];
+ unbound_std_wq_attrs[i] = attrs;
}
system_wq = alloc_workqueue("events", 0, 0);
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
index 07650264ec15..84ab6e1dc6fb 100644
--- a/kernel/workqueue_internal.h
+++ b/kernel/workqueue_internal.h
@@ -32,14 +32,12 @@ struct worker {
struct list_head scheduled; /* L: scheduled works */
struct task_struct *task; /* I: worker task */
struct worker_pool *pool; /* I: the associated pool */
+ /* L: for rescuers */
/* 64 bytes boundary on 64bit, 32 on 32bit */
unsigned long last_active; /* L: last active timestamp */
unsigned int flags; /* X: flags */
int id; /* I: worker id */
- /* for rebinding worker to CPU */
- struct work_struct rebind_work; /* L: for busy worker */
-
/* used only by rescuers to point to the target workqueue */
struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */
};
@@ -58,8 +56,7 @@ static inline struct worker *current_wq_worker(void)
* Scheduler hooks for concurrency managed workqueue. Only to be used from
* sched.c and workqueue.c.
*/
-void wq_worker_waking_up(struct task_struct *task, unsigned int cpu);
-struct task_struct *wq_worker_sleeping(struct task_struct *task,
- unsigned int cpu);
+void wq_worker_waking_up(struct task_struct *task, int cpu);
+struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu);
#endif /* _KERNEL_WORKQUEUE_INTERNAL_H */