diff options
Diffstat (limited to 'kernel')
67 files changed, 6969 insertions, 2610 deletions
diff --git a/kernel/.gitignore b/kernel/.gitignore index ab4f1090f437..b3097bde4e9c 100644 --- a/kernel/.gitignore +++ b/kernel/.gitignore @@ -4,3 +4,4 @@ config_data.h config_data.gz timeconst.h +hz.bc diff --git a/kernel/async.c b/kernel/async.c index 8ddee2c3e5b0..61f023ce0228 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -73,7 +73,7 @@ struct async_entry { struct list_head global_list; struct work_struct work; async_cookie_t cookie; - async_func_ptr *func; + async_func_t func; void *data; struct async_domain *domain; }; @@ -84,24 +84,20 @@ static atomic_t entry_count; static async_cookie_t lowest_in_progress(struct async_domain *domain) { - struct async_entry *first = NULL; + struct list_head *pending; async_cookie_t ret = ASYNC_COOKIE_MAX; unsigned long flags; spin_lock_irqsave(&async_lock, flags); - if (domain) { - if (!list_empty(&domain->pending)) - first = list_first_entry(&domain->pending, - struct async_entry, domain_list); - } else { - if (!list_empty(&async_global_pending)) - first = list_first_entry(&async_global_pending, - struct async_entry, global_list); - } + if (domain) + pending = &domain->pending; + else + pending = &async_global_pending; - if (first) - ret = first->cookie; + if (!list_empty(pending)) + ret = list_first_entry(pending, struct async_entry, + domain_list)->cookie; spin_unlock_irqrestore(&async_lock, flags); return ret; @@ -149,7 +145,7 @@ static void async_run_entry_fn(struct work_struct *work) wake_up(&async_done); } -static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct async_domain *domain) +static async_cookie_t __async_schedule(async_func_t func, void *data, struct async_domain *domain) { struct async_entry *entry; unsigned long flags; @@ -169,13 +165,13 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a spin_unlock_irqrestore(&async_lock, flags); /* low on memory.. run synchronously */ - ptr(data, newcookie); + func(data, newcookie); return newcookie; } INIT_LIST_HEAD(&entry->domain_list); INIT_LIST_HEAD(&entry->global_list); INIT_WORK(&entry->work, async_run_entry_fn); - entry->func = ptr; + entry->func = func; entry->data = data; entry->domain = domain; @@ -202,21 +198,21 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a /** * async_schedule - schedule a function for asynchronous execution - * @ptr: function to execute asynchronously + * @func: function to execute asynchronously * @data: data pointer to pass to the function * * Returns an async_cookie_t that may be used for checkpointing later. * Note: This function may be called from atomic or non-atomic contexts. */ -async_cookie_t async_schedule(async_func_ptr *ptr, void *data) +async_cookie_t async_schedule(async_func_t func, void *data) { - return __async_schedule(ptr, data, &async_dfl_domain); + return __async_schedule(func, data, &async_dfl_domain); } EXPORT_SYMBOL_GPL(async_schedule); /** * async_schedule_domain - schedule a function for asynchronous execution within a certain domain - * @ptr: function to execute asynchronously + * @func: function to execute asynchronously * @data: data pointer to pass to the function * @domain: the domain * @@ -226,10 +222,10 @@ EXPORT_SYMBOL_GPL(async_schedule); * synchronization domain is specified via @domain. Note: This function * may be called from atomic or non-atomic contexts. */ -async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data, +async_cookie_t async_schedule_domain(async_func_t func, void *data, struct async_domain *domain) { - return __async_schedule(ptr, data, domain); + return __async_schedule(func, data, domain); } EXPORT_SYMBOL_GPL(async_schedule_domain); diff --git a/kernel/audit.c b/kernel/audit.c index d596e5355f15..9816a1b96cfc 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -660,14 +660,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) /* As soon as there's any sign of userspace auditd, * start kauditd to talk to it */ - if (!kauditd_task) + if (!kauditd_task) { kauditd_task = kthread_run(kauditd_thread, NULL, "kauditd"); - if (IS_ERR(kauditd_task)) { - err = PTR_ERR(kauditd_task); - kauditd_task = NULL; - return err; + if (IS_ERR(kauditd_task)) { + err = PTR_ERR(kauditd_task); + kauditd_task = NULL; + return err; + } } - loginuid = audit_get_loginuid(current); sessionid = audit_get_sessionid(current); security_task_getsecid(current, &sid); diff --git a/kernel/audit.h b/kernel/audit.h index d51cba868e1b..11468d99dad0 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -59,10 +59,7 @@ struct audit_entry { struct audit_krule rule; }; -#ifdef CONFIG_AUDIT -extern int audit_enabled; extern int audit_ever_enabled; -#endif extern int audit_pid; diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c index 642a89c4f3d6..a291aa23fb3f 100644 --- a/kernel/audit_tree.c +++ b/kernel/audit_tree.c @@ -617,9 +617,9 @@ void audit_trim_trees(void) } spin_unlock(&hash_lock); trim_marked(tree); - put_tree(tree); drop_collected_mounts(root_mnt); skip_it: + put_tree(tree); mutex_lock(&audit_filter_mutex); } list_del(&cursor); diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index f9fc54bbe06f..267436826c3b 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -594,6 +594,10 @@ exit_nofree: return entry; exit_free: + if (entry->rule.watch) + audit_put_watch(entry->rule.watch); /* matches initial get */ + if (entry->rule.tree) + audit_put_tree(entry->rule.tree); /* that's the temporary one */ audit_free_rule(entry); return ERR_PTR(err); } diff --git a/kernel/auditsc.c b/kernel/auditsc.c index a371f857a0a9..c68229411a7c 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1034,21 +1034,15 @@ static inline void audit_free_aux(struct audit_context *context) } } -static inline void audit_zero_context(struct audit_context *context, - enum audit_state state) -{ - memset(context, 0, sizeof(*context)); - context->state = state; - context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; -} - static inline struct audit_context *audit_alloc_context(enum audit_state state) { struct audit_context *context; - if (!(context = kmalloc(sizeof(*context), GFP_KERNEL))) + context = kzalloc(sizeof(*context), GFP_KERNEL); + if (!context) return NULL; - audit_zero_context(context, state); + context->state = state; + context->prio = state == AUDIT_RECORD_CONTEXT ? ~0ULL : 0; INIT_LIST_HEAD(&context->killed_trees); INIT_LIST_HEAD(&context->names_list); return context; diff --git a/kernel/capability.c b/kernel/capability.c index 493d97259484..f6c2ce5701e1 100644 --- a/kernel/capability.c +++ b/kernel/capability.c @@ -393,6 +393,30 @@ bool ns_capable(struct user_namespace *ns, int cap) EXPORT_SYMBOL(ns_capable); /** + * file_ns_capable - Determine if the file's opener had a capability in effect + * @file: The file we want to check + * @ns: The usernamespace we want the capability in + * @cap: The capability to be tested for + * + * Return true if task that opened the file had a capability in effect + * when the file was opened. + * + * This does not set PF_SUPERPRIV because the caller may not + * actually be privileged. + */ +bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap) +{ + if (WARN_ON_ONCE(!cap_valid(cap))) + return false; + + if (security_capable(file->f_cred, ns, cap) == 0) + return true; + + return false; +} +EXPORT_SYMBOL(file_ns_capable); + +/** * capable - Determine if the current task has a superior capability in effect * @cap: The capability to be tested for * diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3f14a1310d23..eeb7e49946b2 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -2137,11 +2137,11 @@ retry_find_task: tsk = tsk->group_leader; /* - * Workqueue threads may acquire PF_THREAD_BOUND and become + * Workqueue threads may acquire PF_NO_SETAFFINITY and become * trapped in a cpuset, or RT worker may be born in a cgroup * with no rt_runtime allocated. Just say no. */ - if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { + if (tsk == kthreadd_task || (tsk->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; rcu_read_unlock(); goto out_unlock_cgroup; @@ -5290,55 +5290,6 @@ struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) } EXPORT_SYMBOL_GPL(css_lookup); -/** - * css_get_next - lookup next cgroup under specified hierarchy. - * @ss: pointer to subsystem - * @id: current position of iteration. - * @root: pointer to css. search tree under this. - * @foundid: position of found object. - * - * Search next css under the specified hierarchy of rootid. Calling under - * rcu_read_lock() is necessary. Returns NULL if it reaches the end. - */ -struct cgroup_subsys_state * -css_get_next(struct cgroup_subsys *ss, int id, - struct cgroup_subsys_state *root, int *foundid) -{ - struct cgroup_subsys_state *ret = NULL; - struct css_id *tmp; - int tmpid; - int rootid = css_id(root); - int depth = css_depth(root); - - if (!rootid) - return NULL; - - BUG_ON(!ss->use_id); - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* fill start point for scan */ - tmpid = id; - while (1) { - /* - * scan next entry from bitmap(tree), tmpid is updated after - * idr_get_next(). - */ - tmp = idr_get_next(&ss->idr, &tmpid); - if (!tmp) - break; - if (tmp->depth >= depth && tmp->stack[depth] == rootid) { - ret = rcu_dereference(tmp->css); - if (ret) { - *foundid = tmpid; - break; - } - } - /* continue to scan from next id */ - tmpid = tmpid + 1; - } - return ret; -} - /* * get corresponding css from file open on cgroupfs directory */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 9bd30b9c430c..12331120767c 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1378,16 +1378,16 @@ static int cpuset_can_attach(struct cgroup *cgrp, struct cgroup_taskset *tset) cgroup_taskset_for_each(task, cgrp, tset) { /* - * Kthreads bound to specific cpus cannot be moved to a new - * cpuset; we cannot change their cpu affinity and - * isolating such threads by their set of allowed nodes is - * unnecessary. Thus, cpusets are not applicable for such - * threads. This prevents checking for success of - * set_cpus_allowed_ptr() on all attached tasks before - * cpus_allowed may be changed. + * Kthreads which disallow setaffinity shouldn't be moved + * to a new cpuset; we don't want to change their cpu + * affinity and isolating such threads by their set of + * allowed nodes is unnecessary. Thus, cpusets are not + * applicable for such threads. This prevents checking for + * success of set_cpus_allowed_ptr() on all attached tasks + * before cpus_allowed may be changed. */ ret = -EINVAL; - if (task->flags & PF_THREAD_BOUND) + if (task->flags & PF_NO_SETAFFINITY) goto out_unlock; ret = security_task_setscheduler(task); if (ret) @@ -2193,7 +2193,6 @@ void cpuset_update_active_cpus(bool cpu_online) schedule_work(&cpuset_hotplug_work); } -#ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. @@ -2205,20 +2204,23 @@ static int cpuset_track_online_nodes(struct notifier_block *self, schedule_work(&cpuset_hotplug_work); return NOTIFY_OK; } -#endif + +static struct notifier_block cpuset_track_online_nodes_nb = { + .notifier_call = cpuset_track_online_nodes, + .priority = 10, /* ??! */ +}; /** * cpuset_init_smp - initialize cpus_allowed * * Description: Finish top cpuset after cpu, node maps are initialized - **/ - + */ void __init cpuset_init_smp(void) { cpumask_copy(top_cpuset.cpus_allowed, cpu_active_mask); top_cpuset.mems_allowed = node_states[N_MEMORY]; - hotplug_memory_notifier(cpuset_track_online_nodes, 10); + register_hotmemory_notifier(&cpuset_track_online_nodes_nb); cpuset_propagate_hotplug_wq = alloc_ordered_workqueue("cpuset_hotplug", 0); diff --git a/kernel/events/core.c b/kernel/events/core.c index 310ec19d968a..dce6e13cf9d7 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4449,12 +4449,15 @@ static void perf_event_task_event(struct perf_task_event *task_event) if (ctxn < 0) goto next; ctx = rcu_dereference(current->perf_event_ctxp[ctxn]); + if (ctx) + perf_event_task_ctx(ctx, task_event); } - if (ctx) - perf_event_task_ctx(ctx, task_event); next: put_cpu_ptr(pmu->pmu_cpu_context); } + if (task_event->task_ctx) + perf_event_task_ctx(task_event->task_ctx, task_event); + rcu_read_unlock(); } @@ -4608,6 +4611,7 @@ void perf_event_comm(struct task_struct *task) struct perf_event_context *ctx; int ctxn; + rcu_read_lock(); for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (!ctx) @@ -4615,6 +4619,7 @@ void perf_event_comm(struct task_struct *task) perf_event_enable_on_exec(ctx); } + rcu_read_unlock(); if (!atomic_read(&nr_comm_events)) return; @@ -4749,7 +4754,8 @@ static void perf_event_mmap_event(struct perf_mmap_event *mmap_event) } else { if (arch_vma_name(mmap_event->vma)) { name = strncpy(tmp, arch_vma_name(mmap_event->vma), - sizeof(tmp)); + sizeof(tmp) - 1); + tmp[sizeof(tmp) - 1] = '\0'; goto got_name; } @@ -5342,7 +5348,7 @@ static void sw_perf_event_destroy(struct perf_event *event) static int perf_swevent_init(struct perf_event *event) { - int event_id = event->attr.config; + u64 event_id = event->attr.config; if (event->attr.type != PERF_TYPE_SOFTWARE) return -ENOENT; @@ -5662,6 +5668,7 @@ static void perf_swevent_init_hrtimer(struct perf_event *event) event->attr.sample_period = NSEC_PER_SEC / freq; hwc->sample_period = event->attr.sample_period; local64_set(&hwc->period_left, hwc->sample_period); + hwc->last_period = hwc->sample_period; event->attr.freq = 0; } } @@ -5997,6 +6004,7 @@ skip_type: if (pmu->pmu_cpu_context) goto got_cpu_context; + ret = -ENOMEM; pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context); if (!pmu->pmu_cpu_context) goto free_dev; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index d56a64c99a8b..eb675c4d59df 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -16,7 +16,7 @@ struct ring_buffer { int page_order; /* allocation order */ #endif int nr_pages; /* nr of data pages */ - int writable; /* are we writable */ + int overwrite; /* can overwrite itself */ atomic_t poll; /* POLL_ for wakeups */ diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 23cb34ff3973..97fddb09762b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -18,12 +18,24 @@ static bool perf_output_space(struct ring_buffer *rb, unsigned long tail, unsigned long offset, unsigned long head) { - unsigned long mask; + unsigned long sz = perf_data_size(rb); + unsigned long mask = sz - 1; - if (!rb->writable) + /* + * check if user-writable + * overwrite : over-write its own tail + * !overwrite: buffer possibly drops events. + */ + if (rb->overwrite) return true; - mask = perf_data_size(rb) - 1; + /* + * verify that payload is not bigger than buffer + * otherwise masking logic may fail to detect + * the "not enough space" condition + */ + if ((head - offset) > sz) + return false; offset = (offset - tail) & mask; head = (head - tail) & mask; @@ -212,7 +224,9 @@ ring_buffer_init(struct ring_buffer *rb, long watermark, int flags) rb->watermark = max_size / 2; if (flags & RING_BUFFER_WRITABLE) - rb->writable = 1; + rb->overwrite = 0; + else + rb->overwrite = 1; atomic_set(&rb->refcount, 1); diff --git a/kernel/exit.c b/kernel/exit.c index 51e485ca9935..60bc027c61c3 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -835,7 +835,7 @@ void do_exit(long code) /* * Make sure we are holding no locks: */ - debug_check_no_locks_held(); + debug_check_no_locks_held(tsk); /* * We can do this unlocked here. The futex code uses this flag * just to verify whether the pi state cleanup has been done diff --git a/kernel/fork.c b/kernel/fork.c index 8d932b1c9056..1766d324d5e3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1141,6 +1141,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. @@ -1807,7 +1810,7 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) * If unsharing a user namespace must also unshare the thread. */ if (unshare_flags & CLONE_NEWUSER) - unshare_flags |= CLONE_THREAD; + unshare_flags |= CLONE_THREAD | CLONE_FS; /* * If unsharing a pid namespace must also unshare the thread. */ diff --git a/kernel/futex.c b/kernel/futex.c index f0090a993dab..b26dcfc02c94 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -223,7 +223,8 @@ static void drop_futex_key_refs(union futex_key *key) * @rw: mapping needs to be read/write (values: VERIFY_READ, * VERIFY_WRITE) * - * Returns a negative error code or 0 + * Return: a negative error code or 0 + * * The key words are stored in *key on success. * * For shared mappings, it's (page->index, file_inode(vma->vm_file), @@ -705,9 +706,9 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, * be "current" except in the case of requeue pi. * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * - * Returns: - * 0 - ready to wait - * 1 - acquired the lock + * Return: + * 0 - ready to wait; + * 1 - acquired the lock; * <0 - error * * The hb->lock and futex_key refs shall be held by the caller. @@ -1191,9 +1192,9 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. * hb1 and hb2 must be held by the caller. * - * Returns: - * 0 - failed to acquire the lock atomicly - * 1 - acquired the lock + * Return: + * 0 - failed to acquire the lock atomically; + * 1 - acquired the lock; * <0 - error */ static int futex_proxy_trylock_atomic(u32 __user *pifutex, @@ -1254,8 +1255,8 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire * uaddr2 atomically on behalf of the top waiter. * - * Returns: - * >=0 - on success, the number of tasks requeued or woken + * Return: + * >=0 - on success, the number of tasks requeued or woken; * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, @@ -1536,8 +1537,8 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must * be paired with exactly one earlier call to queue_me(). * - * Returns: - * 1 - if the futex_q was still queued (and we removed unqueued it) + * Return: + * 1 - if the futex_q was still queued (and we removed unqueued it); * 0 - if the futex_q was already removed by the waking thread */ static int unqueue_me(struct futex_q *q) @@ -1707,9 +1708,9 @@ static long futex_wait_restart(struct restart_block *restart); * the pi_state owner as well as handle race conditions that may allow us to * acquire the lock. Must be called with the hb lock held. * - * Returns: - * 1 - success, lock taken - * 0 - success, lock not taken + * Return: + * 1 - success, lock taken; + * 0 - success, lock not taken; * <0 - on error (-EFAULT) */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) @@ -1824,8 +1825,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * Return with the hb lock held and a q.key reference on success, and unlocked * with no q.key reference on failure. * - * Returns: - * 0 - uaddr contains val and hb has been locked + * Return: + * 0 - uaddr contains val and hb has been locked; * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlocked */ static int futex_wait_setup(u32 __user *uaddr, u32 val, unsigned int flags, @@ -2203,9 +2204,9 @@ pi_faulted: * the wakeup and return the appropriate error code to the caller. Must be * called with the hb lock held. * - * Returns - * 0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTNOINTR + * Return: + * 0 = no early wakeup detected; + * <0 = -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2247,7 +2248,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * @val: the expected value of uaddr * @abs_time: absolute timeout * @bitset: 32 bit wakeup bitset set by userspace, defaults to all - * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) * @uaddr2: the pi futex we will take prior to returning to user-space * * The caller will wait on uaddr and will be requeued by futex_requeue() to @@ -2258,7 +2258,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * there was a need to. * * We call schedule in futex_wait_queue_me() when we enqueue and return there - * via the following: + * via the following-- * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() * 2) wakeup on uaddr2 after a requeue * 3) signal @@ -2276,8 +2276,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * - * Returns: - * 0 - On success + * Return: + * 0 - On success; * <0 - On error */ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index cc47812d3feb..14be27feda49 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -63,6 +63,7 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(hrtimer_bases.lock), .clock_base = { { @@ -1642,8 +1643,6 @@ static void __cpuinit init_hrtimers_cpu(int cpu) struct hrtimer_cpu_base *cpu_base = &per_cpu(hrtimer_bases, cpu); int i; - raw_spin_lock_init(&cpu_base->lock); - for (i = 0; i < HRTIMER_MAX_CLOCK_BASES; i++) { cpu_base->clock_base[i].cpu_base = cpu_base; timerqueue_init_head(&cpu_base->clock_base[i].active); diff --git a/kernel/kexec.c b/kernel/kexec.c index bddd3d7a74b6..b574920cbd4b 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -55,7 +55,7 @@ struct resource crashk_res = { .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; struct resource crashk_low_res = { - .name = "Crash kernel low", + .name = "Crash kernel", .start = 0, .end = 0, .flags = IORESOURCE_BUSY | IORESOURCE_MEM @@ -1118,12 +1118,8 @@ void __weak crash_free_reserved_phys_range(unsigned long begin, { unsigned long addr; - for (addr = begin; addr < end; addr += PAGE_SIZE) { - ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); - init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); - free_page((unsigned long)__va(addr)); - totalram_pages++; - } + for (addr = begin; addr < end; addr += PAGE_SIZE) + free_reserved_page(pfn_to_page(addr >> PAGE_SHIFT)); } int crash_shrink_memory(unsigned long new_size) @@ -1368,35 +1364,114 @@ static int __init parse_crashkernel_simple(char *cmdline, return 0; } +#define SUFFIX_HIGH 0 +#define SUFFIX_LOW 1 +#define SUFFIX_NULL 2 +static __initdata char *suffix_tbl[] = { + [SUFFIX_HIGH] = ",high", + [SUFFIX_LOW] = ",low", + [SUFFIX_NULL] = NULL, +}; + /* - * That function is the entry point for command line parsing and should be - * called from the arch-specific code. + * That function parses "suffix" crashkernel command lines like + * + * crashkernel=size,[high|low] + * + * It returns 0 on success and -EINVAL on failure. */ +static int __init parse_crashkernel_suffix(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base, + const char *suffix) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warn("crashkernel: memory value expected\n"); + return -EINVAL; + } + + /* check with suffix */ + if (strncmp(cur, suffix, strlen(suffix))) { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + cur += strlen(suffix); + if (*cur != ' ' && *cur != '\0') { + pr_warn("crashkernel: unrecognized char\n"); + return -EINVAL; + } + + return 0; +} + +static __init char *get_last_crashkernel(char *cmdline, + const char *name, + const char *suffix) +{ + char *p = cmdline, *ck_cmdline = NULL; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, name); + while (p) { + char *end_p = strchr(p, ' '); + char *q; + + if (!end_p) + end_p = p + strlen(p); + + if (!suffix) { + int i; + + /* skip the one with any known suffix */ + for (i = 0; suffix_tbl[i]; i++) { + q = end_p - strlen(suffix_tbl[i]); + if (!strncmp(q, suffix_tbl[i], + strlen(suffix_tbl[i]))) + goto next; + } + ck_cmdline = p; + } else { + q = end_p - strlen(suffix); + if (!strncmp(q, suffix, strlen(suffix))) + ck_cmdline = p; + } +next: + p = strstr(p+1, name); + } + + if (!ck_cmdline) + return NULL; + + return ck_cmdline; +} + static int __init __parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base, - const char *name) + const char *name, + const char *suffix) { - char *p = cmdline, *ck_cmdline = NULL; char *first_colon, *first_space; + char *ck_cmdline; BUG_ON(!crash_size || !crash_base); *crash_size = 0; *crash_base = 0; - /* find crashkernel and use the last one if there are more */ - p = strstr(p, name); - while (p) { - ck_cmdline = p; - p = strstr(p+1, name); - } + ck_cmdline = get_last_crashkernel(cmdline, name, suffix); if (!ck_cmdline) return -EINVAL; ck_cmdline += strlen(name); + if (suffix) + return parse_crashkernel_suffix(ck_cmdline, crash_size, + crash_base, suffix); /* * if the commandline contains a ':', then that's the extended * syntax -- if not, it must be the classic syntax @@ -1413,13 +1488,26 @@ static int __init __parse_crashkernel(char *cmdline, return 0; } +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ int __init parse_crashkernel(char *cmdline, unsigned long long system_ram, unsigned long long *crash_size, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel="); + "crashkernel=", NULL); +} + +int __init parse_crashkernel_high(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, + "crashkernel=", suffix_tbl[SUFFIX_HIGH]); } int __init parse_crashkernel_low(char *cmdline, @@ -1428,7 +1516,7 @@ int __init parse_crashkernel_low(char *cmdline, unsigned long long *crash_base) { return __parse_crashkernel(cmdline, system_ram, crash_size, crash_base, - "crashkernel_low="); + "crashkernel=", suffix_tbl[SUFFIX_LOW]); } static void update_vmcoreinfo_note(void) @@ -1489,7 +1577,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_SYMBOL(swapper_pg_dir); #endif VMCOREINFO_SYMBOL(_stext); - VMCOREINFO_SYMBOL(vmlist); + VMCOREINFO_SYMBOL(vmap_area_list); #ifndef CONFIG_NEED_MULTIPLE_NODES VMCOREINFO_SYMBOL(mem_map); @@ -1527,7 +1615,8 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_OFFSET(free_area, free_list); VMCOREINFO_OFFSET(list_head, next); VMCOREINFO_OFFSET(list_head, prev); - VMCOREINFO_OFFSET(vm_struct, addr); + VMCOREINFO_OFFSET(vmap_area, va_start); + VMCOREINFO_OFFSET(vmap_area, list); VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); log_buf_kexec_setup(); VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); diff --git a/kernel/kprobes.c b/kernel/kprobes.c index e35be53f6613..3fed7f0cbcdf 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -794,16 +794,16 @@ out: } #ifdef CONFIG_SYSCTL -/* This should be called with kprobe_mutex locked */ static void __kprobes optimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; unsigned int i; + mutex_lock(&kprobe_mutex); /* If optimization is already allowed, just return */ if (kprobes_allow_optimization) - return; + goto out; kprobes_allow_optimization = true; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -813,18 +813,22 @@ static void __kprobes optimize_all_kprobes(void) optimize_kprobe(p); } printk(KERN_INFO "Kprobes globally optimized\n"); +out: + mutex_unlock(&kprobe_mutex); } -/* This should be called with kprobe_mutex locked */ static void __kprobes unoptimize_all_kprobes(void) { struct hlist_head *head; struct kprobe *p; unsigned int i; + mutex_lock(&kprobe_mutex); /* If optimization is already prohibited, just return */ - if (!kprobes_allow_optimization) + if (!kprobes_allow_optimization) { + mutex_unlock(&kprobe_mutex); return; + } kprobes_allow_optimization = false; for (i = 0; i < KPROBE_TABLE_SIZE; i++) { @@ -834,11 +838,14 @@ static void __kprobes unoptimize_all_kprobes(void) unoptimize_kprobe(p, false); } } + mutex_unlock(&kprobe_mutex); + /* Wait for unoptimizing completion */ wait_for_kprobe_optimizer(); printk(KERN_INFO "Kprobes globally unoptimized\n"); } +static DEFINE_MUTEX(kprobe_sysctl_mutex); int sysctl_kprobes_optimization; int proc_kprobes_optimization_handler(struct ctl_table *table, int write, void __user *buffer, size_t *length, @@ -846,7 +853,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, { int ret; - mutex_lock(&kprobe_mutex); + mutex_lock(&kprobe_sysctl_mutex); sysctl_kprobes_optimization = kprobes_allow_optimization ? 1 : 0; ret = proc_dointvec_minmax(table, write, buffer, length, ppos); @@ -854,7 +861,7 @@ int proc_kprobes_optimization_handler(struct ctl_table *table, int write, optimize_all_kprobes(); else unoptimize_all_kprobes(); - mutex_unlock(&kprobe_mutex); + mutex_unlock(&kprobe_sysctl_mutex); return ret; } diff --git a/kernel/kthread.c b/kernel/kthread.c index 691dc2ef9baf..16d8ddd268b1 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -52,8 +52,21 @@ enum KTHREAD_BITS { KTHREAD_IS_PARKED, }; -#define to_kthread(tsk) \ - container_of((tsk)->vfork_done, struct kthread, exited) +#define __to_kthread(vfork) \ + container_of(vfork, struct kthread, exited) + +static inline struct kthread *to_kthread(struct task_struct *k) +{ + return __to_kthread(k->vfork_done); +} + +static struct kthread *to_live_kthread(struct task_struct *k) +{ + struct completion *vfork = ACCESS_ONCE(k->vfork_done); + if (likely(vfork)) + return __to_kthread(vfork); + return NULL; +} /** * kthread_should_stop - should this kthread return now? @@ -124,12 +137,12 @@ void *kthread_data(struct task_struct *task) static void __kthread_parkme(struct kthread *self) { - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_PARKED); while (test_bit(KTHREAD_SHOULD_PARK, &self->flags)) { if (!test_and_set_bit(KTHREAD_IS_PARKED, &self->flags)) complete(&self->parked); schedule(); - __set_current_state(TASK_INTERRUPTIBLE); + __set_current_state(TASK_PARKED); } clear_bit(KTHREAD_IS_PARKED, &self->flags); __set_current_state(TASK_RUNNING); @@ -256,11 +269,16 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data), } EXPORT_SYMBOL(kthread_create_on_node); -static void __kthread_bind(struct task_struct *p, unsigned int cpu) +static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state) { + /* Must have done schedule() in kthread() before we set_task_cpu */ + if (!wait_task_inactive(p, state)) { + WARN_ON(1); + return; + } /* It's safe because the task is inactive. */ do_set_cpus_allowed(p, cpumask_of(cpu)); - p->flags |= PF_THREAD_BOUND; + p->flags |= PF_NO_SETAFFINITY; } /** @@ -274,12 +292,7 @@ static void __kthread_bind(struct task_struct *p, unsigned int cpu) */ void kthread_bind(struct task_struct *p, unsigned int cpu) { - /* Must have done schedule() in kthread() before we set_task_cpu */ - if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) { - WARN_ON(1); - return; - } - __kthread_bind(p, cpu); + __kthread_bind(p, cpu, TASK_UNINTERRUPTIBLE); } EXPORT_SYMBOL(kthread_bind); @@ -311,17 +324,20 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data), return p; } -static struct kthread *task_get_live_kthread(struct task_struct *k) +static void __kthread_unpark(struct task_struct *k, struct kthread *kthread) { - struct kthread *kthread; - - get_task_struct(k); - kthread = to_kthread(k); - /* It might have exited */ - barrier(); - if (k->vfork_done != NULL) - return kthread; - return NULL; + clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + /* + * We clear the IS_PARKED bit here as we don't wait + * until the task has left the park code. So if we'd + * park before that happens we'd see the IS_PARKED bit + * which might be about to be cleared. + */ + if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { + if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) + __kthread_bind(k, kthread->cpu, TASK_PARKED); + wake_up_state(k, TASK_PARKED); + } } /** @@ -334,23 +350,10 @@ static struct kthread *task_get_live_kthread(struct task_struct *k) */ void kthread_unpark(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread = to_live_kthread(k); - if (kthread) { - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); - /* - * We clear the IS_PARKED bit here as we don't wait - * until the task has left the park code. So if we'd - * park before that happens we'd see the IS_PARKED bit - * which might be about to be cleared. - */ - if (test_and_clear_bit(KTHREAD_IS_PARKED, &kthread->flags)) { - if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) - __kthread_bind(k, kthread->cpu); - wake_up_process(k); - } - } - put_task_struct(k); + if (kthread) + __kthread_unpark(k, kthread); } /** @@ -367,7 +370,7 @@ void kthread_unpark(struct task_struct *k) */ int kthread_park(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread = to_live_kthread(k); int ret = -ENOSYS; if (kthread) { @@ -380,7 +383,6 @@ int kthread_park(struct task_struct *k) } ret = 0; } - put_task_struct(k); return ret; } @@ -401,21 +403,23 @@ int kthread_park(struct task_struct *k) */ int kthread_stop(struct task_struct *k) { - struct kthread *kthread = task_get_live_kthread(k); + struct kthread *kthread; int ret; trace_sched_kthread_stop(k); + + get_task_struct(k); + kthread = to_live_kthread(k); if (kthread) { set_bit(KTHREAD_SHOULD_STOP, &kthread->flags); - clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); + __kthread_unpark(k, kthread); wake_up_process(k); wait_for_completion(&kthread->exited); } ret = k->exit_code; - put_task_struct(k); - trace_sched_kthread_stop_ret(ret); + trace_sched_kthread_stop_ret(ret); return ret; } EXPORT_SYMBOL(kthread_stop); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 259db207b5d9..6a3bccba7e7d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -380,6 +380,13 @@ static int verbose(struct lock_class *class) unsigned long nr_stack_trace_entries; static unsigned long stack_trace[MAX_STACK_TRACE_ENTRIES]; +static void print_lockdep_off(const char *bug_msg) +{ + printk(KERN_DEBUG "%s\n", bug_msg); + printk(KERN_DEBUG "turning off the locking correctness validator.\n"); + printk(KERN_DEBUG "Please attach the output of /proc/lock_stat to the bug report\n"); +} + static int save_trace(struct stack_trace *trace) { trace->nr_entries = 0; @@ -409,8 +416,7 @@ static int save_trace(struct stack_trace *trace) if (!debug_locks_off_graph_unlock()) return 0; - printk("BUG: MAX_STACK_TRACE_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_STACK_TRACE_ENTRIES too low!"); dump_stack(); return 0; @@ -763,8 +769,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) } raw_local_irq_restore(flags); - printk("BUG: MAX_LOCKDEP_KEYS too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_KEYS too low!"); dump_stack(); return NULL; } @@ -834,8 +839,7 @@ static struct lock_list *alloc_list_entry(void) if (!debug_locks_off_graph_unlock()) return NULL; - printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_ENTRIES too low!"); dump_stack(); return NULL; } @@ -2000,7 +2004,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, struct lock_class *class = hlock_class(hlock); struct list_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - struct held_lock *hlock_curr, *hlock_next; + struct held_lock *hlock_curr; int i, j; /* @@ -2048,8 +2052,7 @@ cache_hit: if (!debug_locks_off_graph_unlock()) return 0; - printk("BUG: MAX_LOCKDEP_CHAINS too low!\n"); - printk("turning off the locking correctness validator.\n"); + print_lockdep_off("BUG: MAX_LOCKDEP_CHAINS too low!"); dump_stack(); return 0; } @@ -2057,12 +2060,10 @@ cache_hit: chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; /* Find the first held_lock of current chain */ - hlock_next = hlock; for (i = curr->lockdep_depth - 1; i >= 0; i--) { hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock_next->irq_context) + if (hlock_curr->irq_context != hlock->irq_context) break; - hlock_next = hlock; } i++; chain->depth = curr->lockdep_depth + 1 - i; @@ -3190,9 +3191,9 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, #endif if (unlikely(curr->lockdep_depth >= MAX_LOCK_DEPTH)) { debug_locks_off(); - printk("BUG: MAX_LOCK_DEPTH too low, depth: %i max: %lu!\n", + print_lockdep_off("BUG: MAX_LOCK_DEPTH too low!"); + printk(KERN_DEBUG "depth: %i max: %lu!\n", curr->lockdep_depth, MAX_LOCK_DEPTH); - printk("turning off the locking correctness validator.\n"); lockdep_print_held_locks(current); debug_show_all_locks(); @@ -4088,7 +4089,7 @@ void debug_check_no_locks_freed(const void *mem_from, unsigned long mem_len) } EXPORT_SYMBOL_GPL(debug_check_no_locks_freed); -static void print_held_locks_bug(void) +static void print_held_locks_bug(struct task_struct *curr) { if (!debug_locks_off()) return; @@ -4097,21 +4098,22 @@ static void print_held_locks_bug(void) printk("\n"); printk("=====================================\n"); - printk("[ BUG: %s/%d still has locks held! ]\n", - current->comm, task_pid_nr(current)); + printk("[ BUG: lock held at task exit time! ]\n"); print_kernel_ident(); printk("-------------------------------------\n"); - lockdep_print_held_locks(current); + printk("%s/%d is exiting with locks still held!\n", + curr->comm, task_pid_nr(curr)); + lockdep_print_held_locks(curr); + printk("\nstack backtrace:\n"); dump_stack(); } -void debug_check_no_locks_held(void) +void debug_check_no_locks_held(struct task_struct *task) { - if (unlikely(current->lockdep_depth > 0)) - print_held_locks_bug(); + if (unlikely(task->lockdep_depth > 0)) + print_held_locks_bug(task); } -EXPORT_SYMBOL_GPL(debug_check_no_locks_held); void debug_show_all_locks(void) { diff --git a/kernel/mutex.c b/kernel/mutex.c index 52f23011b6e0..ad53a664f113 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -37,6 +37,12 @@ # include <asm/mutex.h> #endif +/* + * A negative mutex count indicates that waiters are sleeping waiting for the + * mutex. + */ +#define MUTEX_SHOW_NO_WAITER(mutex) (atomic_read(&(mutex)->count) >= 0) + void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { @@ -44,6 +50,9 @@ __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); mutex_clear_owner(lock); +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER + lock->spin_mlock = NULL; +#endif debug_mutex_init(lock, name, key); } @@ -95,6 +104,124 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * In order to avoid a stampede of mutex spinners from acquiring the mutex + * more or less simultaneously, the spinners need to acquire a MCS lock + * first before spinning on the owner field. + * + * We don't inline mspin_lock() so that perf can correctly account for the + * time spent in this lock function. + */ +struct mspin_node { + struct mspin_node *next ; + int locked; /* 1 if lock acquired */ +}; +#define MLOCK(mutex) ((struct mspin_node **)&((mutex)->spin_mlock)) + +static noinline +void mspin_lock(struct mspin_node **lock, struct mspin_node *node) +{ + struct mspin_node *prev; + + /* Init node */ + node->locked = 0; + node->next = NULL; + + prev = xchg(lock, node); + if (likely(prev == NULL)) { + /* Lock acquired */ + node->locked = 1; + return; + } + ACCESS_ONCE(prev->next) = node; + smp_wmb(); + /* Wait until the lock holder passes the lock down */ + while (!ACCESS_ONCE(node->locked)) + arch_mutex_cpu_relax(); +} + +static void mspin_unlock(struct mspin_node **lock, struct mspin_node *node) +{ + struct mspin_node *next = ACCESS_ONCE(node->next); + + if (likely(!next)) { + /* + * Release the lock by setting it to NULL + */ + if (cmpxchg(lock, node, NULL) == node) + return; + /* Wait until the next pointer is set */ + while (!(next = ACCESS_ONCE(node->next))) + arch_mutex_cpu_relax(); + } + ACCESS_ONCE(next->locked) = 1; + smp_wmb(); +} + +/* + * Mutex spinning code migrated from kernel/sched/core.c + */ + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + if (lock->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +static noinline +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(lock, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. + */ + return lock->owner == NULL; +} + +/* + * Initial check for entering the mutex spinning loop + */ +static inline int mutex_can_spin_on_owner(struct mutex *lock) +{ + int retval = 1; + + rcu_read_lock(); + if (lock->owner) + retval = lock->owner->on_cpu; + rcu_read_unlock(); + /* + * if lock->owner is not set, the mutex owner may have just acquired + * it and not set the owner yet or the mutex has been released. + */ + return retval; +} +#endif + static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /** @@ -158,25 +285,39 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * * We can't do this for DEBUG_MUTEXES because that relies on wait_lock * to serialize everything. + * + * The mutex spinners are queued up using MCS lock so that only one + * spinner can compete for the mutex. However, if mutex spinning isn't + * going to happen, there is no point in going through the lock/unlock + * overhead. */ + if (!mutex_can_spin_on_owner(lock)) + goto slowpath; for (;;) { struct task_struct *owner; + struct mspin_node node; /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ + mspin_lock(MLOCK(lock), &node); owner = ACCESS_ONCE(lock->owner); - if (owner && !mutex_spin_on_owner(lock, owner)) + if (owner && !mutex_spin_on_owner(lock, owner)) { + mspin_unlock(MLOCK(lock), &node); break; + } - if (atomic_cmpxchg(&lock->count, 1, 0) == 1) { + if ((atomic_read(&lock->count) == 1) && + (atomic_cmpxchg(&lock->count, 1, 0) == 1)) { lock_acquired(&lock->dep_map, ip); mutex_set_owner(lock); + mspin_unlock(MLOCK(lock), &node); preempt_enable(); return 0; } + mspin_unlock(MLOCK(lock), &node); /* * When there's no owner, we might have preempted between the @@ -195,6 +336,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, */ arch_mutex_cpu_relax(); } +slowpath: #endif spin_lock_mutex(&lock->wait_lock, flags); @@ -205,7 +347,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; - if (atomic_xchg(&lock->count, -1) == 1) + if (MUTEX_SHOW_NO_WAITER(lock) && (atomic_xchg(&lock->count, -1) == 1)) goto done; lock_contended(&lock->dep_map, ip); @@ -220,7 +362,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * that when we release the lock, we properly wake up the * other waiters: */ - if (atomic_xchg(&lock->count, -1) == 1) + if (MUTEX_SHOW_NO_WAITER(lock) && + (atomic_xchg(&lock->count, -1) == 1)) break; /* diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index c1c3dc1c6023..bea15bdf82b0 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -181,6 +181,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) int nr; int rc; struct task_struct *task, *me = current; + int init_pids = thread_group_leader(me) ? 1 : 2; /* Don't allow any more processes into the pid namespace */ disable_pid_allocation(pid_ns); @@ -230,7 +231,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns) */ for (;;) { set_current_state(TASK_UNINTERRUPTIBLE); - if (pid_ns->nr_hashed == 1) + if (pid_ns->nr_hashed == init_pids) break; schedule(); } diff --git a/kernel/printk.c b/kernel/printk.c index 0b31715f335a..abbdd9e2ac82 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -63,8 +63,6 @@ void asmlinkage __attribute__((weak)) early_printk(const char *fmt, ...) #define MINIMUM_CONSOLE_LOGLEVEL 1 /* Minimum loglevel we let people use */ #define DEFAULT_CONSOLE_LOGLEVEL 7 /* anything MORE serious than KERN_DEBUG */ -DECLARE_WAIT_QUEUE_HEAD(log_wait); - int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* console_loglevel */ DEFAULT_MESSAGE_LOGLEVEL, /* default_message_loglevel */ @@ -224,6 +222,7 @@ struct log { static DEFINE_RAW_SPINLOCK(logbuf_lock); #ifdef CONFIG_PRINTK +DECLARE_WAIT_QUEUE_HEAD(log_wait); /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static u32 syslog_idx; @@ -1957,45 +1956,6 @@ int is_console_locked(void) return console_locked; } -/* - * Delayed printk version, for scheduler-internal messages: - */ -#define PRINTK_BUF_SIZE 512 - -#define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_SCHED 0x02 - -static DEFINE_PER_CPU(int, printk_pending); -static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); - -static void wake_up_klogd_work_func(struct irq_work *irq_work) -{ - int pending = __this_cpu_xchg(printk_pending, 0); - - if (pending & PRINTK_PENDING_SCHED) { - char *buf = __get_cpu_var(printk_sched_buf); - printk(KERN_WARNING "[sched_delayed] %s", buf); - } - - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); -} - -static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { - .func = wake_up_klogd_work_func, - .flags = IRQ_WORK_LAZY, -}; - -void wake_up_klogd(void) -{ - preempt_disable(); - if (waitqueue_active(&log_wait)) { - this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); - irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); - } - preempt_enable(); -} - static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2458,6 +2418,44 @@ static int __init printk_late_init(void) late_initcall(printk_late_init); #if defined CONFIG_PRINTK +/* + * Delayed printk version, for scheduler-internal messages: + */ +#define PRINTK_BUF_SIZE 512 + +#define PRINTK_PENDING_WAKEUP 0x01 +#define PRINTK_PENDING_SCHED 0x02 + +static DEFINE_PER_CPU(int, printk_pending); +static DEFINE_PER_CPU(char [PRINTK_BUF_SIZE], printk_sched_buf); + +static void wake_up_klogd_work_func(struct irq_work *irq_work) +{ + int pending = __this_cpu_xchg(printk_pending, 0); + + if (pending & PRINTK_PENDING_SCHED) { + char *buf = __get_cpu_var(printk_sched_buf); + printk(KERN_WARNING "[sched_delayed] %s", buf); + } + + if (pending & PRINTK_PENDING_WAKEUP) + wake_up_interruptible(&log_wait); +} + +static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = { + .func = wake_up_klogd_work_func, + .flags = IRQ_WORK_LAZY, +}; + +void wake_up_klogd(void) +{ + preempt_disable(); + if (waitqueue_active(&log_wait)) { + this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP); + irq_work_queue(&__get_cpu_var(wake_up_klogd_work)); + } + preempt_enable(); +} int printk_sched(const char *fmt, ...) { diff --git a/kernel/resource.c b/kernel/resource.c index 73f35d4b30b9..d7386986e10e 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -21,6 +21,7 @@ #include <linux/seq_file.h> #include <linux/device.h> #include <linux/pfn.h> +#include <linux/mm.h> #include <asm/io.h> @@ -50,6 +51,14 @@ struct resource_constraint { static DEFINE_RWLOCK(resource_lock); +/* + * For memory hotplug, there is no way to free resource entries allocated + * by boot mem after the system is up. So for reusing the resource entry + * we need to remember the resource. + */ +static struct resource *bootmem_resource_free; +static DEFINE_SPINLOCK(bootmem_resource_lock); + static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -151,6 +160,40 @@ __initcall(ioresources_init); #endif /* CONFIG_PROC_FS */ +static void free_resource(struct resource *res) +{ + if (!res) + return; + + if (!PageSlab(virt_to_head_page(res))) { + spin_lock(&bootmem_resource_lock); + res->sibling = bootmem_resource_free; + bootmem_resource_free = res; + spin_unlock(&bootmem_resource_lock); + } else { + kfree(res); + } +} + +static struct resource *alloc_resource(gfp_t flags) +{ + struct resource *res = NULL; + + spin_lock(&bootmem_resource_lock); + if (bootmem_resource_free) { + res = bootmem_resource_free; + bootmem_resource_free = res->sibling; + } + spin_unlock(&bootmem_resource_lock); + + if (res) + memset(res, 0, sizeof(struct resource)); + else + res = kzalloc(sizeof(struct resource), flags); + + return res; +} + /* Return the conflict entry if you can't request it */ static struct resource * __request_resource(struct resource *root, struct resource *new) { @@ -706,24 +749,13 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) write_unlock(&resource_lock); } -/** - * adjust_resource - modify a resource's start and size - * @res: resource to modify - * @start: new start value - * @size: new size - * - * Given an existing resource, change its start and size to match the - * arguments. Returns 0 on success, -EBUSY if it can't fit. - * Existing children of the resource are assumed to be immutable. - */ -int adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) +static int __adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) { struct resource *tmp, *parent = res->parent; resource_size_t end = start + size - 1; int result = -EBUSY; - write_lock(&resource_lock); - if (!parent) goto skip; @@ -751,6 +783,26 @@ skip: result = 0; out: + return result; +} + +/** + * adjust_resource - modify a resource's start and size + * @res: resource to modify + * @start: new start value + * @size: new size + * + * Given an existing resource, change its start and size to match the + * arguments. Returns 0 on success, -EBUSY if it can't fit. + * Existing children of the resource are assumed to be immutable. + */ +int adjust_resource(struct resource *res, resource_size_t start, + resource_size_t size) +{ + int result; + + write_lock(&resource_lock); + result = __adjust_resource(res, start, size); write_unlock(&resource_lock); return result; } @@ -762,7 +814,7 @@ static void __init __reserve_region_with_split(struct resource *root, { struct resource *parent = root; struct resource *conflict; - struct resource *res = kzalloc(sizeof(*res), GFP_ATOMIC); + struct resource *res = alloc_resource(GFP_ATOMIC); struct resource *next_res = NULL; if (!res) @@ -787,7 +839,7 @@ static void __init __reserve_region_with_split(struct resource *root, /* conflict covered whole area */ if (conflict->start <= res->start && conflict->end >= res->end) { - kfree(res); + free_resource(res); WARN_ON(next_res); break; } @@ -797,10 +849,9 @@ static void __init __reserve_region_with_split(struct resource *root, end = res->end; res->end = conflict->start - 1; if (conflict->end < end) { - next_res = kzalloc(sizeof(*next_res), - GFP_ATOMIC); + next_res = alloc_resource(GFP_ATOMIC); if (!next_res) { - kfree(res); + free_resource(res); break; } next_res->name = name; @@ -890,7 +941,7 @@ struct resource * __request_region(struct resource *parent, const char *name, int flags) { DECLARE_WAITQUEUE(wait, current); - struct resource *res = kzalloc(sizeof(*res), GFP_KERNEL); + struct resource *res = alloc_resource(GFP_KERNEL); if (!res) return NULL; @@ -924,7 +975,7 @@ struct resource * __request_region(struct resource *parent, continue; } /* Uhhuh, that didn't work out.. */ - kfree(res); + free_resource(res); res = NULL; break; } @@ -958,7 +1009,7 @@ int __check_region(struct resource *parent, resource_size_t start, return -EBUSY; release_resource(res); - kfree(res); + free_resource(res); return 0; } EXPORT_SYMBOL(__check_region); @@ -998,7 +1049,7 @@ void __release_region(struct resource *parent, resource_size_t start, write_unlock(&resource_lock); if (res->flags & IORESOURCE_MUXED) wake_up(&muxed_resource_wait); - kfree(res); + free_resource(res); return; } p = &res->sibling; @@ -1012,6 +1063,109 @@ void __release_region(struct resource *parent, resource_size_t start, } EXPORT_SYMBOL(__release_region); +#ifdef CONFIG_MEMORY_HOTREMOVE +/** + * release_mem_region_adjustable - release a previously reserved memory region + * @parent: parent resource descriptor + * @start: resource start address + * @size: resource region size + * + * This interface is intended for memory hot-delete. The requested region + * is released from a currently busy memory resource. The requested region + * must either match exactly or fit into a single busy resource entry. In + * the latter case, the remaining resource is adjusted accordingly. + * Existing children of the busy memory resource must be immutable in the + * request. + * + * Note: + * - Additional release conditions, such as overlapping region, can be + * supported after they are confirmed as valid cases. + * - When a busy memory resource gets split into two entries, the code + * assumes that all children remain in the lower address entry for + * simplicity. Enhance this logic when necessary. + */ +int release_mem_region_adjustable(struct resource *parent, + resource_size_t start, resource_size_t size) +{ + struct resource **p; + struct resource *res; + struct resource *new_res; + resource_size_t end; + int ret = -EINVAL; + + end = start + size - 1; + if ((start < parent->start) || (end > parent->end)) + return ret; + + /* The alloc_resource() result gets checked later */ + new_res = alloc_resource(GFP_KERNEL); + + p = &parent->child; + write_lock(&resource_lock); + + while ((res = *p)) { + if (res->start >= end) + break; + + /* look for the next resource if it does not fit into */ + if (res->start > start || res->end < end) { + p = &res->sibling; + continue; + } + + if (!(res->flags & IORESOURCE_MEM)) + break; + + if (!(res->flags & IORESOURCE_BUSY)) { + p = &res->child; + continue; + } + + /* found the target resource; let's adjust accordingly */ + if (res->start == start && res->end == end) { + /* free the whole entry */ + *p = res->sibling; + free_resource(res); + ret = 0; + } else if (res->start == start && res->end != end) { + /* adjust the start */ + ret = __adjust_resource(res, end + 1, + res->end - end); + } else if (res->start != start && res->end == end) { + /* adjust the end */ + ret = __adjust_resource(res, res->start, + start - res->start); + } else { + /* split into two entries */ + if (!new_res) { + ret = -ENOMEM; + break; + } + new_res->name = res->name; + new_res->start = end + 1; + new_res->end = res->end; + new_res->flags = res->flags; + new_res->parent = res->parent; + new_res->sibling = res->sibling; + new_res->child = NULL; + + ret = __adjust_resource(res, res->start, + start - res->start); + if (ret) + break; + res->sibling = new_res; + new_res = NULL; + } + + break; + } + + write_unlock(&resource_lock); + free_resource(new_res); + return ret; +} +#endif /* CONFIG_MEMORY_HOTREMOVE */ + /* * Managed region resource */ diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 7890b10084a7..1d96dd0d93c1 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -14,6 +14,7 @@ #include <linux/spinlock.h> #include <linux/timer.h> #include <linux/freezer.h> +#include <linux/stat.h> #include "rtmutex.h" @@ -366,8 +367,8 @@ static ssize_t sysfs_test_status(struct device *dev, struct device_attribute *at return curr - buf; } -static DEVICE_ATTR(status, 0600, sysfs_test_status, NULL); -static DEVICE_ATTR(command, 0600, NULL, sysfs_test_command); +static DEVICE_ATTR(status, S_IRUSR, sysfs_test_status, NULL); +static DEVICE_ATTR(command, S_IWUSR, NULL, sysfs_test_command); static struct bus_type rttest_subsys = { .name = "rttest", diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492df..c3ae1446461c 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd) u64 this_clock, remote_clock; u64 *ptr, old_val, val; +#if BITS_PER_LONG != 64 +again: + /* + * Careful here: The local and the remote clock values need to + * be read out atomic as we need to compare the values and + * then update either the local or the remote side. So the + * cmpxchg64 below only protects one readout. + * + * We must reread via sched_clock_local() in the retry case on + * 32bit as an NMI could use sched_clock_local() via the + * tracer and hit between the readout of + * the low32bit and the high 32bit portion. + */ + this_clock = sched_clock_local(my_scd); + /* + * We must enforce atomic readout on 32bit, otherwise the + * update on the remote cpu can hit inbetween the readout of + * the low32bit and the high 32bit portion. + */ + remote_clock = cmpxchg64(&scd->clock, 0, 0); +#else + /* + * On 64bit the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32bit dance. + */ sched_clock_local(my_scd); again: this_clock = my_scd->clock; remote_clock = scd->clock; +#endif /* * Use the opportunity that we have both locks diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..d8285eb0cde6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1498,8 +1498,10 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - BUG_ON(rq != this_rq()); - BUG_ON(p == current); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { @@ -2997,51 +2999,6 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ - if (lock->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ - if (!sched_feat(OWNER_SPIN)) - return 0; - - rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) - break; - - arch_mutex_cpu_relax(); - } - rcu_read_unlock(); - - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; -} -#endif - #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption @@ -4126,6 +4083,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) get_task_struct(p); rcu_read_unlock(); + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; @@ -4773,11 +4734,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ @@ -4999,7 +4955,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) } static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; static void set_table_entry(struct ctl_table *entry, diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ed12cbb135f4..e93cca92f38b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - task_cputime(tsk, &utime, &stime); + task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395f..99399f8e4799 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -46,13 +46,6 @@ SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) /* - * Spin-wait on mutex acquisition when the mutex owner is running on - * another cpu -- assumes that when the owner is running, it will soon - * release the lock. Decreases scheduling overhead. - */ -SCHED_FEAT(OWNER_SPIN, true) - -/* * Decrement CPU power based on time not spent running tasks */ SCHED_FEAT(NONTASK_POWER, true) diff --git a/kernel/signal.c b/kernel/signal.c index 2ec870a4c3c4..598dc06be421 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -485,6 +485,9 @@ flush_signal_handlers(struct task_struct *t, int force_default) if (force_default || ka->sa.sa_handler != SIG_IGN) ka->sa.sa_handler = SIG_DFL; ka->sa.sa_flags = 0; +#ifdef __ARCH_HAS_SA_RESTORER + ka->sa.sa_restorer = NULL; +#endif sigemptyset(&ka->sa.sa_mask); ka++; } @@ -2682,7 +2685,7 @@ static int do_sigpending(void *set, unsigned long sigsetsize) /** * sys_rt_sigpending - examine a pending signal that has been raised * while blocked - * @set: stores pending signals + * @uset: stores pending signals * @sigsetsize: size of sigset_t type or larger */ SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, uset, size_t, sigsetsize) @@ -2945,7 +2948,7 @@ do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) static int do_tkill(pid_t tgid, pid_t pid, int sig) { - struct siginfo info; + struct siginfo info = {}; info.si_signo = sig; info.si_errno = 0; diff --git a/kernel/smpboot.c b/kernel/smpboot.c index b9bde5727829..02fc5c933673 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -131,7 +131,7 @@ static int smpboot_thread_fn(void *data) continue; } - //BUG_ON(td->cpu != smp_processor_id()); + BUG_ON(td->cpu != smp_processor_id()); /* Check for state change setup */ switch (td->status) { @@ -185,8 +185,18 @@ __smpboot_create_thread(struct smp_hotplug_thread *ht, unsigned int cpu) } get_task_struct(tsk); *per_cpu_ptr(ht->store, cpu) = tsk; - if (ht->create) - ht->create(cpu); + if (ht->create) { + /* + * Make sure that the task has actually scheduled out + * into park position, before calling the create + * callback. At least the migration thread callback + * requires that the task is off the runqueue. + */ + if (!wait_task_inactive(tsk, TASK_PARKED)) + WARN_ON(1); + else + ht->create(cpu); + } return 0; } @@ -209,6 +219,8 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp { struct task_struct *tsk = *per_cpu_ptr(ht->store, cpu); + if (ht->pre_unpark) + ht->pre_unpark(cpu); kthread_unpark(tsk); } diff --git a/kernel/softirq.c b/kernel/softirq.c index b4d252fd195b..14d7758074aa 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -323,18 +323,10 @@ void irq_enter(void) static inline void invoke_softirq(void) { - if (!force_irqthreads) { -#ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED + if (!force_irqthreads) __do_softirq(); -#else - do_softirq(); -#endif - } else { - __local_bh_disable((unsigned long)__builtin_return_address(0), - SOFTIRQ_OFFSET); + else wakeup_softirqd(); - __local_bh_enable(SOFTIRQ_OFFSET); - } } /* @@ -342,9 +334,15 @@ static inline void invoke_softirq(void) */ void irq_exit(void) { +#ifndef __ARCH_IRQ_EXIT_IRQS_DISABLED + local_irq_disable(); +#else + WARN_ON_ONCE(!irqs_disabled()); +#endif + account_irq_exit_time(current); trace_hardirq_exit(); - sub_preempt_count(IRQ_EXIT_OFFSET); + sub_preempt_count(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); @@ -354,7 +352,6 @@ void irq_exit(void) tick_nohz_irq_exit(); #endif rcu_irq_exit(); - sched_preempt_enable_no_resched(); } /* diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 95d178c62d5a..c09f2955ae30 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -336,7 +336,7 @@ static struct smp_hotplug_thread cpu_stop_threads = { .create = cpu_stop_create, .setup = cpu_stop_unpark, .park = cpu_stop_park, - .unpark = cpu_stop_unpark, + .pre_unpark = cpu_stop_unpark, .selfparking = true, }; diff --git a/kernel/sys.c b/kernel/sys.c index 81f56445fba9..0da73cf73e60 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -324,7 +324,6 @@ void kernel_restart_prepare(char *cmd) system_state = SYSTEM_RESTART; usermodehelper_disable(); device_shutdown(); - syscore_shutdown(); } /** @@ -370,6 +369,7 @@ void kernel_restart(char *cmd) { kernel_restart_prepare(cmd); disable_nonboot_cpus(); + syscore_shutdown(); if (!cmd) printk(KERN_EMERG "Restarting system.\n"); else @@ -395,6 +395,7 @@ static void kernel_shutdown_prepare(enum system_states state) void kernel_halt(void) { kernel_shutdown_prepare(SYSTEM_HALT); + disable_nonboot_cpus(); syscore_shutdown(); printk(KERN_EMERG "System halted.\n"); kmsg_dump(KMSG_DUMP_HALT); @@ -2185,9 +2186,8 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; -static int __orderly_poweroff(void) +static int __orderly_poweroff(bool force) { - int argc; char **argv; static char *envp[] = { "HOME=/", @@ -2196,20 +2196,40 @@ static int __orderly_poweroff(void) }; int ret; - argv = argv_split(GFP_ATOMIC, poweroff_cmd, &argc); - if (argv == NULL) { + argv = argv_split(GFP_KERNEL, poweroff_cmd, NULL); + if (argv) { + ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); + argv_free(argv); + } else { printk(KERN_WARNING "%s failed to allocate memory for \"%s\"\n", - __func__, poweroff_cmd); - return -ENOMEM; + __func__, poweroff_cmd); + ret = -ENOMEM; } - ret = call_usermodehelper_fns(argv[0], argv, envp, UMH_WAIT_EXEC, - NULL, NULL, NULL); - argv_free(argv); + if (ret && force) { + printk(KERN_WARNING "Failed to start orderly shutdown: " + "forcing the issue\n"); + /* + * I guess this should try to kick off some daemon to sync and + * poweroff asap. Or not even bother syncing if we're doing an + * emergency shutdown? + */ + emergency_sync(); + kernel_power_off(); + } return ret; } +static bool poweroff_force; + +static void poweroff_work_func(struct work_struct *work) +{ + __orderly_poweroff(poweroff_force); +} + +static DECLARE_WORK(poweroff_work, poweroff_work_func); + /** * orderly_poweroff - Trigger an orderly system poweroff * @force: force poweroff if command execution fails @@ -2219,21 +2239,9 @@ static int __orderly_poweroff(void) */ int orderly_poweroff(bool force) { - int ret = __orderly_poweroff(); - - if (ret && force) { - printk(KERN_WARNING "Failed to start orderly shutdown: " - "forcing the issue\n"); - - /* - * I guess this should try to kick off some daemon to sync and - * poweroff asap. Or not even bother syncing if we're doing an - * emergency shutdown? - */ - emergency_sync(); - kernel_power_off(); - } - - return ret; + if (force) /* do not override the pending "true" */ + poweroff_force = true; + schedule_work(&poweroff_work); + return 0; } EXPORT_SYMBOL_GPL(orderly_poweroff); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index afc1dc60f3f8..9edcf456e0fc 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -106,7 +106,6 @@ extern unsigned int core_pipe_limit; #endif extern int pid_max; extern int pid_max_min, pid_max_max; -extern int sysctl_drop_caches; extern int percpu_pagelist_fraction; extern int compat_log; extern int latencytop_enabled; @@ -1430,6 +1429,20 @@ static struct ctl_table vm_table[] = { .extra2 = &one, }, #endif + { + .procname = "user_reserve_kbytes", + .data = &sysctl_user_reserve_kbytes, + .maxlen = sizeof(sysctl_user_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "admin_reserve_kbytes", + .data = &sysctl_admin_reserve_kbytes, + .maxlen = sizeof(sysctl_admin_reserve_kbytes), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2fb8cb88df8d..7f32fe0e52cd 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -67,7 +67,8 @@ static void tick_broadcast_start_periodic(struct clock_event_device *bc) */ int tick_check_broadcast_device(struct clock_event_device *dev) { - if ((tick_broadcast_device.evtdev && + if ((dev->features & CLOCK_EVT_FEAT_DUMMY) || + (tick_broadcast_device.evtdev && tick_broadcast_device.evtdev->rating >= dev->rating) || (dev->features & CLOCK_EVT_FEAT_C3STOP)) return 0; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 192473b22799..5e9efd4b83a4 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -176,6 +176,8 @@ config IRQSOFF_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP help This option measures the time spent in irqs-off critical sections, with microsecond accuracy. @@ -198,6 +200,8 @@ config PREEMPT_TRACER select GENERIC_TRACER select TRACER_MAX_TRACE select RING_BUFFER_ALLOW_SWAP + select TRACER_SNAPSHOT + select TRACER_SNAPSHOT_PER_CPU_SWAP help This option measures the time spent in preemption-off critical sections, with microsecond accuracy. @@ -217,6 +221,7 @@ config SCHED_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select TRACER_MAX_TRACE + select TRACER_SNAPSHOT help This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. @@ -248,6 +253,27 @@ config TRACER_SNAPSHOT echo 1 > /sys/kernel/debug/tracing/snapshot cat snapshot +config TRACER_SNAPSHOT_PER_CPU_SWAP + bool "Allow snapshot to swap per CPU" + depends on TRACER_SNAPSHOT + select RING_BUFFER_ALLOW_SWAP + help + Allow doing a snapshot of a single CPU buffer instead of a + full swap (all buffers). If this is set, then the following is + allowed: + + echo 1 > /sys/kernel/debug/tracing/per_cpu/cpu2/snapshot + + After which, only the tracing buffer for CPU 2 was swapped with + the main tracing buffer, and the other CPU buffers remain the same. + + When this is enabled, this adds a little more overhead to the + trace recording, as it needs to add some checks to synchronize + recording with swaps. But this does not affect the performance + of the overall system. This is enabled by default when the preempt + or irq latency tracers are enabled, as those need to swap as well + and already adds the overhead (plus a lot more). + config TRACE_BRANCH_PROFILING bool select GENERIC_TRACER @@ -414,24 +440,28 @@ config PROBE_EVENTS def_bool n config DYNAMIC_FTRACE - bool "enable/disable ftrace tracepoints dynamically" + bool "enable/disable function tracing dynamically" depends on FUNCTION_TRACER depends on HAVE_DYNAMIC_FTRACE default y help - This option will modify all the calls to ftrace dynamically - (will patch them out of the binary image and replace them - with a No-Op instruction) as they are called. A table is - created to dynamically enable them again. + This option will modify all the calls to function tracing + dynamically (will patch them out of the binary image and + replace them with a No-Op instruction) on boot up. During + compile time, a table is made of all the locations that ftrace + can function trace, and this table is linked into the kernel + image. When this is enabled, functions can be individually + enabled, and the functions not enabled will not affect + performance of the system. + + See the files in /sys/kernel/debug/tracing: + available_filter_functions + set_ftrace_filter + set_ftrace_notrace This way a CONFIG_FUNCTION_TRACER kernel is slightly larger, but otherwise has native performance as long as no tracing is active. - The changes to the code are done by a kernel thread that - wakes up once a second and checks to see if any ftrace calls - were made. If so, it runs stop_machine (stops all CPUS) - and modifies the code to jump over the call to ftrace. - config DYNAMIC_FTRACE_WITH_REGS def_bool y depends on DYNAMIC_FTRACE @@ -520,6 +550,29 @@ config RING_BUFFER_BENCHMARK If unsure, say N. +config RING_BUFFER_STARTUP_TEST + bool "Ring buffer startup self test" + depends on RING_BUFFER + help + Run a simple self test on the ring buffer on boot up. Late in the + kernel boot sequence, the test will start that kicks off + a thread per cpu. Each thread will write various size events + into the ring buffer. Another thread is created to send IPIs + to each of the threads, where the IPI handler will also write + to the ring buffer, to test/stress the nesting ability. + If any anomalies are discovered, a warning will be displayed + and all ring buffers will be disabled. + + The test runs for 10 seconds. This will slow your boot time + by at least 10 more seconds. + + At the end of the test, statics and more checks are done. + It will output the stats of each per cpu buffer. What + was written, the sizes, what was read, what was lost, and + other similar details. + + If unsure, say N + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 9e5b8c272eec..ed58a3216a6d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -72,7 +72,7 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action, bool blk_tracer = blk_tracer_enabled; if (blk_tracer) { - buffer = blk_tr->buffer; + buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + len, @@ -218,7 +218,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, if (blk_tracer) { tracing_record_cmdline(current); - buffer = blk_tr->buffer; + buffer = blk_tr->trace_buffer.buffer; pc = preempt_count(); event = trace_buffer_lock_reserve(buffer, TRACE_BLK, sizeof(*t) + pdu_len, @@ -739,12 +739,6 @@ static void blk_add_trace_rq_complete(void *ignore, struct request_queue *q, struct request *rq) { - struct blk_trace *bt = q->blk_trace; - - /* if control ever passes through here, it's a request based driver */ - if (unlikely(bt && !bt->rq_based)) - bt->rq_based = true; - blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); } @@ -780,24 +774,10 @@ static void blk_add_trace_bio_bounce(void *ignore, blk_add_trace_bio(q, bio, BLK_TA_BOUNCE, 0); } -static void blk_add_trace_bio_complete(void *ignore, struct bio *bio, int error) +static void blk_add_trace_bio_complete(void *ignore, + struct request_queue *q, struct bio *bio, + int error) { - struct request_queue *q; - struct blk_trace *bt; - - if (!bio->bi_bdev) - return; - - q = bdev_get_queue(bio->bi_bdev); - bt = q->blk_trace; - - /* - * Request based drivers will generate both rq and bio completions. - * Ignore bio ones. - */ - if (likely(!bt) || bt->rq_based) - return; - blk_add_trace_bio(q, bio, BLK_TA_COMPLETE, error); } diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ab25b88aae56..8a5c017bb50c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -66,7 +66,7 @@ static struct ftrace_ops ftrace_list_end __read_mostly = { .func = ftrace_stub, - .flags = FTRACE_OPS_FL_RECURSION_SAFE, + .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_STUB, }; /* ftrace_enabled is a method to turn ftrace on or off */ @@ -486,7 +486,6 @@ struct ftrace_profile_stat { #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits __read_mostly; static int ftrace_profile_enabled __read_mostly; /* ftrace_profile_lock - synchronize the enable and disable of the profiler */ @@ -494,7 +493,8 @@ static DEFINE_MUTEX(ftrace_profile_lock); static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); -#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ +#define FTRACE_PROFILE_HASH_BITS 10 +#define FTRACE_PROFILE_HASH_SIZE (1 << FTRACE_PROFILE_HASH_BITS) static void * function_stat_next(void *v, int idx) @@ -676,7 +676,7 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); - for (i = 0; i < pages; i++) { + for (i = 1; i < pages; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); if (!pg->next) goto out_free; @@ -694,7 +694,6 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) free_page(tmp); } - free_page((unsigned long)stat->pages); stat->pages = NULL; stat->start = NULL; @@ -725,13 +724,6 @@ static int ftrace_profile_init_cpu(int cpu) if (!stat->hash) return -ENOMEM; - if (!ftrace_profile_bits) { - size--; - - for (; size; size >>= 1) - ftrace_profile_bits++; - } - /* Preallocate the function profiling pages */ if (ftrace_profile_pages_init(stat) < 0) { kfree(stat->hash); @@ -764,7 +756,7 @@ ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) struct hlist_head *hhd; unsigned long key; - key = hash_long(ip, ftrace_profile_bits); + key = hash_long(ip, FTRACE_PROFILE_HASH_BITS); hhd = &stat->hash[key]; if (hlist_empty(hhd)) @@ -783,7 +775,7 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat, { unsigned long key; - key = hash_long(rec->ip, ftrace_profile_bits); + key = hash_long(rec->ip, FTRACE_PROFILE_HASH_BITS); hlist_add_head_rcu(&rec->node, &stat->hash[key]); } @@ -1053,6 +1045,19 @@ static __init void ftrace_profile_debugfs(struct dentry *d_tracer) static struct pid * const ftrace_swapper_pid = &init_struct_pid; +loff_t +ftrace_filter_lseek(struct file *file, loff_t offset, int whence) +{ + loff_t ret; + + if (file->f_mode & FMODE_READ) + ret = seq_lseek(file, offset, whence); + else + file->f_pos = ret = 1; + + return ret; +} + #ifdef CONFIG_DYNAMIC_FTRACE #ifndef CONFIG_FTRACE_MCOUNT_RECORD @@ -1067,7 +1072,7 @@ struct ftrace_func_probe { unsigned long flags; unsigned long ip; void *data; - struct rcu_head rcu; + struct list_head free_list; }; struct ftrace_func_entry { @@ -1317,7 +1322,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, struct hlist_head *hhd; struct ftrace_hash *old_hash; struct ftrace_hash *new_hash; - unsigned long key; int size = src->count; int bits = 0; int ret; @@ -1360,10 +1364,6 @@ ftrace_hash_move(struct ftrace_ops *ops, int enable, for (i = 0; i < size; i++) { hhd = &src->buckets[i]; hlist_for_each_entry_safe(entry, tn, hhd, hlist) { - if (bits > 0) - key = hash_long(entry->ip, bits); - else - key = 0; remove_hash_entry(src, entry); __add_hash_entry(new_hash, entry); } @@ -2613,7 +2613,7 @@ static void ftrace_filter_reset(struct ftrace_hash *hash) * routine, you can use ftrace_filter_write() for the write * routine if @flag has FTRACE_ITER_FILTER set, or * ftrace_notrace_write() if @flag has FTRACE_ITER_NOTRACE set. - * ftrace_regex_lseek() should be used as the lseek routine, and + * ftrace_filter_lseek() should be used as the lseek routine, and * release must call ftrace_regex_release(). */ int @@ -2697,19 +2697,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file) inode, file); } -loff_t -ftrace_regex_lseek(struct file *file, loff_t offset, int whence) -{ - loff_t ret; - - if (file->f_mode & FMODE_READ) - ret = seq_lseek(file, offset, whence); - else - file->f_pos = ret = 1; - - return ret; -} - static int ftrace_match(char *str, char *regex, int len, int type) { int matched = 0; @@ -2974,28 +2961,27 @@ static void __disable_ftrace_function_probe(void) } -static void ftrace_free_entry_rcu(struct rcu_head *rhp) +static void ftrace_free_entry(struct ftrace_func_probe *entry) { - struct ftrace_func_probe *entry = - container_of(rhp, struct ftrace_func_probe, rcu); - if (entry->ops->free) - entry->ops->free(&entry->data); + entry->ops->free(entry->ops, entry->ip, &entry->data); kfree(entry); } - int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { struct ftrace_func_probe *entry; + struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct ftrace_hash *hash; struct ftrace_page *pg; struct dyn_ftrace *rec; int type, len, not; unsigned long key; int count = 0; char *search; + int ret; type = filter_parse_regex(glob, strlen(glob), &search, ¬); len = strlen(search); @@ -3006,8 +2992,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_lock(&ftrace_lock); - if (unlikely(ftrace_disabled)) + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) { + count = -ENOMEM; + goto out_unlock; + } + + if (unlikely(ftrace_disabled)) { + count = -ENODEV; goto out_unlock; + } do_for_each_ftrace_rec(pg, rec) { @@ -3031,14 +3025,21 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, * for each function we find. We call the callback * to give the caller an opportunity to do so. */ - if (ops->callback) { - if (ops->callback(rec->ip, &entry->data) < 0) { + if (ops->init) { + if (ops->init(ops, rec->ip, &entry->data) < 0) { /* caller does not like this func */ kfree(entry); continue; } } + ret = enter_record(hash, rec, 0); + if (ret < 0) { + kfree(entry); + count = ret; + goto out_unlock; + } + entry->ops = ops; entry->ip = rec->ip; @@ -3046,10 +3047,16 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]); } while_for_each_ftrace_rec(); + + ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + if (ret < 0) + count = ret; + __enable_ftrace_function_probe(); out_unlock: mutex_unlock(&ftrace_lock); + free_ftrace_hash(hash); return count; } @@ -3063,7 +3070,12 @@ static void __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data, int flags) { + struct ftrace_func_entry *rec_entry; struct ftrace_func_probe *entry; + struct ftrace_func_probe *p; + struct ftrace_hash **orig_hash = &trace_probe_ops.filter_hash; + struct list_head free_list; + struct ftrace_hash *hash; struct hlist_node *tmp; char str[KSYM_SYMBOL_LEN]; int type = MATCH_FULL; @@ -3084,6 +3096,14 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, } mutex_lock(&ftrace_lock); + + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, *orig_hash); + if (!hash) + /* Hmm, should report this somehow */ + goto out_unlock; + + INIT_LIST_HEAD(&free_list); + for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) { struct hlist_head *hhd = &ftrace_func_hash[i]; @@ -3104,12 +3124,30 @@ __unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, continue; } - hlist_del(&entry->node); - call_rcu(&entry->rcu, ftrace_free_entry_rcu); + rec_entry = ftrace_lookup_ip(hash, entry->ip); + /* It is possible more than one entry had this ip */ + if (rec_entry) + free_hash_entry(hash, rec_entry); + + hlist_del_rcu(&entry->node); + list_add(&entry->free_list, &free_list); } } __disable_ftrace_function_probe(); + /* + * Remove after the disable is called. Otherwise, if the last + * probe is removed, a null hash means *all enabled*. + */ + ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); + synchronize_sched(); + list_for_each_entry_safe(entry, p, &free_list, free_list) { + list_del(&entry->free_list); + ftrace_free_entry(entry); + } + + out_unlock: mutex_unlock(&ftrace_lock); + free_ftrace_hash(hash); } void @@ -3441,14 +3479,14 @@ static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata; static int __init set_ftrace_notrace(char *str) { - strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); + strlcpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_notrace=", set_ftrace_notrace); static int __init set_ftrace_filter(char *str) { - strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); + strlcpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE); return 1; } __setup("ftrace_filter=", set_ftrace_filter); @@ -3571,7 +3609,7 @@ static const struct file_operations ftrace_filter_fops = { .open = ftrace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; @@ -3579,7 +3617,7 @@ static const struct file_operations ftrace_notrace_fops = { .open = ftrace_notrace_open, .read = seq_read, .write = ftrace_notrace_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; @@ -3737,7 +3775,8 @@ out: if (fail) return -EINVAL; - ftrace_graph_filter_enabled = 1; + ftrace_graph_filter_enabled = !!(*idx); + return 0; } @@ -3784,8 +3823,8 @@ static const struct file_operations ftrace_graph_fops = { .open = ftrace_graph_open, .read = seq_read, .write = ftrace_graph_write, + .llseek = ftrace_filter_lseek, .release = ftrace_graph_release, - .llseek = seq_lseek, }; #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ @@ -4131,7 +4170,8 @@ ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, preempt_disable_notrace(); trace_recursion_set(TRACE_CONTROL_BIT); do_for_each_ftrace_op(op, ftrace_control_list) { - if (!ftrace_function_local_disabled(op) && + if (!(op->flags & FTRACE_OPS_FL_STUB) && + !ftrace_function_local_disabled(op) && ftrace_ops_test(op, ip)) op->func(ip, parent_ip, op, regs); } while_for_each_ftrace_op(op); @@ -4439,7 +4479,7 @@ static const struct file_operations ftrace_pid_fops = { .open = ftrace_pid_open, .write = ftrace_pid_write, .read = seq_read, - .llseek = seq_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_pid_release, }; @@ -4555,12 +4595,8 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, ftrace_startup_sysctl(); /* we are starting ftrace again */ - if (ftrace_ops_list != &ftrace_list_end) { - if (ftrace_ops_list->next == &ftrace_list_end) - ftrace_trace_function = ftrace_ops_list->func; - else - ftrace_trace_function = ftrace_ops_list_func; - } + if (ftrace_ops_list != &ftrace_list_end) + update_ftrace_function(); } else { /* stopping ftrace calls (just send to ftrace_stub) */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 6989df2ba194..b59aea2c48c2 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -8,13 +8,16 @@ #include <linux/trace_clock.h> #include <linux/trace_seq.h> #include <linux/spinlock.h> +#include <linux/irq_work.h> #include <linux/debugfs.h> #include <linux/uaccess.h> #include <linux/hardirq.h> +#include <linux/kthread.h> /* for self test */ #include <linux/kmemcheck.h> #include <linux/module.h> #include <linux/percpu.h> #include <linux/mutex.h> +#include <linux/delay.h> #include <linux/slab.h> #include <linux/init.h> #include <linux/hash.h> @@ -444,6 +447,12 @@ int ring_buffer_print_page_header(struct trace_seq *s) return ret; } +struct rb_irq_work { + struct irq_work work; + wait_queue_head_t waiters; + bool waiters_pending; +}; + /* * head_page == tail_page && head == tail then buffer is empty. */ @@ -478,6 +487,8 @@ struct ring_buffer_per_cpu { struct list_head new_pages; /* new pages to add */ struct work_struct update_pages_work; struct completion update_done; + + struct rb_irq_work irq_work; }; struct ring_buffer { @@ -497,6 +508,8 @@ struct ring_buffer { struct notifier_block cpu_notify; #endif u64 (*clock)(void); + + struct rb_irq_work irq_work; }; struct ring_buffer_iter { @@ -508,6 +521,118 @@ struct ring_buffer_iter { u64 read_stamp; }; +/* + * rb_wake_up_waiters - wake up tasks waiting for ring buffer input + * + * Schedules a delayed work to wake up any task that is blocked on the + * ring buffer waiters queue. + */ +static void rb_wake_up_waiters(struct irq_work *work) +{ + struct rb_irq_work *rbwork = container_of(work, struct rb_irq_work, work); + + wake_up_all(&rbwork->waiters); +} + +/** + * ring_buffer_wait - wait for input to the ring buffer + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + */ +void ring_buffer_wait(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + DEFINE_WAIT(wait); + struct rb_irq_work *work; + + /* + * Depending on what the caller is waiting for, either any + * data in any cpu buffer, or a specific buffer, put the + * caller on the appropriate wait queue. + */ + if (cpu == RING_BUFFER_ALL_CPUS) + work = &buffer->irq_work; + else { + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + + prepare_to_wait(&work->waiters, &wait, TASK_INTERRUPTIBLE); + + /* + * The events can happen in critical sections where + * checking a work queue can cause deadlocks. + * After adding a task to the queue, this flag is set + * only to notify events to try to wake up the queue + * using irq_work. + * + * We don't clear it even if the buffer is no longer + * empty. The flag only causes the next event to run + * irq_work to do the work queue wake up. The worse + * that can happen if we race with !trace_empty() is that + * an event will cause an irq_work to try to wake up + * an empty queue. + * + * There's no reason to protect this flag either, as + * the work queue and irq_work logic will do the necessary + * synchronization for the wake ups. The only thing + * that is necessary is that the wake up happens after + * a task has been queued. It's OK for spurious wake ups. + */ + work->waiters_pending = true; + + if ((cpu == RING_BUFFER_ALL_CPUS && ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && ring_buffer_empty_cpu(buffer, cpu))) + schedule(); + + finish_wait(&work->waiters, &wait); +} + +/** + * ring_buffer_poll_wait - poll on buffer input + * @buffer: buffer to wait on + * @cpu: the cpu buffer to wait on + * @filp: the file descriptor + * @poll_table: The poll descriptor + * + * If @cpu == RING_BUFFER_ALL_CPUS then the task will wake up as soon + * as data is added to any of the @buffer's cpu buffers. Otherwise + * it will wait for data to be added to a specific cpu buffer. + * + * Returns POLLIN | POLLRDNORM if data exists in the buffers, + * zero otherwise. + */ +int ring_buffer_poll_wait(struct ring_buffer *buffer, int cpu, + struct file *filp, poll_table *poll_table) +{ + struct ring_buffer_per_cpu *cpu_buffer; + struct rb_irq_work *work; + + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) + return POLLIN | POLLRDNORM; + + if (cpu == RING_BUFFER_ALL_CPUS) + work = &buffer->irq_work; + else { + cpu_buffer = buffer->buffers[cpu]; + work = &cpu_buffer->irq_work; + } + + work->waiters_pending = true; + poll_wait(filp, &work->waiters, poll_table); + + if ((cpu == RING_BUFFER_ALL_CPUS && !ring_buffer_empty(buffer)) || + (cpu != RING_BUFFER_ALL_CPUS && !ring_buffer_empty_cpu(buffer, cpu))) + return POLLIN | POLLRDNORM; + return 0; +} + /* buffer may be either ring_buffer or ring_buffer_per_cpu */ #define RB_WARN_ON(b, cond) \ ({ \ @@ -1063,6 +1188,8 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int nr_pages, int cpu) cpu_buffer->lock = (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; INIT_WORK(&cpu_buffer->update_pages_work, update_pages_handler); init_completion(&cpu_buffer->update_done); + init_irq_work(&cpu_buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&cpu_buffer->irq_work.waiters); bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()), GFP_KERNEL, cpu_to_node(cpu)); @@ -1158,6 +1285,9 @@ struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags, buffer->clock = trace_clock_local; buffer->reader_lock_key = key; + init_irq_work(&buffer->irq_work.work, rb_wake_up_waiters); + init_waitqueue_head(&buffer->irq_work.waiters); + /* need at least two pages */ if (nr_pages < 2) nr_pages = 2; @@ -1553,11 +1683,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, if (!cpu_buffer->nr_pages_to_update) continue; - if (cpu_online(cpu)) + /* The update must run on the CPU that is being updated. */ + preempt_disable(); + if (cpu == smp_processor_id() || !cpu_online(cpu)) { + rb_update_pages(cpu_buffer); + cpu_buffer->nr_pages_to_update = 0; + } else { + /* + * Can not disable preemption for schedule_work_on() + * on PREEMPT_RT. + */ + preempt_enable(); schedule_work_on(cpu, &cpu_buffer->update_pages_work); - else - rb_update_pages(cpu_buffer); + preempt_disable(); + } + preempt_enable(); } /* wait for all the updates to complete */ @@ -1595,12 +1736,22 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size, get_online_cpus(); - if (cpu_online(cpu_id)) { + preempt_disable(); + /* The update must run on the CPU that is being updated. */ + if (cpu_id == smp_processor_id() || !cpu_online(cpu_id)) + rb_update_pages(cpu_buffer); + else { + /* + * Can not disable preemption for schedule_work_on() + * on PREEMPT_RT. + */ + preempt_enable(); schedule_work_on(cpu_id, &cpu_buffer->update_pages_work); wait_for_completion(&cpu_buffer->update_done); - } else - rb_update_pages(cpu_buffer); + preempt_disable(); + } + preempt_enable(); cpu_buffer->nr_pages_to_update = 0; put_online_cpus(); @@ -2612,6 +2763,22 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, rb_end_commit(cpu_buffer); } +static __always_inline void +rb_wakeups(struct ring_buffer *buffer, struct ring_buffer_per_cpu *cpu_buffer) +{ + if (buffer->irq_work.waiters_pending) { + buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&buffer->irq_work.work); + } + + if (cpu_buffer->irq_work.waiters_pending) { + cpu_buffer->irq_work.waiters_pending = false; + /* irq_work_queue() supplies it's own memory barriers */ + irq_work_queue(&cpu_buffer->irq_work.work); + } +} + /** * ring_buffer_unlock_commit - commit a reserved * @buffer: The buffer to commit to @@ -2631,6 +2798,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + rb_wakeups(buffer, cpu_buffer); + trace_recursive_unlock(); preempt_enable_notrace(); @@ -2803,6 +2972,8 @@ int ring_buffer_write(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + rb_wakeups(buffer, cpu_buffer); + ret = 0; out: preempt_enable_notrace(); @@ -4467,3 +4638,320 @@ static int rb_cpu_notify(struct notifier_block *self, return NOTIFY_OK; } #endif + +#ifdef CONFIG_RING_BUFFER_STARTUP_TEST +/* + * This is a basic integrity check of the ring buffer. + * Late in the boot cycle this test will run when configured in. + * It will kick off a thread per CPU that will go into a loop + * writing to the per cpu ring buffer various sizes of data. + * Some of the data will be large items, some small. + * + * Another thread is created that goes into a spin, sending out + * IPIs to the other CPUs to also write into the ring buffer. + * this is to test the nesting ability of the buffer. + * + * Basic stats are recorded and reported. If something in the + * ring buffer should happen that's not expected, a big warning + * is displayed and all ring buffers are disabled. + */ +static struct task_struct *rb_threads[NR_CPUS] __initdata; + +struct rb_test_data { + struct ring_buffer *buffer; + unsigned long events; + unsigned long bytes_written; + unsigned long bytes_alloc; + unsigned long bytes_dropped; + unsigned long events_nested; + unsigned long bytes_written_nested; + unsigned long bytes_alloc_nested; + unsigned long bytes_dropped_nested; + int min_size_nested; + int max_size_nested; + int max_size; + int min_size; + int cpu; + int cnt; +}; + +static struct rb_test_data rb_data[NR_CPUS] __initdata; + +/* 1 meg per cpu */ +#define RB_TEST_BUFFER_SIZE 1048576 + +static char rb_string[] __initdata = + "abcdefghijklmnopqrstuvwxyz1234567890!@#$%^&*()?+\\" + "?+|:';\",.<>/?abcdefghijklmnopqrstuvwxyz1234567890" + "!@#$%^&*()?+\\?+|:';\",.<>/?abcdefghijklmnopqrstuv"; + +static bool rb_test_started __initdata; + +struct rb_item { + int size; + char str[]; +}; + +static __init int rb_write_something(struct rb_test_data *data, bool nested) +{ + struct ring_buffer_event *event; + struct rb_item *item; + bool started; + int event_len; + int size; + int len; + int cnt; + + /* Have nested writes different that what is written */ + cnt = data->cnt + (nested ? 27 : 0); + + /* Multiply cnt by ~e, to make some unique increment */ + size = (data->cnt * 68 / 25) % (sizeof(rb_string) - 1); + + len = size + sizeof(struct rb_item); + + started = rb_test_started; + /* read rb_test_started before checking buffer enabled */ + smp_rmb(); + + event = ring_buffer_lock_reserve(data->buffer, len); + if (!event) { + /* Ignore dropped events before test starts. */ + if (started) { + if (nested) + data->bytes_dropped += len; + else + data->bytes_dropped_nested += len; + } + return len; + } + + event_len = ring_buffer_event_length(event); + + if (RB_WARN_ON(data->buffer, event_len < len)) + goto out; + + item = ring_buffer_event_data(event); + item->size = size; + memcpy(item->str, rb_string, size); + + if (nested) { + data->bytes_alloc_nested += event_len; + data->bytes_written_nested += len; + data->events_nested++; + if (!data->min_size_nested || len < data->min_size_nested) + data->min_size_nested = len; + if (len > data->max_size_nested) + data->max_size_nested = len; + } else { + data->bytes_alloc += event_len; + data->bytes_written += len; + data->events++; + if (!data->min_size || len < data->min_size) + data->max_size = len; + if (len > data->max_size) + data->max_size = len; + } + + out: + ring_buffer_unlock_commit(data->buffer, event); + + return 0; +} + +static __init int rb_test(void *arg) +{ + struct rb_test_data *data = arg; + + while (!kthread_should_stop()) { + rb_write_something(data, false); + data->cnt++; + + set_current_state(TASK_INTERRUPTIBLE); + /* Now sleep between a min of 100-300us and a max of 1ms */ + usleep_range(((data->cnt % 3) + 1) * 100, 1000); + } + + return 0; +} + +static __init void rb_ipi(void *ignore) +{ + struct rb_test_data *data; + int cpu = smp_processor_id(); + + data = &rb_data[cpu]; + rb_write_something(data, true); +} + +static __init int rb_hammer_test(void *arg) +{ + while (!kthread_should_stop()) { + + /* Send an IPI to all cpus to write data! */ + smp_call_function(rb_ipi, NULL, 1); + /* No sleep, but for non preempt, let others run */ + schedule(); + } + + return 0; +} + +static __init int test_ringbuffer(void) +{ + struct task_struct *rb_hammer; + struct ring_buffer *buffer; + int cpu; + int ret = 0; + + pr_info("Running ring buffer tests...\n"); + + buffer = ring_buffer_alloc(RB_TEST_BUFFER_SIZE, RB_FL_OVERWRITE); + if (WARN_ON(!buffer)) + return 0; + + /* Disable buffer so that threads can't write to it yet */ + ring_buffer_record_off(buffer); + + for_each_online_cpu(cpu) { + rb_data[cpu].buffer = buffer; + rb_data[cpu].cpu = cpu; + rb_data[cpu].cnt = cpu; + rb_threads[cpu] = kthread_create(rb_test, &rb_data[cpu], + "rbtester/%d", cpu); + if (WARN_ON(!rb_threads[cpu])) { + pr_cont("FAILED\n"); + ret = -1; + goto out_free; + } + + kthread_bind(rb_threads[cpu], cpu); + wake_up_process(rb_threads[cpu]); + } + + /* Now create the rb hammer! */ + rb_hammer = kthread_run(rb_hammer_test, NULL, "rbhammer"); + if (WARN_ON(!rb_hammer)) { + pr_cont("FAILED\n"); + ret = -1; + goto out_free; + } + + ring_buffer_record_on(buffer); + /* + * Show buffer is enabled before setting rb_test_started. + * Yes there's a small race window where events could be + * dropped and the thread wont catch it. But when a ring + * buffer gets enabled, there will always be some kind of + * delay before other CPUs see it. Thus, we don't care about + * those dropped events. We care about events dropped after + * the threads see that the buffer is active. + */ + smp_wmb(); + rb_test_started = true; + + set_current_state(TASK_INTERRUPTIBLE); + /* Just run for 10 seconds */; + schedule_timeout(10 * HZ); + + kthread_stop(rb_hammer); + + out_free: + for_each_online_cpu(cpu) { + if (!rb_threads[cpu]) + break; + kthread_stop(rb_threads[cpu]); + } + if (ret) { + ring_buffer_free(buffer); + return ret; + } + + /* Report! */ + pr_info("finished\n"); + for_each_online_cpu(cpu) { + struct ring_buffer_event *event; + struct rb_test_data *data = &rb_data[cpu]; + struct rb_item *item; + unsigned long total_events; + unsigned long total_dropped; + unsigned long total_written; + unsigned long total_alloc; + unsigned long total_read = 0; + unsigned long total_size = 0; + unsigned long total_len = 0; + unsigned long total_lost = 0; + unsigned long lost; + int big_event_size; + int small_event_size; + + ret = -1; + + total_events = data->events + data->events_nested; + total_written = data->bytes_written + data->bytes_written_nested; + total_alloc = data->bytes_alloc + data->bytes_alloc_nested; + total_dropped = data->bytes_dropped + data->bytes_dropped_nested; + + big_event_size = data->max_size + data->max_size_nested; + small_event_size = data->min_size + data->min_size_nested; + + pr_info("CPU %d:\n", cpu); + pr_info(" events: %ld\n", total_events); + pr_info(" dropped bytes: %ld\n", total_dropped); + pr_info(" alloced bytes: %ld\n", total_alloc); + pr_info(" written bytes: %ld\n", total_written); + pr_info(" biggest event: %d\n", big_event_size); + pr_info(" smallest event: %d\n", small_event_size); + + if (RB_WARN_ON(buffer, total_dropped)) + break; + + ret = 0; + + while ((event = ring_buffer_consume(buffer, cpu, NULL, &lost))) { + total_lost += lost; + item = ring_buffer_event_data(event); + total_len += ring_buffer_event_length(event); + total_size += item->size + sizeof(struct rb_item); + if (memcmp(&item->str[0], rb_string, item->size) != 0) { + pr_info("FAILED!\n"); + pr_info("buffer had: %.*s\n", item->size, item->str); + pr_info("expected: %.*s\n", item->size, rb_string); + RB_WARN_ON(buffer, 1); + ret = -1; + break; + } + total_read++; + } + if (ret) + break; + + ret = -1; + + pr_info(" read events: %ld\n", total_read); + pr_info(" lost events: %ld\n", total_lost); + pr_info(" total events: %ld\n", total_lost + total_read); + pr_info(" recorded len bytes: %ld\n", total_len); + pr_info(" recorded size bytes: %ld\n", total_size); + if (total_lost) + pr_info(" With dropped events, record len and size may not match\n" + " alloced and written from above\n"); + if (!total_lost) { + if (RB_WARN_ON(buffer, total_len != total_alloc || + total_size != total_written)) + break; + } + if (RB_WARN_ON(buffer, total_lost + total_read != total_events)) + break; + + ret = 0; + } + if (!ret) + pr_info("Ring buffer PASSED!\n"); + + ring_buffer_free(buffer); + return 0; +} + +late_initcall(test_ringbuffer); +#endif /* CONFIG_RING_BUFFER_STARTUP_TEST */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c2e2c2310374..581630a6387d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1,7 +1,7 @@ /* * ring buffer based function tracer * - * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com> + * Copyright (C) 2007-2012 Steven Rostedt <srostedt@redhat.com> * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com> * * Originally taken from the RT patch by: @@ -19,7 +19,6 @@ #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/irqflags.h> -#include <linux/irq_work.h> #include <linux/debugfs.h> #include <linux/pagemap.h> #include <linux/hardirq.h> @@ -48,7 +47,7 @@ * On boot up, the ring buffer is set to the minimum size, so that * we do not waste memory on systems that are not using tracing. */ -int ring_buffer_expanded; +bool ring_buffer_expanded; /* * We need to change this state when a selftest is running. @@ -87,14 +86,6 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set) static DEFINE_PER_CPU(bool, trace_cmdline_save); /* - * When a reader is waiting for data, then this variable is - * set to true. - */ -static bool trace_wakeup_needed; - -static struct irq_work trace_work_wakeup; - -/* * Kill all tracing for good (never come back). * It is initialized to 1 but will turn to zero if the initialization * of the tracer is successful. But that is the only place that sets @@ -130,12 +121,14 @@ static int tracing_set_tracer(const char *buf); static char bootup_tracer_buf[MAX_TRACER_SIZE] __initdata; static char *default_bootup_tracer; +static bool allocate_snapshot; + static int __init set_cmdline_ftrace(char *str) { - strncpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); + strlcpy(bootup_tracer_buf, str, MAX_TRACER_SIZE); default_bootup_tracer = bootup_tracer_buf; /* We are using ftrace early, expand it */ - ring_buffer_expanded = 1; + ring_buffer_expanded = true; return 1; } __setup("ftrace=", set_cmdline_ftrace); @@ -156,13 +149,22 @@ static int __init set_ftrace_dump_on_oops(char *str) } __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops); +static int __init boot_alloc_snapshot(char *str) +{ + allocate_snapshot = true; + /* We also need the main ring buffer expanded */ + ring_buffer_expanded = true; + return 1; +} +__setup("alloc_snapshot", boot_alloc_snapshot); + static char trace_boot_options_buf[MAX_TRACER_SIZE] __initdata; static char *trace_boot_options __initdata; static int __init set_trace_boot_options(char *str) { - strncpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); + strlcpy(trace_boot_options_buf, str, MAX_TRACER_SIZE); trace_boot_options = trace_boot_options_buf; return 0; } @@ -189,7 +191,7 @@ unsigned long long ns2usecs(cycle_t nsec) */ static struct trace_array global_trace; -static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +LIST_HEAD(ftrace_trace_arrays); int filter_current_check_discard(struct ring_buffer *buffer, struct ftrace_event_call *call, void *rec, @@ -204,29 +206,15 @@ cycle_t ftrace_now(int cpu) u64 ts; /* Early boot up does not have a buffer yet */ - if (!global_trace.buffer) + if (!global_trace.trace_buffer.buffer) return trace_clock_local(); - ts = ring_buffer_time_stamp(global_trace.buffer, cpu); - ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts); + ts = ring_buffer_time_stamp(global_trace.trace_buffer.buffer, cpu); + ring_buffer_normalize_time_stamp(global_trace.trace_buffer.buffer, cpu, &ts); return ts; } -/* - * The max_tr is used to snapshot the global_trace when a maximum - * latency is reached. Some tracers will use this to store a maximum - * trace while it continues examining live traces. - * - * The buffers for the max_tr are set up the same as the global_trace. - * When a snapshot is taken, the link list of the max_tr is swapped - * with the link list of the global_trace and the buffers are reset for - * the global_trace so the tracing can continue. - */ -static struct trace_array max_tr; - -static DEFINE_PER_CPU(struct trace_array_cpu, max_tr_data); - int tracing_is_enabled(void) { return tracing_is_on(); @@ -249,9 +237,6 @@ static unsigned long trace_buf_size = TRACE_BUF_SIZE_DEFAULT; /* trace_types holds a link list of available tracers. */ static struct tracer *trace_types __read_mostly; -/* current_trace points to the tracer that is currently active */ -static struct tracer *current_trace __read_mostly = &nop_trace; - /* * trace_types_lock is used to protect the trace_types list. */ @@ -285,13 +270,13 @@ static DEFINE_PER_CPU(struct mutex, cpu_access_lock); static inline void trace_access_lock(int cpu) { - if (cpu == TRACE_PIPE_ALL_CPU) { + if (cpu == RING_BUFFER_ALL_CPUS) { /* gain it for accessing the whole ring buffer. */ down_write(&all_cpu_access_lock); } else { /* gain it for accessing a cpu ring buffer. */ - /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ + /* Firstly block other trace_access_lock(RING_BUFFER_ALL_CPUS). */ down_read(&all_cpu_access_lock); /* Secondly block other access to this @cpu ring buffer. */ @@ -301,7 +286,7 @@ static inline void trace_access_lock(int cpu) static inline void trace_access_unlock(int cpu) { - if (cpu == TRACE_PIPE_ALL_CPU) { + if (cpu == RING_BUFFER_ALL_CPUS) { up_write(&all_cpu_access_lock); } else { mutex_unlock(&per_cpu(cpu_access_lock, cpu)); @@ -339,30 +324,11 @@ static inline void trace_access_lock_init(void) #endif -/* trace_wait is a waitqueue for tasks blocked on trace_poll */ -static DECLARE_WAIT_QUEUE_HEAD(trace_wait); - /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | TRACE_ITER_GRAPH_TIME | TRACE_ITER_RECORD_CMD | TRACE_ITER_OVERWRITE | - TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS; - -static int trace_stop_count; -static DEFINE_RAW_SPINLOCK(tracing_start_lock); - -/** - * trace_wake_up - wake up tasks waiting for trace input - * - * Schedules a delayed work to wake up any task that is blocked on the - * trace_wait queue. These is used with trace_poll for tasks polling the - * trace. - */ -static void trace_wake_up(struct irq_work *work) -{ - wake_up_all(&trace_wait); - -} + TRACE_ITER_IRQ_INFO | TRACE_ITER_MARKERS | TRACE_ITER_FUNCTION; /** * tracing_on - enable tracing buffers @@ -372,8 +338,8 @@ static void trace_wake_up(struct irq_work *work) */ void tracing_on(void) { - if (global_trace.buffer) - ring_buffer_record_on(global_trace.buffer); + if (global_trace.trace_buffer.buffer) + ring_buffer_record_on(global_trace.trace_buffer.buffer); /* * This flag is only looked at when buffers haven't been * allocated yet. We don't really care about the race @@ -385,6 +351,196 @@ void tracing_on(void) EXPORT_SYMBOL_GPL(tracing_on); /** + * __trace_puts - write a constant string into the trace buffer. + * @ip: The address of the caller + * @str: The constant string to write + * @size: The size of the string. + */ +int __trace_puts(unsigned long ip, const char *str, int size) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct print_entry *entry; + unsigned long irq_flags; + int alloc; + + alloc = sizeof(*entry) + size + 2; /* possible \n added */ + + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, alloc, + irq_flags, preempt_count()); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + + memcpy(&entry->buf, str, size); + + /* Add a newline if necessary */ + if (entry->buf[size - 1] != '\n') { + entry->buf[size] = '\n'; + entry->buf[size + 1] = '\0'; + } else + entry->buf[size] = '\0'; + + __buffer_unlock_commit(buffer, event); + + return size; +} +EXPORT_SYMBOL_GPL(__trace_puts); + +/** + * __trace_bputs - write the pointer to a constant string into trace buffer + * @ip: The address of the caller + * @str: The constant string to write to the buffer to + */ +int __trace_bputs(unsigned long ip, const char *str) +{ + struct ring_buffer_event *event; + struct ring_buffer *buffer; + struct bputs_entry *entry; + unsigned long irq_flags; + int size = sizeof(struct bputs_entry); + + local_save_flags(irq_flags); + buffer = global_trace.trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, TRACE_BPUTS, size, + irq_flags, preempt_count()); + if (!event) + return 0; + + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->str = str; + + __buffer_unlock_commit(buffer, event); + + return 1; +} +EXPORT_SYMBOL_GPL(__trace_bputs); + +#ifdef CONFIG_TRACER_SNAPSHOT +/** + * trace_snapshot - take a snapshot of the current buffer. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + * + * Note, make sure to allocate the snapshot with either + * a tracing_snapshot_alloc(), or by doing it manually + * with: echo 1 > /sys/kernel/debug/tracing/snapshot + * + * If the snapshot buffer is not allocated, it will stop tracing. + * Basically making a permanent snapshot. + */ +void tracing_snapshot(void) +{ + struct trace_array *tr = &global_trace; + struct tracer *tracer = tr->current_trace; + unsigned long flags; + + if (in_nmi()) { + internal_trace_puts("*** SNAPSHOT CALLED FROM NMI CONTEXT ***\n"); + internal_trace_puts("*** snapshot is being ignored ***\n"); + return; + } + + if (!tr->allocated_snapshot) { + internal_trace_puts("*** SNAPSHOT NOT ALLOCATED ***\n"); + internal_trace_puts("*** stopping trace here! ***\n"); + tracing_off(); + return; + } + + /* Note, snapshot can not be used when the tracer uses it */ + if (tracer->use_max_tr) { + internal_trace_puts("*** LATENCY TRACER ACTIVE ***\n"); + internal_trace_puts("*** Can not use snapshot (sorry) ***\n"); + return; + } + + local_irq_save(flags); + update_max_tr(tr, current, smp_processor_id()); + local_irq_restore(flags); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); + +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, + struct trace_buffer *size_buf, int cpu_id); +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val); + +static int alloc_snapshot(struct trace_array *tr) +{ + int ret; + + if (!tr->allocated_snapshot) { + + /* allocate spare buffer */ + ret = resize_buffer_duplicate_size(&tr->max_buffer, + &tr->trace_buffer, RING_BUFFER_ALL_CPUS); + if (ret < 0) + return ret; + + tr->allocated_snapshot = true; + } + + return 0; +} + +void free_snapshot(struct trace_array *tr) +{ + /* + * We don't free the ring buffer. instead, resize it because + * The max_tr ring buffer has some state (e.g. ring->clock) and + * we want preserve it. + */ + ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS); + set_buffer_entries(&tr->max_buffer, 1); + tracing_reset_online_cpus(&tr->max_buffer); + tr->allocated_snapshot = false; +} + +/** + * trace_snapshot_alloc - allocate and take a snapshot of the current buffer. + * + * This is similar to trace_snapshot(), but it will allocate the + * snapshot buffer if it isn't already allocated. Use this only + * where it is safe to sleep, as the allocation may sleep. + * + * This causes a swap between the snapshot buffer and the current live + * tracing buffer. You can use this to take snapshots of the live + * trace when some condition is triggered, but continue to trace. + */ +void tracing_snapshot_alloc(void) +{ + struct trace_array *tr = &global_trace; + int ret; + + ret = alloc_snapshot(tr); + if (WARN_ON(ret < 0)) + return; + + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#else +void tracing_snapshot(void) +{ + WARN_ONCE(1, "Snapshot feature not enabled, but internal snapshot used"); +} +EXPORT_SYMBOL_GPL(tracing_snapshot); +void tracing_snapshot_alloc(void) +{ + /* Give warning */ + tracing_snapshot(); +} +EXPORT_SYMBOL_GPL(tracing_snapshot_alloc); +#endif /* CONFIG_TRACER_SNAPSHOT */ + +/** * tracing_off - turn off tracing buffers * * This function stops the tracing buffers from recording data. @@ -394,8 +550,8 @@ EXPORT_SYMBOL_GPL(tracing_on); */ void tracing_off(void) { - if (global_trace.buffer) - ring_buffer_record_off(global_trace.buffer); + if (global_trace.trace_buffer.buffer) + ring_buffer_record_off(global_trace.trace_buffer.buffer); /* * This flag is only looked at when buffers haven't been * allocated yet. We don't really care about the race @@ -411,8 +567,8 @@ EXPORT_SYMBOL_GPL(tracing_off); */ int tracing_is_on(void) { - if (global_trace.buffer) - return ring_buffer_record_is_on(global_trace.buffer); + if (global_trace.trace_buffer.buffer) + return ring_buffer_record_is_on(global_trace.trace_buffer.buffer); return !global_trace.buffer_disabled; } EXPORT_SYMBOL_GPL(tracing_is_on); @@ -479,6 +635,7 @@ static const char *trace_options[] = { "disable_on_free", "irq-info", "markers", + "function-trace", NULL }; @@ -490,6 +647,8 @@ static struct { { trace_clock_local, "local", 1 }, { trace_clock_global, "global", 1 }, { trace_clock_counter, "counter", 0 }, + { trace_clock_jiffies, "uptime", 1 }, + { trace_clock, "perf", 1 }, ARCH_TRACE_CLOCKS }; @@ -670,13 +829,14 @@ unsigned long __read_mostly tracing_max_latency; static void __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct trace_array_cpu *data = tr->data[cpu]; - struct trace_array_cpu *max_data; + struct trace_buffer *trace_buf = &tr->trace_buffer; + struct trace_buffer *max_buf = &tr->max_buffer; + struct trace_array_cpu *data = per_cpu_ptr(trace_buf->data, cpu); + struct trace_array_cpu *max_data = per_cpu_ptr(max_buf->data, cpu); - max_tr.cpu = cpu; - max_tr.time_start = data->preempt_timestamp; + max_buf->cpu = cpu; + max_buf->time_start = data->preempt_timestamp; - max_data = max_tr.data[cpu]; max_data->saved_latency = tracing_max_latency; max_data->critical_start = data->critical_start; max_data->critical_end = data->critical_end; @@ -704,23 +864,24 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu) { - struct ring_buffer *buf = tr->buffer; + struct ring_buffer *buf; - if (trace_stop_count) + if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); - if (!current_trace->allocated_snapshot) { + if (!tr->allocated_snapshot) { /* Only the nop tracer should hit this when disabling */ - WARN_ON_ONCE(current_trace != &nop_trace); + WARN_ON_ONCE(tr->current_trace != &nop_trace); return; } arch_spin_lock(&ftrace_max_lock); - tr->buffer = max_tr.buffer; - max_tr.buffer = buf; + buf = tr->trace_buffer.buffer; + tr->trace_buffer.buffer = tr->max_buffer.buffer; + tr->max_buffer.buffer = buf; __update_max_tr(tr, tsk, cpu); arch_spin_unlock(&ftrace_max_lock); @@ -739,16 +900,19 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) { int ret; - if (trace_stop_count) + if (tr->stop_count) return; WARN_ON_ONCE(!irqs_disabled()); - if (WARN_ON_ONCE(!current_trace->allocated_snapshot)) + if (tr->allocated_snapshot) { + /* Only the nop tracer should hit this when disabling */ + WARN_ON_ONCE(tr->current_trace != &nop_trace); return; + } arch_spin_lock(&ftrace_max_lock); - ret = ring_buffer_swap_cpu(max_tr.buffer, tr->buffer, cpu); + ret = ring_buffer_swap_cpu(tr->max_buffer.buffer, tr->trace_buffer.buffer, cpu); if (ret == -EBUSY) { /* @@ -757,7 +921,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) * the max trace buffer (no one writes directly to it) * and flag that it failed. */ - trace_array_printk(&max_tr, _THIS_IP_, + trace_array_printk_buf(tr->max_buffer.buffer, _THIS_IP_, "Failed to swap buffers due to commit in progress\n"); } @@ -770,37 +934,78 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu) static void default_wait_pipe(struct trace_iterator *iter) { - DEFINE_WAIT(wait); + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return; - prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE); + ring_buffer_wait(iter->trace_buffer->buffer, iter->cpu_file); +} + +#ifdef CONFIG_FTRACE_STARTUP_TEST +static int run_tracer_selftest(struct tracer *type) +{ + struct trace_array *tr = &global_trace; + struct tracer *saved_tracer = tr->current_trace; + int ret; + + if (!type->selftest || tracing_selftest_disabled) + return 0; /* - * The events can happen in critical sections where - * checking a work queue can cause deadlocks. - * After adding a task to the queue, this flag is set - * only to notify events to try to wake up the queue - * using irq_work. - * - * We don't clear it even if the buffer is no longer - * empty. The flag only causes the next event to run - * irq_work to do the work queue wake up. The worse - * that can happen if we race with !trace_empty() is that - * an event will cause an irq_work to try to wake up - * an empty queue. - * - * There's no reason to protect this flag either, as - * the work queue and irq_work logic will do the necessary - * synchronization for the wake ups. The only thing - * that is necessary is that the wake up happens after - * a task has been queued. It's OK for spurious wake ups. + * Run a selftest on this tracer. + * Here we reset the trace buffer, and set the current + * tracer to be this tracer. The tracer can then run some + * internal tracing to verify that everything is in order. + * If we fail, we do not register this tracer. */ - trace_wakeup_needed = true; + tracing_reset_online_cpus(&tr->trace_buffer); - if (trace_empty(iter)) - schedule(); + tr->current_trace = type; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + /* If we expanded the buffers, make sure the max is expanded too */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, trace_buf_size, + RING_BUFFER_ALL_CPUS); + tr->allocated_snapshot = true; + } +#endif - finish_wait(&trace_wait, &wait); + /* the test is responsible for initializing and enabling */ + pr_info("Testing tracer %s: ", type->name); + ret = type->selftest(type, tr); + /* the test is responsible for resetting too */ + tr->current_trace = saved_tracer; + if (ret) { + printk(KERN_CONT "FAILED!\n"); + /* Add the warning after printing 'FAILED' */ + WARN_ON(1); + return -1; + } + /* Only reset on passing, to avoid touching corrupted buffers */ + tracing_reset_online_cpus(&tr->trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (type->use_max_tr) { + tr->allocated_snapshot = false; + + /* Shrink the max buffer again */ + if (ring_buffer_expanded) + ring_buffer_resize(tr->max_buffer.buffer, 1, + RING_BUFFER_ALL_CPUS); + } +#endif + + printk(KERN_CONT "PASSED\n"); + return 0; } +#else +static inline int run_tracer_selftest(struct tracer *type) +{ + return 0; +} +#endif /* CONFIG_FTRACE_STARTUP_TEST */ /** * register_tracer - register a tracer with the ftrace system. @@ -847,57 +1052,9 @@ int register_tracer(struct tracer *type) if (!type->wait_pipe) type->wait_pipe = default_wait_pipe; - -#ifdef CONFIG_FTRACE_STARTUP_TEST - if (type->selftest && !tracing_selftest_disabled) { - struct tracer *saved_tracer = current_trace; - struct trace_array *tr = &global_trace; - - /* - * Run a selftest on this tracer. - * Here we reset the trace buffer, and set the current - * tracer to be this tracer. The tracer can then run some - * internal tracing to verify that everything is in order. - * If we fail, we do not register this tracer. - */ - tracing_reset_online_cpus(tr); - - current_trace = type; - - if (type->use_max_tr) { - /* If we expanded the buffers, make sure the max is expanded too */ - if (ring_buffer_expanded) - ring_buffer_resize(max_tr.buffer, trace_buf_size, - RING_BUFFER_ALL_CPUS); - type->allocated_snapshot = true; - } - - /* the test is responsible for initializing and enabling */ - pr_info("Testing tracer %s: ", type->name); - ret = type->selftest(type, tr); - /* the test is responsible for resetting too */ - current_trace = saved_tracer; - if (ret) { - printk(KERN_CONT "FAILED!\n"); - /* Add the warning after printing 'FAILED' */ - WARN_ON(1); - goto out; - } - /* Only reset on passing, to avoid touching corrupted buffers */ - tracing_reset_online_cpus(tr); - - if (type->use_max_tr) { - type->allocated_snapshot = false; - - /* Shrink the max buffer again */ - if (ring_buffer_expanded) - ring_buffer_resize(max_tr.buffer, 1, - RING_BUFFER_ALL_CPUS); - } - - printk(KERN_CONT "PASSED\n"); - } -#endif + ret = run_tracer_selftest(type); + if (ret < 0) + goto out; type->next = trace_types; trace_types = type; @@ -917,7 +1074,7 @@ int register_tracer(struct tracer *type) tracing_set_tracer(type->name); default_bootup_tracer = NULL; /* disable other selftests, since this will break it. */ - tracing_selftest_disabled = 1; + tracing_selftest_disabled = true; #ifdef CONFIG_FTRACE_STARTUP_TEST printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n", type->name); @@ -927,9 +1084,9 @@ int register_tracer(struct tracer *type) return ret; } -void tracing_reset(struct trace_array *tr, int cpu) +void tracing_reset(struct trace_buffer *buf, int cpu) { - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = buf->buffer; if (!buffer) return; @@ -943,9 +1100,9 @@ void tracing_reset(struct trace_array *tr, int cpu) ring_buffer_record_enable(buffer); } -void tracing_reset_online_cpus(struct trace_array *tr) +void tracing_reset_online_cpus(struct trace_buffer *buf) { - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = buf->buffer; int cpu; if (!buffer) @@ -956,7 +1113,7 @@ void tracing_reset_online_cpus(struct trace_array *tr) /* Make sure all commits have finished */ synchronize_sched(); - tr->time_start = ftrace_now(tr->cpu); + buf->time_start = ftrace_now(buf->cpu); for_each_online_cpu(cpu) ring_buffer_reset_cpu(buffer, cpu); @@ -966,12 +1123,21 @@ void tracing_reset_online_cpus(struct trace_array *tr) void tracing_reset_current(int cpu) { - tracing_reset(&global_trace, cpu); + tracing_reset(&global_trace.trace_buffer, cpu); } -void tracing_reset_current_online_cpus(void) +void tracing_reset_all_online_cpus(void) { - tracing_reset_online_cpus(&global_trace); + struct trace_array *tr; + + mutex_lock(&trace_types_lock); + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + tracing_reset_online_cpus(&tr->trace_buffer); +#ifdef CONFIG_TRACER_MAX_TRACE + tracing_reset_online_cpus(&tr->max_buffer); +#endif + } + mutex_unlock(&trace_types_lock); } #define SAVED_CMDLINES 128 @@ -994,7 +1160,7 @@ static void trace_init_cmdlines(void) int is_tracing_stopped(void) { - return trace_stop_count; + return global_trace.stop_count; } /** @@ -1026,12 +1192,12 @@ void tracing_start(void) if (tracing_disabled) return; - raw_spin_lock_irqsave(&tracing_start_lock, flags); - if (--trace_stop_count) { - if (trace_stop_count < 0) { + raw_spin_lock_irqsave(&global_trace.start_lock, flags); + if (--global_trace.stop_count) { + if (global_trace.stop_count < 0) { /* Someone screwed up their debugging */ WARN_ON_ONCE(1); - trace_stop_count = 0; + global_trace.stop_count = 0; } goto out; } @@ -1039,19 +1205,52 @@ void tracing_start(void) /* Prevent the buffers from switching */ arch_spin_lock(&ftrace_max_lock); - buffer = global_trace.buffer; + buffer = global_trace.trace_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); - buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = global_trace.max_buffer.buffer; if (buffer) ring_buffer_record_enable(buffer); +#endif arch_spin_unlock(&ftrace_max_lock); ftrace_start(); out: - raw_spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_start_tr(struct trace_array *tr) +{ + struct ring_buffer *buffer; + unsigned long flags; + + if (tracing_disabled) + return; + + /* If global, we need to also start the max tracer */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return tracing_start(); + + raw_spin_lock_irqsave(&tr->start_lock, flags); + + if (--tr->stop_count) { + if (tr->stop_count < 0) { + /* Someone screwed up their debugging */ + WARN_ON_ONCE(1); + tr->stop_count = 0; + } + goto out; + } + + buffer = tr->trace_buffer.buffer; + if (buffer) + ring_buffer_record_enable(buffer); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); } /** @@ -1066,25 +1265,48 @@ void tracing_stop(void) unsigned long flags; ftrace_stop(); - raw_spin_lock_irqsave(&tracing_start_lock, flags); - if (trace_stop_count++) + raw_spin_lock_irqsave(&global_trace.start_lock, flags); + if (global_trace.stop_count++) goto out; /* Prevent the buffers from switching */ arch_spin_lock(&ftrace_max_lock); - buffer = global_trace.buffer; + buffer = global_trace.trace_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); - buffer = max_tr.buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + buffer = global_trace.max_buffer.buffer; if (buffer) ring_buffer_record_disable(buffer); +#endif arch_spin_unlock(&ftrace_max_lock); out: - raw_spin_unlock_irqrestore(&tracing_start_lock, flags); + raw_spin_unlock_irqrestore(&global_trace.start_lock, flags); +} + +static void tracing_stop_tr(struct trace_array *tr) +{ + struct ring_buffer *buffer; + unsigned long flags; + + /* If global, we need to also stop the max tracer */ + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + return tracing_stop(); + + raw_spin_lock_irqsave(&tr->start_lock, flags); + if (tr->stop_count++) + goto out; + + buffer = tr->trace_buffer.buffer; + if (buffer) + ring_buffer_record_disable(buffer); + + out: + raw_spin_unlock_irqrestore(&tr->start_lock, flags); } void trace_stop_cmdline_recording(void); @@ -1217,11 +1439,6 @@ void __buffer_unlock_commit(struct ring_buffer *buffer, struct ring_buffer_event *event) { __this_cpu_write(trace_cmdline_save, true); - if (trace_wakeup_needed) { - trace_wakeup_needed = false; - /* irq_work_queue() supplies it's own memory barriers */ - irq_work_queue(&trace_work_wakeup); - } ring_buffer_unlock_commit(buffer, event); } @@ -1245,11 +1462,23 @@ void trace_buffer_unlock_commit(struct ring_buffer *buffer, EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit); struct ring_buffer_event * +trace_event_buffer_lock_reserve(struct ring_buffer **current_rb, + struct ftrace_event_file *ftrace_file, + int type, unsigned long len, + unsigned long flags, int pc) +{ + *current_rb = ftrace_file->tr->trace_buffer.buffer; + return trace_buffer_lock_reserve(*current_rb, + type, len, flags, pc); +} +EXPORT_SYMBOL_GPL(trace_event_buffer_lock_reserve); + +struct ring_buffer_event * trace_current_buffer_lock_reserve(struct ring_buffer **current_rb, int type, unsigned long len, unsigned long flags, int pc) { - *current_rb = global_trace.buffer; + *current_rb = global_trace.trace_buffer.buffer; return trace_buffer_lock_reserve(*current_rb, type, len, flags, pc); } @@ -1288,7 +1517,7 @@ trace_function(struct trace_array *tr, int pc) { struct ftrace_event_call *call = &event_function; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -1429,13 +1658,14 @@ void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc) { - __ftrace_trace_stack(tr->buffer, flags, skip, pc, NULL); + __ftrace_trace_stack(tr->trace_buffer.buffer, flags, skip, pc, NULL); } /** * trace_dump_stack - record a stack back trace in the trace buffer + * @skip: Number of functions to skip (helper handlers) */ -void trace_dump_stack(void) +void trace_dump_stack(int skip) { unsigned long flags; @@ -1444,8 +1674,13 @@ void trace_dump_stack(void) local_save_flags(flags); - /* skipping 3 traces, seems to get us at the caller of this function */ - __ftrace_trace_stack(global_trace.buffer, flags, 3, preempt_count(), NULL); + /* + * Skip 3 more, seems to get us at the caller of + * this function. + */ + skip += 3; + __ftrace_trace_stack(global_trace.trace_buffer.buffer, + flags, skip, preempt_count(), NULL); } static DEFINE_PER_CPU(int, user_stack_count); @@ -1615,7 +1850,7 @@ void trace_printk_init_buffers(void) * directly here. If the global_trace.buffer is already * allocated here, then this was called by module code. */ - if (global_trace.buffer) + if (global_trace.trace_buffer.buffer) tracing_start_cmdline_record(); } @@ -1675,7 +1910,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) local_save_flags(flags); size = sizeof(*entry) + sizeof(u32) * len; - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BPRINT, size, flags, pc); if (!event) @@ -1698,27 +1933,12 @@ out: } EXPORT_SYMBOL_GPL(trace_vbprintk); -int trace_array_printk(struct trace_array *tr, - unsigned long ip, const char *fmt, ...) -{ - int ret; - va_list ap; - - if (!(trace_flags & TRACE_ITER_PRINTK)) - return 0; - - va_start(ap, fmt); - ret = trace_array_vprintk(tr, ip, fmt, ap); - va_end(ap); - return ret; -} - -int trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args) +static int +__trace_array_vprintk(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; - struct ring_buffer *buffer; int len = 0, size, pc; struct print_entry *entry; unsigned long flags; @@ -1746,7 +1966,6 @@ int trace_array_vprintk(struct trace_array *tr, local_save_flags(flags); size = sizeof(*entry) + len + 1; - buffer = tr->buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, flags, pc); if (!event) @@ -1767,6 +1986,42 @@ int trace_array_vprintk(struct trace_array *tr, return len; } +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args) +{ + return __trace_array_vprintk(tr->trace_buffer.buffer, ip, fmt, args); +} + +int trace_array_printk(struct trace_array *tr, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = trace_array_vprintk(tr, ip, fmt, ap); + va_end(ap); + return ret; +} + +int trace_array_printk_buf(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, ...) +{ + int ret; + va_list ap; + + if (!(trace_flags & TRACE_ITER_PRINTK)) + return 0; + + va_start(ap, fmt); + ret = __trace_array_vprintk(buffer, ip, fmt, ap); + va_end(ap); + return ret; +} + int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); @@ -1792,7 +2047,7 @@ peek_next_entry(struct trace_iterator *iter, int cpu, u64 *ts, if (buf_iter) event = ring_buffer_iter_peek(buf_iter, ts); else - event = ring_buffer_peek(iter->tr->buffer, cpu, ts, + event = ring_buffer_peek(iter->trace_buffer->buffer, cpu, ts, lost_events); if (event) { @@ -1807,7 +2062,7 @@ static struct trace_entry * __find_next_entry(struct trace_iterator *iter, int *ent_cpu, unsigned long *missing_events, u64 *ent_ts) { - struct ring_buffer *buffer = iter->tr->buffer; + struct ring_buffer *buffer = iter->trace_buffer->buffer; struct trace_entry *ent, *next = NULL; unsigned long lost_events = 0, next_lost = 0; int cpu_file = iter->cpu_file; @@ -1820,7 +2075,7 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, * If we are in a per_cpu trace file, don't bother by iterating over * all cpu and peek directly. */ - if (cpu_file > TRACE_PIPE_ALL_CPU) { + if (cpu_file > RING_BUFFER_ALL_CPUS) { if (ring_buffer_empty_cpu(buffer, cpu_file)) return NULL; ent = peek_next_entry(iter, cpu_file, ent_ts, missing_events); @@ -1884,7 +2139,7 @@ void *trace_find_next_entry_inc(struct trace_iterator *iter) static void trace_consume(struct trace_iterator *iter) { - ring_buffer_consume(iter->tr->buffer, iter->cpu, &iter->ts, + ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, &iter->ts, &iter->lost_events); } @@ -1917,13 +2172,12 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos) void tracing_iter_reset(struct trace_iterator *iter, int cpu) { - struct trace_array *tr = iter->tr; struct ring_buffer_event *event; struct ring_buffer_iter *buf_iter; unsigned long entries = 0; u64 ts; - tr->data[cpu]->skipped_entries = 0; + per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = 0; buf_iter = trace_buffer_iter(iter, cpu); if (!buf_iter) @@ -1937,13 +2191,13 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) * by the timestamp being before the start of the buffer. */ while ((event = ring_buffer_iter_peek(buf_iter, &ts))) { - if (ts >= iter->tr->time_start) + if (ts >= iter->trace_buffer->time_start) break; entries++; ring_buffer_read(buf_iter, NULL); } - tr->data[cpu]->skipped_entries = entries; + per_cpu_ptr(iter->trace_buffer->data, cpu)->skipped_entries = entries; } /* @@ -1953,6 +2207,7 @@ void tracing_iter_reset(struct trace_iterator *iter, int cpu) static void *s_start(struct seq_file *m, loff_t *pos) { struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; int cpu_file = iter->cpu_file; void *p = NULL; loff_t l = 0; @@ -1965,12 +2220,14 @@ static void *s_start(struct seq_file *m, loff_t *pos) * will point to the same string as current_trace->name. */ mutex_lock(&trace_types_lock); - if (unlikely(current_trace && iter->trace->name != current_trace->name)) - *iter->trace = *current_trace; + if (unlikely(tr->current_trace && iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); +#ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return ERR_PTR(-EBUSY); +#endif if (!iter->snapshot) atomic_inc(&trace_record_cmdline_disabled); @@ -1980,7 +2237,7 @@ static void *s_start(struct seq_file *m, loff_t *pos) iter->cpu = 0; iter->idx = -1; - if (cpu_file == TRACE_PIPE_ALL_CPU) { + if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) tracing_iter_reset(iter, cpu); } else @@ -2012,17 +2269,21 @@ static void s_stop(struct seq_file *m, void *p) { struct trace_iterator *iter = m->private; +#ifdef CONFIG_TRACER_MAX_TRACE if (iter->snapshot && iter->trace->use_max_tr) return; +#endif if (!iter->snapshot) atomic_dec(&trace_record_cmdline_disabled); + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } static void -get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *entries) +get_total_entries(struct trace_buffer *buf, + unsigned long *total, unsigned long *entries) { unsigned long count; int cpu; @@ -2031,19 +2292,19 @@ get_total_entries(struct trace_array *tr, unsigned long *total, unsigned long *e *entries = 0; for_each_tracing_cpu(cpu) { - count = ring_buffer_entries_cpu(tr->buffer, cpu); + count = ring_buffer_entries_cpu(buf->buffer, cpu); /* * If this buffer has skipped entries, then we hold all * entries for the trace and we need to ignore the * ones before the time stamp. */ - if (tr->data[cpu]->skipped_entries) { - count -= tr->data[cpu]->skipped_entries; + if (per_cpu_ptr(buf->data, cpu)->skipped_entries) { + count -= per_cpu_ptr(buf->data, cpu)->skipped_entries; /* total is the same as the entries */ *total += count; } else *total += count + - ring_buffer_overrun_cpu(tr->buffer, cpu); + ring_buffer_overrun_cpu(buf->buffer, cpu); *entries += count; } } @@ -2060,27 +2321,27 @@ static void print_lat_help_header(struct seq_file *m) seq_puts(m, "# \\ / ||||| \\ | / \n"); } -static void print_event_info(struct trace_array *tr, struct seq_file *m) +static void print_event_info(struct trace_buffer *buf, struct seq_file *m) { unsigned long total; unsigned long entries; - get_total_entries(tr, &total, &entries); + get_total_entries(buf, &total, &entries); seq_printf(m, "# entries-in-buffer/entries-written: %lu/%lu #P:%d\n", entries, total, num_online_cpus()); seq_puts(m, "#\n"); } -static void print_func_help_header(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header(struct trace_buffer *buf, struct seq_file *m) { - print_event_info(tr, m); + print_event_info(buf, m); seq_puts(m, "# TASK-PID CPU# TIMESTAMP FUNCTION\n"); seq_puts(m, "# | | | | |\n"); } -static void print_func_help_header_irq(struct trace_array *tr, struct seq_file *m) +static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m) { - print_event_info(tr, m); + print_event_info(buf, m); seq_puts(m, "# _-----=> irqs-off\n"); seq_puts(m, "# / _----=> need-resched\n"); seq_puts(m, "# | / _---=> hardirq/softirq\n"); @@ -2094,16 +2355,16 @@ void print_trace_header(struct seq_file *m, struct trace_iterator *iter) { unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK); - struct trace_array *tr = iter->tr; - struct trace_array_cpu *data = tr->data[tr->cpu]; - struct tracer *type = current_trace; + struct trace_buffer *buf = iter->trace_buffer; + struct trace_array_cpu *data = per_cpu_ptr(buf->data, buf->cpu); + struct tracer *type = iter->trace; unsigned long entries; unsigned long total; const char *name = "preemption"; name = type->name; - get_total_entries(tr, &total, &entries); + get_total_entries(buf, &total, &entries); seq_printf(m, "# %s latency trace v1.1.5 on %s\n", name, UTS_RELEASE); @@ -2114,7 +2375,7 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter) nsecs_to_usecs(data->saved_latency), entries, total, - tr->cpu, + buf->cpu, #if defined(CONFIG_PREEMPT_NONE) "server", #elif defined(CONFIG_PREEMPT_VOLUNTARY) @@ -2165,7 +2426,7 @@ static void test_cpu_buff_start(struct trace_iterator *iter) if (cpumask_test_cpu(iter->cpu, iter->started)) return; - if (iter->tr->data[iter->cpu]->skipped_entries) + if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries) return; cpumask_set_cpu(iter->cpu, iter->started); @@ -2288,14 +2549,14 @@ int trace_empty(struct trace_iterator *iter) int cpu; /* If we are looking at one CPU buffer, only check that one */ - if (iter->cpu_file != TRACE_PIPE_ALL_CPU) { + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { cpu = iter->cpu_file; buf_iter = trace_buffer_iter(iter, cpu); if (buf_iter) { if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) return 0; } return 1; @@ -2307,7 +2568,7 @@ int trace_empty(struct trace_iterator *iter) if (!ring_buffer_iter_empty(buf_iter)) return 0; } else { - if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu)) + if (!ring_buffer_empty_cpu(iter->trace_buffer->buffer, cpu)) return 0; } } @@ -2331,6 +2592,11 @@ enum print_line_t print_trace_line(struct trace_iterator *iter) return ret; } + if (iter->ent->type == TRACE_BPUTS && + trace_flags & TRACE_ITER_PRINTK && + trace_flags & TRACE_ITER_PRINTK_MSGONLY) + return trace_print_bputs_msg_only(iter); + if (iter->ent->type == TRACE_BPRINT && trace_flags & TRACE_ITER_PRINTK && trace_flags & TRACE_ITER_PRINTK_MSGONLY) @@ -2385,9 +2651,9 @@ void trace_default_header(struct seq_file *m) } else { if (!(trace_flags & TRACE_ITER_VERBOSE)) { if (trace_flags & TRACE_ITER_IRQ_INFO) - print_func_help_header_irq(iter->tr, m); + print_func_help_header_irq(iter->trace_buffer, m); else - print_func_help_header(iter->tr, m); + print_func_help_header(iter->trace_buffer, m); } } } @@ -2400,6 +2666,50 @@ static void test_ftrace_alive(struct seq_file *m) seq_printf(m, "# MAY BE MISSING FUNCTION EVENTS\n"); } +#ifdef CONFIG_TRACER_MAX_TRACE +static void show_snapshot_main_help(struct seq_file *m) +{ + seq_printf(m, "# echo 0 > snapshot : Clears and frees snapshot buffer\n"); + seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); + seq_printf(m, "# Takes a snapshot of the main buffer.\n"); + seq_printf(m, "# echo 2 > snapshot : Clears snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); + seq_printf(m, "# is not a '0' or '1')\n"); +} + +static void show_snapshot_percpu_help(struct seq_file *m) +{ + seq_printf(m, "# echo 0 > snapshot : Invalid for per_cpu snapshot file.\n"); +#ifdef CONFIG_RING_BUFFER_ALLOW_SWAP + seq_printf(m, "# echo 1 > snapshot : Allocates snapshot buffer, if not already allocated.\n"); + seq_printf(m, "# Takes a snapshot of the main buffer for this cpu.\n"); +#else + seq_printf(m, "# echo 1 > snapshot : Not supported with this kernel.\n"); + seq_printf(m, "# Must use main snapshot file to allocate.\n"); +#endif + seq_printf(m, "# echo 2 > snapshot : Clears this cpu's snapshot buffer (but does not allocate)\n"); + seq_printf(m, "# (Doesn't have to be '2' works with any number that\n"); + seq_printf(m, "# is not a '0' or '1')\n"); +} + +static void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) +{ + if (iter->tr->allocated_snapshot) + seq_printf(m, "#\n# * Snapshot is allocated *\n#\n"); + else + seq_printf(m, "#\n# * Snapshot is freed *\n#\n"); + + seq_printf(m, "# Snapshot commands:\n"); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + show_snapshot_main_help(m); + else + show_snapshot_percpu_help(m); +} +#else +/* Should never be called */ +static inline void print_snapshot_help(struct seq_file *m, struct trace_iterator *iter) { } +#endif + static int s_show(struct seq_file *m, void *v) { struct trace_iterator *iter = v; @@ -2411,7 +2721,9 @@ static int s_show(struct seq_file *m, void *v) seq_puts(m, "#\n"); test_ftrace_alive(m); } - if (iter->trace && iter->trace->print_header) + if (iter->snapshot && trace_empty(iter)) + print_snapshot_help(m, iter); + else if (iter->trace && iter->trace->print_header) iter->trace->print_header(m); else trace_default_header(m); @@ -2452,7 +2764,8 @@ static const struct seq_operations tracer_seq_ops = { static struct trace_iterator * __tracing_open(struct inode *inode, struct file *file, bool snapshot) { - long cpu_file = (long) inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct trace_iterator *iter; int cpu; @@ -2477,26 +2790,31 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) if (!iter->trace) goto fail; - *iter->trace = *current_trace; + *iter->trace = *tr->current_trace; if (!zalloc_cpumask_var(&iter->started, GFP_KERNEL)) goto fail; - if (current_trace->print_max || snapshot) - iter->tr = &max_tr; + iter->tr = tr; + +#ifdef CONFIG_TRACER_MAX_TRACE + /* Currently only the top directory has a snapshot */ + if (tr->current_trace->print_max || snapshot) + iter->trace_buffer = &tr->max_buffer; else - iter->tr = &global_trace; +#endif + iter->trace_buffer = &tr->trace_buffer; iter->snapshot = snapshot; iter->pos = -1; mutex_init(&iter->mutex); - iter->cpu_file = cpu_file; + iter->cpu_file = tc->cpu; /* Notify the tracer early; before we stop tracing. */ if (iter->trace && iter->trace->open) iter->trace->open(iter); /* Annotate start of buffers if we had overruns */ - if (ring_buffer_overruns(iter->tr->buffer)) + if (ring_buffer_overruns(iter->trace_buffer->buffer)) iter->iter_flags |= TRACE_FILE_ANNOTATE; /* Output in nanoseconds only if we are using a clock in nanoseconds. */ @@ -2505,12 +2823,12 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) /* stop the trace while dumping if we are not opening "snapshot" */ if (!iter->snapshot) - tracing_stop(); + tracing_stop_tr(tr); - if (iter->cpu_file == TRACE_PIPE_ALL_CPU) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); } ring_buffer_read_prepare_sync(); for_each_tracing_cpu(cpu) { @@ -2520,12 +2838,14 @@ __tracing_open(struct inode *inode, struct file *file, bool snapshot) } else { cpu = iter->cpu_file; iter->buffer_iter[cpu] = - ring_buffer_read_prepare(iter->tr->buffer, cpu); + ring_buffer_read_prepare(iter->trace_buffer->buffer, cpu); ring_buffer_read_prepare_sync(); ring_buffer_read_start(iter->buffer_iter[cpu]); tracing_iter_reset(iter, cpu); } + tr->ref++; + mutex_unlock(&trace_types_lock); return iter; @@ -2552,14 +2872,20 @@ static int tracing_release(struct inode *inode, struct file *file) { struct seq_file *m = file->private_data; struct trace_iterator *iter; + struct trace_array *tr; int cpu; if (!(file->f_mode & FMODE_READ)) return 0; iter = m->private; + tr = iter->tr; mutex_lock(&trace_types_lock); + + WARN_ON(!tr->ref); + tr->ref--; + for_each_tracing_cpu(cpu) { if (iter->buffer_iter[cpu]) ring_buffer_read_finish(iter->buffer_iter[cpu]); @@ -2570,7 +2896,7 @@ static int tracing_release(struct inode *inode, struct file *file) if (!iter->snapshot) /* reenable tracing if it was previously enabled */ - tracing_start(); + tracing_start_tr(tr); mutex_unlock(&trace_types_lock); mutex_destroy(&iter->mutex); @@ -2589,12 +2915,13 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { - long cpu = (long) inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; - if (cpu == TRACE_PIPE_ALL_CPU) - tracing_reset_online_cpus(&global_trace); + if (tc->cpu == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->trace_buffer); else - tracing_reset(&global_trace, cpu); + tracing_reset(&tr->trace_buffer, tc->cpu); } if (file->f_mode & FMODE_READ) { @@ -2741,8 +3068,9 @@ static ssize_t tracing_cpumask_write(struct file *filp, const char __user *ubuf, size_t count, loff_t *ppos) { - int err, cpu; + struct trace_array *tr = filp->private_data; cpumask_var_t tracing_cpumask_new; + int err, cpu; if (!alloc_cpumask_var(&tracing_cpumask_new, GFP_KERNEL)) return -ENOMEM; @@ -2762,13 +3090,13 @@ tracing_cpumask_write(struct file *filp, const char __user *ubuf, */ if (cpumask_test_cpu(cpu, tracing_cpumask) && !cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_inc(&global_trace.data[cpu]->disabled); - ring_buffer_record_disable_cpu(global_trace.buffer, cpu); + atomic_inc(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); + ring_buffer_record_disable_cpu(tr->trace_buffer.buffer, cpu); } if (!cpumask_test_cpu(cpu, tracing_cpumask) && cpumask_test_cpu(cpu, tracing_cpumask_new)) { - atomic_dec(&global_trace.data[cpu]->disabled); - ring_buffer_record_enable_cpu(global_trace.buffer, cpu); + atomic_dec(&per_cpu_ptr(tr->trace_buffer.data, cpu)->disabled); + ring_buffer_record_enable_cpu(tr->trace_buffer.buffer, cpu); } } arch_spin_unlock(&ftrace_max_lock); @@ -2797,12 +3125,13 @@ static const struct file_operations tracing_cpumask_fops = { static int tracing_trace_options_show(struct seq_file *m, void *v) { struct tracer_opt *trace_opts; + struct trace_array *tr = m->private; u32 tracer_flags; int i; mutex_lock(&trace_types_lock); - tracer_flags = current_trace->flags->val; - trace_opts = current_trace->flags->opts; + tracer_flags = tr->current_trace->flags->val; + trace_opts = tr->current_trace->flags->opts; for (i = 0; trace_options[i]; i++) { if (trace_flags & (1 << i)) @@ -2857,11 +3186,25 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg) return -EINVAL; } -static void set_tracer_flags(unsigned int mask, int enabled) +/* Some tracers require overwrite to stay enabled */ +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set) +{ + if (tracer->enabled && (mask & TRACE_ITER_OVERWRITE) && !set) + return -1; + + return 0; +} + +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled) { /* do nothing if flag is already set */ if (!!(trace_flags & mask) == !!enabled) - return; + return 0; + + /* Give the tracer a chance to approve the change */ + if (tr->current_trace->flag_changed) + if (tr->current_trace->flag_changed(tr->current_trace, mask, !!enabled)) + return -EINVAL; if (enabled) trace_flags |= mask; @@ -2871,18 +3214,24 @@ static void set_tracer_flags(unsigned int mask, int enabled) if (mask == TRACE_ITER_RECORD_CMD) trace_event_enable_cmd_record(enabled); - if (mask == TRACE_ITER_OVERWRITE) - ring_buffer_change_overwrite(global_trace.buffer, enabled); + if (mask == TRACE_ITER_OVERWRITE) { + ring_buffer_change_overwrite(tr->trace_buffer.buffer, enabled); +#ifdef CONFIG_TRACER_MAX_TRACE + ring_buffer_change_overwrite(tr->max_buffer.buffer, enabled); +#endif + } if (mask == TRACE_ITER_PRINTK) trace_printk_start_stop_comm(enabled); + + return 0; } -static int trace_set_options(char *option) +static int trace_set_options(struct trace_array *tr, char *option) { char *cmp; int neg = 0; - int ret = 0; + int ret = -ENODEV; int i; cmp = strstrip(option); @@ -2892,19 +3241,20 @@ static int trace_set_options(char *option) cmp += 2; } + mutex_lock(&trace_types_lock); + for (i = 0; trace_options[i]; i++) { if (strcmp(cmp, trace_options[i]) == 0) { - set_tracer_flags(1 << i, !neg); + ret = set_tracer_flag(tr, 1 << i, !neg); break; } } /* If no option could be set, test the specific tracer options */ - if (!trace_options[i]) { - mutex_lock(&trace_types_lock); - ret = set_tracer_option(current_trace, cmp, neg); - mutex_unlock(&trace_types_lock); - } + if (!trace_options[i]) + ret = set_tracer_option(tr->current_trace, cmp, neg); + + mutex_unlock(&trace_types_lock); return ret; } @@ -2913,7 +3263,10 @@ static ssize_t tracing_trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; char buf[64]; + int ret; if (cnt >= sizeof(buf)) return -EINVAL; @@ -2923,7 +3276,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf, buf[cnt] = 0; - trace_set_options(buf); + ret = trace_set_options(tr, buf); + if (ret < 0) + return ret; *ppos += cnt; @@ -2934,7 +3289,8 @@ static int tracing_trace_options_open(struct inode *inode, struct file *file) { if (tracing_disabled) return -ENODEV; - return single_open(file, tracing_trace_options_show, NULL); + + return single_open(file, tracing_trace_options_show, inode->i_private); } static const struct file_operations tracing_iter_fops = { @@ -2947,20 +3303,84 @@ static const struct file_operations tracing_iter_fops = { static const char readme_msg[] = "tracing mini-HOWTO:\n\n" - "# mount -t debugfs nodev /sys/kernel/debug\n\n" - "# cat /sys/kernel/debug/tracing/available_tracers\n" - "wakeup wakeup_rt preemptirqsoff preemptoff irqsoff function nop\n\n" - "# cat /sys/kernel/debug/tracing/current_tracer\n" - "nop\n" - "# echo wakeup > /sys/kernel/debug/tracing/current_tracer\n" - "# cat /sys/kernel/debug/tracing/current_tracer\n" - "wakeup\n" - "# cat /sys/kernel/debug/tracing/trace_options\n" - "noprint-parent nosym-offset nosym-addr noverbose\n" - "# echo print-parent > /sys/kernel/debug/tracing/trace_options\n" - "# echo 1 > /sys/kernel/debug/tracing/tracing_on\n" - "# cat /sys/kernel/debug/tracing/trace > /tmp/trace.txt\n" - "# echo 0 > /sys/kernel/debug/tracing/tracing_on\n" + "# echo 0 > tracing_on : quick way to disable tracing\n" + "# echo 1 > tracing_on : quick way to re-enable tracing\n\n" + " Important files:\n" + " trace\t\t\t- The static contents of the buffer\n" + "\t\t\t To clear the buffer write into this file: echo > trace\n" + " trace_pipe\t\t- A consuming read to see the contents of the buffer\n" + " current_tracer\t- function and latency tracers\n" + " available_tracers\t- list of configured tracers for current_tracer\n" + " buffer_size_kb\t- view and modify size of per cpu buffer\n" + " buffer_total_size_kb - view total size of all cpu buffers\n\n" + " trace_clock\t\t-change the clock used to order events\n" + " local: Per cpu clock but may not be synced across CPUs\n" + " global: Synced across CPUs but slows tracing down.\n" + " counter: Not a clock, but just an increment\n" + " uptime: Jiffy counter from time of boot\n" + " perf: Same clock that perf events use\n" +#ifdef CONFIG_X86_64 + " x86-tsc: TSC cycle counter\n" +#endif + "\n trace_marker\t\t- Writes into this file writes into the kernel buffer\n" + " tracing_cpumask\t- Limit which CPUs to trace\n" + " instances\t\t- Make sub-buffers with: mkdir instances/foo\n" + "\t\t\t Remove sub-buffer with rmdir\n" + " trace_options\t\t- Set format or modify how tracing happens\n" + "\t\t\t Disable an option by adding a suffix 'no' to the option name\n" +#ifdef CONFIG_DYNAMIC_FTRACE + "\n available_filter_functions - list of functions that can be filtered on\n" + " set_ftrace_filter\t- echo function name in here to only trace these functions\n" + " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + " modules: Can select a group via module\n" + " Format: :mod:<module-name>\n" + " example: echo :mod:ext3 > set_ftrace_filter\n" + " triggers: a command to perform when function is hit\n" + " Format: <function>:<trigger>[:count]\n" + " trigger: traceon, traceoff\n" + " enable_event:<system>:<event>\n" + " disable_event:<system>:<event>\n" +#ifdef CONFIG_STACKTRACE + " stacktrace\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + " snapshot\n" +#endif + " example: echo do_fault:traceoff > set_ftrace_filter\n" + " echo do_trap:traceoff:3 > set_ftrace_filter\n" + " The first one will disable tracing every time do_fault is hit\n" + " The second will disable tracing at most 3 times when do_trap is hit\n" + " The first time do trap is hit and it disables tracing, the counter\n" + " will decrement to 2. If tracing is already disabled, the counter\n" + " will not decrement. It only decrements when the trigger did work\n" + " To remove trigger without count:\n" + " echo '!<function>:<trigger> > set_ftrace_filter\n" + " To remove trigger with a count:\n" + " echo '!<function>:<trigger>:0 > set_ftrace_filter\n" + " set_ftrace_notrace\t- echo function name in here to never trace.\n" + " accepts: func_full_name, *func_end, func_begin*, *func_middle*\n" + " modules: Can select a group via module command :mod:\n" + " Does not accept triggers\n" +#endif /* CONFIG_DYNAMIC_FTRACE */ +#ifdef CONFIG_FUNCTION_TRACER + " set_ftrace_pid\t- Write pid(s) to only function trace those pids (function)\n" +#endif +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + " set_graph_function\t- Trace the nested calls of a function (function_graph)\n" + " max_graph_depth\t- Trace a limited depth of nested calls (0 is unlimited)\n" +#endif +#ifdef CONFIG_TRACER_SNAPSHOT + "\n snapshot\t\t- Like 'trace' but shows the content of the static snapshot buffer\n" + "\t\t\t Read the contents for more information\n" +#endif +#ifdef CONFIG_STACKTRACE + " stack_trace\t\t- Shows the max stack trace when active\n" + " stack_max_size\t- Shows current max stack size that was traced\n" + "\t\t\t Write into this file to reset the max size (trigger a new trace)\n" +#ifdef CONFIG_DYNAMIC_FTRACE + " stack_trace_filter\t- Like set_ftrace_filter but limits what stack_trace traces\n" +#endif +#endif /* CONFIG_STACKTRACE */ ; static ssize_t @@ -3032,11 +3452,12 @@ static ssize_t tracing_set_trace_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = filp->private_data; char buf[MAX_TRACER_SIZE+2]; int r; mutex_lock(&trace_types_lock); - r = sprintf(buf, "%s\n", current_trace->name); + r = sprintf(buf, "%s\n", tr->current_trace->name); mutex_unlock(&trace_types_lock); return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); @@ -3044,43 +3465,48 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf, int tracer_init(struct tracer *t, struct trace_array *tr) { - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); return t->init(tr); } -static void set_buffer_entries(struct trace_array *tr, unsigned long val) +static void set_buffer_entries(struct trace_buffer *buf, unsigned long val) { int cpu; + for_each_tracing_cpu(cpu) - tr->data[cpu]->entries = val; + per_cpu_ptr(buf->data, cpu)->entries = val; } +#ifdef CONFIG_TRACER_MAX_TRACE /* resize @tr's buffer to the size of @size_tr's entries */ -static int resize_buffer_duplicate_size(struct trace_array *tr, - struct trace_array *size_tr, int cpu_id) +static int resize_buffer_duplicate_size(struct trace_buffer *trace_buf, + struct trace_buffer *size_buf, int cpu_id) { int cpu, ret = 0; if (cpu_id == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { - ret = ring_buffer_resize(tr->buffer, - size_tr->data[cpu]->entries, cpu); + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu)->entries, cpu); if (ret < 0) break; - tr->data[cpu]->entries = size_tr->data[cpu]->entries; + per_cpu_ptr(trace_buf->data, cpu)->entries = + per_cpu_ptr(size_buf->data, cpu)->entries; } } else { - ret = ring_buffer_resize(tr->buffer, - size_tr->data[cpu_id]->entries, cpu_id); + ret = ring_buffer_resize(trace_buf->buffer, + per_cpu_ptr(size_buf->data, cpu_id)->entries, cpu_id); if (ret == 0) - tr->data[cpu_id]->entries = - size_tr->data[cpu_id]->entries; + per_cpu_ptr(trace_buf->data, cpu_id)->entries = + per_cpu_ptr(size_buf->data, cpu_id)->entries; } return ret; } +#endif /* CONFIG_TRACER_MAX_TRACE */ -static int __tracing_resize_ring_buffer(unsigned long size, int cpu) +static int __tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu) { int ret; @@ -3089,23 +3515,25 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) * we use the size that was given, and we can forget about * expanding it later. */ - ring_buffer_expanded = 1; + ring_buffer_expanded = true; /* May be called before buffers are initialized */ - if (!global_trace.buffer) + if (!tr->trace_buffer.buffer) return 0; - ret = ring_buffer_resize(global_trace.buffer, size, cpu); + ret = ring_buffer_resize(tr->trace_buffer.buffer, size, cpu); if (ret < 0) return ret; - if (!current_trace->use_max_tr) +#ifdef CONFIG_TRACER_MAX_TRACE + if (!(tr->flags & TRACE_ARRAY_FL_GLOBAL) || + !tr->current_trace->use_max_tr) goto out; - ret = ring_buffer_resize(max_tr.buffer, size, cpu); + ret = ring_buffer_resize(tr->max_buffer.buffer, size, cpu); if (ret < 0) { - int r = resize_buffer_duplicate_size(&global_trace, - &global_trace, cpu); + int r = resize_buffer_duplicate_size(&tr->trace_buffer, + &tr->trace_buffer, cpu); if (r < 0) { /* * AARGH! We are left with different @@ -3128,20 +3556,23 @@ static int __tracing_resize_ring_buffer(unsigned long size, int cpu) } if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&max_tr, size); + set_buffer_entries(&tr->max_buffer, size); else - max_tr.data[cpu]->entries = size; + per_cpu_ptr(tr->max_buffer.data, cpu)->entries = size; out: +#endif /* CONFIG_TRACER_MAX_TRACE */ + if (cpu == RING_BUFFER_ALL_CPUS) - set_buffer_entries(&global_trace, size); + set_buffer_entries(&tr->trace_buffer, size); else - global_trace.data[cpu]->entries = size; + per_cpu_ptr(tr->trace_buffer.data, cpu)->entries = size; return ret; } -static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) +static ssize_t tracing_resize_ring_buffer(struct trace_array *tr, + unsigned long size, int cpu_id) { int ret = size; @@ -3155,7 +3586,7 @@ static ssize_t tracing_resize_ring_buffer(unsigned long size, int cpu_id) } } - ret = __tracing_resize_ring_buffer(size, cpu_id); + ret = __tracing_resize_ring_buffer(tr, size, cpu_id); if (ret < 0) ret = -ENOMEM; @@ -3182,7 +3613,7 @@ int tracing_update_buffers(void) mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) - ret = __tracing_resize_ring_buffer(trace_buf_size, + ret = __tracing_resize_ring_buffer(&global_trace, trace_buf_size, RING_BUFFER_ALL_CPUS); mutex_unlock(&trace_types_lock); @@ -3192,7 +3623,7 @@ int tracing_update_buffers(void) struct trace_option_dentry; static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer); +create_trace_option_files(struct trace_array *tr, struct tracer *tracer); static void destroy_trace_option_files(struct trace_option_dentry *topts); @@ -3202,13 +3633,15 @@ static int tracing_set_tracer(const char *buf) static struct trace_option_dentry *topts; struct trace_array *tr = &global_trace; struct tracer *t; +#ifdef CONFIG_TRACER_MAX_TRACE bool had_max_tr; +#endif int ret = 0; mutex_lock(&trace_types_lock); if (!ring_buffer_expanded) { - ret = __tracing_resize_ring_buffer(trace_buf_size, + ret = __tracing_resize_ring_buffer(tr, trace_buf_size, RING_BUFFER_ALL_CPUS); if (ret < 0) goto out; @@ -3223,15 +3656,21 @@ static int tracing_set_tracer(const char *buf) ret = -EINVAL; goto out; } - if (t == current_trace) + if (t == tr->current_trace) goto out; trace_branch_disable(); - if (current_trace->reset) - current_trace->reset(tr); - had_max_tr = current_trace->allocated_snapshot; - current_trace = &nop_trace; + tr->current_trace->enabled = false; + + if (tr->current_trace->reset) + tr->current_trace->reset(tr); + + /* Current trace needs to be nop_trace before synchronize_sched */ + tr->current_trace = &nop_trace; + +#ifdef CONFIG_TRACER_MAX_TRACE + had_max_tr = tr->allocated_snapshot; if (had_max_tr && !t->use_max_tr) { /* @@ -3242,27 +3681,20 @@ static int tracing_set_tracer(const char *buf) * so a synchronized_sched() is sufficient. */ synchronize_sched(); - /* - * We don't free the ring buffer. instead, resize it because - * The max_tr ring buffer has some state (e.g. ring->clock) and - * we want preserve it. - */ - ring_buffer_resize(max_tr.buffer, 1, RING_BUFFER_ALL_CPUS); - set_buffer_entries(&max_tr, 1); - tracing_reset_online_cpus(&max_tr); - current_trace->allocated_snapshot = false; + free_snapshot(tr); } +#endif destroy_trace_option_files(topts); - topts = create_trace_option_files(t); + topts = create_trace_option_files(tr, t); + +#ifdef CONFIG_TRACER_MAX_TRACE if (t->use_max_tr && !had_max_tr) { - /* we need to make per cpu buffer sizes equivalent */ - ret = resize_buffer_duplicate_size(&max_tr, &global_trace, - RING_BUFFER_ALL_CPUS); + ret = alloc_snapshot(tr); if (ret < 0) goto out; - t->allocated_snapshot = true; } +#endif if (t->init) { ret = tracer_init(t, tr); @@ -3270,7 +3702,8 @@ static int tracing_set_tracer(const char *buf) goto out; } - current_trace = t; + tr->current_trace = t; + tr->current_trace->enabled = true; trace_branch_enable(tr); out: mutex_unlock(&trace_types_lock); @@ -3344,7 +3777,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, static int tracing_open_pipe(struct inode *inode, struct file *filp) { - long cpu_file = (long) inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct trace_iterator *iter; int ret = 0; @@ -3369,7 +3803,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) ret = -ENOMEM; goto fail; } - *iter->trace = *current_trace; + *iter->trace = *tr->current_trace; if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) { ret = -ENOMEM; @@ -3386,8 +3820,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) if (trace_clocks[trace_clock_id].in_ns) iter->iter_flags |= TRACE_FILE_TIME_IN_NS; - iter->cpu_file = cpu_file; - iter->tr = &global_trace; + iter->cpu_file = tc->cpu; + iter->tr = tc->tr; + iter->trace_buffer = &tc->tr->trace_buffer; mutex_init(&iter->mutex); filp->private_data = iter; @@ -3426,24 +3861,28 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) } static unsigned int -tracing_poll_pipe(struct file *filp, poll_table *poll_table) +trace_poll(struct trace_iterator *iter, struct file *filp, poll_table *poll_table) { - struct trace_iterator *iter = filp->private_data; + /* Iterators are static, they should be filled or empty */ + if (trace_buffer_iter(iter, iter->cpu_file)) + return POLLIN | POLLRDNORM; - if (trace_flags & TRACE_ITER_BLOCK) { + if (trace_flags & TRACE_ITER_BLOCK) /* * Always select as readable when in blocking mode */ return POLLIN | POLLRDNORM; - } else { - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; - poll_wait(filp, &trace_wait, poll_table); - if (!trace_empty(iter)) - return POLLIN | POLLRDNORM; + else + return ring_buffer_poll_wait(iter->trace_buffer->buffer, iter->cpu_file, + filp, poll_table); +} - return 0; - } +static unsigned int +tracing_poll_pipe(struct file *filp, poll_table *poll_table) +{ + struct trace_iterator *iter = filp->private_data; + + return trace_poll(iter, filp, poll_table); } /* @@ -3509,6 +3948,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_iterator *iter = filp->private_data; + struct trace_array *tr = iter->tr; ssize_t sret; /* return any leftover data */ @@ -3520,8 +3960,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != current_trace->name)) - *iter->trace = *current_trace; + if (unlikely(iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); /* @@ -3677,6 +4117,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, .ops = &tracing_pipe_buf_ops, .spd_release = tracing_spd_release_pipe, }; + struct trace_array *tr = iter->tr; ssize_t ret; size_t rem; unsigned int i; @@ -3686,8 +4127,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, /* copy the tracer to avoid using a global lock all around */ mutex_lock(&trace_types_lock); - if (unlikely(iter->trace->name != current_trace->name)) - *iter->trace = *current_trace; + if (unlikely(iter->trace->name != tr->current_trace->name)) + *iter->trace = *tr->current_trace; mutex_unlock(&trace_types_lock); mutex_lock(&iter->mutex); @@ -3749,43 +4190,19 @@ out_err: goto out; } -struct ftrace_entries_info { - struct trace_array *tr; - int cpu; -}; - -static int tracing_entries_open(struct inode *inode, struct file *filp) -{ - struct ftrace_entries_info *info; - - if (tracing_disabled) - return -ENODEV; - - info = kzalloc(sizeof(*info), GFP_KERNEL); - if (!info) - return -ENOMEM; - - info->tr = &global_trace; - info->cpu = (unsigned long)inode->i_private; - - filp->private_data = info; - - return 0; -} - static ssize_t tracing_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_entries_info *info = filp->private_data; - struct trace_array *tr = info->tr; + struct trace_cpu *tc = filp->private_data; + struct trace_array *tr = tc->tr; char buf[64]; int r = 0; ssize_t ret; mutex_lock(&trace_types_lock); - if (info->cpu == RING_BUFFER_ALL_CPUS) { + if (tc->cpu == RING_BUFFER_ALL_CPUS) { int cpu, buf_size_same; unsigned long size; @@ -3795,8 +4212,8 @@ tracing_entries_read(struct file *filp, char __user *ubuf, for_each_tracing_cpu(cpu) { /* fill in the size from first enabled cpu */ if (size == 0) - size = tr->data[cpu]->entries; - if (size != tr->data[cpu]->entries) { + size = per_cpu_ptr(tr->trace_buffer.data, cpu)->entries; + if (size != per_cpu_ptr(tr->trace_buffer.data, cpu)->entries) { buf_size_same = 0; break; } @@ -3812,7 +4229,7 @@ tracing_entries_read(struct file *filp, char __user *ubuf, } else r = sprintf(buf, "X\n"); } else - r = sprintf(buf, "%lu\n", tr->data[info->cpu]->entries >> 10); + r = sprintf(buf, "%lu\n", per_cpu_ptr(tr->trace_buffer.data, tc->cpu)->entries >> 10); mutex_unlock(&trace_types_lock); @@ -3824,7 +4241,7 @@ static ssize_t tracing_entries_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_entries_info *info = filp->private_data; + struct trace_cpu *tc = filp->private_data; unsigned long val; int ret; @@ -3839,7 +4256,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, /* value is in KB */ val <<= 10; - ret = tracing_resize_ring_buffer(val, info->cpu); + ret = tracing_resize_ring_buffer(tc->tr, val, tc->cpu); if (ret < 0) return ret; @@ -3848,16 +4265,6 @@ tracing_entries_write(struct file *filp, const char __user *ubuf, return cnt; } -static int -tracing_entries_release(struct inode *inode, struct file *filp) -{ - struct ftrace_entries_info *info = filp->private_data; - - kfree(info); - - return 0; -} - static ssize_t tracing_total_entries_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3869,7 +4276,7 @@ tracing_total_entries_read(struct file *filp, char __user *ubuf, mutex_lock(&trace_types_lock); for_each_tracing_cpu(cpu) { - size += tr->data[cpu]->entries >> 10; + size += per_cpu_ptr(tr->trace_buffer.data, cpu)->entries >> 10; if (!ring_buffer_expanded) expanded_size += trace_buf_size >> 10; } @@ -3899,11 +4306,13 @@ tracing_free_buffer_write(struct file *filp, const char __user *ubuf, static int tracing_free_buffer_release(struct inode *inode, struct file *filp) { + struct trace_array *tr = inode->i_private; + /* disable tracing ? */ if (trace_flags & TRACE_ITER_STOP_ON_FREE) tracing_off(); /* resize the ring buffer to 0 */ - tracing_resize_ring_buffer(0, RING_BUFFER_ALL_CPUS); + tracing_resize_ring_buffer(tr, 0, RING_BUFFER_ALL_CPUS); return 0; } @@ -3972,7 +4381,7 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, local_save_flags(irq_flags); size = sizeof(*entry) + cnt + 2; /* possible \n added */ - buffer = global_trace.buffer; + buffer = global_trace.trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_PRINT, size, irq_flags, preempt_count()); if (!event) { @@ -4014,13 +4423,14 @@ tracing_mark_write(struct file *filp, const char __user *ubuf, static int tracing_clock_show(struct seq_file *m, void *v) { + struct trace_array *tr = m->private; int i; for (i = 0; i < ARRAY_SIZE(trace_clocks); i++) seq_printf(m, "%s%s%s%s", i ? " " : "", - i == trace_clock_id ? "[" : "", trace_clocks[i].name, - i == trace_clock_id ? "]" : ""); + i == tr->clock_id ? "[" : "", trace_clocks[i].name, + i == tr->clock_id ? "]" : ""); seq_putc(m, '\n'); return 0; @@ -4029,6 +4439,8 @@ static int tracing_clock_show(struct seq_file *m, void *v) static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *fpos) { + struct seq_file *m = filp->private_data; + struct trace_array *tr = m->private; char buf[64]; const char *clockstr; int i; @@ -4050,20 +4462,23 @@ static ssize_t tracing_clock_write(struct file *filp, const char __user *ubuf, if (i == ARRAY_SIZE(trace_clocks)) return -EINVAL; - trace_clock_id = i; - mutex_lock(&trace_types_lock); - ring_buffer_set_clock(global_trace.buffer, trace_clocks[i].func); - if (max_tr.buffer) - ring_buffer_set_clock(max_tr.buffer, trace_clocks[i].func); + tr->clock_id = i; + + ring_buffer_set_clock(tr->trace_buffer.buffer, trace_clocks[i].func); /* * New clock may not be consistent with the previous clock. * Reset the buffer so that it doesn't have incomparable timestamps. */ - tracing_reset_online_cpus(&global_trace); - tracing_reset_online_cpus(&max_tr); + tracing_reset_online_cpus(&global_trace.trace_buffer); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); + tracing_reset_online_cpus(&global_trace.max_buffer); +#endif mutex_unlock(&trace_types_lock); @@ -4076,20 +4491,45 @@ static int tracing_clock_open(struct inode *inode, struct file *file) { if (tracing_disabled) return -ENODEV; - return single_open(file, tracing_clock_show, NULL); + + return single_open(file, tracing_clock_show, inode->i_private); } +struct ftrace_buffer_info { + struct trace_iterator iter; + void *spare; + unsigned int read; +}; + #ifdef CONFIG_TRACER_SNAPSHOT static int tracing_snapshot_open(struct inode *inode, struct file *file) { + struct trace_cpu *tc = inode->i_private; struct trace_iterator *iter; + struct seq_file *m; int ret = 0; if (file->f_mode & FMODE_READ) { iter = __tracing_open(inode, file, true); if (IS_ERR(iter)) ret = PTR_ERR(iter); + } else { + /* Writes still need the seq_file to hold the private data */ + m = kzalloc(sizeof(*m), GFP_KERNEL); + if (!m) + return -ENOMEM; + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) { + kfree(m); + return -ENOMEM; + } + iter->tr = tc->tr; + iter->trace_buffer = &tc->tr->max_buffer; + iter->cpu_file = tc->cpu; + m->private = iter; + file->private_data = m; } + return ret; } @@ -4097,6 +4537,9 @@ static ssize_t tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct seq_file *m = filp->private_data; + struct trace_iterator *iter = m->private; + struct trace_array *tr = iter->tr; unsigned long val; int ret; @@ -4110,42 +4553,48 @@ tracing_snapshot_write(struct file *filp, const char __user *ubuf, size_t cnt, mutex_lock(&trace_types_lock); - if (current_trace->use_max_tr) { + if (tr->current_trace->use_max_tr) { ret = -EBUSY; goto out; } switch (val) { case 0: - if (current_trace->allocated_snapshot) { - /* free spare buffer */ - ring_buffer_resize(max_tr.buffer, 1, - RING_BUFFER_ALL_CPUS); - set_buffer_entries(&max_tr, 1); - tracing_reset_online_cpus(&max_tr); - current_trace->allocated_snapshot = false; + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; } + if (tr->allocated_snapshot) + free_snapshot(tr); break; case 1: - if (!current_trace->allocated_snapshot) { - /* allocate spare buffer */ - ret = resize_buffer_duplicate_size(&max_tr, - &global_trace, RING_BUFFER_ALL_CPUS); +/* Only allow per-cpu swap if the ring buffer supports it */ +#ifndef CONFIG_RING_BUFFER_ALLOW_SWAP + if (iter->cpu_file != RING_BUFFER_ALL_CPUS) { + ret = -EINVAL; + break; + } +#endif + if (!tr->allocated_snapshot) { + ret = alloc_snapshot(tr); if (ret < 0) break; - current_trace->allocated_snapshot = true; } - local_irq_disable(); /* Now, we're going to swap */ - update_max_tr(&global_trace, current, smp_processor_id()); + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + update_max_tr(tr, current, smp_processor_id()); + else + update_max_tr_single(tr, current, iter->cpu_file); local_irq_enable(); break; default: - if (current_trace->allocated_snapshot) - tracing_reset_online_cpus(&max_tr); - else - ret = -EINVAL; + if (tr->allocated_snapshot) { + if (iter->cpu_file == RING_BUFFER_ALL_CPUS) + tracing_reset_online_cpus(&tr->max_buffer); + else + tracing_reset(&tr->max_buffer, iter->cpu_file); + } break; } @@ -4157,6 +4606,51 @@ out: mutex_unlock(&trace_types_lock); return ret; } + +static int tracing_snapshot_release(struct inode *inode, struct file *file) +{ + struct seq_file *m = file->private_data; + + if (file->f_mode & FMODE_READ) + return tracing_release(inode, file); + + /* If write only, the seq_file is just a stub */ + if (m) + kfree(m->private); + kfree(m); + + return 0; +} + +static int tracing_buffers_open(struct inode *inode, struct file *filp); +static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos); +static int tracing_buffers_release(struct inode *inode, struct file *file); +static ssize_t tracing_buffers_splice_read(struct file *file, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags); + +static int snapshot_raw_open(struct inode *inode, struct file *filp) +{ + struct ftrace_buffer_info *info; + int ret; + + ret = tracing_buffers_open(inode, filp); + if (ret < 0) + return ret; + + info = filp->private_data; + + if (info->iter.trace->use_max_tr) { + tracing_buffers_release(inode, filp); + return -EBUSY; + } + + info->iter.snapshot = true; + info->iter.trace_buffer = &info->iter.tr->max_buffer; + + return ret; +} + #endif /* CONFIG_TRACER_SNAPSHOT */ @@ -4184,10 +4678,9 @@ static const struct file_operations tracing_pipe_fops = { }; static const struct file_operations tracing_entries_fops = { - .open = tracing_entries_open, + .open = tracing_open_generic, .read = tracing_entries_read, .write = tracing_entries_write, - .release = tracing_entries_release, .llseek = generic_file_llseek, }; @@ -4222,20 +4715,23 @@ static const struct file_operations snapshot_fops = { .read = seq_read, .write = tracing_snapshot_write, .llseek = tracing_seek, - .release = tracing_release, + .release = tracing_snapshot_release, }; -#endif /* CONFIG_TRACER_SNAPSHOT */ -struct ftrace_buffer_info { - struct trace_array *tr; - void *spare; - int cpu; - unsigned int read; +static const struct file_operations snapshot_raw_fops = { + .open = snapshot_raw_open, + .read = tracing_buffers_read, + .release = tracing_buffers_release, + .splice_read = tracing_buffers_splice_read, + .llseek = no_llseek, }; +#endif /* CONFIG_TRACER_SNAPSHOT */ + static int tracing_buffers_open(struct inode *inode, struct file *filp) { - int cpu = (int)(long)inode->i_private; + struct trace_cpu *tc = inode->i_private; + struct trace_array *tr = tc->tr; struct ftrace_buffer_info *info; if (tracing_disabled) @@ -4245,72 +4741,131 @@ static int tracing_buffers_open(struct inode *inode, struct file *filp) if (!info) return -ENOMEM; - info->tr = &global_trace; - info->cpu = cpu; - info->spare = NULL; + mutex_lock(&trace_types_lock); + + tr->ref++; + + info->iter.tr = tr; + info->iter.cpu_file = tc->cpu; + info->iter.trace = tr->current_trace; + info->iter.trace_buffer = &tr->trace_buffer; + info->spare = NULL; /* Force reading ring buffer for first read */ - info->read = (unsigned int)-1; + info->read = (unsigned int)-1; filp->private_data = info; + mutex_unlock(&trace_types_lock); + return nonseekable_open(inode, filp); } +static unsigned int +tracing_buffers_poll(struct file *filp, poll_table *poll_table) +{ + struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; + + return trace_poll(iter, filp, poll_table); +} + static ssize_t tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; + struct trace_iterator *iter = &info->iter; ssize_t ret; - size_t size; + ssize_t size; if (!count) return 0; + mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) { + size = -EBUSY; + goto out_unlock; + } +#endif + if (!info->spare) - info->spare = ring_buffer_alloc_read_page(info->tr->buffer, info->cpu); + info->spare = ring_buffer_alloc_read_page(iter->trace_buffer->buffer, + iter->cpu_file); + size = -ENOMEM; if (!info->spare) - return -ENOMEM; + goto out_unlock; /* Do we have previous read data to read? */ if (info->read < PAGE_SIZE) goto read; - trace_access_lock(info->cpu); - ret = ring_buffer_read_page(info->tr->buffer, + again: + trace_access_lock(iter->cpu_file); + ret = ring_buffer_read_page(iter->trace_buffer->buffer, &info->spare, count, - info->cpu, 0); - trace_access_unlock(info->cpu); - if (ret < 0) - return 0; + iter->cpu_file, 0); + trace_access_unlock(iter->cpu_file); - info->read = 0; + if (ret < 0) { + if (trace_empty(iter)) { + if ((filp->f_flags & O_NONBLOCK)) { + size = -EAGAIN; + goto out_unlock; + } + mutex_unlock(&trace_types_lock); + iter->trace->wait_pipe(iter); + mutex_lock(&trace_types_lock); + if (signal_pending(current)) { + size = -EINTR; + goto out_unlock; + } + goto again; + } + size = 0; + goto out_unlock; + } -read: + info->read = 0; + read: size = PAGE_SIZE - info->read; if (size > count) size = count; ret = copy_to_user(ubuf, info->spare + info->read, size); - if (ret == size) - return -EFAULT; + if (ret == size) { + size = -EFAULT; + goto out_unlock; + } size -= ret; *ppos += size; info->read += size; + out_unlock: + mutex_unlock(&trace_types_lock); + return size; } static int tracing_buffers_release(struct inode *inode, struct file *file) { struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; + + mutex_lock(&trace_types_lock); + + WARN_ON(!iter->tr->ref); + iter->tr->ref--; if (info->spare) - ring_buffer_free_read_page(info->tr->buffer, info->spare); + ring_buffer_free_read_page(iter->trace_buffer->buffer, info->spare); kfree(info); + mutex_unlock(&trace_types_lock); + return 0; } @@ -4375,6 +4930,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, unsigned int flags) { struct ftrace_buffer_info *info = file->private_data; + struct trace_iterator *iter = &info->iter; struct partial_page partial_def[PIPE_DEF_BUFFERS]; struct page *pages_def[PIPE_DEF_BUFFERS]; struct splice_pipe_desc spd = { @@ -4387,10 +4943,21 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, }; struct buffer_ref *ref; int entries, size, i; - size_t ret; + ssize_t ret; - if (splice_grow_spd(pipe, &spd)) - return -ENOMEM; + mutex_lock(&trace_types_lock); + +#ifdef CONFIG_TRACER_MAX_TRACE + if (iter->snapshot && iter->tr->current_trace->use_max_tr) { + ret = -EBUSY; + goto out; + } +#endif + + if (splice_grow_spd(pipe, &spd)) { + ret = -ENOMEM; + goto out; + } if (*ppos & (PAGE_SIZE - 1)) { ret = -EINVAL; @@ -4405,8 +4972,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } - trace_access_lock(info->cpu); - entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + again: + trace_access_lock(iter->cpu_file); + entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); for (i = 0; i < pipe->buffers && len && entries; i++, len -= PAGE_SIZE) { struct page *page; @@ -4417,15 +4985,15 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, break; ref->ref = 1; - ref->buffer = info->tr->buffer; - ref->page = ring_buffer_alloc_read_page(ref->buffer, info->cpu); + ref->buffer = iter->trace_buffer->buffer; + ref->page = ring_buffer_alloc_read_page(ref->buffer, iter->cpu_file); if (!ref->page) { kfree(ref); break; } r = ring_buffer_read_page(ref->buffer, &ref->page, - len, info->cpu, 1); + len, iter->cpu_file, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->page); kfree(ref); @@ -4449,31 +5017,40 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.nr_pages++; *ppos += PAGE_SIZE; - entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + entries = ring_buffer_entries_cpu(iter->trace_buffer->buffer, iter->cpu_file); } - trace_access_unlock(info->cpu); + trace_access_unlock(iter->cpu_file); spd.nr_pages = i; /* did we read anything? */ if (!spd.nr_pages) { - if (flags & SPLICE_F_NONBLOCK) + if ((file->f_flags & O_NONBLOCK) || (flags & SPLICE_F_NONBLOCK)) { ret = -EAGAIN; - else - ret = 0; - /* TODO: block */ - goto out; + goto out; + } + mutex_unlock(&trace_types_lock); + iter->trace->wait_pipe(iter); + mutex_lock(&trace_types_lock); + if (signal_pending(current)) { + ret = -EINTR; + goto out; + } + goto again; } ret = splice_to_pipe(pipe, &spd); splice_shrink_spd(&spd); out: + mutex_unlock(&trace_types_lock); + return ret; } static const struct file_operations tracing_buffers_fops = { .open = tracing_buffers_open, .read = tracing_buffers_read, + .poll = tracing_buffers_poll, .release = tracing_buffers_release, .splice_read = tracing_buffers_splice_read, .llseek = no_llseek, @@ -4483,12 +5060,14 @@ static ssize_t tracing_stats_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { - unsigned long cpu = (unsigned long)filp->private_data; - struct trace_array *tr = &global_trace; + struct trace_cpu *tc = filp->private_data; + struct trace_array *tr = tc->tr; + struct trace_buffer *trace_buf = &tr->trace_buffer; struct trace_seq *s; unsigned long cnt; unsigned long long t; unsigned long usec_rem; + int cpu = tc->cpu; s = kmalloc(sizeof(*s), GFP_KERNEL); if (!s) @@ -4496,41 +5075,41 @@ tracing_stats_read(struct file *filp, char __user *ubuf, trace_seq_init(s); - cnt = ring_buffer_entries_cpu(tr->buffer, cpu); + cnt = ring_buffer_entries_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "entries: %ld\n", cnt); - cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); + cnt = ring_buffer_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "overrun: %ld\n", cnt); - cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); + cnt = ring_buffer_commit_overrun_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "commit overrun: %ld\n", cnt); - cnt = ring_buffer_bytes_cpu(tr->buffer, cpu); + cnt = ring_buffer_bytes_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "bytes: %ld\n", cnt); if (trace_clocks[trace_clock_id].in_ns) { /* local or global for trace_clock */ - t = ns2usecs(ring_buffer_oldest_event_ts(tr->buffer, cpu)); + t = ns2usecs(ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "oldest event ts: %5llu.%06lu\n", t, usec_rem); - t = ns2usecs(ring_buffer_time_stamp(tr->buffer, cpu)); + t = ns2usecs(ring_buffer_time_stamp(trace_buf->buffer, cpu)); usec_rem = do_div(t, USEC_PER_SEC); trace_seq_printf(s, "now ts: %5llu.%06lu\n", t, usec_rem); } else { /* counter or tsc mode for trace_clock */ trace_seq_printf(s, "oldest event ts: %llu\n", - ring_buffer_oldest_event_ts(tr->buffer, cpu)); + ring_buffer_oldest_event_ts(trace_buf->buffer, cpu)); trace_seq_printf(s, "now ts: %llu\n", - ring_buffer_time_stamp(tr->buffer, cpu)); + ring_buffer_time_stamp(trace_buf->buffer, cpu)); } - cnt = ring_buffer_dropped_events_cpu(tr->buffer, cpu); + cnt = ring_buffer_dropped_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "dropped events: %ld\n", cnt); - cnt = ring_buffer_read_events_cpu(tr->buffer, cpu); + cnt = ring_buffer_read_events_cpu(trace_buf->buffer, cpu); trace_seq_printf(s, "read events: %ld\n", cnt); count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); @@ -4582,60 +5161,161 @@ static const struct file_operations tracing_dyn_info_fops = { .read = tracing_read_dyn_info, .llseek = generic_file_llseek, }; -#endif +#endif /* CONFIG_DYNAMIC_FTRACE */ -static struct dentry *d_tracer; +#if defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) +static void +ftrace_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ + tracing_snapshot(); +} -struct dentry *tracing_init_dentry(void) +static void +ftrace_count_snapshot(unsigned long ip, unsigned long parent_ip, void **data) +{ + unsigned long *count = (long *)data; + + if (!*count) + return; + + if (*count != -1) + (*count)--; + + tracing_snapshot(); +} + +static int +ftrace_snapshot_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + long count = (long)data; + + seq_printf(m, "%ps:", (void *)ip); + + seq_printf(m, "snapshot"); + + if (count == -1) + seq_printf(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", count); + + return 0; +} + +static struct ftrace_probe_ops snapshot_probe_ops = { + .func = ftrace_snapshot, + .print = ftrace_snapshot_print, +}; + +static struct ftrace_probe_ops snapshot_count_probe_ops = { + .func = ftrace_count_snapshot, + .print = ftrace_snapshot_print, +}; + +static int +ftrace_trace_snapshot_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + void *count = (void *)-1; + char *number; + int ret; + + /* hash funcs only work with set_ftrace_filter */ + if (!enable) + return -EINVAL; + + ops = param ? &snapshot_count_probe_ops : &snapshot_probe_ops; + + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + return 0; + } + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + if (!strlen(number)) + goto out_reg; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, (unsigned long *)&count); + if (ret) + return ret; + + out_reg: + ret = register_ftrace_function_probe(glob, ops, count); + + if (ret >= 0) + alloc_snapshot(&global_trace); + + return ret < 0 ? ret : 0; +} + +static struct ftrace_func_command ftrace_snapshot_cmd = { + .name = "snapshot", + .func = ftrace_trace_snapshot_callback, +}; + +static int register_snapshot_cmd(void) { - static int once; + return register_ftrace_command(&ftrace_snapshot_cmd); +} +#else +static inline int register_snapshot_cmd(void) { return 0; } +#endif /* defined(CONFIG_TRACER_SNAPSHOT) && defined(CONFIG_DYNAMIC_FTRACE) */ - if (d_tracer) - return d_tracer; +struct dentry *tracing_init_dentry_tr(struct trace_array *tr) +{ + if (tr->dir) + return tr->dir; if (!debugfs_initialized()) return NULL; - d_tracer = debugfs_create_dir("tracing", NULL); + if (tr->flags & TRACE_ARRAY_FL_GLOBAL) + tr->dir = debugfs_create_dir("tracing", NULL); - if (!d_tracer && !once) { - once = 1; - pr_warning("Could not create debugfs directory 'tracing'\n"); - return NULL; - } + if (!tr->dir) + pr_warn_once("Could not create debugfs directory 'tracing'\n"); - return d_tracer; + return tr->dir; } -static struct dentry *d_percpu; +struct dentry *tracing_init_dentry(void) +{ + return tracing_init_dentry_tr(&global_trace); +} -static struct dentry *tracing_dentry_percpu(void) +static struct dentry *tracing_dentry_percpu(struct trace_array *tr, int cpu) { - static int once; struct dentry *d_tracer; - if (d_percpu) - return d_percpu; - - d_tracer = tracing_init_dentry(); + if (tr->percpu_dir) + return tr->percpu_dir; + d_tracer = tracing_init_dentry_tr(tr); if (!d_tracer) return NULL; - d_percpu = debugfs_create_dir("per_cpu", d_tracer); + tr->percpu_dir = debugfs_create_dir("per_cpu", d_tracer); - if (!d_percpu && !once) { - once = 1; - pr_warning("Could not create debugfs directory 'per_cpu'\n"); - return NULL; - } + WARN_ONCE(!tr->percpu_dir, + "Could not create debugfs directory 'per_cpu/%d'\n", cpu); - return d_percpu; + return tr->percpu_dir; } -static void tracing_init_debugfs_percpu(long cpu) +static void +tracing_init_debugfs_percpu(struct trace_array *tr, long cpu) { - struct dentry *d_percpu = tracing_dentry_percpu(); + struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, cpu); + struct dentry *d_percpu = tracing_dentry_percpu(tr, cpu); struct dentry *d_cpu; char cpu_dir[30]; /* 30 characters should be more than enough */ @@ -4651,20 +5331,28 @@ static void tracing_init_debugfs_percpu(long cpu) /* per cpu trace_pipe */ trace_create_file("trace_pipe", 0444, d_cpu, - (void *) cpu, &tracing_pipe_fops); + (void *)&data->trace_cpu, &tracing_pipe_fops); /* per cpu trace */ trace_create_file("trace", 0644, d_cpu, - (void *) cpu, &tracing_fops); + (void *)&data->trace_cpu, &tracing_fops); trace_create_file("trace_pipe_raw", 0444, d_cpu, - (void *) cpu, &tracing_buffers_fops); + (void *)&data->trace_cpu, &tracing_buffers_fops); trace_create_file("stats", 0444, d_cpu, - (void *) cpu, &tracing_stats_fops); + (void *)&data->trace_cpu, &tracing_stats_fops); trace_create_file("buffer_size_kb", 0444, d_cpu, - (void *) cpu, &tracing_entries_fops); + (void *)&data->trace_cpu, &tracing_entries_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_cpu, + (void *)&data->trace_cpu, &snapshot_fops); + + trace_create_file("snapshot_raw", 0444, d_cpu, + (void *)&data->trace_cpu, &snapshot_raw_fops); +#endif } #ifdef CONFIG_FTRACE_SELFTEST @@ -4675,6 +5363,7 @@ static void tracing_init_debugfs_percpu(long cpu) struct trace_option_dentry { struct tracer_opt *opt; struct tracer_flags *flags; + struct trace_array *tr; struct dentry *entry; }; @@ -4710,7 +5399,7 @@ trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt, if (!!(topt->flags->val & topt->opt->bit) != val) { mutex_lock(&trace_types_lock); - ret = __set_tracer_option(current_trace, topt->flags, + ret = __set_tracer_option(topt->tr->current_trace, topt->flags, topt->opt, !val); mutex_unlock(&trace_types_lock); if (ret) @@ -4749,6 +5438,7 @@ static ssize_t trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { + struct trace_array *tr = &global_trace; long index = (long)filp->private_data; unsigned long val; int ret; @@ -4759,7 +5449,13 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt, if (val != 0 && val != 1) return -EINVAL; - set_tracer_flags(1 << index, val); + + mutex_lock(&trace_types_lock); + ret = set_tracer_flag(tr, 1 << index, val); + mutex_unlock(&trace_types_lock); + + if (ret < 0) + return ret; *ppos += cnt; @@ -4789,40 +5485,41 @@ struct dentry *trace_create_file(const char *name, } -static struct dentry *trace_options_init_dentry(void) +static struct dentry *trace_options_init_dentry(struct trace_array *tr) { struct dentry *d_tracer; - static struct dentry *t_options; - if (t_options) - return t_options; + if (tr->options) + return tr->options; - d_tracer = tracing_init_dentry(); + d_tracer = tracing_init_dentry_tr(tr); if (!d_tracer) return NULL; - t_options = debugfs_create_dir("options", d_tracer); - if (!t_options) { + tr->options = debugfs_create_dir("options", d_tracer); + if (!tr->options) { pr_warning("Could not create debugfs directory 'options'\n"); return NULL; } - return t_options; + return tr->options; } static void -create_trace_option_file(struct trace_option_dentry *topt, +create_trace_option_file(struct trace_array *tr, + struct trace_option_dentry *topt, struct tracer_flags *flags, struct tracer_opt *opt) { struct dentry *t_options; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return; topt->flags = flags; topt->opt = opt; + topt->tr = tr; topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops); @@ -4830,7 +5527,7 @@ create_trace_option_file(struct trace_option_dentry *topt, } static struct trace_option_dentry * -create_trace_option_files(struct tracer *tracer) +create_trace_option_files(struct trace_array *tr, struct tracer *tracer) { struct trace_option_dentry *topts; struct tracer_flags *flags; @@ -4855,7 +5552,7 @@ create_trace_option_files(struct tracer *tracer) return NULL; for (cnt = 0; opts[cnt].name; cnt++) - create_trace_option_file(&topts[cnt], flags, + create_trace_option_file(tr, &topts[cnt], flags, &opts[cnt]); return topts; @@ -4878,11 +5575,12 @@ destroy_trace_option_files(struct trace_option_dentry *topts) } static struct dentry * -create_trace_option_core_file(const char *option, long index) +create_trace_option_core_file(struct trace_array *tr, + const char *option, long index) { struct dentry *t_options; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return NULL; @@ -4890,17 +5588,17 @@ create_trace_option_core_file(const char *option, long index) &trace_options_core_fops); } -static __init void create_trace_options_dir(void) +static __init void create_trace_options_dir(struct trace_array *tr) { struct dentry *t_options; int i; - t_options = trace_options_init_dentry(); + t_options = trace_options_init_dentry(tr); if (!t_options) return; for (i = 0; trace_options[i]; i++) - create_trace_option_core_file(trace_options[i], i); + create_trace_option_core_file(tr, trace_options[i], i); } static ssize_t @@ -4908,7 +5606,7 @@ rb_simple_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; char buf[64]; int r; @@ -4927,7 +5625,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_array *tr = filp->private_data; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; unsigned long val; int ret; @@ -4939,12 +5637,12 @@ rb_simple_write(struct file *filp, const char __user *ubuf, mutex_lock(&trace_types_lock); if (val) { ring_buffer_record_on(buffer); - if (current_trace->start) - current_trace->start(tr); + if (tr->current_trace->start) + tr->current_trace->start(tr); } else { ring_buffer_record_off(buffer); - if (current_trace->stop) - current_trace->stop(tr); + if (tr->current_trace->stop) + tr->current_trace->stop(tr); } mutex_unlock(&trace_types_lock); } @@ -4961,23 +5659,310 @@ static const struct file_operations rb_simple_fops = { .llseek = default_llseek, }; +struct dentry *trace_instance_dir; + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer); + +static void init_trace_buffers(struct trace_array *tr, struct trace_buffer *buf) +{ + int cpu; + + for_each_tracing_cpu(cpu) { + memset(per_cpu_ptr(buf->data, cpu), 0, sizeof(struct trace_array_cpu)); + per_cpu_ptr(buf->data, cpu)->trace_cpu.cpu = cpu; + per_cpu_ptr(buf->data, cpu)->trace_cpu.tr = tr; + } +} + +static int +allocate_trace_buffer(struct trace_array *tr, struct trace_buffer *buf, int size) +{ + enum ring_buffer_flags rb_flags; + + rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; + + buf->buffer = ring_buffer_alloc(size, rb_flags); + if (!buf->buffer) + return -ENOMEM; + + buf->data = alloc_percpu(struct trace_array_cpu); + if (!buf->data) { + ring_buffer_free(buf->buffer); + return -ENOMEM; + } + + init_trace_buffers(tr, buf); + + /* Allocate the first page for all buffers */ + set_buffer_entries(&tr->trace_buffer, + ring_buffer_size(tr->trace_buffer.buffer, 0)); + + return 0; +} + +static int allocate_trace_buffers(struct trace_array *tr, int size) +{ + int ret; + + ret = allocate_trace_buffer(tr, &tr->trace_buffer, size); + if (ret) + return ret; + +#ifdef CONFIG_TRACER_MAX_TRACE + ret = allocate_trace_buffer(tr, &tr->max_buffer, + allocate_snapshot ? size : 1); + if (WARN_ON(ret)) { + ring_buffer_free(tr->trace_buffer.buffer); + free_percpu(tr->trace_buffer.data); + return -ENOMEM; + } + tr->allocated_snapshot = allocate_snapshot; + + /* + * Only the top level trace array gets its snapshot allocated + * from the kernel command line. + */ + allocate_snapshot = false; +#endif + return 0; +} + +static int new_instance_create(const char *name) +{ + struct trace_array *tr; + int ret; + + mutex_lock(&trace_types_lock); + + ret = -EEXIST; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) + goto out_unlock; + } + + ret = -ENOMEM; + tr = kzalloc(sizeof(*tr), GFP_KERNEL); + if (!tr) + goto out_unlock; + + tr->name = kstrdup(name, GFP_KERNEL); + if (!tr->name) + goto out_free_tr; + + raw_spin_lock_init(&tr->start_lock); + + tr->current_trace = &nop_trace; + + INIT_LIST_HEAD(&tr->systems); + INIT_LIST_HEAD(&tr->events); + + if (allocate_trace_buffers(tr, trace_buf_size) < 0) + goto out_free_tr; + + /* Holder for file callbacks */ + tr->trace_cpu.cpu = RING_BUFFER_ALL_CPUS; + tr->trace_cpu.tr = tr; + + tr->dir = debugfs_create_dir(name, trace_instance_dir); + if (!tr->dir) + goto out_free_tr; + + ret = event_trace_add_tracer(tr->dir, tr); + if (ret) + goto out_free_tr; + + init_tracer_debugfs(tr, tr->dir); + + list_add(&tr->list, &ftrace_trace_arrays); + + mutex_unlock(&trace_types_lock); + + return 0; + + out_free_tr: + if (tr->trace_buffer.buffer) + ring_buffer_free(tr->trace_buffer.buffer); + kfree(tr->name); + kfree(tr); + + out_unlock: + mutex_unlock(&trace_types_lock); + + return ret; + +} + +static int instance_delete(const char *name) +{ + struct trace_array *tr; + int found = 0; + int ret; + + mutex_lock(&trace_types_lock); + + ret = -ENODEV; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (tr->name && strcmp(tr->name, name) == 0) { + found = 1; + break; + } + } + if (!found) + goto out_unlock; + + ret = -EBUSY; + if (tr->ref) + goto out_unlock; + + list_del(&tr->list); + + event_trace_del_tracer(tr); + debugfs_remove_recursive(tr->dir); + free_percpu(tr->trace_buffer.data); + ring_buffer_free(tr->trace_buffer.buffer); + + kfree(tr->name); + kfree(tr); + + ret = 0; + + out_unlock: + mutex_unlock(&trace_types_lock); + + return ret; +} + +static int instance_mkdir (struct inode *inode, struct dentry *dentry, umode_t mode) +{ + struct dentry *parent; + int ret; + + /* Paranoid: Make sure the parent is the "instances" directory */ + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + if (WARN_ON_ONCE(parent != trace_instance_dir)) + return -ENOENT; + + /* + * The inode mutex is locked, but debugfs_create_dir() will also + * take the mutex. As the instances directory can not be destroyed + * or changed in any other way, it is safe to unlock it, and + * let the dentry try. If two users try to make the same dir at + * the same time, then the new_instance_create() will determine the + * winner. + */ + mutex_unlock(&inode->i_mutex); + + ret = new_instance_create(dentry->d_iname); + + mutex_lock(&inode->i_mutex); + + return ret; +} + +static int instance_rmdir(struct inode *inode, struct dentry *dentry) +{ + struct dentry *parent; + int ret; + + /* Paranoid: Make sure the parent is the "instances" directory */ + parent = hlist_entry(inode->i_dentry.first, struct dentry, d_alias); + if (WARN_ON_ONCE(parent != trace_instance_dir)) + return -ENOENT; + + /* The caller did a dget() on dentry */ + mutex_unlock(&dentry->d_inode->i_mutex); + + /* + * The inode mutex is locked, but debugfs_create_dir() will also + * take the mutex. As the instances directory can not be destroyed + * or changed in any other way, it is safe to unlock it, and + * let the dentry try. If two users try to make the same dir at + * the same time, then the instance_delete() will determine the + * winner. + */ + mutex_unlock(&inode->i_mutex); + + ret = instance_delete(dentry->d_iname); + + mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); + mutex_lock(&dentry->d_inode->i_mutex); + + return ret; +} + +static const struct inode_operations instance_dir_inode_operations = { + .lookup = simple_lookup, + .mkdir = instance_mkdir, + .rmdir = instance_rmdir, +}; + +static __init void create_trace_instances(struct dentry *d_tracer) +{ + trace_instance_dir = debugfs_create_dir("instances", d_tracer); + if (WARN_ON(!trace_instance_dir)) + return; + + /* Hijack the dir inode operations, to allow mkdir */ + trace_instance_dir->d_inode->i_op = &instance_dir_inode_operations; +} + +static void +init_tracer_debugfs(struct trace_array *tr, struct dentry *d_tracer) +{ + int cpu; + + trace_create_file("trace_options", 0644, d_tracer, + tr, &tracing_iter_fops); + + trace_create_file("trace", 0644, d_tracer, + (void *)&tr->trace_cpu, &tracing_fops); + + trace_create_file("trace_pipe", 0444, d_tracer, + (void *)&tr->trace_cpu, &tracing_pipe_fops); + + trace_create_file("buffer_size_kb", 0644, d_tracer, + (void *)&tr->trace_cpu, &tracing_entries_fops); + + trace_create_file("buffer_total_size_kb", 0444, d_tracer, + tr, &tracing_total_entries_fops); + + trace_create_file("free_buffer", 0644, d_tracer, + tr, &tracing_free_buffer_fops); + + trace_create_file("trace_marker", 0220, d_tracer, + tr, &tracing_mark_fops); + + trace_create_file("trace_clock", 0644, d_tracer, tr, + &trace_clock_fops); + + trace_create_file("tracing_on", 0644, d_tracer, + tr, &rb_simple_fops); + +#ifdef CONFIG_TRACER_SNAPSHOT + trace_create_file("snapshot", 0644, d_tracer, + (void *)&tr->trace_cpu, &snapshot_fops); +#endif + + for_each_tracing_cpu(cpu) + tracing_init_debugfs_percpu(tr, cpu); + +} + static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; - int cpu; trace_access_lock_init(); d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; - trace_create_file("trace_options", 0644, d_tracer, - NULL, &tracing_iter_fops); + init_tracer_debugfs(&global_trace, d_tracer); trace_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); - - trace_create_file("trace", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); + &global_trace, &tracing_cpumask_fops); trace_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); @@ -4996,44 +5981,17 @@ static __init int tracer_init_debugfs(void) trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); - trace_create_file("trace_pipe", 0444, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - - trace_create_file("buffer_size_kb", 0644, d_tracer, - (void *) RING_BUFFER_ALL_CPUS, &tracing_entries_fops); - - trace_create_file("buffer_total_size_kb", 0444, d_tracer, - &global_trace, &tracing_total_entries_fops); - - trace_create_file("free_buffer", 0644, d_tracer, - &global_trace, &tracing_free_buffer_fops); - - trace_create_file("trace_marker", 0220, d_tracer, - NULL, &tracing_mark_fops); - trace_create_file("saved_cmdlines", 0444, d_tracer, NULL, &tracing_saved_cmdlines_fops); - trace_create_file("trace_clock", 0644, d_tracer, NULL, - &trace_clock_fops); - - trace_create_file("tracing_on", 0644, d_tracer, - &global_trace, &rb_simple_fops); - #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif -#ifdef CONFIG_TRACER_SNAPSHOT - trace_create_file("snapshot", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &snapshot_fops); -#endif - - create_trace_options_dir(); + create_trace_instances(d_tracer); - for_each_tracing_cpu(cpu) - tracing_init_debugfs_percpu(cpu); + create_trace_options_dir(&global_trace); return 0; } @@ -5089,8 +6047,8 @@ void trace_printk_seq(struct trace_seq *s) { /* Probably should print a warning here. */ - if (s->len >= 1000) - s->len = 1000; + if (s->len >= TRACE_MAX_PRINT) + s->len = TRACE_MAX_PRINT; /* should be zero ended, but we are paranoid. */ s->buffer[s->len] = 0; @@ -5103,46 +6061,43 @@ trace_printk_seq(struct trace_seq *s) void trace_init_global_iter(struct trace_iterator *iter) { iter->tr = &global_trace; - iter->trace = current_trace; - iter->cpu_file = TRACE_PIPE_ALL_CPU; + iter->trace = iter->tr->current_trace; + iter->cpu_file = RING_BUFFER_ALL_CPUS; + iter->trace_buffer = &global_trace.trace_buffer; } -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) +void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) { - static arch_spinlock_t ftrace_dump_lock = - (arch_spinlock_t)__ARCH_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; + static atomic_t dump_running; unsigned int old_userobj; - static int dump_ran; unsigned long flags; int cnt = 0, cpu; - /* only one dump */ - local_irq_save(flags); - arch_spin_lock(&ftrace_dump_lock); - if (dump_ran) - goto out; - - dump_ran = 1; + /* Only allow one dump user at a time. */ + if (atomic_inc_return(&dump_running) != 1) { + atomic_dec(&dump_running); + return; + } + /* + * Always turn off tracing when we dump. + * We don't need to show trace output of what happens + * between multiple crashes. + * + * If the user does a sysrq-z, then they can re-enable + * tracing with echo 1 > tracing_on. + */ tracing_off(); - /* Did function tracer already get disabled? */ - if (ftrace_is_dead()) { - printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); - printk("# MAY BE MISSING FUNCTION EVENTS\n"); - } - - if (disable_tracing) - ftrace_kill(); + local_irq_save(flags); /* Simulate the iterator */ trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&iter.tr->data[cpu]->disabled); + atomic_inc(&per_cpu_ptr(iter.tr->trace_buffer.data, cpu)->disabled); } old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ; @@ -5152,7 +6107,7 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) switch (oops_dump_mode) { case DUMP_ALL: - iter.cpu_file = TRACE_PIPE_ALL_CPU; + iter.cpu_file = RING_BUFFER_ALL_CPUS; break; case DUMP_ORIG: iter.cpu_file = raw_smp_processor_id(); @@ -5161,11 +6116,17 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) goto out_enable; default: printk(KERN_TRACE "Bad dumping mode, switching to all CPUs dump\n"); - iter.cpu_file = TRACE_PIPE_ALL_CPU; + iter.cpu_file = RING_BUFFER_ALL_CPUS; } printk(KERN_TRACE "Dumping ftrace buffer:\n"); + /* Did function tracer already get disabled? */ + if (ftrace_is_dead()) { + printk("# WARNING: FUNCTION TRACING IS CORRUPTED\n"); + printk("# MAY BE MISSING FUNCTION EVENTS\n"); + } + /* * We need to stop all tracing on all CPUS to read the * the next buffer. This is a bit expensive, but is @@ -5205,33 +6166,19 @@ __ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode) printk(KERN_TRACE "---------------------------------\n"); out_enable: - /* Re-enable tracing if requested */ - if (!disable_tracing) { - trace_flags |= old_userobj; + trace_flags |= old_userobj; - for_each_tracing_cpu(cpu) { - atomic_dec(&iter.tr->data[cpu]->disabled); - } - tracing_on(); + for_each_tracing_cpu(cpu) { + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } - - out: - arch_spin_unlock(&ftrace_dump_lock); + atomic_dec(&dump_running); local_irq_restore(flags); } - -/* By default: disable tracing after the dump */ -void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) -{ - __ftrace_dump(true, oops_dump_mode); -} EXPORT_SYMBOL_GPL(ftrace_dump); __init static int tracer_alloc_buffers(void) { int ring_buf_size; - enum ring_buffer_flags rb_flags; - int i; int ret = -ENOMEM; @@ -5252,49 +6199,27 @@ __init static int tracer_alloc_buffers(void) else ring_buf_size = 1; - rb_flags = trace_flags & TRACE_ITER_OVERWRITE ? RB_FL_OVERWRITE : 0; - cpumask_copy(tracing_buffer_mask, cpu_possible_mask); cpumask_copy(tracing_cpumask, cpu_all_mask); + raw_spin_lock_init(&global_trace.start_lock); + /* TODO: make the number of buffers hot pluggable with CPUS */ - global_trace.buffer = ring_buffer_alloc(ring_buf_size, rb_flags); - if (!global_trace.buffer) { + if (allocate_trace_buffers(&global_trace, ring_buf_size) < 0) { printk(KERN_ERR "tracer: failed to allocate ring buffer!\n"); WARN_ON(1); goto out_free_cpumask; } + if (global_trace.buffer_disabled) tracing_off(); - -#ifdef CONFIG_TRACER_MAX_TRACE - max_tr.buffer = ring_buffer_alloc(1, rb_flags); - if (!max_tr.buffer) { - printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n"); - WARN_ON(1); - ring_buffer_free(global_trace.buffer); - goto out_free_cpumask; - } -#endif - - /* Allocate the first page for all buffers */ - for_each_tracing_cpu(i) { - global_trace.data[i] = &per_cpu(global_trace_cpu, i); - max_tr.data[i] = &per_cpu(max_tr_data, i); - } - - set_buffer_entries(&global_trace, - ring_buffer_size(global_trace.buffer, 0)); -#ifdef CONFIG_TRACER_MAX_TRACE - set_buffer_entries(&max_tr, 1); -#endif - trace_init_cmdlines(); - init_irq_work(&trace_work_wakeup, trace_wake_up); register_tracer(&nop_trace); + global_trace.current_trace = &nop_trace; + /* All seems OK, enable tracing */ tracing_disabled = 0; @@ -5303,16 +6228,32 @@ __init static int tracer_alloc_buffers(void) register_die_notifier(&trace_die_notifier); + global_trace.flags = TRACE_ARRAY_FL_GLOBAL; + + /* Holder for file callbacks */ + global_trace.trace_cpu.cpu = RING_BUFFER_ALL_CPUS; + global_trace.trace_cpu.tr = &global_trace; + + INIT_LIST_HEAD(&global_trace.systems); + INIT_LIST_HEAD(&global_trace.events); + list_add(&global_trace.list, &ftrace_trace_arrays); + while (trace_boot_options) { char *option; option = strsep(&trace_boot_options, ","); - trace_set_options(option); + trace_set_options(&global_trace, option); } + register_snapshot_cmd(); + return 0; out_free_cpumask: + free_percpu(global_trace.trace_buffer.data); +#ifdef CONFIG_TRACER_MAX_TRACE + free_percpu(global_trace.max_buffer.data); +#endif free_cpumask_var(tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 57d7e5397d56..9e014582e763 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,11 @@ #include <linux/trace_seq.h> #include <linux/ftrace_event.h> +#ifdef CONFIG_FTRACE_SYSCALLS +#include <asm/unistd.h> /* For NR_SYSCALLS */ +#include <asm/syscall.h> /* some archs define it here */ +#endif + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -29,6 +34,7 @@ enum trace_type { TRACE_GRAPH_ENT, TRACE_USER_STACK, TRACE_BLK, + TRACE_BPUTS, __TRACE_LAST_TYPE, }; @@ -127,12 +133,21 @@ enum trace_flag_type { #define TRACE_BUF_SIZE 1024 +struct trace_array; + +struct trace_cpu { + struct trace_array *tr; + struct dentry *dir; + int cpu; +}; + /* * The CPU trace array - it consists of thousands of trace entries * plus some other descriptor data: (for example which task started * the trace, etc.) */ struct trace_array_cpu { + struct trace_cpu trace_cpu; atomic_t disabled; void *buffer_page; /* ring buffer spare */ @@ -151,20 +166,83 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; }; +struct tracer; + +struct trace_buffer { + struct trace_array *tr; + struct ring_buffer *buffer; + struct trace_array_cpu __percpu *data; + cycle_t time_start; + int cpu; +}; + /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. * They have on/off state as well: */ struct trace_array { - struct ring_buffer *buffer; - int cpu; + struct list_head list; + char *name; + struct trace_buffer trace_buffer; +#ifdef CONFIG_TRACER_MAX_TRACE + /* + * The max_buffer is used to snapshot the trace when a maximum + * latency is reached, or when the user initiates a snapshot. + * Some tracers will use this to store a maximum trace while + * it continues examining live traces. + * + * The buffers for the max_buffer are set up the same as the trace_buffer + * When a snapshot is taken, the buffer of the max_buffer is swapped + * with the buffer of the trace_buffer and the buffers are reset for + * the trace_buffer so the tracing can continue. + */ + struct trace_buffer max_buffer; + bool allocated_snapshot; +#endif int buffer_disabled; - cycle_t time_start; + struct trace_cpu trace_cpu; /* place holder */ +#ifdef CONFIG_FTRACE_SYSCALLS + int sys_refcount_enter; + int sys_refcount_exit; + DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); + DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +#endif + int stop_count; + int clock_id; + struct tracer *current_trace; + unsigned int flags; + raw_spinlock_t start_lock; + struct dentry *dir; + struct dentry *options; + struct dentry *percpu_dir; + struct dentry *event_dir; + struct list_head systems; + struct list_head events; struct task_struct *waiter; - struct trace_array_cpu *data[NR_CPUS]; + int ref; +}; + +enum { + TRACE_ARRAY_FL_GLOBAL = (1 << 0) }; +extern struct list_head ftrace_trace_arrays; + +/* + * The global tracer (top) should be the first trace array added, + * but we check the flag anyway. + */ +static inline struct trace_array *top_trace_array(void) +{ + struct trace_array *tr; + + tr = list_entry(ftrace_trace_arrays.prev, + typeof(*tr), list); + WARN_ON(!(tr->flags & TRACE_ARRAY_FL_GLOBAL)); + return tr; +} + #define FTRACE_CMP_TYPE(var, type) \ __builtin_types_compatible_p(typeof(var), type *) @@ -200,6 +278,7 @@ extern void __ftrace_bad_type(void); IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\ IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT); \ IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT); \ + IF_ASSIGN(var, ent, struct bputs_entry, TRACE_BPUTS); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_rw, \ TRACE_MMIO_RW); \ IF_ASSIGN(var, ent, struct trace_mmiotrace_map, \ @@ -283,11 +362,16 @@ struct tracer { enum print_line_t (*print_line)(struct trace_iterator *iter); /* If you handled the flag setting, return 0 */ int (*set_flag)(u32 old_flags, u32 bit, int set); + /* Return 0 if OK with change, else return non-zero */ + int (*flag_changed)(struct tracer *tracer, + u32 mask, int set); struct tracer *next; struct tracer_flags *flags; bool print_max; + bool enabled; +#ifdef CONFIG_TRACER_MAX_TRACE bool use_max_tr; - bool allocated_snapshot; +#endif }; @@ -423,8 +507,6 @@ static __always_inline void trace_clear_recursion(int bit) current->trace_recursion = val; } -#define TRACE_PIPE_ALL_CPU -1 - static inline struct ring_buffer_iter * trace_buffer_iter(struct trace_iterator *iter, int cpu) { @@ -435,10 +517,10 @@ trace_buffer_iter(struct trace_iterator *iter, int cpu) int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); -void tracing_reset(struct trace_array *tr, int cpu); -void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset(struct trace_buffer *buf, int cpu); +void tracing_reset_online_cpus(struct trace_buffer *buf); void tracing_reset_current(int cpu); -void tracing_reset_current_online_cpus(void); +void tracing_reset_all_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, umode_t mode, @@ -446,6 +528,7 @@ struct dentry *trace_create_file(const char *name, void *data, const struct file_operations *fops); +struct dentry *tracing_init_dentry_tr(struct trace_array *tr); struct dentry *tracing_init_dentry(void); struct ring_buffer_event; @@ -579,7 +662,7 @@ extern int DYN_FTRACE_TEST_NAME(void); #define DYN_FTRACE_TEST_NAME2 trace_selftest_dynamic_test_func2 extern int DYN_FTRACE_TEST_NAME2(void); -extern int ring_buffer_expanded; +extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; DECLARE_PER_CPU(int, ftrace_cpu_disabled); @@ -615,6 +698,8 @@ trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args); int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...); +int trace_array_printk_buf(struct ring_buffer *buffer, + unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); enum print_line_t print_trace_line(struct trace_iterator *iter); @@ -782,6 +867,7 @@ enum trace_iterator_flags { TRACE_ITER_STOP_ON_FREE = 0x400000, TRACE_ITER_IRQ_INFO = 0x800000, TRACE_ITER_MARKERS = 0x1000000, + TRACE_ITER_FUNCTION = 0x2000000, }; /* @@ -828,8 +914,8 @@ enum { struct ftrace_event_field { struct list_head link; - char *name; - char *type; + const char *name; + const char *type; int filter_type; int offset; int size; @@ -847,12 +933,19 @@ struct event_filter { struct event_subsystem { struct list_head list; const char *name; - struct dentry *entry; struct event_filter *filter; - int nr_events; int ref_count; }; +struct ftrace_subsystem_dir { + struct list_head list; + struct event_subsystem *subsystem; + struct trace_array *tr; + struct dentry *entry; + int ref_count; + int nr_events; +}; + #define FILTER_PRED_INVALID ((unsigned short)-1) #define FILTER_PRED_IS_RIGHT (1 << 15) #define FILTER_PRED_FOLD (1 << 15) @@ -902,22 +995,20 @@ struct filter_pred { unsigned short right; }; -extern struct list_head ftrace_common_fields; - extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); extern int apply_event_filter(struct ftrace_event_call *call, char *filter_string); -extern int apply_subsystem_event_filter(struct event_subsystem *system, +extern int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string); extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); -struct list_head * -trace_get_fields(struct ftrace_event_call *event_call); +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -934,6 +1025,8 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, } extern void trace_event_enable_cmd_record(bool enable); +extern int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr); +extern int event_trace_del_tracer(struct trace_array *tr); extern struct mutex event_mutex; extern struct list_head ftrace_events; @@ -943,6 +1036,19 @@ extern const char *__stop___trace_bprintk_fmt[]; void trace_printk_init_buffers(void); void trace_printk_start_comm(void); +int trace_keep_overwrite(struct tracer *tracer, u32 mask, int set); +int set_tracer_flag(struct trace_array *tr, unsigned int mask, int enabled); + +/* + * Normal trace_printk() and friends allocates special buffers + * to do the manipulation, as well as saves the print formats + * into sections to display. But the trace infrastructure wants + * to use these without the added overhead at the price of being + * a bit slower (used mainly for warnings, where we don't care + * about performance). The internal_trace_puts() is for such + * a purpose. + */ +#define internal_trace_puts(str) __trace_puts(_THIS_IP_, str, strlen(str)) #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print, filter) \ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 95e96842ed29..d594da0dc03c 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -32,6 +32,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { struct ftrace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; + struct trace_array_cpu *data; struct ring_buffer_event *event; struct trace_branch *entry; struct ring_buffer *buffer; @@ -51,11 +52,12 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) local_irq_save(flags); cpu = raw_smp_processor_id(); - if (atomic_inc_return(&tr->data[cpu]->disabled) != 1) + data = per_cpu_ptr(tr->trace_buffer.data, cpu); + if (atomic_inc_return(&data->disabled) != 1) goto out; pc = preempt_count(); - buffer = tr->buffer; + buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH, sizeof(*entry), flags, pc); if (!event) @@ -80,7 +82,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) __buffer_unlock_commit(buffer, event); out: - atomic_dec(&tr->data[cpu]->disabled); + atomic_dec(&data->disabled); local_irq_restore(flags); } diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c index aa8f5f48dae6..26dc348332b7 100644 --- a/kernel/trace/trace_clock.c +++ b/kernel/trace/trace_clock.c @@ -57,6 +57,16 @@ u64 notrace trace_clock(void) return local_clock(); } +/* + * trace_jiffy_clock(): Simply use jiffies as a clock counter. + */ +u64 notrace trace_clock_jiffies(void) +{ + u64 jiffy = jiffies - INITIAL_JIFFIES; + + /* Return nsecs */ + return (u64)jiffies_to_usecs(jiffy) * 1000ULL; +} /* * trace_clock_global(): special globally coherent trace clock diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h index 4108e1250ca2..e2d027ac66a2 100644 --- a/kernel/trace/trace_entries.h +++ b/kernel/trace/trace_entries.h @@ -223,8 +223,8 @@ FTRACE_ENTRY(bprint, bprint_entry, __dynamic_array( u32, buf ) ), - F_printk("%08lx fmt:%p", - __entry->ip, __entry->fmt), + F_printk("%pf: %s", + (void *)__entry->ip, __entry->fmt), FILTER_OTHER ); @@ -238,8 +238,23 @@ FTRACE_ENTRY(print, print_entry, __dynamic_array( char, buf ) ), - F_printk("%08lx %s", - __entry->ip, __entry->buf), + F_printk("%pf: %s", + (void *)__entry->ip, __entry->buf), + + FILTER_OTHER +); + +FTRACE_ENTRY(bputs, bputs_entry, + + TRACE_BPUTS, + + F_STRUCT( + __field( unsigned long, ip ) + __field( const char *, str ) + ), + + F_printk("%pf: %s", + (void *)__entry->ip, __entry->str), FILTER_OTHER ); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 57e9b284250c..53582e982e51 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -34,9 +34,27 @@ char event_storage[EVENT_STORAGE_SIZE]; EXPORT_SYMBOL_GPL(event_storage); LIST_HEAD(ftrace_events); -LIST_HEAD(ftrace_common_fields); +static LIST_HEAD(ftrace_common_fields); -struct list_head * +#define GFP_TRACE (GFP_KERNEL | __GFP_ZERO) + +static struct kmem_cache *field_cachep; +static struct kmem_cache *file_cachep; + +/* Double loops, do not use break, only goto's work */ +#define do_for_each_event_file(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + list_for_each_entry(file, &tr->events, list) + +#define do_for_each_event_file_safe(tr, file) \ + list_for_each_entry(tr, &ftrace_trace_arrays, list) { \ + struct ftrace_event_file *___n; \ + list_for_each_entry_safe(file, ___n, &tr->events, list) + +#define while_for_each_event_file() \ + } + +static struct list_head * trace_get_fields(struct ftrace_event_call *event_call) { if (!event_call->class->get_fields) @@ -44,23 +62,45 @@ trace_get_fields(struct ftrace_event_call *event_call) return event_call->class->get_fields(event_call); } +static struct ftrace_event_field * +__find_event_field(struct list_head *head, char *name) +{ + struct ftrace_event_field *field; + + list_for_each_entry(field, head, link) { + if (!strcmp(field->name, name)) + return field; + } + + return NULL; +} + +struct ftrace_event_field * +trace_find_event_field(struct ftrace_event_call *call, char *name) +{ + struct ftrace_event_field *field; + struct list_head *head; + + field = __find_event_field(&ftrace_common_fields, name); + if (field) + return field; + + head = trace_get_fields(call); + return __find_event_field(head, name); +} + static int __trace_define_field(struct list_head *head, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { struct ftrace_event_field *field; - field = kzalloc(sizeof(*field), GFP_KERNEL); + field = kmem_cache_alloc(field_cachep, GFP_TRACE); if (!field) goto err; - field->name = kstrdup(name, GFP_KERNEL); - if (!field->name) - goto err; - - field->type = kstrdup(type, GFP_KERNEL); - if (!field->type) - goto err; + field->name = name; + field->type = type; if (filter_type == FILTER_OTHER) field->filter_type = filter_assign_type(type); @@ -76,9 +116,7 @@ static int __trace_define_field(struct list_head *head, const char *type, return 0; err: - if (field) - kfree(field->name); - kfree(field); + kmem_cache_free(field_cachep, field); return -ENOMEM; } @@ -120,7 +158,7 @@ static int trace_define_common_fields(void) return ret; } -void trace_destroy_fields(struct ftrace_event_call *call) +static void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; struct list_head *head; @@ -128,9 +166,7 @@ void trace_destroy_fields(struct ftrace_event_call *call) head = trace_get_fields(call); list_for_each_entry_safe(field, next, head, link) { list_del(&field->link); - kfree(field->type); - kfree(field->name); - kfree(field); + kmem_cache_free(field_cachep, field); } } @@ -149,15 +185,17 @@ EXPORT_SYMBOL_GPL(trace_event_raw_init); int ftrace_event_reg(struct ftrace_event_call *call, enum trace_reg type, void *data) { + struct ftrace_event_file *file = data; + switch (type) { case TRACE_REG_REGISTER: return tracepoint_probe_register(call->name, call->class->probe, - call); + file); case TRACE_REG_UNREGISTER: tracepoint_probe_unregister(call->name, call->class->probe, - call); + file); return 0; #ifdef CONFIG_PERF_EVENTS @@ -183,54 +221,100 @@ EXPORT_SYMBOL_GPL(ftrace_event_reg); void trace_event_enable_cmd_record(bool enable) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - if (!(call->flags & TRACE_EVENT_FL_ENABLED)) + do_for_each_event_file(tr, file) { + + if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) continue; if (enable) { tracing_start_cmdline_record(); - call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } else { tracing_stop_cmdline_record(); - call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } - } + } while_for_each_event_file(); mutex_unlock(&event_mutex); } -static int ftrace_event_enable_disable(struct ftrace_event_call *call, - int enable) +static int __ftrace_event_enable_disable(struct ftrace_event_file *file, + int enable, int soft_disable) { + struct ftrace_event_call *call = file->event_call; int ret = 0; + int disable; switch (enable) { case 0: - if (call->flags & TRACE_EVENT_FL_ENABLED) { - call->flags &= ~TRACE_EVENT_FL_ENABLED; - if (call->flags & TRACE_EVENT_FL_RECORDED_CMD) { + /* + * When soft_disable is set and enable is cleared, we want + * to clear the SOFT_DISABLED flag but leave the event in the + * state that it was. That is, if the event was enabled and + * SOFT_DISABLED isn't set, then do nothing. But if SOFT_DISABLED + * is set we do not want the event to be enabled before we + * clear the bit. + * + * When soft_disable is not set but the SOFT_MODE flag is, + * we do nothing. Do not disable the tracepoint, otherwise + * "soft enable"s (clearing the SOFT_DISABLED bit) wont work. + */ + if (soft_disable) { + disable = file->flags & FTRACE_EVENT_FL_SOFT_DISABLED; + clear_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + } else + disable = !(file->flags & FTRACE_EVENT_FL_SOFT_MODE); + + if (disable && (file->flags & FTRACE_EVENT_FL_ENABLED)) { + clear_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + if (file->flags & FTRACE_EVENT_FL_RECORDED_CMD) { tracing_stop_cmdline_record(); - call->flags &= ~TRACE_EVENT_FL_RECORDED_CMD; + clear_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } - call->class->reg(call, TRACE_REG_UNREGISTER, NULL); + call->class->reg(call, TRACE_REG_UNREGISTER, file); } + /* If in SOFT_MODE, just set the SOFT_DISABLE_BIT */ + if (file->flags & FTRACE_EVENT_FL_SOFT_MODE) + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); break; case 1: - if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { + /* + * When soft_disable is set and enable is set, we want to + * register the tracepoint for the event, but leave the event + * as is. That means, if the event was already enabled, we do + * nothing (but set SOFT_MODE). If the event is disabled, we + * set SOFT_DISABLED before enabling the event tracepoint, so + * it still seems to be disabled. + */ + if (!soft_disable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_MODE_BIT, &file->flags); + + if (!(file->flags & FTRACE_EVENT_FL_ENABLED)) { + + /* Keep the event disabled, when going to SOFT_MODE. */ + if (soft_disable) + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &file->flags); + if (trace_flags & TRACE_ITER_RECORD_CMD) { tracing_start_cmdline_record(); - call->flags |= TRACE_EVENT_FL_RECORDED_CMD; + set_bit(FTRACE_EVENT_FL_RECORDED_CMD_BIT, &file->flags); } - ret = call->class->reg(call, TRACE_REG_REGISTER, NULL); + ret = call->class->reg(call, TRACE_REG_REGISTER, file); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " "%s\n", call->name); break; } - call->flags |= TRACE_EVENT_FL_ENABLED; + set_bit(FTRACE_EVENT_FL_ENABLED_BIT, &file->flags); + + /* WAS_ENABLED gets set but never cleared. */ + call->flags |= TRACE_EVENT_FL_WAS_ENABLED; } break; } @@ -238,13 +322,19 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, return ret; } -static void ftrace_clear_events(void) +static int ftrace_event_enable_disable(struct ftrace_event_file *file, + int enable) { - struct ftrace_event_call *call; + return __ftrace_event_enable_disable(file, enable, 0); +} + +static void ftrace_clear_events(struct trace_array *tr) +{ + struct ftrace_event_file *file; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - ftrace_event_enable_disable(call, 0); + list_for_each_entry(file, &tr->events, list) { + ftrace_event_enable_disable(file, 0); } mutex_unlock(&event_mutex); } @@ -257,11 +347,12 @@ static void __put_system(struct event_subsystem *system) if (--system->ref_count) return; + list_del(&system->list); + if (filter) { kfree(filter->filter_string); kfree(filter); } - kfree(system->name); kfree(system); } @@ -271,24 +362,45 @@ static void __get_system(struct event_subsystem *system) system->ref_count++; } -static void put_system(struct event_subsystem *system) +static void __get_system_dir(struct ftrace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + dir->ref_count++; + __get_system(dir->subsystem); +} + +static void __put_system_dir(struct ftrace_subsystem_dir *dir) +{ + WARN_ON_ONCE(dir->ref_count == 0); + /* If the subsystem is about to be freed, the dir must be too */ + WARN_ON_ONCE(dir->subsystem->ref_count == 1 && dir->ref_count != 1); + + __put_system(dir->subsystem); + if (!--dir->ref_count) + kfree(dir); +} + +static void put_system(struct ftrace_subsystem_dir *dir) { mutex_lock(&event_mutex); - __put_system(system); + __put_system_dir(dir); mutex_unlock(&event_mutex); } /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ -static int __ftrace_set_clr_event(const char *match, const char *sub, - const char *event, int set) +static int __ftrace_set_clr_event(struct trace_array *tr, const char *match, + const char *sub, const char *event, int set) { + struct ftrace_event_file *file; struct ftrace_event_call *call; int ret = -EINVAL; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; if (!call->name || !call->class || !call->class->reg) continue; @@ -307,7 +419,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, if (event && strcmp(event, call->name) != 0) continue; - ftrace_event_enable_disable(call, set); + ftrace_event_enable_disable(file, set); ret = 0; } @@ -316,7 +428,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, return ret; } -static int ftrace_set_clr_event(char *buf, int set) +static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set) { char *event = NULL, *sub = NULL, *match; @@ -344,7 +456,7 @@ static int ftrace_set_clr_event(char *buf, int set) event = NULL; } - return __ftrace_set_clr_event(match, sub, event, set); + return __ftrace_set_clr_event(tr, match, sub, event, set); } /** @@ -361,7 +473,9 @@ static int ftrace_set_clr_event(char *buf, int set) */ int trace_set_clr_event(const char *system, const char *event, int set) { - return __ftrace_set_clr_event(NULL, system, event, set); + struct trace_array *tr = top_trace_array(); + + return __ftrace_set_clr_event(tr, NULL, system, event, set); } EXPORT_SYMBOL_GPL(trace_set_clr_event); @@ -373,6 +487,8 @@ ftrace_event_write(struct file *file, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct trace_parser parser; + struct seq_file *m = file->private_data; + struct trace_array *tr = m->private; ssize_t read, ret; if (!cnt) @@ -395,7 +511,7 @@ ftrace_event_write(struct file *file, const char __user *ubuf, parser.buffer[parser.idx] = 0; - ret = ftrace_set_clr_event(parser.buffer + !set, set); + ret = ftrace_set_clr_event(tr, parser.buffer + !set, set); if (ret) goto out_put; } @@ -411,17 +527,20 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct ftrace_event_call *call; + struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(call, &ftrace_events, list) { + list_for_each_entry_continue(file, &tr->events, list) { + call = file->event_call; /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ if (call->class && call->class->reg) - return call; + return file; } return NULL; @@ -429,30 +548,32 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - call = list_entry(&ftrace_events, struct ftrace_event_call, list); + file = list_entry(&tr->events, struct ftrace_event_file, list); for (l = 0; l <= *pos; ) { - call = t_next(m, call, &l); - if (!call) + file = t_next(m, file, &l); + if (!file) break; } - return call; + return file; } static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct trace_array *tr = m->private; (*pos)++; - list_for_each_entry_continue(call, &ftrace_events, list) { - if (call->flags & TRACE_EVENT_FL_ENABLED) - return call; + list_for_each_entry_continue(file, &tr->events, list) { + if (file->flags & FTRACE_EVENT_FL_ENABLED) + return file; } return NULL; @@ -460,23 +581,25 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { - struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = m->private; loff_t l; mutex_lock(&event_mutex); - call = list_entry(&ftrace_events, struct ftrace_event_call, list); + file = list_entry(&tr->events, struct ftrace_event_file, list); for (l = 0; l <= *pos; ) { - call = s_next(m, call, &l); - if (!call) + file = s_next(m, file, &l); + if (!file) break; } - return call; + return file; } static int t_show(struct seq_file *m, void *v) { - struct ftrace_event_call *call = v; + struct ftrace_event_file *file = v; + struct ftrace_event_call *call = file->event_call; if (strcmp(call->class->system, TRACE_SYSTEM) != 0) seq_printf(m, "%s:", call->class->system); @@ -494,25 +617,31 @@ static ssize_t event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_file *file = filp->private_data; char *buf; - if (call->flags & TRACE_EVENT_FL_ENABLED) - buf = "1\n"; - else + if (file->flags & FTRACE_EVENT_FL_ENABLED) { + if (file->flags & FTRACE_EVENT_FL_SOFT_DISABLED) + buf = "0*\n"; + else + buf = "1\n"; + } else buf = "0\n"; - return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, strlen(buf)); } static ssize_t event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_file *file = filp->private_data; unsigned long val; int ret; + if (!file) + return -EINVAL; + ret = kstrtoul_from_user(ubuf, cnt, 10, &val); if (ret) return ret; @@ -525,7 +654,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, case 0: case 1: mutex_lock(&event_mutex); - ret = ftrace_event_enable_disable(call, val); + ret = ftrace_event_enable_disable(file, val); mutex_unlock(&event_mutex); break; @@ -543,14 +672,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { const char set_to_char[4] = { '?', '0', '1', 'X' }; - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; struct ftrace_event_call *call; + struct ftrace_event_file *file; + struct trace_array *tr = dir->tr; char buf[2]; int set = 0; int ret; mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + call = file->event_call; if (!call->name || !call->class || !call->class->reg) continue; @@ -562,7 +695,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); + set |= (1 << !!(file->flags & FTRACE_EVENT_FL_ENABLED)); /* * If we have a mixture, no need to look further. @@ -584,7 +717,8 @@ static ssize_t system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; const char *name = NULL; unsigned long val; ssize_t ret; @@ -607,7 +741,7 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (system) name = system->name; - ret = __ftrace_set_clr_event(NULL, name, NULL, val); + ret = __ftrace_set_clr_event(dir->tr, NULL, name, NULL, val); if (ret) goto out; @@ -845,43 +979,75 @@ static LIST_HEAD(event_subsystems); static int subsystem_open(struct inode *inode, struct file *filp) { struct event_subsystem *system = NULL; + struct ftrace_subsystem_dir *dir = NULL; /* Initialize for gcc */ + struct trace_array *tr; int ret; - if (!inode->i_private) - goto skip_search; - /* Make sure the system still exists */ mutex_lock(&event_mutex); - list_for_each_entry(system, &event_subsystems, list) { - if (system == inode->i_private) { - /* Don't open systems with no events */ - if (!system->nr_events) { - system = NULL; - break; + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + list_for_each_entry(dir, &tr->systems, list) { + if (dir == inode->i_private) { + /* Don't open systems with no events */ + if (dir->nr_events) { + __get_system_dir(dir); + system = dir->subsystem; + } + goto exit_loop; } - __get_system(system); - break; } } + exit_loop: mutex_unlock(&event_mutex); - if (system != inode->i_private) + if (!system) return -ENODEV; - skip_search: + /* Some versions of gcc think dir can be uninitialized here */ + WARN_ON(!dir); + ret = tracing_open_generic(inode, filp); - if (ret < 0 && system) - put_system(system); + if (ret < 0) + put_system(dir); + + return ret; +} + +static int system_tr_open(struct inode *inode, struct file *filp) +{ + struct ftrace_subsystem_dir *dir; + struct trace_array *tr = inode->i_private; + int ret; + + /* Make a temporary dir that has no system but points to tr */ + dir = kzalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + return -ENOMEM; + + dir->tr = tr; + + ret = tracing_open_generic(inode, filp); + if (ret < 0) + kfree(dir); + + filp->private_data = dir; return ret; } static int subsystem_release(struct inode *inode, struct file *file) { - struct event_subsystem *system = inode->i_private; + struct ftrace_subsystem_dir *dir = file->private_data; - if (system) - put_system(system); + /* + * If dir->subsystem is NULL, then this is a temporary + * descriptor that was made for a trace_array to enable + * all subsystems. + */ + if (dir->subsystem) + put_system(dir); + else + kfree(dir); return 0; } @@ -890,7 +1056,8 @@ static ssize_t subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; + struct event_subsystem *system = dir->subsystem; struct trace_seq *s; int r; @@ -915,7 +1082,7 @@ static ssize_t subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { - struct event_subsystem *system = filp->private_data; + struct ftrace_subsystem_dir *dir = filp->private_data; char *buf; int err; @@ -932,7 +1099,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } buf[cnt] = '\0'; - err = apply_subsystem_event_filter(system, buf); + err = apply_subsystem_event_filter(dir, buf); free_page((unsigned long) buf); if (err < 0) return err; @@ -1041,30 +1208,35 @@ static const struct file_operations ftrace_system_enable_fops = { .release = subsystem_release, }; +static const struct file_operations ftrace_tr_enable_fops = { + .open = system_tr_open, + .read = system_enable_read, + .write = system_enable_write, + .llseek = default_llseek, + .release = subsystem_release, +}; + static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, .llseek = default_llseek, }; -static struct dentry *event_trace_events_dir(void) +static int +ftrace_event_open(struct inode *inode, struct file *file, + const struct seq_operations *seq_ops) { - static struct dentry *d_tracer; - static struct dentry *d_events; - - if (d_events) - return d_events; - - d_tracer = tracing_init_dentry(); - if (!d_tracer) - return NULL; + struct seq_file *m; + int ret; - d_events = debugfs_create_dir("events", d_tracer); - if (!d_events) - pr_warning("Could not create debugfs " - "'events' directory\n"); + ret = seq_open(file, seq_ops); + if (ret < 0) + return ret; + m = file->private_data; + /* copy tr over to seq ops */ + m->private = inode->i_private; - return d_events; + return ret; } static int @@ -1072,117 +1244,165 @@ ftrace_event_avail_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_event_seq_ops; - return seq_open(file, seq_ops); + return ftrace_event_open(inode, file, seq_ops); } static int ftrace_event_set_open(struct inode *inode, struct file *file) { const struct seq_operations *seq_ops = &show_set_event_seq_ops; + struct trace_array *tr = inode->i_private; if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) - ftrace_clear_events(); + ftrace_clear_events(tr); - return seq_open(file, seq_ops); + return ftrace_event_open(inode, file, seq_ops); +} + +static struct event_subsystem * +create_new_subsystem(const char *name) +{ + struct event_subsystem *system; + + /* need to create new entry */ + system = kmalloc(sizeof(*system), GFP_KERNEL); + if (!system) + return NULL; + + system->ref_count = 1; + system->name = name; + + system->filter = NULL; + + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) + goto out_free; + + list_add(&system->list, &event_subsystems); + + return system; + + out_free: + kfree(system); + return NULL; } static struct dentry * -event_subsystem_dir(const char *name, struct dentry *d_events) +event_subsystem_dir(struct trace_array *tr, const char *name, + struct ftrace_event_file *file, struct dentry *parent) { + struct ftrace_subsystem_dir *dir; struct event_subsystem *system; struct dentry *entry; /* First see if we did not already create this dir */ - list_for_each_entry(system, &event_subsystems, list) { + list_for_each_entry(dir, &tr->systems, list) { + system = dir->subsystem; if (strcmp(system->name, name) == 0) { - system->nr_events++; - return system->entry; + dir->nr_events++; + file->system = dir; + return dir->entry; } } - /* need to create new entry */ - system = kmalloc(sizeof(*system), GFP_KERNEL); - if (!system) { - pr_warning("No memory to create event subsystem %s\n", - name); - return d_events; + /* Now see if the system itself exists. */ + list_for_each_entry(system, &event_subsystems, list) { + if (strcmp(system->name, name) == 0) + break; } + /* Reset system variable when not found */ + if (&system->list == &event_subsystems) + system = NULL; - system->entry = debugfs_create_dir(name, d_events); - if (!system->entry) { - pr_warning("Could not create event subsystem %s\n", - name); - kfree(system); - return d_events; - } + dir = kmalloc(sizeof(*dir), GFP_KERNEL); + if (!dir) + goto out_fail; - system->nr_events = 1; - system->ref_count = 1; - system->name = kstrdup(name, GFP_KERNEL); - if (!system->name) { - debugfs_remove(system->entry); - kfree(system); - return d_events; + if (!system) { + system = create_new_subsystem(name); + if (!system) + goto out_free; + } else + __get_system(system); + + dir->entry = debugfs_create_dir(name, parent); + if (!dir->entry) { + pr_warning("Failed to create system directory %s\n", name); + __put_system(system); + goto out_free; } - list_add(&system->list, &event_subsystems); - - system->filter = NULL; - - system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); - if (!system->filter) { - pr_warning("Could not allocate filter for subsystem " - "'%s'\n", name); - return system->entry; - } + dir->tr = tr; + dir->ref_count = 1; + dir->nr_events = 1; + dir->subsystem = system; + file->system = dir; - entry = debugfs_create_file("filter", 0644, system->entry, system, + entry = debugfs_create_file("filter", 0644, dir->entry, dir, &ftrace_subsystem_filter_fops); if (!entry) { kfree(system->filter); system->filter = NULL; - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", name); + pr_warning("Could not create debugfs '%s/filter' entry\n", name); } - trace_create_file("enable", 0644, system->entry, system, + trace_create_file("enable", 0644, dir->entry, dir, &ftrace_system_enable_fops); - return system->entry; + list_add(&dir->list, &tr->systems); + + return dir->entry; + + out_free: + kfree(dir); + out_fail: + /* Only print this message if failed on memory allocation */ + if (!dir || !system) + pr_warning("No memory to create event subsystem %s\n", + name); + return NULL; } static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, +event_create_dir(struct dentry *parent, + struct ftrace_event_file *file, const struct file_operations *id, const struct file_operations *enable, const struct file_operations *filter, const struct file_operations *format) { + struct ftrace_event_call *call = file->event_call; + struct trace_array *tr = file->tr; struct list_head *head; + struct dentry *d_events; int ret; /* * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->class->system, TRACE_SYSTEM) != 0) - d_events = event_subsystem_dir(call->class->system, d_events); - - call->dir = debugfs_create_dir(call->name, d_events); - if (!call->dir) { - pr_warning("Could not create debugfs " - "'%s' directory\n", call->name); + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) { + d_events = event_subsystem_dir(tr, call->class->system, file, parent); + if (!d_events) + return -ENOMEM; + } else + d_events = parent; + + file->dir = debugfs_create_dir(call->name, d_events); + if (!file->dir) { + pr_warning("Could not create debugfs '%s' directory\n", + call->name); return -1; } if (call->class->reg && !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) - trace_create_file("enable", 0644, call->dir, call, + trace_create_file("enable", 0644, file->dir, file, enable); #ifdef CONFIG_PERF_EVENTS if (call->event.type && call->class->reg) - trace_create_file("id", 0444, call->dir, call, + trace_create_file("id", 0444, file->dir, call, id); #endif @@ -1196,23 +1416,76 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, if (ret < 0) { pr_warning("Could not initialize trace point" " events/%s\n", call->name); - return ret; + return -1; } } - trace_create_file("filter", 0644, call->dir, call, + trace_create_file("filter", 0644, file->dir, call, filter); - trace_create_file("format", 0444, call->dir, call, + trace_create_file("format", 0444, file->dir, call, format); return 0; } +static void remove_subsystem(struct ftrace_subsystem_dir *dir) +{ + if (!dir) + return; + + if (!--dir->nr_events) { + debugfs_remove_recursive(dir->entry); + list_del(&dir->list); + __put_system_dir(dir); + } +} + +static void remove_event_from_tracers(struct ftrace_event_call *call) +{ + struct ftrace_event_file *file; + struct trace_array *tr; + + do_for_each_event_file_safe(tr, file) { + + if (file->event_call != call) + continue; + + list_del(&file->list); + debugfs_remove_recursive(file->dir); + remove_subsystem(file->system); + kmem_cache_free(file_cachep, file); + + /* + * The do_for_each_event_file_safe() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); +} + static void event_remove(struct ftrace_event_call *call) { - ftrace_event_enable_disable(call, 0); + struct trace_array *tr; + struct ftrace_event_file *file; + + do_for_each_event_file(tr, file) { + if (file->event_call != call) + continue; + ftrace_event_enable_disable(file, 0); + /* + * The do_for_each_event_file() is + * a double loop. After finding the call for this + * trace_array, we use break to jump to the next + * trace_array. + */ + break; + } while_for_each_event_file(); + if (call->event.funcs) __unregister_ftrace_event(&call->event); + remove_event_from_tracers(call); list_del(&call->list); } @@ -1234,82 +1507,99 @@ static int event_init(struct ftrace_event_call *call) } static int -__trace_add_event_call(struct ftrace_event_call *call, struct module *mod, - const struct file_operations *id, - const struct file_operations *enable, - const struct file_operations *filter, - const struct file_operations *format) +__register_event(struct ftrace_event_call *call, struct module *mod) { - struct dentry *d_events; int ret; ret = event_init(call); if (ret < 0) return ret; - d_events = event_trace_events_dir(); - if (!d_events) - return -ENOENT; - - ret = event_create_dir(call, d_events, id, enable, filter, format); - if (!ret) - list_add(&call->list, &ftrace_events); + list_add(&call->list, &ftrace_events); call->mod = mod; - return ret; + return 0; +} + +/* Add an event to a trace directory */ +static int +__trace_add_new_event(struct ftrace_event_call *call, + struct trace_array *tr, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) +{ + struct ftrace_event_file *file; + + file = kmem_cache_alloc(file_cachep, GFP_TRACE); + if (!file) + return -ENOMEM; + + file->event_call = call; + file->tr = tr; + list_add(&file->list, &tr->events); + + return event_create_dir(tr->event_dir, file, id, enable, filter, format); } +/* + * Just create a decriptor for early init. A descriptor is required + * for enabling events at boot. We want to enable events before + * the filesystem is initialized. + */ +static __init int +__trace_early_add_new_event(struct ftrace_event_call *call, + struct trace_array *tr) +{ + struct ftrace_event_file *file; + + file = kmem_cache_alloc(file_cachep, GFP_TRACE); + if (!file) + return -ENOMEM; + + file->event_call = call; + file->tr = tr; + list_add(&file->list, &tr->events); + + return 0; +} + +struct ftrace_module_file_ops; +static void __add_event_to_tracers(struct ftrace_event_call *call, + struct ftrace_module_file_ops *file_ops); + /* Add an additional event_call dynamically */ int trace_add_event_call(struct ftrace_event_call *call) { int ret; mutex_lock(&event_mutex); - ret = __trace_add_event_call(call, NULL, &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); - mutex_unlock(&event_mutex); - return ret; -} -static void remove_subsystem_dir(const char *name) -{ - struct event_subsystem *system; - - if (strcmp(name, TRACE_SYSTEM) == 0) - return; + ret = __register_event(call, NULL); + if (ret >= 0) + __add_event_to_tracers(call, NULL); - list_for_each_entry(system, &event_subsystems, list) { - if (strcmp(system->name, name) == 0) { - if (!--system->nr_events) { - debugfs_remove_recursive(system->entry); - list_del(&system->list); - __put_system(system); - } - break; - } - } + mutex_unlock(&event_mutex); + return ret; } /* - * Must be called under locking both of event_mutex and trace_event_mutex. + * Must be called under locking both of event_mutex and trace_event_sem. */ static void __trace_remove_event_call(struct ftrace_event_call *call) { event_remove(call); trace_destroy_fields(call); destroy_preds(call); - debugfs_remove_recursive(call->dir); - remove_subsystem_dir(call->class->system); } /* Remove an event_call */ void trace_remove_event_call(struct ftrace_event_call *call) { mutex_lock(&event_mutex); - down_write(&trace_event_mutex); + down_write(&trace_event_sem); __trace_remove_event_call(call); - up_write(&trace_event_mutex); + up_write(&trace_event_sem); mutex_unlock(&event_mutex); } @@ -1336,6 +1626,26 @@ struct ftrace_module_file_ops { }; static struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) +{ + /* + * As event_calls are added in groups by module, + * when we find one file_ops, we don't need to search for + * each call in that module, as the rest should be the + * same. Only search for a new one if the last one did + * not match. + */ + if (file_ops && mod == file_ops->mod) + return file_ops; + + list_for_each_entry(file_ops, &ftrace_module_file_list, list) { + if (file_ops->mod == mod) + return file_ops; + } + return NULL; +} + +static struct ftrace_module_file_ops * trace_create_file_ops(struct module *mod) { struct ftrace_module_file_ops *file_ops; @@ -1386,9 +1696,8 @@ static void trace_module_add_events(struct module *mod) return; for_each_event(call, start, end) { - __trace_add_event_call(*call, mod, - &file_ops->id, &file_ops->enable, - &file_ops->filter, &file_ops->format); + __register_event(*call, mod); + __add_event_to_tracers(*call, file_ops); } } @@ -1396,12 +1705,13 @@ static void trace_module_remove_events(struct module *mod) { struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; - bool found = false; + bool clear_trace = false; - down_write(&trace_event_mutex); + down_write(&trace_event_sem); list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { - found = true; + if (call->flags & TRACE_EVENT_FL_WAS_ENABLED) + clear_trace = true; __trace_remove_event_call(call); } } @@ -1415,14 +1725,18 @@ static void trace_module_remove_events(struct module *mod) list_del(&file_ops->list); kfree(file_ops); } + up_write(&trace_event_sem); /* * It is safest to reset the ring buffer if the module being unloaded - * registered any events. + * registered any events that were used. The only worry is if + * a new module gets loaded, and takes on the same id as the events + * of this module. When printing out the buffer, traced events left + * over from this module may be passed to the new module events and + * unexpected results may occur. */ - if (found) - tracing_reset_current_online_cpus(); - up_write(&trace_event_mutex); + if (clear_trace) + tracing_reset_all_online_cpus(); } static int trace_module_notify(struct notifier_block *self, @@ -1443,14 +1757,433 @@ static int trace_module_notify(struct notifier_block *self, return 0; } + +static int +__trace_add_new_mod_event(struct ftrace_event_call *call, + struct trace_array *tr, + struct ftrace_module_file_ops *file_ops) +{ + return __trace_add_new_event(call, tr, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); +} + #else -static int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static inline struct ftrace_module_file_ops * +find_ftrace_file_ops(struct ftrace_module_file_ops *file_ops, struct module *mod) +{ + return NULL; +} +static inline int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) { return 0; } +static inline int +__trace_add_new_mod_event(struct ftrace_event_call *call, + struct trace_array *tr, + struct ftrace_module_file_ops *file_ops) +{ + return -ENODEV; +} #endif /* CONFIG_MODULES */ +/* Create a new event directory structure for a trace directory. */ +static void +__trace_add_event_dirs(struct trace_array *tr) +{ + struct ftrace_module_file_ops *file_ops = NULL; + struct ftrace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + if (call->mod) { + /* + * Directories for events by modules need to + * keep module ref counts when opened (as we don't + * want the module to disappear when reading one + * of these files). The file_ops keep account of + * the module ref count. + */ + file_ops = find_ftrace_file_ops(file_ops, call->mod); + if (!file_ops) + continue; /* Warn? */ + ret = __trace_add_new_mod_event(call, tr, file_ops); + if (ret < 0) + pr_warning("Could not create directory for event %s\n", + call->name); + continue; + } + ret = __trace_add_new_event(call, tr, + &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (ret < 0) + pr_warning("Could not create directory for event %s\n", + call->name); + } +} + +#ifdef CONFIG_DYNAMIC_FTRACE + +/* Avoid typos */ +#define ENABLE_EVENT_STR "enable_event" +#define DISABLE_EVENT_STR "disable_event" + +struct event_probe_data { + struct ftrace_event_file *file; + unsigned long count; + int ref; + bool enable; +}; + +static struct ftrace_event_file * +find_event_file(struct trace_array *tr, const char *system, const char *event) +{ + struct ftrace_event_file *file; + struct ftrace_event_call *call; + + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; + + if (!call->name || !call->class || !call->class->reg) + continue; + + if (call->flags & TRACE_EVENT_FL_IGNORE_ENABLE) + continue; + + if (strcmp(event, call->name) == 0 && + strcmp(system, call->class->system) == 0) + return file; + } + return NULL; +} + +static void +event_enable_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (!data) + return; + + if (data->enable) + clear_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); + else + set_bit(FTRACE_EVENT_FL_SOFT_DISABLED_BIT, &data->file->flags); +} + +static void +event_enable_count_probe(unsigned long ip, unsigned long parent_ip, void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (!data) + return; + + if (!data->count) + return; + + /* Skip if the event is in a state we want to switch to */ + if (data->enable == !(data->file->flags & FTRACE_EVENT_FL_SOFT_DISABLED)) + return; + + if (data->count != -1) + (data->count)--; + + event_enable_probe(ip, parent_ip, _data); +} + +static int +event_enable_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *_data) +{ + struct event_probe_data *data = _data; + + seq_printf(m, "%ps:", (void *)ip); + + seq_printf(m, "%s:%s:%s", + data->enable ? ENABLE_EVENT_STR : DISABLE_EVENT_STR, + data->file->event_call->class->system, + data->file->event_call->name); + + if (data->count == -1) + seq_printf(m, ":unlimited\n"); + else + seq_printf(m, ":count=%ld\n", data->count); + + return 0; +} + +static int +event_enable_init(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + data->ref++; + return 0; +} + +static void +event_enable_free(struct ftrace_probe_ops *ops, unsigned long ip, + void **_data) +{ + struct event_probe_data **pdata = (struct event_probe_data **)_data; + struct event_probe_data *data = *pdata; + + if (WARN_ON_ONCE(data->ref <= 0)) + return; + + data->ref--; + if (!data->ref) { + /* Remove the SOFT_MODE flag */ + __ftrace_event_enable_disable(data->file, 0, 1); + module_put(data->file->event_call->mod); + kfree(data); + } + *pdata = NULL; +} + +static struct ftrace_probe_ops event_enable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_enable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_probe_ops = { + .func = event_enable_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static struct ftrace_probe_ops event_disable_count_probe_ops = { + .func = event_enable_count_probe, + .print = event_enable_print, + .init = event_enable_init, + .free = event_enable_free, +}; + +static int +event_enable_func(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enabled) +{ + struct trace_array *tr = top_trace_array(); + struct ftrace_event_file *file; + struct ftrace_probe_ops *ops; + struct event_probe_data *data; + const char *system; + const char *event; + char *number; + bool enable; + int ret; + + /* hash funcs only work with set_ftrace_filter */ + if (!enabled) + return -EINVAL; + + if (!param) + return -EINVAL; + + system = strsep(¶m, ":"); + if (!param) + return -EINVAL; + + event = strsep(¶m, ":"); + + mutex_lock(&event_mutex); + + ret = -EINVAL; + file = find_event_file(tr, system, event); + if (!file) + goto out; + + enable = strcmp(cmd, ENABLE_EVENT_STR) == 0; + + if (enable) + ops = param ? &event_enable_count_probe_ops : &event_enable_probe_ops; + else + ops = param ? &event_disable_count_probe_ops : &event_disable_probe_ops; + + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + ret = 0; + goto out; + } + + ret = -ENOMEM; + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + goto out; + + data->enable = enable; + data->count = -1; + data->file = file; + + if (!param) + goto out_reg; + + number = strsep(¶m, ":"); + + ret = -EINVAL; + if (!strlen(number)) + goto out_free; + + /* + * We use the callback data field (which is a pointer) + * as our counter. + */ + ret = kstrtoul(number, 0, &data->count); + if (ret) + goto out_free; + + out_reg: + /* Don't let event modules unload while probe registered */ + ret = try_module_get(file->event_call->mod); + if (!ret) + goto out_free; + + ret = __ftrace_event_enable_disable(file, 1, 1); + if (ret < 0) + goto out_put; + ret = register_ftrace_function_probe(glob, ops, data); + if (!ret) + goto out_disable; + out: + mutex_unlock(&event_mutex); + return ret; + + out_disable: + __ftrace_event_enable_disable(file, 0, 1); + out_put: + module_put(file->event_call->mod); + out_free: + kfree(data); + goto out; +} + +static struct ftrace_func_command event_enable_cmd = { + .name = ENABLE_EVENT_STR, + .func = event_enable_func, +}; + +static struct ftrace_func_command event_disable_cmd = { + .name = DISABLE_EVENT_STR, + .func = event_enable_func, +}; + +static __init int register_event_cmds(void) +{ + int ret; + + ret = register_ftrace_command(&event_enable_cmd); + if (WARN_ON(ret < 0)) + return ret; + ret = register_ftrace_command(&event_disable_cmd); + if (WARN_ON(ret < 0)) + unregister_ftrace_command(&event_enable_cmd); + return ret; +} +#else +static inline int register_event_cmds(void) { return 0; } +#endif /* CONFIG_DYNAMIC_FTRACE */ + +/* + * The top level array has already had its ftrace_event_file + * descriptors created in order to allow for early events to + * be recorded. This function is called after the debugfs has been + * initialized, and we now have to create the files associated + * to the events. + */ +static __init void +__trace_early_add_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_file *file; + int ret; + + + list_for_each_entry(file, &tr->events, list) { + ret = event_create_dir(tr->event_dir, file, + &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + if (ret < 0) + pr_warning("Could not create directory for event %s\n", + file->event_call->name); + } +} + +/* + * For early boot up, the top trace array requires to have + * a list of events that can be enabled. This must be done before + * the filesystem is set up in order to allow events to be traced + * early. + */ +static __init void +__trace_early_add_events(struct trace_array *tr) +{ + struct ftrace_event_call *call; + int ret; + + list_for_each_entry(call, &ftrace_events, list) { + /* Early boot up should not have any modules loaded */ + if (WARN_ON_ONCE(call->mod)) + continue; + + ret = __trace_early_add_new_event(call, tr); + if (ret < 0) + pr_warning("Could not create early event %s\n", + call->name); + } +} + +/* Remove the event directory structure for a trace directory. */ +static void +__trace_remove_event_dirs(struct trace_array *tr) +{ + struct ftrace_event_file *file, *next; + + list_for_each_entry_safe(file, next, &tr->events, list) { + list_del(&file->list); + debugfs_remove_recursive(file->dir); + remove_subsystem(file->system); + kmem_cache_free(file_cachep, file); + } +} + +static void +__add_event_to_tracers(struct ftrace_event_call *call, + struct ftrace_module_file_ops *file_ops) +{ + struct trace_array *tr; + + list_for_each_entry(tr, &ftrace_trace_arrays, list) { + if (file_ops) + __trace_add_new_mod_event(call, tr, file_ops); + else + __trace_add_new_event(call, tr, + &ftrace_event_id_fops, + &ftrace_enable_fops, + &ftrace_event_filter_fops, + &ftrace_event_format_fops); + } +} + static struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, .priority = 0, @@ -1464,15 +2197,135 @@ static char bootup_event_buf[COMMAND_LINE_SIZE] __initdata; static __init int setup_trace_event(char *str) { strlcpy(bootup_event_buf, str, COMMAND_LINE_SIZE); - ring_buffer_expanded = 1; - tracing_selftest_disabled = 1; + ring_buffer_expanded = true; + tracing_selftest_disabled = true; return 1; } __setup("trace_event=", setup_trace_event); +/* Expects to have event_mutex held when called */ +static int +create_event_toplevel_files(struct dentry *parent, struct trace_array *tr) +{ + struct dentry *d_events; + struct dentry *entry; + + entry = debugfs_create_file("set_event", 0644, parent, + tr, &ftrace_set_event_fops); + if (!entry) { + pr_warning("Could not create debugfs 'set_event' entry\n"); + return -ENOMEM; + } + + d_events = debugfs_create_dir("events", parent); + if (!d_events) { + pr_warning("Could not create debugfs 'events' directory\n"); + return -ENOMEM; + } + + /* ring buffer internal formats */ + trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + + trace_create_file("enable", 0644, d_events, + tr, &ftrace_tr_enable_fops); + + tr->event_dir = d_events; + + return 0; +} + +/** + * event_trace_add_tracer - add a instance of a trace_array to events + * @parent: The parent dentry to place the files/directories for events in + * @tr: The trace array associated with these events + * + * When a new instance is created, it needs to set up its events + * directory, as well as other files associated with events. It also + * creates the event hierachry in the @parent/events directory. + * + * Returns 0 on success. + */ +int event_trace_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out_unlock; + + down_write(&trace_event_sem); + __trace_add_event_dirs(tr); + up_write(&trace_event_sem); + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +/* + * The top trace array already had its file descriptors created. + * Now the files themselves need to be created. + */ +static __init int +early_event_add_tracer(struct dentry *parent, struct trace_array *tr) +{ + int ret; + + mutex_lock(&event_mutex); + + ret = create_event_toplevel_files(parent, tr); + if (ret) + goto out_unlock; + + down_write(&trace_event_sem); + __trace_early_add_event_dirs(tr); + up_write(&trace_event_sem); + + out_unlock: + mutex_unlock(&event_mutex); + + return ret; +} + +int event_trace_del_tracer(struct trace_array *tr) +{ + /* Disable any running events */ + __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); + + mutex_lock(&event_mutex); + + down_write(&trace_event_sem); + __trace_remove_event_dirs(tr); + debugfs_remove_recursive(tr->event_dir); + up_write(&trace_event_sem); + + tr->event_dir = NULL; + + mutex_unlock(&event_mutex); + + return 0; +} + +static __init int event_trace_memsetup(void) +{ + field_cachep = KMEM_CACHE(ftrace_event_field, SLAB_PANIC); + file_cachep = KMEM_CACHE(ftrace_event_file, SLAB_PANIC); + return 0; +} + static __init int event_trace_enable(void) { + struct trace_array *tr = top_trace_array(); struct ftrace_event_call **iter, *call; char *buf = bootup_event_buf; char *token; @@ -1486,6 +2339,14 @@ static __init int event_trace_enable(void) list_add(&call->list, &ftrace_events); } + /* + * We need the top trace array to have a working set of trace + * points at early init, before the debug files and directories + * are created. Create the file entries now, and attach them + * to the actual file dentries later. + */ + __trace_early_add_events(tr); + while (true) { token = strsep(&buf, ","); @@ -1494,73 +2355,43 @@ static __init int event_trace_enable(void) if (!*token) continue; - ret = ftrace_set_clr_event(token, 1); + ret = ftrace_set_clr_event(tr, token, 1); if (ret) pr_warn("Failed to enable trace event: %s\n", token); } trace_printk_start_comm(); + register_event_cmds(); + return 0; } static __init int event_trace_init(void) { - struct ftrace_event_call *call; + struct trace_array *tr; struct dentry *d_tracer; struct dentry *entry; - struct dentry *d_events; int ret; + tr = top_trace_array(); + d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; entry = debugfs_create_file("available_events", 0444, d_tracer, - NULL, &ftrace_avail_fops); + tr, &ftrace_avail_fops); if (!entry) pr_warning("Could not create debugfs " "'available_events' entry\n"); - entry = debugfs_create_file("set_event", 0644, d_tracer, - NULL, &ftrace_set_event_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_event' entry\n"); - - d_events = event_trace_events_dir(); - if (!d_events) - return 0; - - /* ring buffer internal formats */ - trace_create_file("header_page", 0444, d_events, - ring_buffer_print_page_header, - &ftrace_show_header_fops); - - trace_create_file("header_event", 0444, d_events, - ring_buffer_print_entry_header, - &ftrace_show_header_fops); - - trace_create_file("enable", 0644, d_events, - NULL, &ftrace_system_enable_fops); - if (trace_define_common_fields()) pr_warning("tracing: Failed to allocate common fields"); - /* - * Early initialization already enabled ftrace event. - * Now it's only necessary to create the event directory. - */ - list_for_each_entry(call, &ftrace_events, list) { - - ret = event_create_dir(call, d_events, - &ftrace_event_id_fops, - &ftrace_enable_fops, - &ftrace_event_filter_fops, - &ftrace_event_format_fops); - if (ret < 0) - event_remove(call); - } + ret = early_event_add_tracer(d_tracer, tr); + if (ret) + return ret; ret = register_module_notifier(&trace_module_nb); if (ret) @@ -1568,6 +2399,7 @@ static __init int event_trace_init(void) return 0; } +early_initcall(event_trace_memsetup); core_initcall(event_trace_enable); fs_initcall(event_trace_init); @@ -1627,13 +2459,20 @@ static __init void event_test_stuff(void) */ static __init void event_trace_self_tests(void) { + struct ftrace_subsystem_dir *dir; + struct ftrace_event_file *file; struct ftrace_event_call *call; struct event_subsystem *system; + struct trace_array *tr; int ret; + tr = top_trace_array(); + pr_info("Running tests on trace events:\n"); - list_for_each_entry(call, &ftrace_events, list) { + list_for_each_entry(file, &tr->events, list) { + + call = file->event_call; /* Only test those that have a probe */ if (!call->class || !call->class->probe) @@ -1657,15 +2496,15 @@ static __init void event_trace_self_tests(void) * If an event is already enabled, someone is using * it and the self test should not be on. */ - if (call->flags & TRACE_EVENT_FL_ENABLED) { + if (file->flags & FTRACE_EVENT_FL_ENABLED) { pr_warning("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; } - ftrace_event_enable_disable(call, 1); + ftrace_event_enable_disable(file, 1); event_test_stuff(); - ftrace_event_enable_disable(call, 0); + ftrace_event_enable_disable(file, 0); pr_cont("OK\n"); } @@ -1674,7 +2513,9 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on trace event systems:\n"); - list_for_each_entry(system, &event_subsystems, list) { + list_for_each_entry(dir, &tr->systems, list) { + + system = dir->subsystem; /* the ftrace system is special, skip it */ if (strcmp(system->name, "ftrace") == 0) @@ -1682,7 +2523,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling system %s\n", system->name); @@ -1691,7 +2532,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warning("error disabling system %s\n", system->name); @@ -1706,7 +2547,7 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling all events\n"); return; @@ -1715,7 +2556,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); + ret = __ftrace_set_clr_event(tr, NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); return; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e5b0ca8b8d4d..a6361178de5a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -658,33 +658,6 @@ void print_subsystem_event_filter(struct event_subsystem *system, mutex_unlock(&event_mutex); } -static struct ftrace_event_field * -__find_event_field(struct list_head *head, char *name) -{ - struct ftrace_event_field *field; - - list_for_each_entry(field, head, link) { - if (!strcmp(field->name, name)) - return field; - } - - return NULL; -} - -static struct ftrace_event_field * -find_event_field(struct ftrace_event_call *call, char *name) -{ - struct ftrace_event_field *field; - struct list_head *head; - - field = __find_event_field(&ftrace_common_fields, name); - if (field) - return field; - - head = trace_get_fields(call); - return __find_event_field(head, name); -} - static int __alloc_pred_stack(struct pred_stack *stack, int n_preds) { stack->preds = kcalloc(n_preds + 1, sizeof(*stack->preds), GFP_KERNEL); @@ -1337,7 +1310,7 @@ static struct filter_pred *create_pred(struct filter_parse_state *ps, return NULL; } - field = find_event_field(call, operand1); + field = trace_find_event_field(call, operand1); if (!field) { parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); return NULL; @@ -1907,16 +1880,17 @@ out_unlock: return err; } -int apply_subsystem_event_filter(struct event_subsystem *system, +int apply_subsystem_event_filter(struct ftrace_subsystem_dir *dir, char *filter_string) { + struct event_subsystem *system = dir->subsystem; struct event_filter *filter; int err = 0; mutex_lock(&event_mutex); /* Make sure the system still has events */ - if (!system->nr_events) { + if (!dir->nr_events) { err = -ENODEV; goto out_unlock; } diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index e039906b037d..d21a74670088 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -129,7 +129,7 @@ static void __always_unused ____ftrace_check_##name(void) \ #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print, filter) \ -int \ +static int __init \ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ { \ struct struct_name field; \ @@ -168,7 +168,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ #define FTRACE_ENTRY_REG(call, struct_name, etype, tstruct, print, filter,\ regfn) \ \ -struct ftrace_event_class event_class_ftrace_##call = { \ +struct ftrace_event_class __refdata event_class_ftrace_##call = { \ .system = __stringify(TRACE_SYSTEM), \ .define_fields = ftrace_define_fields_##call, \ .fields = LIST_HEAD_INIT(event_class_ftrace_##call.fields),\ diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 601152523326..c4d6d7191988 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -28,7 +28,7 @@ static void tracing_stop_function_trace(void); static int function_trace_init(struct trace_array *tr) { func_trace = tr; - tr->cpu = get_cpu(); + tr->trace_buffer.cpu = get_cpu(); put_cpu(); tracing_start_cmdline_record(); @@ -44,7 +44,7 @@ static void function_trace_reset(struct trace_array *tr) static void function_trace_start(struct trace_array *tr) { - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); } /* Our option */ @@ -76,7 +76,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip, goto out; cpu = smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (!atomic_read(&data->disabled)) { local_save_flags(flags); trace_function(tr, ip, parent_ip, flags, pc); @@ -107,7 +107,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip, */ local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { @@ -214,66 +214,89 @@ static struct tracer function_trace __read_mostly = }; #ifdef CONFIG_DYNAMIC_FTRACE -static void -ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +static int update_count(void **data) { - long *count = (long *)data; - - if (tracing_is_on()) - return; + unsigned long *count = (long *)data; if (!*count) - return; + return 0; if (*count != -1) (*count)--; - tracing_on(); + return 1; } static void -ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +ftrace_traceon_count(unsigned long ip, unsigned long parent_ip, void **data) { - long *count = (long *)data; + if (tracing_is_on()) + return; + + if (update_count(data)) + tracing_on(); +} +static void +ftrace_traceoff_count(unsigned long ip, unsigned long parent_ip, void **data) +{ if (!tracing_is_on()) return; - if (!*count) + if (update_count(data)) + tracing_off(); +} + +static void +ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (tracing_is_on()) return; - if (*count != -1) - (*count)--; + tracing_on(); +} + +static void +ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (!tracing_is_on()) + return; tracing_off(); } -static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data); +/* + * Skip 4: + * ftrace_stacktrace() + * function_trace_probe_call() + * ftrace_ops_list_func() + * ftrace_call() + */ +#define STACK_SKIP 4 -static struct ftrace_probe_ops traceon_probe_ops = { - .func = ftrace_traceon, - .print = ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace(unsigned long ip, unsigned long parent_ip, void **data) +{ + trace_dump_stack(STACK_SKIP); +} -static struct ftrace_probe_ops traceoff_probe_ops = { - .func = ftrace_traceoff, - .print = ftrace_trace_onoff_print, -}; +static void +ftrace_stacktrace_count(unsigned long ip, unsigned long parent_ip, void **data) +{ + if (!tracing_is_on()) + return; + + if (update_count(data)) + trace_dump_stack(STACK_SKIP); +} static int -ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, - struct ftrace_probe_ops *ops, void *data) +ftrace_probe_print(const char *name, struct seq_file *m, + unsigned long ip, void *data) { long count = (long)data; - seq_printf(m, "%ps:", (void *)ip); - - if (ops == &traceon_probe_ops) - seq_printf(m, "traceon"); - else - seq_printf(m, "traceoff"); + seq_printf(m, "%ps:%s", (void *)ip, name); if (count == -1) seq_printf(m, ":unlimited\n"); @@ -284,26 +307,61 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip, } static int -ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param) +ftrace_traceon_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) { - struct ftrace_probe_ops *ops; - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = &traceon_probe_ops; - else - ops = &traceoff_probe_ops; + return ftrace_probe_print("traceon", m, ip, data); +} - unregister_ftrace_function_probe_func(glob, ops); +static int +ftrace_traceoff_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("traceoff", m, ip, data); +} - return 0; +static int +ftrace_stacktrace_print(struct seq_file *m, unsigned long ip, + struct ftrace_probe_ops *ops, void *data) +{ + return ftrace_probe_print("stacktrace", m, ip, data); } +static struct ftrace_probe_ops traceon_count_probe_ops = { + .func = ftrace_traceon_count, + .print = ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_count_probe_ops = { + .func = ftrace_traceoff_count, + .print = ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_count_probe_ops = { + .func = ftrace_stacktrace_count, + .print = ftrace_stacktrace_print, +}; + +static struct ftrace_probe_ops traceon_probe_ops = { + .func = ftrace_traceon, + .print = ftrace_traceon_print, +}; + +static struct ftrace_probe_ops traceoff_probe_ops = { + .func = ftrace_traceoff, + .print = ftrace_traceoff_print, +}; + +static struct ftrace_probe_ops stacktrace_probe_ops = { + .func = ftrace_stacktrace, + .print = ftrace_stacktrace_print, +}; + static int -ftrace_trace_onoff_callback(struct ftrace_hash *hash, - char *glob, char *cmd, char *param, int enable) +ftrace_trace_probe_callback(struct ftrace_probe_ops *ops, + struct ftrace_hash *hash, char *glob, + char *cmd, char *param, int enable) { - struct ftrace_probe_ops *ops; void *count = (void *)-1; char *number; int ret; @@ -312,14 +370,10 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, if (!enable) return -EINVAL; - if (glob[0] == '!') - return ftrace_trace_onoff_unreg(glob+1, cmd, param); - - /* we register both traceon and traceoff to this callback */ - if (strcmp(cmd, "traceon") == 0) - ops = &traceon_probe_ops; - else - ops = &traceoff_probe_ops; + if (glob[0] == '!') { + unregister_ftrace_function_probe_func(glob+1, ops); + return 0; + } if (!param) goto out_reg; @@ -343,6 +397,34 @@ ftrace_trace_onoff_callback(struct ftrace_hash *hash, return ret < 0 ? ret : 0; } +static int +ftrace_trace_onoff_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + /* we register both traceon and traceoff to this callback */ + if (strcmp(cmd, "traceon") == 0) + ops = param ? &traceon_count_probe_ops : &traceon_probe_ops; + else + ops = param ? &traceoff_count_probe_ops : &traceoff_probe_ops; + + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + param, enable); +} + +static int +ftrace_stacktrace_callback(struct ftrace_hash *hash, + char *glob, char *cmd, char *param, int enable) +{ + struct ftrace_probe_ops *ops; + + ops = param ? &stacktrace_count_probe_ops : &stacktrace_probe_ops; + + return ftrace_trace_probe_callback(ops, hash, glob, cmd, + param, enable); +} + static struct ftrace_func_command ftrace_traceon_cmd = { .name = "traceon", .func = ftrace_trace_onoff_callback, @@ -353,6 +435,11 @@ static struct ftrace_func_command ftrace_traceoff_cmd = { .func = ftrace_trace_onoff_callback, }; +static struct ftrace_func_command ftrace_stacktrace_cmd = { + .name = "stacktrace", + .func = ftrace_stacktrace_callback, +}; + static int __init init_func_cmd_traceon(void) { int ret; @@ -364,6 +451,12 @@ static int __init init_func_cmd_traceon(void) ret = register_ftrace_command(&ftrace_traceon_cmd); if (ret) unregister_ftrace_command(&ftrace_traceoff_cmd); + + ret = register_ftrace_command(&ftrace_stacktrace_cmd); + if (ret) { + unregister_ftrace_command(&ftrace_traceoff_cmd); + unregister_ftrace_command(&ftrace_traceon_cmd); + } return ret; } #else diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 39ada66389cc..8388bc99f2ee 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -218,7 +218,7 @@ int __trace_graph_entry(struct trace_array *tr, { struct ftrace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ent_entry *entry; if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -265,7 +265,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); @@ -323,7 +323,7 @@ void __trace_graph_return(struct trace_array *tr, { struct ftrace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ret_entry *entry; if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) @@ -350,7 +350,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace) local_irq_save(flags); cpu = raw_smp_processor_id(); - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&data->disabled); if (likely(disabled == 1)) { pc = preempt_count(); @@ -560,9 +560,9 @@ get_return_for_leaf(struct trace_iterator *iter, * We need to consume the current entry to see * the next one. */ - ring_buffer_consume(iter->tr->buffer, iter->cpu, + ring_buffer_consume(iter->trace_buffer->buffer, iter->cpu, NULL, NULL); - event = ring_buffer_peek(iter->tr->buffer, iter->cpu, + event = ring_buffer_peek(iter->trace_buffer->buffer, iter->cpu, NULL, NULL); } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 713a2cac4881..b19d065a28cb 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -32,7 +32,8 @@ enum { static int trace_type __read_mostly; -static int save_lat_flag; +static int save_flags; +static bool function_enabled; static void stop_irqsoff_tracer(struct trace_array *tr, int graph); static int start_irqsoff_tracer(struct trace_array *tr, int graph); @@ -121,7 +122,7 @@ static int func_prolog_dec(struct trace_array *tr, if (!irqs_disabled_flags(*flags)) return 0; - *data = tr->data[cpu]; + *data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (likely(disabled == 1)) @@ -175,7 +176,7 @@ static int irqsoff_set_flag(u32 old_flags, u32 bit, int set) per_cpu(tracing_cpu, cpu) = 0; tracing_max_latency = 0; - tracing_reset_online_cpus(irqsoff_trace); + tracing_reset_online_cpus(&irqsoff_trace->trace_buffer); return start_irqsoff_tracer(irqsoff_trace, set); } @@ -380,7 +381,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip) if (per_cpu(tracing_cpu, cpu)) return; - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (unlikely(!data) || atomic_read(&data->disabled)) return; @@ -418,7 +419,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip) if (!tracer_enabled) return; - data = tr->data[cpu]; + data = per_cpu_ptr(tr->trace_buffer.data, cpu); if (unlikely(!data) || !data->critical_start || atomic_read(&data->disabled)) @@ -528,15 +529,60 @@ void trace_preempt_off(unsigned long a0, unsigned long a1) } #endif /* CONFIG_PREEMPT_TRACER */ -static int start_irqsoff_tracer(struct trace_array *tr, int graph) +static int register_irqsoff_function(int graph, int set) { - int ret = 0; + int ret; - if (!graph) - ret = register_ftrace_function(&trace_ops); - else + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) ret = register_ftrace_graph(&irqsoff_graph_return, &irqsoff_graph_entry); + else + ret = register_ftrace_function(&trace_ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_irqsoff_function(int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(&trace_ops); + + function_enabled = false; +} + +static void irqsoff_function_set(int set) +{ + if (set) + register_irqsoff_function(is_graph(), 1); + else + unregister_irqsoff_function(is_graph()); +} + +static int irqsoff_flag_changed(struct tracer *tracer, u32 mask, int set) +{ + if (mask & TRACE_ITER_FUNCTION) + irqsoff_function_set(set); + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_irqsoff_tracer(struct trace_array *tr, int graph) +{ + int ret; + + ret = register_irqsoff_function(graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -550,22 +596,22 @@ static void stop_irqsoff_tracer(struct trace_array *tr, int graph) { tracer_enabled = 0; - if (!graph) - unregister_ftrace_function(&trace_ops); - else - unregister_ftrace_graph(); + unregister_irqsoff_function(graph); } static void __irqsoff_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); if (start_irqsoff_tracer(tr, is_graph())) printk(KERN_ERR "failed to start irqsoff tracer\n"); @@ -573,10 +619,13 @@ static void __irqsoff_tracer_init(struct trace_array *tr) static void irqsoff_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_irqsoff_tracer(tr, is_graph()); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); } static void irqsoff_tracer_start(struct trace_array *tr) @@ -609,6 +658,7 @@ static struct tracer irqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_irqsoff, #endif @@ -642,6 +692,7 @@ static struct tracer preemptoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptoff, #endif @@ -677,6 +728,7 @@ static struct tracer preemptirqsoff_tracer __read_mostly = .print_line = irqsoff_print_line, .flags = &tracer_flags, .set_flag = irqsoff_set_flag, + .flag_changed = irqsoff_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_preemptirqsoff, #endif diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c index 3c5c5dfea0b3..bd90e1b06088 100644 --- a/kernel/trace/trace_kdb.c +++ b/kernel/trace/trace_kdb.c @@ -26,7 +26,7 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) trace_init_global_iter(&iter); for_each_tracing_cpu(cpu) { - atomic_inc(&iter.tr->data[cpu]->disabled); + atomic_inc(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } old_userobj = trace_flags; @@ -43,17 +43,17 @@ static void ftrace_dump_buf(int skip_lines, long cpu_file) iter.iter_flags |= TRACE_FILE_LAT_FMT; iter.pos = -1; - if (cpu_file == TRACE_PIPE_ALL_CPU) { + if (cpu_file == RING_BUFFER_ALL_CPUS) { for_each_tracing_cpu(cpu) { iter.buffer_iter[cpu] = - ring_buffer_read_prepare(iter.tr->buffer, cpu); + ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu); ring_buffer_read_start(iter.buffer_iter[cpu]); tracing_iter_reset(&iter, cpu); } } else { iter.cpu_file = cpu_file; iter.buffer_iter[cpu_file] = - ring_buffer_read_prepare(iter.tr->buffer, cpu_file); + ring_buffer_read_prepare(iter.trace_buffer->buffer, cpu_file); ring_buffer_read_start(iter.buffer_iter[cpu_file]); tracing_iter_reset(&iter, cpu_file); } @@ -83,7 +83,7 @@ out: trace_flags = old_userobj; for_each_tracing_cpu(cpu) { - atomic_dec(&iter.tr->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(iter.trace_buffer->data, cpu)->disabled); } for_each_tracing_cpu(cpu) @@ -115,7 +115,7 @@ static int kdb_ftdump(int argc, const char **argv) !cpu_online(cpu_file)) return KDB_BADINT; } else { - cpu_file = TRACE_PIPE_ALL_CPU; + cpu_file = RING_BUFFER_ALL_CPUS; } kdb_trap_printk++; diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index fd3c8aae55e5..a5e8f4878bfa 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -31,7 +31,7 @@ static void mmio_reset_data(struct trace_array *tr) overrun_detected = false; prev_overruns = 0; - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); } static int mmio_trace_init(struct trace_array *tr) @@ -128,7 +128,7 @@ static void mmio_close(struct trace_iterator *iter) static unsigned long count_overruns(struct trace_iterator *iter) { unsigned long cnt = atomic_xchg(&dropped_count, 0); - unsigned long over = ring_buffer_overruns(iter->tr->buffer); + unsigned long over = ring_buffer_overruns(iter->trace_buffer->buffer); if (over > prev_overruns) cnt += over - prev_overruns; @@ -309,7 +309,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, struct mmiotrace_rw *rw) { struct ftrace_event_call *call = &event_mmiotrace_rw; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_rw *entry; int pc = preempt_count(); @@ -330,7 +330,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr, void mmio_trace_rw(struct mmiotrace_rw *rw) { struct trace_array *tr = mmio_trace_array; - struct trace_array_cpu *data = tr->data[smp_processor_id()]; + struct trace_array_cpu *data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); __trace_mmiotrace_rw(tr, data, rw); } @@ -339,7 +339,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr, struct mmiotrace_map *map) { struct ftrace_event_call *call = &event_mmiotrace_map; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct trace_mmiotrace_map *entry; int pc = preempt_count(); @@ -363,7 +363,7 @@ void mmio_trace_mapping(struct mmiotrace_map *map) struct trace_array_cpu *data; preempt_disable(); - data = tr->data[smp_processor_id()]; + data = per_cpu_ptr(tr->trace_buffer.data, smp_processor_id()); __trace_mmiotrace_map(tr, data, map); preempt_enable(); } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 697e88d13907..bb922d9ee51b 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -DECLARE_RWSEM(trace_event_mutex); +DECLARE_RWSEM(trace_event_sem); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; @@ -37,6 +37,22 @@ int trace_print_seq(struct seq_file *m, struct trace_seq *s) return ret; } +enum print_line_t trace_print_bputs_msg_only(struct trace_iterator *iter) +{ + struct trace_seq *s = &iter->seq; + struct trace_entry *entry = iter->ent; + struct bputs_entry *field; + int ret; + + trace_assign_type(field, entry); + + ret = trace_seq_puts(s, field->str); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return TRACE_TYPE_HANDLED; +} + enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; @@ -397,6 +413,32 @@ ftrace_print_hex_seq(struct trace_seq *p, const unsigned char *buf, int buf_len) } EXPORT_SYMBOL(ftrace_print_hex_seq); +int ftrace_raw_output_prep(struct trace_iterator *iter, + struct trace_event *trace_event) +{ + struct ftrace_event_call *event; + struct trace_seq *s = &iter->seq; + struct trace_seq *p = &iter->tmp_seq; + struct trace_entry *entry; + int ret; + + event = container_of(trace_event, struct ftrace_event_call, event); + entry = iter->ent; + + if (entry->type != event->event.type) { + WARN_ON_ONCE(1); + return TRACE_TYPE_UNHANDLED; + } + + trace_seq_init(p); + ret = trace_seq_printf(s, "%s: ", event->name); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + + return 0; +} +EXPORT_SYMBOL(ftrace_raw_output_prep); + #ifdef CONFIG_KRETPROBES static inline const char *kretprobed(const char *name) { @@ -617,7 +659,7 @@ lat_print_timestamp(struct trace_iterator *iter, u64 next_ts) { unsigned long verbose = trace_flags & TRACE_ITER_VERBOSE; unsigned long in_ns = iter->iter_flags & TRACE_FILE_TIME_IN_NS; - unsigned long long abs_ts = iter->ts - iter->tr->time_start; + unsigned long long abs_ts = iter->ts - iter->trace_buffer->time_start; unsigned long long rel_ts = next_ts - iter->ts; struct trace_seq *s = &iter->seq; @@ -783,12 +825,12 @@ static int trace_search_list(struct list_head **list) void trace_event_read_lock(void) { - down_read(&trace_event_mutex); + down_read(&trace_event_sem); } void trace_event_read_unlock(void) { - up_read(&trace_event_mutex); + up_read(&trace_event_sem); } /** @@ -811,7 +853,7 @@ int register_ftrace_event(struct trace_event *event) unsigned key; int ret = 0; - down_write(&trace_event_mutex); + down_write(&trace_event_sem); if (WARN_ON(!event)) goto out; @@ -866,14 +908,14 @@ int register_ftrace_event(struct trace_event *event) ret = event->type; out: - up_write(&trace_event_mutex); + up_write(&trace_event_sem); return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); /* - * Used by module code with the trace_event_mutex held for write. + * Used by module code with the trace_event_sem held for write. */ int __unregister_ftrace_event(struct trace_event *event) { @@ -888,9 +930,9 @@ int __unregister_ftrace_event(struct trace_event *event) */ int unregister_ftrace_event(struct trace_event *event) { - down_write(&trace_event_mutex); + down_write(&trace_event_sem); __unregister_ftrace_event(event); - up_write(&trace_event_mutex); + up_write(&trace_event_sem); return 0; } @@ -1217,6 +1259,64 @@ static struct trace_event trace_user_stack_event = { .funcs = &trace_user_stack_funcs, }; +/* TRACE_BPUTS */ +static enum print_line_t +trace_bputs_print(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct trace_entry *entry = iter->ent; + struct trace_seq *s = &iter->seq; + struct bputs_entry *field; + + trace_assign_type(field, entry); + + if (!seq_print_ip_sym(s, field->ip, flags)) + goto partial; + + if (!trace_seq_puts(s, ": ")) + goto partial; + + if (!trace_seq_puts(s, field->str)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + + +static enum print_line_t +trace_bputs_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) +{ + struct bputs_entry *field; + struct trace_seq *s = &iter->seq; + + trace_assign_type(field, iter->ent); + + if (!trace_seq_printf(s, ": %lx : ", field->ip)) + goto partial; + + if (!trace_seq_puts(s, field->str)) + goto partial; + + return TRACE_TYPE_HANDLED; + + partial: + return TRACE_TYPE_PARTIAL_LINE; +} + +static struct trace_event_functions trace_bputs_funcs = { + .trace = trace_bputs_print, + .raw = trace_bputs_raw, +}; + +static struct trace_event trace_bputs_event = { + .type = TRACE_BPUTS, + .funcs = &trace_bputs_funcs, +}; + /* TRACE_BPRINT */ static enum print_line_t trace_bprint_print(struct trace_iterator *iter, int flags, @@ -1329,6 +1429,7 @@ static struct trace_event *events[] __initdata = { &trace_wake_event, &trace_stack_event, &trace_user_stack_event, + &trace_bputs_event, &trace_bprint_event, &trace_print_event, NULL diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index c038eba0492b..127a9d8c8357 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -5,6 +5,8 @@ #include "trace.h" extern enum print_line_t +trace_print_bputs_msg_only(struct trace_iterator *iter); +extern enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); @@ -31,7 +33,7 @@ trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); /* used by module unregistering */ extern int __unregister_ftrace_event(struct trace_event *event); -extern struct rw_semaphore trace_event_mutex; +extern struct rw_semaphore trace_event_sem; #define MAX_MEMHEX_BYTES 8 #define HEX_CHARS (MAX_MEMHEX_BYTES*2 + 1) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 3374c792ccd8..4e98e3b257a3 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -28,7 +28,7 @@ tracing_sched_switch_trace(struct trace_array *tr, unsigned long flags, int pc) { struct ftrace_event_call *call = &event_context_switch; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -69,7 +69,7 @@ probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *n pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = ctx_trace->data[cpu]; + data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); if (likely(!atomic_read(&data->disabled))) tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc); @@ -86,7 +86,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct ftrace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; - struct ring_buffer *buffer = tr->buffer; + struct ring_buffer *buffer = tr->trace_buffer.buffer; event = trace_buffer_lock_reserve(buffer, TRACE_WAKE, sizeof(*entry), flags, pc); @@ -123,7 +123,7 @@ probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) pc = preempt_count(); local_irq_save(flags); cpu = raw_smp_processor_id(); - data = ctx_trace->data[cpu]; + data = per_cpu_ptr(ctx_trace->trace_buffer.data, cpu); if (likely(!atomic_read(&data->disabled))) tracing_sched_wakeup_trace(ctx_trace, wakee, current, diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 75aa97fbe1a1..fee77e15d815 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -36,7 +36,8 @@ static void __wakeup_reset(struct trace_array *tr); static int wakeup_graph_entry(struct ftrace_graph_ent *trace); static void wakeup_graph_return(struct ftrace_graph_ret *trace); -static int save_lat_flag; +static int save_flags; +static bool function_enabled; #define TRACE_DISPLAY_GRAPH 1 @@ -89,7 +90,7 @@ func_prolog_preempt_disable(struct trace_array *tr, if (cpu != wakeup_current_cpu) goto out_enable; - *data = tr->data[cpu]; + *data = per_cpu_ptr(tr->trace_buffer.data, cpu); disabled = atomic_inc_return(&(*data)->disabled); if (unlikely(disabled != 1)) goto out; @@ -134,15 +135,60 @@ static struct ftrace_ops trace_ops __read_mostly = }; #endif /* CONFIG_FUNCTION_TRACER */ -static int start_func_tracer(int graph) +static int register_wakeup_function(int graph, int set) { int ret; - if (!graph) - ret = register_ftrace_function(&trace_ops); - else + /* 'set' is set if TRACE_ITER_FUNCTION is about to be set */ + if (function_enabled || (!set && !(trace_flags & TRACE_ITER_FUNCTION))) + return 0; + + if (graph) ret = register_ftrace_graph(&wakeup_graph_return, &wakeup_graph_entry); + else + ret = register_ftrace_function(&trace_ops); + + if (!ret) + function_enabled = true; + + return ret; +} + +static void unregister_wakeup_function(int graph) +{ + if (!function_enabled) + return; + + if (graph) + unregister_ftrace_graph(); + else + unregister_ftrace_function(&trace_ops); + + function_enabled = false; +} + +static void wakeup_function_set(int set) +{ + if (set) + register_wakeup_function(is_graph(), 1); + else + unregister_wakeup_function(is_graph()); +} + +static int wakeup_flag_changed(struct tracer *tracer, u32 mask, int set) +{ + if (mask & TRACE_ITER_FUNCTION) + wakeup_function_set(set); + + return trace_keep_overwrite(tracer, mask, set); +} + +static int start_func_tracer(int graph) +{ + int ret; + + ret = register_wakeup_function(graph, 0); if (!ret && tracing_is_enabled()) tracer_enabled = 1; @@ -156,10 +202,7 @@ static void stop_func_tracer(int graph) { tracer_enabled = 0; - if (!graph) - unregister_ftrace_function(&trace_ops); - else - unregister_ftrace_graph(); + unregister_wakeup_function(graph); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -353,7 +396,7 @@ probe_wakeup_sched_switch(void *ignore, /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); - disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); if (likely(disabled != 1)) goto out; @@ -365,7 +408,7 @@ probe_wakeup_sched_switch(void *ignore, goto out_unlock; /* The task we are waiting for is waking up */ - data = wakeup_trace->data[wakeup_cpu]; + data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); __trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); @@ -387,7 +430,7 @@ out_unlock: arch_spin_unlock(&wakeup_lock); local_irq_restore(flags); out: - atomic_dec(&wakeup_trace->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); } static void __wakeup_reset(struct trace_array *tr) @@ -405,7 +448,7 @@ static void wakeup_reset(struct trace_array *tr) { unsigned long flags; - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); local_irq_save(flags); arch_spin_lock(&wakeup_lock); @@ -435,7 +478,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) return; pc = preempt_count(); - disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); + disabled = atomic_inc_return(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); if (unlikely(disabled != 1)) goto out; @@ -458,7 +501,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) local_save_flags(flags); - data = wakeup_trace->data[wakeup_cpu]; + data = per_cpu_ptr(wakeup_trace->trace_buffer.data, wakeup_cpu); data->preempt_timestamp = ftrace_now(cpu); tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc); @@ -472,7 +515,7 @@ probe_wakeup(void *ignore, struct task_struct *p, int success) out_locked: arch_spin_unlock(&wakeup_lock); out: - atomic_dec(&wakeup_trace->data[cpu]->disabled); + atomic_dec(&per_cpu_ptr(wakeup_trace->trace_buffer.data, cpu)->disabled); } static void start_wakeup_tracer(struct trace_array *tr) @@ -540,8 +583,11 @@ static void stop_wakeup_tracer(struct trace_array *tr) static int __wakeup_tracer_init(struct trace_array *tr) { - save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT; - trace_flags |= TRACE_ITER_LATENCY_FMT; + save_flags = trace_flags; + + /* non overwrite screws up the latency tracers */ + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, 1); + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, 1); tracing_max_latency = 0; wakeup_trace = tr; @@ -563,12 +609,15 @@ static int wakeup_rt_tracer_init(struct trace_array *tr) static void wakeup_tracer_reset(struct trace_array *tr) { + int lat_flag = save_flags & TRACE_ITER_LATENCY_FMT; + int overwrite_flag = save_flags & TRACE_ITER_OVERWRITE; + stop_wakeup_tracer(tr); /* make sure we put back any tasks we are tracing */ wakeup_reset(tr); - if (!save_lat_flag) - trace_flags &= ~TRACE_ITER_LATENCY_FMT; + set_tracer_flag(tr, TRACE_ITER_LATENCY_FMT, lat_flag); + set_tracer_flag(tr, TRACE_ITER_OVERWRITE, overwrite_flag); } static void wakeup_tracer_start(struct trace_array *tr) @@ -594,6 +643,7 @@ static struct tracer wakeup_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif @@ -615,6 +665,7 @@ static struct tracer wakeup_rt_tracer __read_mostly = .print_line = wakeup_print_line, .flags = &tracer_flags, .set_flag = wakeup_set_flag, + .flag_changed = wakeup_flag_changed, #ifdef CONFIG_FTRACE_SELFTEST .selftest = trace_selftest_startup_wakeup, #endif diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 51c819c12c29..55e2cf66967b 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -21,13 +21,13 @@ static inline int trace_valid_entry(struct trace_entry *entry) return 0; } -static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) +static int trace_test_buffer_cpu(struct trace_buffer *buf, int cpu) { struct ring_buffer_event *event; struct trace_entry *entry; unsigned int loops = 0; - while ((event = ring_buffer_consume(tr->buffer, cpu, NULL, NULL))) { + while ((event = ring_buffer_consume(buf->buffer, cpu, NULL, NULL))) { entry = ring_buffer_event_data(event); /* @@ -58,7 +58,7 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu) * Test the trace buffer to see if all the elements * are still sane. */ -static int trace_test_buffer(struct trace_array *tr, unsigned long *count) +static int trace_test_buffer(struct trace_buffer *buf, unsigned long *count) { unsigned long flags, cnt = 0; int cpu, ret = 0; @@ -67,7 +67,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) local_irq_save(flags); arch_spin_lock(&ftrace_max_lock); - cnt = ring_buffer_entries(tr->buffer); + cnt = ring_buffer_entries(buf->buffer); /* * The trace_test_buffer_cpu runs a while loop to consume all data. @@ -78,7 +78,7 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count) */ tracing_off(); for_each_possible_cpu(cpu) { - ret = trace_test_buffer_cpu(tr, cpu); + ret = trace_test_buffer_cpu(buf, cpu); if (ret) break; } @@ -355,7 +355,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, msleep(100); /* we should have nothing in the buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); if (ret) goto out; @@ -376,7 +376,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, ftrace_enabled = 0; /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); tracing_start(); /* we should only have one item */ @@ -666,7 +666,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) ftrace_enabled = 0; /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); @@ -703,8 +703,6 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr) /* Maximum number of functions to trace before diagnosing a hang */ #define GRAPH_MAX_FUNC_TEST 100000000 -static void -__ftrace_dump(bool disable_tracing, enum ftrace_dump_mode oops_dump_mode); static unsigned int graph_hang_thresh; /* Wrap the real function entry probe to avoid possible hanging */ @@ -714,8 +712,11 @@ static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace) if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) { ftrace_graph_stop(); printk(KERN_WARNING "BUG: Function graph tracer hang!\n"); - if (ftrace_dump_on_oops) - __ftrace_dump(false, DUMP_ALL); + if (ftrace_dump_on_oops) { + ftrace_dump(DUMP_ALL); + /* ftrace_dump() disables tracing */ + tracing_on(); + } return 0; } @@ -737,7 +738,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, * Simulate the init() callback but we attach a watchdog callback * to detect and recover from possible hangs */ - tracing_reset_online_cpus(tr); + tracing_reset_online_cpus(&tr->trace_buffer); set_graph_array(tr); ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry_watchdog); @@ -760,7 +761,7 @@ trace_selftest_startup_function_graph(struct tracer *trace, tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); @@ -815,9 +816,9 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); tracing_start(); @@ -877,9 +878,9 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); tracing_start(); @@ -943,11 +944,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (ret) goto out; - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); if (ret) goto out; @@ -973,11 +974,11 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array * /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); if (ret) goto out; - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); if (!ret && !count) { printk(KERN_CONT ".. no entries found .."); @@ -1084,10 +1085,10 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr) /* stop the tracing. */ tracing_stop(); /* check both trace buffers */ - ret = trace_test_buffer(tr, NULL); + ret = trace_test_buffer(&tr->trace_buffer, NULL); printk("ret = %d\n", ret); if (!ret) - ret = trace_test_buffer(&max_tr, &count); + ret = trace_test_buffer(&tr->max_buffer, &count); trace->reset(tr); @@ -1126,7 +1127,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr /* stop the tracing. */ tracing_stop(); /* check the trace buffer */ - ret = trace_test_buffer(tr, &count); + ret = trace_test_buffer(&tr->trace_buffer, &count); trace->reset(tr); tracing_start(); diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 42ca822fc701..b20428c5efe2 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -20,13 +20,24 @@ #define STACK_TRACE_ENTRIES 500 +#ifdef CC_USING_FENTRY +# define fentry 1 +#else +# define fentry 0 +#endif + static unsigned long stack_dump_trace[STACK_TRACE_ENTRIES+1] = { [0 ... (STACK_TRACE_ENTRIES)] = ULONG_MAX }; static unsigned stack_dump_index[STACK_TRACE_ENTRIES]; +/* + * Reserve one entry for the passed in ip. This will allow + * us to remove most or all of the stack size overhead + * added by the stack tracer itself. + */ static struct stack_trace max_stack_trace = { - .max_entries = STACK_TRACE_ENTRIES, - .entries = stack_dump_trace, + .max_entries = STACK_TRACE_ENTRIES - 1, + .entries = &stack_dump_trace[1], }; static unsigned long max_stack_size; @@ -39,25 +50,34 @@ static DEFINE_MUTEX(stack_sysctl_mutex); int stack_tracer_enabled; static int last_stack_tracer_enabled; -static inline void check_stack(void) +static inline void +check_stack(unsigned long ip, unsigned long *stack) { unsigned long this_size, flags; unsigned long *p, *top, *start; + static int tracer_frame; + int frame_size = ACCESS_ONCE(tracer_frame); int i; - this_size = ((unsigned long)&this_size) & (THREAD_SIZE-1); + this_size = ((unsigned long)stack) & (THREAD_SIZE-1); this_size = THREAD_SIZE - this_size; + /* Remove the frame of the tracer */ + this_size -= frame_size; if (this_size <= max_stack_size) return; /* we do not handle interrupt stacks yet */ - if (!object_is_on_stack(&this_size)) + if (!object_is_on_stack(stack)) return; local_irq_save(flags); arch_spin_lock(&max_stack_lock); + /* In case another CPU set the tracer_frame on us */ + if (unlikely(!frame_size)) + this_size -= tracer_frame; + /* a race could have already updated it */ if (this_size <= max_stack_size) goto out; @@ -70,10 +90,18 @@ static inline void check_stack(void) save_stack_trace(&max_stack_trace); /* + * Add the passed in ip from the function tracer. + * Searching for this on the stack will skip over + * most of the overhead from the stack tracer itself. + */ + stack_dump_trace[0] = ip; + max_stack_trace.nr_entries++; + + /* * Now find where in the stack these are. */ i = 0; - start = &this_size; + start = stack; top = (unsigned long *) (((unsigned long)start & ~(THREAD_SIZE-1)) + THREAD_SIZE); @@ -97,6 +125,18 @@ static inline void check_stack(void) found = 1; /* Start the search from here */ start = p + 1; + /* + * We do not want to show the overhead + * of the stack tracer stack in the + * max stack. If we haven't figured + * out what that is, then figure it out + * now. + */ + if (unlikely(!tracer_frame) && i == 1) { + tracer_frame = (p - stack) * + sizeof(unsigned long); + max_stack_size -= tracer_frame; + } } } @@ -113,6 +153,7 @@ static void stack_trace_call(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *pt_regs) { + unsigned long stack; int cpu; preempt_disable_notrace(); @@ -122,7 +163,26 @@ stack_trace_call(unsigned long ip, unsigned long parent_ip, if (per_cpu(trace_active, cpu)++ != 0) goto out; - check_stack(); + /* + * When fentry is used, the traced function does not get + * its stack frame set up, and we lose the parent. + * The ip is pretty useless because the function tracer + * was called before that function set up its stack frame. + * In this case, we use the parent ip. + * + * By adding the return address of either the parent ip + * or the current ip we can disregard most of the stack usage + * caused by the stack tracer itself. + * + * The function tracer always reports the address of where the + * mcount call was, but the stack will hold the return address. + */ + if (fentry) + ip = parent_ip; + else + ip += MCOUNT_INSN_SIZE; + + check_stack(ip, &stack); out: per_cpu(trace_active, cpu)--; @@ -322,7 +382,7 @@ static const struct file_operations stack_trace_filter_fops = { .open = stack_trace_filter_open, .read = seq_read, .write = ftrace_filter_write, - .llseek = ftrace_regex_lseek, + .llseek = ftrace_filter_lseek, .release = ftrace_regex_release, }; @@ -371,6 +431,8 @@ static __init int stack_trace_init(void) struct dentry *d_tracer; d_tracer = tracing_init_dentry(); + if (!d_tracer) + return 0; trace_create_file("stack_max_size", 0644, d_tracer, &max_stack_size, &stack_max_size_fops); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 96cffb269e73..847f88a6194b 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -307,6 +307,8 @@ static int tracing_stat_init(void) struct dentry *d_tracing; d_tracing = tracing_init_dentry(); + if (!d_tracing) + return 0; stat_dir = debugfs_create_dir("trace_stat", d_tracing); if (!stat_dir) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 7a809e321058..8f2ac73c7a5f 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -12,10 +12,6 @@ #include "trace.h" static DEFINE_MUTEX(syscall_trace_lock); -static int sys_refcount_enter; -static int sys_refcount_exit; -static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); -static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); static int syscall_enter_register(struct ftrace_event_call *event, enum trace_reg type, void *data); @@ -41,7 +37,7 @@ static inline bool arch_syscall_match_sym_name(const char *sym, const char *name /* * Only compare after the "sys" prefix. Archs that use * syscall wrappers may have syscalls symbols aliases prefixed - * with "SyS" instead of "sys", leading to an unwanted + * with ".SyS" or ".sys" instead of "sys", leading to an unwanted * mismatch. */ return !strcmp(sym + 3, name + 3); @@ -265,7 +261,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call) kfree(call->print_fmt); } -static int syscall_enter_define_fields(struct ftrace_event_call *call) +static int __init syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; @@ -288,7 +284,7 @@ static int syscall_enter_define_fields(struct ftrace_event_call *call) return ret; } -static int syscall_exit_define_fields(struct ftrace_event_call *call) +static int __init syscall_exit_define_fields(struct ftrace_event_call *call) { struct syscall_trace_exit trace; int ret; @@ -303,8 +299,9 @@ static int syscall_exit_define_fields(struct ftrace_event_call *call) return ret; } -static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) +static void ftrace_syscall_enter(void *data, struct pt_regs *regs, long id) { + struct trace_array *tr = data; struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -315,7 +312,7 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, enabled_enter_syscalls)) + if (!test_bit(syscall_nr, tr->enabled_enter_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); @@ -324,7 +321,8 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; - event = trace_current_buffer_lock_reserve(&buffer, + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, sys_data->enter_event->event.type, size, 0, 0); if (!event) return; @@ -338,8 +336,9 @@ static void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) +static void ftrace_syscall_exit(void *data, struct pt_regs *regs, long ret) { + struct trace_array *tr = data; struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; struct ring_buffer_event *event; @@ -349,14 +348,15 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) syscall_nr = trace_get_syscall_nr(current, regs); if (syscall_nr < 0) return; - if (!test_bit(syscall_nr, enabled_exit_syscalls)) + if (!test_bit(syscall_nr, tr->enabled_exit_syscalls)) return; sys_data = syscall_nr_to_meta(syscall_nr); if (!sys_data) return; - event = trace_current_buffer_lock_reserve(&buffer, + buffer = tr->trace_buffer.buffer; + event = trace_buffer_lock_reserve(buffer, sys_data->exit_event->event.type, sizeof(*entry), 0, 0); if (!event) return; @@ -370,8 +370,10 @@ static void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -static int reg_event_syscall_enter(struct ftrace_event_call *call) +static int reg_event_syscall_enter(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int ret = 0; int num; @@ -379,33 +381,37 @@ static int reg_event_syscall_enter(struct ftrace_event_call *call) if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); - if (!sys_refcount_enter) - ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); + if (!tr->sys_refcount_enter) + ret = register_trace_sys_enter(ftrace_syscall_enter, tr); if (!ret) { - set_bit(num, enabled_enter_syscalls); - sys_refcount_enter++; + set_bit(num, tr->enabled_enter_syscalls); + tr->sys_refcount_enter++; } mutex_unlock(&syscall_trace_lock); return ret; } -static void unreg_event_syscall_enter(struct ftrace_event_call *call) +static void unreg_event_syscall_enter(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); - sys_refcount_enter--; - clear_bit(num, enabled_enter_syscalls); - if (!sys_refcount_enter) - unregister_trace_sys_enter(ftrace_syscall_enter, NULL); + tr->sys_refcount_enter--; + clear_bit(num, tr->enabled_enter_syscalls); + if (!tr->sys_refcount_enter) + unregister_trace_sys_enter(ftrace_syscall_enter, tr); mutex_unlock(&syscall_trace_lock); } -static int reg_event_syscall_exit(struct ftrace_event_call *call) +static int reg_event_syscall_exit(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int ret = 0; int num; @@ -413,28 +419,30 @@ static int reg_event_syscall_exit(struct ftrace_event_call *call) if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return -ENOSYS; mutex_lock(&syscall_trace_lock); - if (!sys_refcount_exit) - ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); + if (!tr->sys_refcount_exit) + ret = register_trace_sys_exit(ftrace_syscall_exit, tr); if (!ret) { - set_bit(num, enabled_exit_syscalls); - sys_refcount_exit++; + set_bit(num, tr->enabled_exit_syscalls); + tr->sys_refcount_exit++; } mutex_unlock(&syscall_trace_lock); return ret; } -static void unreg_event_syscall_exit(struct ftrace_event_call *call) +static void unreg_event_syscall_exit(struct ftrace_event_file *file, + struct ftrace_event_call *call) { + struct trace_array *tr = file->tr; int num; num = ((struct syscall_metadata *)call->data)->syscall_nr; if (WARN_ON_ONCE(num < 0 || num >= NR_syscalls)) return; mutex_lock(&syscall_trace_lock); - sys_refcount_exit--; - clear_bit(num, enabled_exit_syscalls); - if (!sys_refcount_exit) - unregister_trace_sys_exit(ftrace_syscall_exit, NULL); + tr->sys_refcount_exit--; + clear_bit(num, tr->enabled_exit_syscalls); + if (!tr->sys_refcount_exit) + unregister_trace_sys_exit(ftrace_syscall_exit, tr); mutex_unlock(&syscall_trace_lock); } @@ -471,7 +479,7 @@ struct trace_event_functions exit_syscall_print_funcs = { .trace = print_syscall_exit, }; -struct ftrace_event_class event_class_syscall_enter = { +struct ftrace_event_class __refdata event_class_syscall_enter = { .system = "syscalls", .reg = syscall_enter_register, .define_fields = syscall_enter_define_fields, @@ -479,7 +487,7 @@ struct ftrace_event_class event_class_syscall_enter = { .raw_init = init_syscall_trace, }; -struct ftrace_event_class event_class_syscall_exit = { +struct ftrace_event_class __refdata event_class_syscall_exit = { .system = "syscalls", .reg = syscall_exit_register, .define_fields = syscall_exit_define_fields, @@ -685,11 +693,13 @@ static void perf_sysexit_disable(struct ftrace_event_call *call) static int syscall_enter_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { + struct ftrace_event_file *file = data; + switch (type) { case TRACE_REG_REGISTER: - return reg_event_syscall_enter(event); + return reg_event_syscall_enter(file, event); case TRACE_REG_UNREGISTER: - unreg_event_syscall_enter(event); + unreg_event_syscall_enter(file, event); return 0; #ifdef CONFIG_PERF_EVENTS @@ -711,11 +721,13 @@ static int syscall_enter_register(struct ftrace_event_call *event, static int syscall_exit_register(struct ftrace_event_call *event, enum trace_reg type, void *data) { + struct ftrace_event_file *file = data; + switch (type) { case TRACE_REG_REGISTER: - return reg_event_syscall_exit(event); + return reg_event_syscall_exit(file, event); case TRACE_REG_UNREGISTER: - unreg_event_syscall_exit(event); + unreg_event_syscall_exit(file, event); return 0; #ifdef CONFIG_PERF_EVENTS diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index 0c05a4592047..29f26540e9c9 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -112,7 +112,8 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, int nr_probes = 0; struct tracepoint_func *old, *new; - WARN_ON(!probe); + if (WARN_ON(!probe)) + return ERR_PTR(-EINVAL); debug_print_probes(entry); old = entry->funcs; @@ -152,13 +153,18 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, debug_print_probes(entry); /* (N -> M), (N > 1, M >= 0) probes */ - for (nr_probes = 0; old[nr_probes].func; nr_probes++) { - if (!probe || - (old[nr_probes].func == probe && - old[nr_probes].data == data)) - nr_del++; + if (probe) { + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + if (old[nr_probes].func == probe && + old[nr_probes].data == data) + nr_del++; + } } + /* + * If probe is NULL, then nr_probes = nr_del = 0, and then the + * entire entry will be removed. + */ if (nr_probes - nr_del == 0) { /* N -> 0, (N > 1) */ entry->funcs = NULL; @@ -173,8 +179,7 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, if (new == NULL) return ERR_PTR(-ENOMEM); for (i = 0; old[i].func; i++) - if (probe && - (old[i].func != probe || old[i].data != data)) + if (old[i].func != probe || old[i].data != data) new[j++] = old[i]; new[nr_probes - nr_del].func = NULL; entry->refcount = nr_probes - nr_del; diff --git a/kernel/user.c b/kernel/user.c index e81978e8c03b..8e635a18ab52 100644 --- a/kernel/user.c +++ b/kernel/user.c @@ -51,6 +51,8 @@ struct user_namespace init_user_ns = { .owner = GLOBAL_ROOT_UID, .group = GLOBAL_ROOT_GID, .proc_inum = PROC_USER_INIT_INO, + .may_mount_sysfs = true, + .may_mount_proc = true, }; EXPORT_SYMBOL_GPL(init_user_ns); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 8b650837083e..e134d8f365dd 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -21,10 +21,12 @@ #include <linux/uaccess.h> #include <linux/ctype.h> #include <linux/projid.h> +#include <linux/fs_struct.h> static struct kmem_cache *user_ns_cachep __read_mostly; -static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, +static bool new_idmap_permitted(const struct file *file, + struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) @@ -60,6 +62,15 @@ int create_user_ns(struct cred *new) kgid_t group = new->egid; int ret; + /* + * Verify that we can not violate the policy of which files + * may be accessed that is specified by the root directory, + * by verifing that the root directory is at the root of the + * mount namespace which allows all files to be accessed. + */ + if (current_chrooted()) + return -EPERM; + /* The creator needs a mapping in the parent user namespace * or else we won't be able to reasonably tell userspace who * created a user_namespace. @@ -86,6 +97,8 @@ int create_user_ns(struct cred *new) set_cred_user_ns(new, ns); + update_mnt_policy(ns); + return 0; } @@ -600,10 +613,10 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (map->nr_extents != 0) goto out; - /* Require the appropriate privilege CAP_SETUID or CAP_SETGID - * over the user namespace in order to set the id mapping. + /* + * Adjusting namespace settings requires capabilities on the target. */ - if (cap_valid(cap_setid) && !ns_capable(ns, cap_setid)) + if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; /* Get a buffer */ @@ -688,7 +701,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = -EPERM; /* Validate the user is allowed to use user id's mapped to. */ - if (!new_idmap_permitted(ns, cap_setid, &new_map)) + if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) goto out; /* Map the lower ids from the parent user namespace to the @@ -775,7 +788,8 @@ ssize_t proc_projid_map_write(struct file *file, const char __user *buf, size_t &ns->projid_map, &ns->parent->projid_map); } -static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, +static bool new_idmap_permitted(const struct file *file, + struct user_namespace *ns, int cap_setid, struct uid_gid_map *new_map) { /* Allow mapping to your own filesystem ids */ @@ -783,12 +797,12 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, u32 id = new_map->extent[0].lower_first; if (cap_setid == CAP_SETUID) { kuid_t uid = make_kuid(ns->parent, id); - if (uid_eq(uid, current_fsuid())) + if (uid_eq(uid, file->f_cred->fsuid)) return true; } else if (cap_setid == CAP_SETGID) { kgid_t gid = make_kgid(ns->parent, id); - if (gid_eq(gid, current_fsgid())) + if (gid_eq(gid, file->f_cred->fsgid)) return true; } } @@ -799,8 +813,10 @@ static bool new_idmap_permitted(struct user_namespace *ns, int cap_setid, /* Allow the specified ids if we have the appropriate capability * (CAP_SETUID or CAP_SETGID) over the parent user namespace. + * And the opener of the id file also had the approprpiate capability. */ - if (ns_capable(ns->parent, cap_setid)) + if (ns_capable(ns->parent, cap_setid) && + file_ns_capable(file, ns->parent, cap_setid)) return true; return false; @@ -837,6 +853,9 @@ static int userns_install(struct nsproxy *nsproxy, void *ns) if (atomic_read(¤t->mm->mm_users) > 1) return -EINVAL; + if (current->fs->users != 1) + return -EINVAL; + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) return -EPERM; diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 81f2457811eb..154aa12af48e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -41,7 +41,11 @@ #include <linux/debug_locks.h> #include <linux/lockdep.h> #include <linux/idr.h> +#include <linux/jhash.h> #include <linux/hashtable.h> +#include <linux/rculist.h> +#include <linux/nodemask.h> +#include <linux/moduleparam.h> #include "workqueue_internal.h" @@ -58,12 +62,11 @@ enum { * %WORKER_UNBOUND set and concurrency management disabled, and may * be executing on any CPU. The pool behaves as an unbound one. * - * Note that DISASSOCIATED can be flipped only while holding - * assoc_mutex to avoid changing binding state while + * Note that DISASSOCIATED should be flipped only while holding + * manager_mutex to avoid changing binding state while * create_worker() is in progress. */ POOL_MANAGE_WORKERS = 1 << 0, /* need to manage workers */ - POOL_MANAGING_WORKERS = 1 << 1, /* managing workers */ POOL_DISASSOCIATED = 1 << 2, /* cpu can't serve workers */ POOL_FREEZING = 1 << 3, /* freeze in progress */ @@ -74,12 +77,14 @@ enum { WORKER_PREP = 1 << 3, /* preparing to run works */ WORKER_CPU_INTENSIVE = 1 << 6, /* cpu intensive */ WORKER_UNBOUND = 1 << 7, /* worker is unbound */ + WORKER_REBOUND = 1 << 8, /* worker was rebound */ - WORKER_NOT_RUNNING = WORKER_PREP | WORKER_UNBOUND | - WORKER_CPU_INTENSIVE, + WORKER_NOT_RUNNING = WORKER_PREP | WORKER_CPU_INTENSIVE | + WORKER_UNBOUND | WORKER_REBOUND, NR_STD_WORKER_POOLS = 2, /* # standard pools per cpu */ + UNBOUND_POOL_HASH_ORDER = 6, /* hashed by pool->attrs */ BUSY_WORKER_HASH_ORDER = 6, /* 64 pointers */ MAX_IDLE_WORKERS_RATIO = 4, /* 1/4 of busy can be idle */ @@ -97,6 +102,8 @@ enum { */ RESCUER_NICE_LEVEL = -20, HIGHPRI_NICE_LEVEL = -20, + + WQ_NAME_LEN = 24, }; /* @@ -115,16 +122,26 @@ enum { * cpu or grabbing pool->lock is enough for read access. If * POOL_DISASSOCIATED is set, it's identical to L. * - * F: wq->flush_mutex protected. + * MG: pool->manager_mutex and pool->lock protected. Writes require both + * locks. Reads can happen under either lock. + * + * PL: wq_pool_mutex protected. + * + * PR: wq_pool_mutex protected for writes. Sched-RCU protected for reads. + * + * WQ: wq->mutex protected. * - * W: workqueue_lock protected. + * WR: wq->mutex protected for writes. Sched-RCU protected for reads. + * + * MD: wq_mayday_lock protected. */ /* struct worker is defined in workqueue_internal.h */ struct worker_pool { spinlock_t lock; /* the pool lock */ - unsigned int cpu; /* I: the associated cpu */ + int cpu; /* I: the associated cpu */ + int node; /* I: the associated node ID */ int id; /* I: pool ID */ unsigned int flags; /* X: flags */ @@ -138,12 +155,18 @@ struct worker_pool { struct timer_list idle_timer; /* L: worker idle timeout */ struct timer_list mayday_timer; /* L: SOS timer for workers */ - /* workers are chained either in busy_hash or idle_list */ + /* a workers is either on busy_hash or idle_list, or the manager */ DECLARE_HASHTABLE(busy_hash, BUSY_WORKER_HASH_ORDER); /* L: hash of busy workers */ - struct mutex assoc_mutex; /* protect POOL_DISASSOCIATED */ - struct ida worker_ida; /* L: for worker IDs */ + /* see manage_workers() for details on the two manager mutexes */ + struct mutex manager_arb; /* manager arbitration */ + struct mutex manager_mutex; /* manager exclusion */ + struct idr worker_idr; /* MG: worker IDs and iteration */ + + struct workqueue_attrs *attrs; /* I: worker attributes */ + struct hlist_node hash_node; /* PL: unbound_pool_hash node */ + int refcnt; /* PL: refcnt for unbound pools */ /* * The current concurrency level. As it's likely to be accessed @@ -151,6 +174,12 @@ struct worker_pool { * cacheline. */ atomic_t nr_running ____cacheline_aligned_in_smp; + + /* + * Destruction of pool is sched-RCU protected to allow dereferences + * from get_work_pool(). + */ + struct rcu_head rcu; } ____cacheline_aligned_in_smp; /* @@ -164,75 +193,107 @@ struct pool_workqueue { struct workqueue_struct *wq; /* I: the owning workqueue */ int work_color; /* L: current color */ int flush_color; /* L: flushing color */ + int refcnt; /* L: reference count */ int nr_in_flight[WORK_NR_COLORS]; /* L: nr of in_flight works */ int nr_active; /* L: nr of active works */ int max_active; /* L: max active works */ struct list_head delayed_works; /* L: delayed works */ -}; + struct list_head pwqs_node; /* WR: node on wq->pwqs */ + struct list_head mayday_node; /* MD: node on wq->maydays */ + + /* + * Release of unbound pwq is punted to system_wq. See put_pwq() + * and pwq_unbound_release_workfn() for details. pool_workqueue + * itself is also sched-RCU protected so that the first pwq can be + * determined without grabbing wq->mutex. + */ + struct work_struct unbound_release_work; + struct rcu_head rcu; +} __aligned(1 << WORK_STRUCT_FLAG_BITS); /* * Structure used to wait for workqueue flush. */ struct wq_flusher { - struct list_head list; /* F: list of flushers */ - int flush_color; /* F: flush color waiting for */ + struct list_head list; /* WQ: list of flushers */ + int flush_color; /* WQ: flush color waiting for */ struct completion done; /* flush completion */ }; -/* - * All cpumasks are assumed to be always set on UP and thus can't be - * used to determine whether there's something to be done. - */ -#ifdef CONFIG_SMP -typedef cpumask_var_t mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask) \ - cpumask_test_and_set_cpu((cpu), (mask)) -#define mayday_clear_cpu(cpu, mask) cpumask_clear_cpu((cpu), (mask)) -#define for_each_mayday_cpu(cpu, mask) for_each_cpu((cpu), (mask)) -#define alloc_mayday_mask(maskp, gfp) zalloc_cpumask_var((maskp), (gfp)) -#define free_mayday_mask(mask) free_cpumask_var((mask)) -#else -typedef unsigned long mayday_mask_t; -#define mayday_test_and_set_cpu(cpu, mask) test_and_set_bit(0, &(mask)) -#define mayday_clear_cpu(cpu, mask) clear_bit(0, &(mask)) -#define for_each_mayday_cpu(cpu, mask) if ((cpu) = 0, (mask)) -#define alloc_mayday_mask(maskp, gfp) true -#define free_mayday_mask(mask) do { } while (0) -#endif +struct wq_device; /* - * The externally visible workqueue abstraction is an array of - * per-CPU workqueues: + * The externally visible workqueue. It relays the issued work items to + * the appropriate worker_pool through its pool_workqueues. */ struct workqueue_struct { - unsigned int flags; /* W: WQ_* flags */ - union { - struct pool_workqueue __percpu *pcpu; - struct pool_workqueue *single; - unsigned long v; - } pool_wq; /* I: pwq's */ - struct list_head list; /* W: list of all workqueues */ - - struct mutex flush_mutex; /* protects wq flushing */ - int work_color; /* F: current work color */ - int flush_color; /* F: current flush color */ + struct list_head pwqs; /* WR: all pwqs of this wq */ + struct list_head list; /* PL: list of all workqueues */ + + struct mutex mutex; /* protects this wq */ + int work_color; /* WQ: current work color */ + int flush_color; /* WQ: current flush color */ atomic_t nr_pwqs_to_flush; /* flush in progress */ - struct wq_flusher *first_flusher; /* F: first flusher */ - struct list_head flusher_queue; /* F: flush waiters */ - struct list_head flusher_overflow; /* F: flush overflow list */ + struct wq_flusher *first_flusher; /* WQ: first flusher */ + struct list_head flusher_queue; /* WQ: flush waiters */ + struct list_head flusher_overflow; /* WQ: flush overflow list */ - mayday_mask_t mayday_mask; /* cpus requesting rescue */ + struct list_head maydays; /* MD: pwqs requesting rescue */ struct worker *rescuer; /* I: rescue worker */ - int nr_drainers; /* W: drain in progress */ - int saved_max_active; /* W: saved pwq max_active */ + int nr_drainers; /* WQ: drain in progress */ + int saved_max_active; /* WQ: saved pwq max_active */ + + struct workqueue_attrs *unbound_attrs; /* WQ: only for unbound wqs */ + struct pool_workqueue *dfl_pwq; /* WQ: only for unbound wqs */ + +#ifdef CONFIG_SYSFS + struct wq_device *wq_dev; /* I: for sysfs interface */ +#endif #ifdef CONFIG_LOCKDEP struct lockdep_map lockdep_map; #endif - char name[]; /* I: workqueue name */ + char name[WQ_NAME_LEN]; /* I: workqueue name */ + + /* hot fields used during command issue, aligned to cacheline */ + unsigned int flags ____cacheline_aligned; /* WQ: WQ_* flags */ + struct pool_workqueue __percpu *cpu_pwqs; /* I: per-cpu pwqs */ + struct pool_workqueue __rcu *numa_pwq_tbl[]; /* FR: unbound pwqs indexed by node */ }; +static struct kmem_cache *pwq_cache; + +static int wq_numa_tbl_len; /* highest possible NUMA node id + 1 */ +static cpumask_var_t *wq_numa_possible_cpumask; + /* possible CPUs of each node */ + +static bool wq_disable_numa; +module_param_named(disable_numa, wq_disable_numa, bool, 0444); + +static bool wq_numa_enabled; /* unbound NUMA affinity enabled */ + +/* buf for wq_update_unbound_numa_attrs(), protected by CPU hotplug exclusion */ +static struct workqueue_attrs *wq_update_unbound_numa_attrs_buf; + +static DEFINE_MUTEX(wq_pool_mutex); /* protects pools and workqueues list */ +static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ + +static LIST_HEAD(workqueues); /* PL: list of all workqueues */ +static bool workqueue_freezing; /* PL: have wqs started freezing? */ + +/* the per-cpu worker pools */ +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], + cpu_worker_pools); + +static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ + +/* PL: hash of all unbound pools keyed by pool->attrs */ +static DEFINE_HASHTABLE(unbound_pool_hash, UNBOUND_POOL_HASH_ORDER); + +/* I: attributes used when instantiating standard unbound pools on demand */ +static struct workqueue_attrs *unbound_std_wq_attrs[NR_STD_WORKER_POOLS]; + struct workqueue_struct *system_wq __read_mostly; EXPORT_SYMBOL_GPL(system_wq); struct workqueue_struct *system_highpri_wq __read_mostly; @@ -244,64 +305,87 @@ EXPORT_SYMBOL_GPL(system_unbound_wq); struct workqueue_struct *system_freezable_wq __read_mostly; EXPORT_SYMBOL_GPL(system_freezable_wq); +static int worker_thread(void *__worker); +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from); + #define CREATE_TRACE_POINTS #include <trace/events/workqueue.h> -#define for_each_std_worker_pool(pool, cpu) \ - for ((pool) = &std_worker_pools(cpu)[0]; \ - (pool) < &std_worker_pools(cpu)[NR_STD_WORKER_POOLS]; (pool)++) +#define assert_rcu_or_pool_mutex() \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&wq_pool_mutex), \ + "sched RCU or wq_pool_mutex should be held") -#define for_each_busy_worker(worker, i, pool) \ - hash_for_each(pool->busy_hash, i, worker, hentry) +#define assert_rcu_or_wq_mutex(wq) \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&wq->mutex), \ + "sched RCU or wq->mutex should be held") -static inline int __next_wq_cpu(int cpu, const struct cpumask *mask, - unsigned int sw) -{ - if (cpu < nr_cpu_ids) { - if (sw & 1) { - cpu = cpumask_next(cpu, mask); - if (cpu < nr_cpu_ids) - return cpu; - } - if (sw & 2) - return WORK_CPU_UNBOUND; - } - return WORK_CPU_END; -} +#ifdef CONFIG_LOCKDEP +#define assert_manager_or_pool_lock(pool) \ + WARN_ONCE(debug_locks && \ + !lockdep_is_held(&(pool)->manager_mutex) && \ + !lockdep_is_held(&(pool)->lock), \ + "pool->manager_mutex or ->lock should be held") +#else +#define assert_manager_or_pool_lock(pool) do { } while (0) +#endif -static inline int __next_pwq_cpu(int cpu, const struct cpumask *mask, - struct workqueue_struct *wq) -{ - return __next_wq_cpu(cpu, mask, !(wq->flags & WQ_UNBOUND) ? 1 : 2); -} +#define for_each_cpu_worker_pool(pool, cpu) \ + for ((pool) = &per_cpu(cpu_worker_pools, cpu)[0]; \ + (pool) < &per_cpu(cpu_worker_pools, cpu)[NR_STD_WORKER_POOLS]; \ + (pool)++) -/* - * CPU iterators +/** + * for_each_pool - iterate through all worker_pools in the system + * @pool: iteration cursor + * @pi: integer used for iteration * - * An extra cpu number is defined using an invalid cpu number - * (WORK_CPU_UNBOUND) to host workqueues which are not bound to any - * specific CPU. The following iterators are similar to for_each_*_cpu() - * iterators but also considers the unbound CPU. + * This must be called either with wq_pool_mutex held or sched RCU read + * locked. If the pool needs to be used beyond the locking in effect, the + * caller is responsible for guaranteeing that the pool stays online. * - * for_each_wq_cpu() : possible CPUs + WORK_CPU_UNBOUND - * for_each_online_wq_cpu() : online CPUs + WORK_CPU_UNBOUND - * for_each_pwq_cpu() : possible CPUs for bound workqueues, - * WORK_CPU_UNBOUND for unbound workqueues + * The if/else clause exists only for the lockdep assertion and can be + * ignored. */ -#define for_each_wq_cpu(cpu) \ - for ((cpu) = __next_wq_cpu(-1, cpu_possible_mask, 3); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_wq_cpu((cpu), cpu_possible_mask, 3)) +#define for_each_pool(pool, pi) \ + idr_for_each_entry(&worker_pool_idr, pool, pi) \ + if (({ assert_rcu_or_pool_mutex(); false; })) { } \ + else -#define for_each_online_wq_cpu(cpu) \ - for ((cpu) = __next_wq_cpu(-1, cpu_online_mask, 3); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_wq_cpu((cpu), cpu_online_mask, 3)) +/** + * for_each_pool_worker - iterate through all workers of a worker_pool + * @worker: iteration cursor + * @wi: integer used for iteration + * @pool: worker_pool to iterate workers of + * + * This must be called with either @pool->manager_mutex or ->lock held. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. + */ +#define for_each_pool_worker(worker, wi, pool) \ + idr_for_each_entry(&(pool)->worker_idr, (worker), (wi)) \ + if (({ assert_manager_or_pool_lock((pool)); false; })) { } \ + else -#define for_each_pwq_cpu(cpu, wq) \ - for ((cpu) = __next_pwq_cpu(-1, cpu_possible_mask, (wq)); \ - (cpu) < WORK_CPU_END; \ - (cpu) = __next_pwq_cpu((cpu), cpu_possible_mask, (wq))) +/** + * for_each_pwq - iterate through all pool_workqueues of the specified workqueue + * @pwq: iteration cursor + * @wq: the target workqueue + * + * This must be called either with wq->mutex held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online. + * + * The if/else clause exists only for the lockdep assertion and can be + * ignored. + */ +#define for_each_pwq(pwq, wq) \ + list_for_each_entry_rcu((pwq), &(wq)->pwqs, pwqs_node) \ + if (({ assert_rcu_or_wq_mutex(wq); false; })) { } \ + else #ifdef CONFIG_DEBUG_OBJECTS_WORK @@ -419,76 +503,35 @@ static inline void debug_work_activate(struct work_struct *work) { } static inline void debug_work_deactivate(struct work_struct *work) { } #endif -/* Serializes the accesses to the list of workqueues. */ -static DEFINE_SPINLOCK(workqueue_lock); -static LIST_HEAD(workqueues); -static bool workqueue_freezing; /* W: have wqs started freezing? */ - -/* - * The CPU and unbound standard worker pools. The unbound ones have - * POOL_DISASSOCIATED set, and their workers have WORKER_UNBOUND set. - */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_std_worker_pools); -static struct worker_pool unbound_std_worker_pools[NR_STD_WORKER_POOLS]; - -/* idr of all pools */ -static DEFINE_MUTEX(worker_pool_idr_mutex); -static DEFINE_IDR(worker_pool_idr); - -static int worker_thread(void *__worker); - -static struct worker_pool *std_worker_pools(int cpu) -{ - if (cpu != WORK_CPU_UNBOUND) - return per_cpu(cpu_std_worker_pools, cpu); - else - return unbound_std_worker_pools; -} - -static int std_worker_pool_pri(struct worker_pool *pool) -{ - return pool - std_worker_pools(pool->cpu); -} - /* allocate ID and assign it to @pool */ static int worker_pool_assign_id(struct worker_pool *pool) { int ret; - mutex_lock(&worker_pool_idr_mutex); - idr_pre_get(&worker_pool_idr, GFP_KERNEL); - ret = idr_get_new(&worker_pool_idr, pool, &pool->id); - mutex_unlock(&worker_pool_idr_mutex); + lockdep_assert_held(&wq_pool_mutex); + ret = idr_alloc(&worker_pool_idr, pool, 0, 0, GFP_KERNEL); + if (ret >= 0) { + pool->id = ret; + return 0; + } return ret; } -/* - * Lookup worker_pool by id. The idr currently is built during boot and - * never modified. Don't worry about locking for now. +/** + * unbound_pwq_by_node - return the unbound pool_workqueue for the given node + * @wq: the target workqueue + * @node: the node ID + * + * This must be called either with pwq_lock held or sched RCU read locked. + * If the pwq needs to be used beyond the locking in effect, the caller is + * responsible for guaranteeing that the pwq stays online. */ -static struct worker_pool *worker_pool_by_id(int pool_id) -{ - return idr_find(&worker_pool_idr, pool_id); -} - -static struct worker_pool *get_std_worker_pool(int cpu, bool highpri) +static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, + int node) { - struct worker_pool *pools = std_worker_pools(cpu); - - return &pools[highpri]; -} - -static struct pool_workqueue *get_pwq(unsigned int cpu, - struct workqueue_struct *wq) -{ - if (!(wq->flags & WQ_UNBOUND)) { - if (likely(cpu < nr_cpu_ids)) - return per_cpu_ptr(wq->pool_wq.pcpu, cpu); - } else if (likely(cpu == WORK_CPU_UNBOUND)) - return wq->pool_wq.single; - return NULL; + assert_rcu_or_wq_mutex(wq); + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } static unsigned int work_color_to_flags(int color) @@ -530,7 +573,7 @@ static int work_next_color(int color) static inline void set_work_data(struct work_struct *work, unsigned long data, unsigned long flags) { - BUG_ON(!work_pending(work)); + WARN_ON_ONCE(!work_pending(work)); atomic_long_set(&work->data, data | flags | work_static(work)); } @@ -582,13 +625,23 @@ static struct pool_workqueue *get_work_pwq(struct work_struct *work) * @work: the work item of interest * * Return the worker_pool @work was last associated with. %NULL if none. + * + * Pools are created and destroyed under wq_pool_mutex, and allows read + * access under sched-RCU read lock. As such, this function should be + * called under wq_pool_mutex or with preemption disabled. + * + * All fields of the returned pool are accessible as long as the above + * mentioned locking is in effect. If the returned pool needs to be used + * beyond the critical section, the caller is responsible for ensuring the + * returned pool is and stays online. */ static struct worker_pool *get_work_pool(struct work_struct *work) { unsigned long data = atomic_long_read(&work->data); - struct worker_pool *pool; int pool_id; + assert_rcu_or_pool_mutex(); + if (data & WORK_STRUCT_PWQ) return ((struct pool_workqueue *) (data & WORK_STRUCT_WQ_DATA_MASK))->pool; @@ -597,9 +650,7 @@ static struct worker_pool *get_work_pool(struct work_struct *work) if (pool_id == WORK_OFFQ_POOL_NONE) return NULL; - pool = worker_pool_by_id(pool_id); - WARN_ON_ONCE(!pool); - return pool; + return idr_find(&worker_pool_idr, pool_id); } /** @@ -688,7 +739,7 @@ static bool need_to_manage_workers(struct worker_pool *pool) /* Do we have too many workers and should some go away? */ static bool too_many_workers(struct worker_pool *pool) { - bool managing = pool->flags & POOL_MANAGING_WORKERS; + bool managing = mutex_is_locked(&pool->manager_arb); int nr_idle = pool->nr_idle + managing; /* manager is considered idle */ int nr_busy = pool->nr_workers - nr_idle; @@ -743,7 +794,7 @@ static void wake_up_worker(struct worker_pool *pool) * CONTEXT: * spin_lock_irq(rq->lock) */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) +void wq_worker_waking_up(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task); @@ -768,8 +819,7 @@ void wq_worker_waking_up(struct task_struct *task, unsigned int cpu) * RETURNS: * Worker task on @cpu to wake up, %NULL if none. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu) +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; @@ -785,7 +835,8 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, pool = worker->pool; /* this can only happen on the local cpu */ - BUG_ON(cpu != raw_smp_processor_id()); + if (WARN_ON_ONCE(cpu != raw_smp_processor_id())) + return NULL; /* * The counterpart of the following dec_and_test, implied mb, @@ -890,13 +941,12 @@ static inline void worker_clr_flags(struct worker *worker, unsigned int flags) * recycled work item as currently executing and make it wait until the * current execution finishes, introducing an unwanted dependency. * - * This function checks the work item address, work function and workqueue - * to avoid false positives. Note that this isn't complete as one may - * construct a work function which can introduce dependency onto itself - * through a recycled work item. Well, if somebody wants to shoot oneself - * in the foot that badly, there's only so much we can do, and if such - * deadlock actually occurs, it should be easy to locate the culprit work - * function. + * This function checks the work item address and work function to avoid + * false positives. Note that this isn't complete as one may construct a + * work function which can introduce dependency onto itself through a + * recycled work item. Well, if somebody wants to shoot oneself in the + * foot that badly, there's only so much we can do, and if such deadlock + * actually occurs, it should be easy to locate the culprit work function. * * CONTEXT: * spin_lock_irq(pool->lock). @@ -960,6 +1010,64 @@ static void move_linked_works(struct work_struct *work, struct list_head *head, *nextp = n; } +/** + * get_pwq - get an extra reference on the specified pool_workqueue + * @pwq: pool_workqueue to get + * + * Obtain an extra reference on @pwq. The caller should guarantee that + * @pwq has positive refcnt and be holding the matching pool->lock. + */ +static void get_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&pwq->pool->lock); + WARN_ON_ONCE(pwq->refcnt <= 0); + pwq->refcnt++; +} + +/** + * put_pwq - put a pool_workqueue reference + * @pwq: pool_workqueue to put + * + * Drop a reference of @pwq. If its refcnt reaches zero, schedule its + * destruction. The caller should be holding the matching pool->lock. + */ +static void put_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&pwq->pool->lock); + if (likely(--pwq->refcnt)) + return; + if (WARN_ON_ONCE(!(pwq->wq->flags & WQ_UNBOUND))) + return; + /* + * @pwq can't be released under pool->lock, bounce to + * pwq_unbound_release_workfn(). This never recurses on the same + * pool->lock as this path is taken only for unbound workqueues and + * the release work item is scheduled on a per-cpu workqueue. To + * avoid lockdep warning, unbound pool->locks are given lockdep + * subclass of 1 in get_unbound_pool(). + */ + schedule_work(&pwq->unbound_release_work); +} + +/** + * put_pwq_unlocked - put_pwq() with surrounding pool lock/unlock + * @pwq: pool_workqueue to put (can be %NULL) + * + * put_pwq() with locking. This function also allows %NULL @pwq. + */ +static void put_pwq_unlocked(struct pool_workqueue *pwq) +{ + if (pwq) { + /* + * As both pwqs and pools are sched-RCU protected, the + * following lock operations are safe. + */ + spin_lock_irq(&pwq->pool->lock); + put_pwq(pwq); + spin_unlock_irq(&pwq->pool->lock); + } +} + static void pwq_activate_delayed_work(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); @@ -991,9 +1099,9 @@ static void pwq_activate_first_delayed(struct pool_workqueue *pwq) */ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) { - /* ignore uncolored works */ + /* uncolored work items don't participate in flushing or nr_active */ if (color == WORK_NO_COLOR) - return; + goto out_put; pwq->nr_in_flight[color]--; @@ -1006,11 +1114,11 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) /* is flush in progress and are we at the flushing tip? */ if (likely(pwq->flush_color != color)) - return; + goto out_put; /* are there still in-flight works? */ if (pwq->nr_in_flight[color]) - return; + goto out_put; /* this pwq is done, clear flush_color */ pwq->flush_color = -1; @@ -1021,6 +1129,8 @@ static void pwq_dec_nr_in_flight(struct pool_workqueue *pwq, int color) */ if (atomic_dec_and_test(&pwq->wq->nr_pwqs_to_flush)) complete(&pwq->wq->first_flusher->done); +out_put: + put_pwq(pwq); } /** @@ -1143,11 +1253,12 @@ static void insert_work(struct pool_workqueue *pwq, struct work_struct *work, /* we own @work, set data and link */ set_work_pwq(work, pwq, extra_flags); list_add_tail(&work->entry, head); + get_pwq(pwq); /* - * Ensure either worker_sched_deactivated() sees the above - * list_add_tail() or we see zero nr_running to avoid workers - * lying around lazily while there are works to be processed. + * Ensure either wq_worker_sleeping() sees the above + * list_add_tail() or we see zero nr_running to avoid workers lying + * around lazily while there are works to be processed. */ smp_mb(); @@ -1171,10 +1282,11 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } -static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, +static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { struct pool_workqueue *pwq; + struct worker_pool *last_pool; struct list_head *worklist; unsigned int work_flags; unsigned int req_cpu = cpu; @@ -1190,48 +1302,62 @@ static void __queue_work(unsigned int cpu, struct workqueue_struct *wq, debug_work_activate(work); /* if dying, only works from the same workqueue are allowed */ - if (unlikely(wq->flags & WQ_DRAINING) && + if (unlikely(wq->flags & __WQ_DRAINING) && WARN_ON_ONCE(!is_chained_work(wq))) return; +retry: + if (req_cpu == WORK_CPU_UNBOUND) + cpu = raw_smp_processor_id(); - /* determine the pwq to use */ - if (!(wq->flags & WQ_UNBOUND)) { - struct worker_pool *last_pool; - - if (cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); - - /* - * It's multi cpu. If @work was previously on a different - * cpu, it might still be running there, in which case the - * work needs to be queued on that cpu to guarantee - * non-reentrancy. - */ - pwq = get_pwq(cpu, wq); - last_pool = get_work_pool(work); + /* pwq which will be used unless @work is executing elsewhere */ + if (!(wq->flags & WQ_UNBOUND)) + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + else + pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); - if (last_pool && last_pool != pwq->pool) { - struct worker *worker; + /* + * If @work was previously on a different pool, it might still be + * running there, in which case the work needs to be queued on that + * pool to guarantee non-reentrancy. + */ + last_pool = get_work_pool(work); + if (last_pool && last_pool != pwq->pool) { + struct worker *worker; - spin_lock(&last_pool->lock); + spin_lock(&last_pool->lock); - worker = find_worker_executing_work(last_pool, work); + worker = find_worker_executing_work(last_pool, work); - if (worker && worker->current_pwq->wq == wq) { - pwq = get_pwq(last_pool->cpu, wq); - } else { - /* meh... not running there, queue here */ - spin_unlock(&last_pool->lock); - spin_lock(&pwq->pool->lock); - } + if (worker && worker->current_pwq->wq == wq) { + pwq = worker->current_pwq; } else { + /* meh... not running there, queue here */ + spin_unlock(&last_pool->lock); spin_lock(&pwq->pool->lock); } } else { - pwq = get_pwq(WORK_CPU_UNBOUND, wq); spin_lock(&pwq->pool->lock); } + /* + * pwq is determined and locked. For unbound pools, we could have + * raced with pwq release and it could already be dead. If its + * refcnt is zero, repeat pwq selection. Note that pwqs never die + * without another pwq replacing it in the numa_pwq_tbl or while + * work items are executing on it, so the retrying is guaranteed to + * make forward-progress. + */ + if (unlikely(!pwq->refcnt)) { + if (wq->flags & WQ_UNBOUND) { + spin_unlock(&pwq->pool->lock); + cpu_relax(); + goto retry; + } + /* oops */ + WARN_ONCE(true, "workqueue: per-cpu pwq for %s on cpu%d has 0 refcnt", + wq->name, cpu); + } + /* pwq determined, queue */ trace_workqueue_queue_work(req_cpu, pwq, work); @@ -1286,22 +1412,6 @@ bool queue_work_on(int cpu, struct workqueue_struct *wq, } EXPORT_SYMBOL_GPL(queue_work_on); -/** - * queue_work - queue work on a workqueue - * @wq: workqueue to use - * @work: work to queue - * - * Returns %false if @work was already on a queue, %true otherwise. - * - * We queue the work to the CPU on which it was submitted, but if the CPU dies - * it can be processed by another CPU. - */ -bool queue_work(struct workqueue_struct *wq, struct work_struct *work) -{ - return queue_work_on(WORK_CPU_UNBOUND, wq, work); -} -EXPORT_SYMBOL_GPL(queue_work); - void delayed_work_timer_fn(unsigned long __data) { struct delayed_work *dwork = (struct delayed_work *)__data; @@ -1377,21 +1487,6 @@ bool queue_delayed_work_on(int cpu, struct workqueue_struct *wq, EXPORT_SYMBOL_GPL(queue_delayed_work_on); /** - * queue_delayed_work - queue work on a workqueue after delay - * @wq: workqueue to use - * @dwork: delayable work to queue - * @delay: number of jiffies to wait before queueing - * - * Equivalent to queue_delayed_work_on() but tries to use the local CPU. - */ -bool queue_delayed_work(struct workqueue_struct *wq, - struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(queue_delayed_work); - -/** * mod_delayed_work_on - modify delay of or queue a delayed work on specific CPU * @cpu: CPU number to execute work on * @wq: workqueue to use @@ -1430,21 +1525,6 @@ bool mod_delayed_work_on(int cpu, struct workqueue_struct *wq, EXPORT_SYMBOL_GPL(mod_delayed_work_on); /** - * mod_delayed_work - modify delay of or queue a delayed work - * @wq: workqueue to use - * @dwork: work to queue - * @delay: number of jiffies to wait before queueing - * - * mod_delayed_work_on() on local CPU. - */ -bool mod_delayed_work(struct workqueue_struct *wq, struct delayed_work *dwork, - unsigned long delay) -{ - return mod_delayed_work_on(WORK_CPU_UNBOUND, wq, dwork, delay); -} -EXPORT_SYMBOL_GPL(mod_delayed_work); - -/** * worker_enter_idle - enter idle state * @worker: worker which is entering idle state * @@ -1458,9 +1538,10 @@ static void worker_enter_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; - BUG_ON(worker->flags & WORKER_IDLE); - BUG_ON(!list_empty(&worker->entry) && - (worker->hentry.next || worker->hentry.pprev)); + if (WARN_ON_ONCE(worker->flags & WORKER_IDLE) || + WARN_ON_ONCE(!list_empty(&worker->entry) && + (worker->hentry.next || worker->hentry.pprev))) + return; /* can't use worker_set_flags(), also called from start_worker() */ worker->flags |= WORKER_IDLE; @@ -1497,22 +1578,25 @@ static void worker_leave_idle(struct worker *worker) { struct worker_pool *pool = worker->pool; - BUG_ON(!(worker->flags & WORKER_IDLE)); + if (WARN_ON_ONCE(!(worker->flags & WORKER_IDLE))) + return; worker_clr_flags(worker, WORKER_IDLE); pool->nr_idle--; list_del_init(&worker->entry); } /** - * worker_maybe_bind_and_lock - bind worker to its cpu if possible and lock pool - * @worker: self + * worker_maybe_bind_and_lock - try to bind %current to worker_pool and lock it + * @pool: target worker_pool + * + * Bind %current to the cpu of @pool if it is associated and lock @pool. * * Works which are scheduled while the cpu is online must at least be * scheduled to a worker which is bound to the cpu so that if they are * flushed from cpu callbacks while cpu is going down, they are * guaranteed to execute on the cpu. * - * This function is to be used by rogue workers and rescuers to bind + * This function is to be used by unbound workers and rescuers to bind * themselves to the target cpu and may race with cpu going down or * coming online. kthread_bind() can't be used because it may put the * worker to already dead cpu and set_cpus_allowed_ptr() can't be used @@ -1533,12 +1617,9 @@ static void worker_leave_idle(struct worker *worker) * %true if the associated pool is online (@worker is successfully * bound), %false if offline. */ -static bool worker_maybe_bind_and_lock(struct worker *worker) +static bool worker_maybe_bind_and_lock(struct worker_pool *pool) __acquires(&pool->lock) { - struct worker_pool *pool = worker->pool; - struct task_struct *task = worker->task; - while (true) { /* * The following call may fail, succeed or succeed @@ -1547,14 +1628,13 @@ __acquires(&pool->lock) * against POOL_DISASSOCIATED. */ if (!(pool->flags & POOL_DISASSOCIATED)) - set_cpus_allowed_ptr(task, get_cpu_mask(pool->cpu)); + set_cpus_allowed_ptr(current, pool->attrs->cpumask); spin_lock_irq(&pool->lock); if (pool->flags & POOL_DISASSOCIATED) return false; - if (task_cpu(task) == pool->cpu && - cpumask_equal(¤t->cpus_allowed, - get_cpu_mask(pool->cpu))) + if (task_cpu(current) == pool->cpu && + cpumask_equal(¤t->cpus_allowed, pool->attrs->cpumask)) return true; spin_unlock_irq(&pool->lock); @@ -1569,108 +1649,6 @@ __acquires(&pool->lock) } } -/* - * Rebind an idle @worker to its CPU. worker_thread() will test - * list_empty(@worker->entry) before leaving idle and call this function. - */ -static void idle_worker_rebind(struct worker *worker) -{ - /* CPU may go down again inbetween, clear UNBOUND only on success */ - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_UNBOUND); - - /* rebind complete, become available again */ - list_add(&worker->entry, &worker->pool->idle_list); - spin_unlock_irq(&worker->pool->lock); -} - -/* - * Function for @worker->rebind.work used to rebind unbound busy workers to - * the associated cpu which is coming back online. This is scheduled by - * cpu up but can race with other cpu hotplug operations and may be - * executed twice without intervening cpu down. - */ -static void busy_worker_rebind_fn(struct work_struct *work) -{ - struct worker *worker = container_of(work, struct worker, rebind_work); - - if (worker_maybe_bind_and_lock(worker)) - worker_clr_flags(worker, WORKER_UNBOUND); - - spin_unlock_irq(&worker->pool->lock); -} - -/** - * rebind_workers - rebind all workers of a pool to the associated CPU - * @pool: pool of interest - * - * @pool->cpu is coming online. Rebind all workers to the CPU. Rebinding - * is different for idle and busy ones. - * - * Idle ones will be removed from the idle_list and woken up. They will - * add themselves back after completing rebind. This ensures that the - * idle_list doesn't contain any unbound workers when re-bound busy workers - * try to perform local wake-ups for concurrency management. - * - * Busy workers can rebind after they finish their current work items. - * Queueing the rebind work item at the head of the scheduled list is - * enough. Note that nr_running will be properly bumped as busy workers - * rebind. - * - * On return, all non-manager workers are scheduled for rebind - see - * manage_workers() for the manager special case. Any idle worker - * including the manager will not appear on @idle_list until rebind is - * complete, making local wake-ups safe. - */ -static void rebind_workers(struct worker_pool *pool) -{ - struct worker *worker, *n; - int i; - - lockdep_assert_held(&pool->assoc_mutex); - lockdep_assert_held(&pool->lock); - - /* dequeue and kick idle ones */ - list_for_each_entry_safe(worker, n, &pool->idle_list, entry) { - /* - * idle workers should be off @pool->idle_list until rebind - * is complete to avoid receiving premature local wake-ups. - */ - list_del_init(&worker->entry); - - /* - * worker_thread() will see the above dequeuing and call - * idle_worker_rebind(). - */ - wake_up_process(worker->task); - } - - /* rebind busy workers */ - for_each_busy_worker(worker, i, pool) { - struct work_struct *rebind_work = &worker->rebind_work; - struct workqueue_struct *wq; - - if (test_and_set_bit(WORK_STRUCT_PENDING_BIT, - work_data_bits(rebind_work))) - continue; - - debug_work_activate(rebind_work); - - /* - * wq doesn't really matter but let's keep @worker->pool - * and @pwq->pool consistent for sanity. - */ - if (std_worker_pool_pri(worker->pool)) - wq = system_highpri_wq; - else - wq = system_wq; - - insert_work(get_pwq(pool->cpu, wq), rebind_work, - worker->scheduled.next, - work_color_to_flags(WORK_NO_COLOR)); - } -} - static struct worker *alloc_worker(void) { struct worker *worker; @@ -1679,7 +1657,6 @@ static struct worker *alloc_worker(void) if (worker) { INIT_LIST_HEAD(&worker->entry); INIT_LIST_HEAD(&worker->scheduled); - INIT_WORK(&worker->rebind_work, busy_worker_rebind_fn); /* on creation a worker is in !idle && prep state */ worker->flags = WORKER_PREP; } @@ -1702,18 +1679,25 @@ static struct worker *alloc_worker(void) */ static struct worker *create_worker(struct worker_pool *pool) { - const char *pri = std_worker_pool_pri(pool) ? "H" : ""; struct worker *worker = NULL; int id = -1; + char id_buf[16]; + lockdep_assert_held(&pool->manager_mutex); + + /* + * ID is needed to determine kthread name. Allocate ID first + * without installing the pointer. + */ + idr_preload(GFP_KERNEL); spin_lock_irq(&pool->lock); - while (ida_get_new(&pool->worker_ida, &id)) { - spin_unlock_irq(&pool->lock); - if (!ida_pre_get(&pool->worker_ida, GFP_KERNEL)) - goto fail; - spin_lock_irq(&pool->lock); - } + + id = idr_alloc(&pool->worker_idr, NULL, 0, 0, GFP_NOWAIT); + spin_unlock_irq(&pool->lock); + idr_preload_end(); + if (id < 0) + goto fail; worker = alloc_worker(); if (!worker) @@ -1722,40 +1706,46 @@ static struct worker *create_worker(struct worker_pool *pool) worker->pool = pool; worker->id = id; - if (pool->cpu != WORK_CPU_UNBOUND) - worker->task = kthread_create_on_node(worker_thread, - worker, cpu_to_node(pool->cpu), - "kworker/%u:%d%s", pool->cpu, id, pri); + if (pool->cpu >= 0) + snprintf(id_buf, sizeof(id_buf), "%d:%d%s", pool->cpu, id, + pool->attrs->nice < 0 ? "H" : ""); else - worker->task = kthread_create(worker_thread, worker, - "kworker/u:%d%s", id, pri); + snprintf(id_buf, sizeof(id_buf), "u%d:%d", pool->id, id); + + worker->task = kthread_create_on_node(worker_thread, worker, pool->node, + "kworker/%s", id_buf); if (IS_ERR(worker->task)) goto fail; - if (std_worker_pool_pri(pool)) - set_user_nice(worker->task, HIGHPRI_NICE_LEVEL); + /* + * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any + * online CPUs. It'll be re-applied when any of the CPUs come up. + */ + set_user_nice(worker->task, pool->attrs->nice); + set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask); + + /* prevent userland from meddling with cpumask of workqueue workers */ + worker->task->flags |= PF_NO_SETAFFINITY; /* - * Determine CPU binding of the new worker depending on - * %POOL_DISASSOCIATED. The caller is responsible for ensuring the - * flag remains stable across this function. See the comments - * above the flag definition for details. - * - * As an unbound worker may later become a regular one if CPU comes - * online, make sure every worker has %PF_THREAD_BOUND set. + * The caller is responsible for ensuring %POOL_DISASSOCIATED + * remains stable across this function. See the comments above the + * flag definition for details. */ - if (!(pool->flags & POOL_DISASSOCIATED)) { - kthread_bind(worker->task, pool->cpu); - } else { - worker->task->flags |= PF_THREAD_BOUND; + if (pool->flags & POOL_DISASSOCIATED) worker->flags |= WORKER_UNBOUND; - } + + /* successful, commit the pointer to idr */ + spin_lock_irq(&pool->lock); + idr_replace(&pool->worker_idr, worker, worker->id); + spin_unlock_irq(&pool->lock); return worker; + fail: if (id >= 0) { spin_lock_irq(&pool->lock); - ida_remove(&pool->worker_ida, id); + idr_remove(&pool->worker_idr, id); spin_unlock_irq(&pool->lock); } kfree(worker); @@ -1780,6 +1770,30 @@ static void start_worker(struct worker *worker) } /** + * create_and_start_worker - create and start a worker for a pool + * @pool: the target pool + * + * Grab the managership of @pool and create and start a new worker for it. + */ +static int create_and_start_worker(struct worker_pool *pool) +{ + struct worker *worker; + + mutex_lock(&pool->manager_mutex); + + worker = create_worker(pool); + if (worker) { + spin_lock_irq(&pool->lock); + start_worker(worker); + spin_unlock_irq(&pool->lock); + } + + mutex_unlock(&pool->manager_mutex); + + return worker ? 0 : -ENOMEM; +} + +/** * destroy_worker - destroy a workqueue worker * @worker: worker to be destroyed * @@ -1791,11 +1805,14 @@ static void start_worker(struct worker *worker) static void destroy_worker(struct worker *worker) { struct worker_pool *pool = worker->pool; - int id = worker->id; + + lockdep_assert_held(&pool->manager_mutex); + lockdep_assert_held(&pool->lock); /* sanity check frenzy */ - BUG_ON(worker->current_work); - BUG_ON(!list_empty(&worker->scheduled)); + if (WARN_ON(worker->current_work) || + WARN_ON(!list_empty(&worker->scheduled))) + return; if (worker->flags & WORKER_STARTED) pool->nr_workers--; @@ -1805,13 +1822,14 @@ static void destroy_worker(struct worker *worker) list_del_init(&worker->entry); worker->flags |= WORKER_DIE; + idr_remove(&pool->worker_idr, worker->id); + spin_unlock_irq(&pool->lock); kthread_stop(worker->task); kfree(worker); spin_lock_irq(&pool->lock); - ida_remove(&pool->worker_ida, id); } static void idle_worker_timeout(unsigned long __pool) @@ -1840,23 +1858,21 @@ static void idle_worker_timeout(unsigned long __pool) spin_unlock_irq(&pool->lock); } -static bool send_mayday(struct work_struct *work) +static void send_mayday(struct work_struct *work) { struct pool_workqueue *pwq = get_work_pwq(work); struct workqueue_struct *wq = pwq->wq; - unsigned int cpu; - if (!(wq->flags & WQ_RESCUER)) - return false; + lockdep_assert_held(&wq_mayday_lock); + + if (!wq->rescuer) + return; /* mayday mayday mayday */ - cpu = pwq->pool->cpu; - /* WORK_CPU_UNBOUND can't be set in cpumask, use cpu 0 instead */ - if (cpu == WORK_CPU_UNBOUND) - cpu = 0; - if (!mayday_test_and_set_cpu(cpu, wq->mayday_mask)) + if (list_empty(&pwq->mayday_node)) { + list_add_tail(&pwq->mayday_node, &wq->maydays); wake_up_process(wq->rescuer->task); - return true; + } } static void pool_mayday_timeout(unsigned long __pool) @@ -1864,7 +1880,8 @@ static void pool_mayday_timeout(unsigned long __pool) struct worker_pool *pool = (void *)__pool; struct work_struct *work; - spin_lock_irq(&pool->lock); + spin_lock_irq(&wq_mayday_lock); /* for wq->maydays */ + spin_lock(&pool->lock); if (need_to_create_worker(pool)) { /* @@ -1877,7 +1894,8 @@ static void pool_mayday_timeout(unsigned long __pool) send_mayday(work); } - spin_unlock_irq(&pool->lock); + spin_unlock(&pool->lock); + spin_unlock_irq(&wq_mayday_lock); mod_timer(&pool->mayday_timer, jiffies + MAYDAY_INTERVAL); } @@ -1892,8 +1910,8 @@ static void pool_mayday_timeout(unsigned long __pool) * sent to all rescuers with works scheduled on @pool to resolve * possible allocation deadlock. * - * On return, need_to_create_worker() is guaranteed to be false and - * may_start_working() true. + * On return, need_to_create_worker() is guaranteed to be %false and + * may_start_working() %true. * * LOCKING: * spin_lock_irq(pool->lock) which may be released and regrabbed @@ -1901,7 +1919,7 @@ static void pool_mayday_timeout(unsigned long __pool) * manager. * * RETURNS: - * false if no action was taken and pool->lock stayed locked, true + * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ static bool maybe_create_worker(struct worker_pool *pool) @@ -1924,7 +1942,8 @@ restart: del_timer_sync(&pool->mayday_timer); spin_lock_irq(&pool->lock); start_worker(worker); - BUG_ON(need_to_create_worker(pool)); + if (WARN_ON_ONCE(need_to_create_worker(pool))) + goto restart; return true; } @@ -1957,7 +1976,7 @@ restart: * multiple times. Called only from manager. * * RETURNS: - * false if no action was taken and pool->lock stayed locked, true + * %false if no action was taken and pool->lock stayed locked, %true * otherwise. */ static bool maybe_destroy_workers(struct worker_pool *pool) @@ -2008,42 +2027,37 @@ static bool manage_workers(struct worker *worker) struct worker_pool *pool = worker->pool; bool ret = false; - if (pool->flags & POOL_MANAGING_WORKERS) + /* + * Managership is governed by two mutexes - manager_arb and + * manager_mutex. manager_arb handles arbitration of manager role. + * Anyone who successfully grabs manager_arb wins the arbitration + * and becomes the manager. mutex_trylock() on pool->manager_arb + * failure while holding pool->lock reliably indicates that someone + * else is managing the pool and the worker which failed trylock + * can proceed to executing work items. This means that anyone + * grabbing manager_arb is responsible for actually performing + * manager duties. If manager_arb is grabbed and released without + * actual management, the pool may stall indefinitely. + * + * manager_mutex is used for exclusion of actual management + * operations. The holder of manager_mutex can be sure that none + * of management operations, including creation and destruction of + * workers, won't take place until the mutex is released. Because + * manager_mutex doesn't interfere with manager role arbitration, + * it is guaranteed that the pool's management, while may be + * delayed, won't be disturbed by someone else grabbing + * manager_mutex. + */ + if (!mutex_trylock(&pool->manager_arb)) return ret; - pool->flags |= POOL_MANAGING_WORKERS; - /* - * To simplify both worker management and CPU hotplug, hold off - * management while hotplug is in progress. CPU hotplug path can't - * grab %POOL_MANAGING_WORKERS to achieve this because that can - * lead to idle worker depletion (all become busy thinking someone - * else is managing) which in turn can result in deadlock under - * extreme circumstances. Use @pool->assoc_mutex to synchronize - * manager against CPU hotplug. - * - * assoc_mutex would always be free unless CPU hotplug is in - * progress. trylock first without dropping @pool->lock. + * With manager arbitration won, manager_mutex would be free in + * most cases. trylock first without dropping @pool->lock. */ - if (unlikely(!mutex_trylock(&pool->assoc_mutex))) { + if (unlikely(!mutex_trylock(&pool->manager_mutex))) { spin_unlock_irq(&pool->lock); - mutex_lock(&pool->assoc_mutex); - /* - * CPU hotplug could have happened while we were waiting - * for assoc_mutex. Hotplug itself can't handle us - * because manager isn't either on idle or busy list, and - * @pool's state and ours could have deviated. - * - * As hotplug is now excluded via assoc_mutex, we can - * simply try to bind. It will succeed or fail depending - * on @pool's current state. Try it and adjust - * %WORKER_UNBOUND accordingly. - */ - if (worker_maybe_bind_and_lock(worker)) - worker->flags &= ~WORKER_UNBOUND; - else - worker->flags |= WORKER_UNBOUND; - + mutex_lock(&pool->manager_mutex); ret = true; } @@ -2056,8 +2070,8 @@ static bool manage_workers(struct worker *worker) ret |= maybe_destroy_workers(pool); ret |= maybe_create_worker(pool); - pool->flags &= ~POOL_MANAGING_WORKERS; - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->manager_arb); return ret; } @@ -2211,11 +2225,11 @@ static void process_scheduled_works(struct worker *worker) * worker_thread - the worker thread function * @__worker: self * - * The worker thread function. There are NR_CPU_WORKER_POOLS dynamic pools - * of these per each cpu. These workers process all works regardless of - * their specific target workqueue. The only exception is works which - * belong to workqueues with a rescuer which will be explained in - * rescuer_thread(). + * The worker thread function. All workers belong to a worker_pool - + * either a per-cpu one or dynamic unbound one. These workers process all + * work items regardless of their specific target workqueue. The only + * exception is work items which belong to workqueues with a rescuer which + * will be explained in rescuer_thread(). */ static int worker_thread(void *__worker) { @@ -2227,19 +2241,12 @@ static int worker_thread(void *__worker) woke_up: spin_lock_irq(&pool->lock); - /* we are off idle list if destruction or rebind is requested */ - if (unlikely(list_empty(&worker->entry))) { + /* am I supposed to die? */ + if (unlikely(worker->flags & WORKER_DIE)) { spin_unlock_irq(&pool->lock); - - /* if DIE is set, destruction is requested */ - if (worker->flags & WORKER_DIE) { - worker->task->flags &= ~PF_WQ_WORKER; - return 0; - } - - /* otherwise, rebind */ - idle_worker_rebind(worker); - goto woke_up; + WARN_ON_ONCE(!list_empty(&worker->entry)); + worker->task->flags &= ~PF_WQ_WORKER; + return 0; } worker_leave_idle(worker); @@ -2257,14 +2264,16 @@ recheck: * preparing to process a work or actually processing it. * Make sure nobody diddled with it while I was sleeping. */ - BUG_ON(!list_empty(&worker->scheduled)); + WARN_ON_ONCE(!list_empty(&worker->scheduled)); /* - * When control reaches this point, we're guaranteed to have - * at least one idle worker or that someone else has already - * assumed the manager role. + * Finish PREP stage. We're guaranteed to have at least one idle + * worker or that someone else has already assumed the manager + * role. This is where @worker starts participating in concurrency + * management if applicable and concurrency management is restored + * after being rebound. See rebind_workers() for details. */ - worker_clr_flags(worker, WORKER_PREP); + worker_clr_flags(worker, WORKER_PREP | WORKER_REBOUND); do { struct work_struct *work = @@ -2306,7 +2315,7 @@ sleep: * @__rescuer: self * * Workqueue rescuer thread function. There's one rescuer for each - * workqueue which has WQ_RESCUER set. + * workqueue which has WQ_MEM_RECLAIM set. * * Regular work processing on a pool may block trying to create a new * worker which uses GFP_KERNEL allocation which has slight chance of @@ -2325,8 +2334,6 @@ static int rescuer_thread(void *__rescuer) struct worker *rescuer = __rescuer; struct workqueue_struct *wq = rescuer->rescue_wq; struct list_head *scheduled = &rescuer->scheduled; - bool is_unbound = wq->flags & WQ_UNBOUND; - unsigned int cpu; set_user_nice(current, RESCUER_NICE_LEVEL); @@ -2344,28 +2351,29 @@ repeat: return 0; } - /* - * See whether any cpu is asking for help. Unbounded - * workqueues use cpu 0 in mayday_mask for CPU_UNBOUND. - */ - for_each_mayday_cpu(cpu, wq->mayday_mask) { - unsigned int tcpu = is_unbound ? WORK_CPU_UNBOUND : cpu; - struct pool_workqueue *pwq = get_pwq(tcpu, wq); + /* see whether any pwq is asking for help */ + spin_lock_irq(&wq_mayday_lock); + + while (!list_empty(&wq->maydays)) { + struct pool_workqueue *pwq = list_first_entry(&wq->maydays, + struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; __set_current_state(TASK_RUNNING); - mayday_clear_cpu(cpu, wq->mayday_mask); + list_del_init(&pwq->mayday_node); + + spin_unlock_irq(&wq_mayday_lock); /* migrate to the target cpu if possible */ + worker_maybe_bind_and_lock(pool); rescuer->pool = pool; - worker_maybe_bind_and_lock(rescuer); /* * Slurp in all works issued via this workqueue and * process'em. */ - BUG_ON(!list_empty(&rescuer->scheduled)); + WARN_ON_ONCE(!list_empty(&rescuer->scheduled)); list_for_each_entry_safe(work, n, &pool->worklist, entry) if (get_work_pwq(work) == pwq) move_linked_works(work, scheduled, &n); @@ -2380,9 +2388,13 @@ repeat: if (keep_working(pool)) wake_up_worker(pool); - spin_unlock_irq(&pool->lock); + rescuer->pool = NULL; + spin_unlock(&pool->lock); + spin_lock(&wq_mayday_lock); } + spin_unlock_irq(&wq_mayday_lock); + /* rescuers should never participate in concurrency management */ WARN_ON_ONCE(!(rescuer->flags & WORKER_NOT_RUNNING)); schedule(); @@ -2486,7 +2498,7 @@ static void insert_wq_barrier(struct pool_workqueue *pwq, * advanced to @work_color. * * CONTEXT: - * mutex_lock(wq->flush_mutex). + * mutex_lock(wq->mutex). * * RETURNS: * %true if @flush_color >= 0 and there's something to flush. %false @@ -2496,21 +2508,20 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, int flush_color, int work_color) { bool wait = false; - unsigned int cpu; + struct pool_workqueue *pwq; if (flush_color >= 0) { - BUG_ON(atomic_read(&wq->nr_pwqs_to_flush)); + WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); atomic_set(&wq->nr_pwqs_to_flush, 1); } - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + for_each_pwq(pwq, wq) { struct worker_pool *pool = pwq->pool; spin_lock_irq(&pool->lock); if (flush_color >= 0) { - BUG_ON(pwq->flush_color != -1); + WARN_ON_ONCE(pwq->flush_color != -1); if (pwq->nr_in_flight[flush_color]) { pwq->flush_color = flush_color; @@ -2520,7 +2531,7 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, } if (work_color >= 0) { - BUG_ON(work_color != work_next_color(pwq->work_color)); + WARN_ON_ONCE(work_color != work_next_color(pwq->work_color)); pwq->work_color = work_color; } @@ -2537,11 +2548,8 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, * flush_workqueue - ensure that any scheduled work has run to completion. * @wq: workqueue to flush * - * Forces execution of the workqueue and blocks until its completion. - * This is typically used in driver shutdown handlers. - * - * We sleep until all works which were queued on entry have been handled, - * but we are not livelocked by new incoming ones. + * This function sleeps until all work items which were queued on entry + * have finished execution, but it is not livelocked by new incoming ones. */ void flush_workqueue(struct workqueue_struct *wq) { @@ -2555,7 +2563,7 @@ void flush_workqueue(struct workqueue_struct *wq) lock_map_acquire(&wq->lockdep_map); lock_map_release(&wq->lockdep_map); - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); /* * Start-to-wait phase @@ -2568,13 +2576,13 @@ void flush_workqueue(struct workqueue_struct *wq) * becomes our flush_color and work_color is advanced * by one. */ - BUG_ON(!list_empty(&wq->flusher_overflow)); + WARN_ON_ONCE(!list_empty(&wq->flusher_overflow)); this_flusher.flush_color = wq->work_color; wq->work_color = next_color; if (!wq->first_flusher) { /* no flush in progress, become the first flusher */ - BUG_ON(wq->flush_color != this_flusher.flush_color); + WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); wq->first_flusher = &this_flusher; @@ -2587,7 +2595,7 @@ void flush_workqueue(struct workqueue_struct *wq) } } else { /* wait in queue */ - BUG_ON(wq->flush_color == this_flusher.flush_color); + WARN_ON_ONCE(wq->flush_color == this_flusher.flush_color); list_add_tail(&this_flusher.list, &wq->flusher_queue); flush_workqueue_prep_pwqs(wq, -1, wq->work_color); } @@ -2600,7 +2608,7 @@ void flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); @@ -2613,7 +2621,7 @@ void flush_workqueue(struct workqueue_struct *wq) if (wq->first_flusher != &this_flusher) return; - mutex_lock(&wq->flush_mutex); + mutex_lock(&wq->mutex); /* we might have raced, check again with mutex held */ if (wq->first_flusher != &this_flusher) @@ -2621,8 +2629,8 @@ void flush_workqueue(struct workqueue_struct *wq) wq->first_flusher = NULL; - BUG_ON(!list_empty(&this_flusher.list)); - BUG_ON(wq->flush_color != this_flusher.flush_color); + WARN_ON_ONCE(!list_empty(&this_flusher.list)); + WARN_ON_ONCE(wq->flush_color != this_flusher.flush_color); while (true) { struct wq_flusher *next, *tmp; @@ -2635,8 +2643,8 @@ void flush_workqueue(struct workqueue_struct *wq) complete(&next->done); } - BUG_ON(!list_empty(&wq->flusher_overflow) && - wq->flush_color != work_next_color(wq->work_color)); + WARN_ON_ONCE(!list_empty(&wq->flusher_overflow) && + wq->flush_color != work_next_color(wq->work_color)); /* this flush_color is finished, advance by one */ wq->flush_color = work_next_color(wq->flush_color); @@ -2660,7 +2668,7 @@ void flush_workqueue(struct workqueue_struct *wq) } if (list_empty(&wq->flusher_queue)) { - BUG_ON(wq->flush_color != wq->work_color); + WARN_ON_ONCE(wq->flush_color != wq->work_color); break; } @@ -2668,8 +2676,8 @@ void flush_workqueue(struct workqueue_struct *wq) * Need to flush more colors. Make the next flusher * the new first flusher and arm pwqs. */ - BUG_ON(wq->flush_color == wq->work_color); - BUG_ON(wq->flush_color != next->flush_color); + WARN_ON_ONCE(wq->flush_color == wq->work_color); + WARN_ON_ONCE(wq->flush_color != next->flush_color); list_del_init(&next->list); wq->first_flusher = next; @@ -2685,7 +2693,7 @@ void flush_workqueue(struct workqueue_struct *wq) } out_unlock: - mutex_unlock(&wq->flush_mutex); + mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(flush_workqueue); @@ -2703,22 +2711,23 @@ EXPORT_SYMBOL_GPL(flush_workqueue); void drain_workqueue(struct workqueue_struct *wq) { unsigned int flush_cnt = 0; - unsigned int cpu; + struct pool_workqueue *pwq; /* * __queue_work() needs to test whether there are drainers, is much * hotter than drain_workqueue() and already looks at @wq->flags. - * Use WQ_DRAINING so that queue doesn't have to check nr_drainers. + * Use __WQ_DRAINING so that queue doesn't have to check nr_drainers. */ - spin_lock(&workqueue_lock); + mutex_lock(&wq->mutex); if (!wq->nr_drainers++) - wq->flags |= WQ_DRAINING; - spin_unlock(&workqueue_lock); + wq->flags |= __WQ_DRAINING; + mutex_unlock(&wq->mutex); reflush: flush_workqueue(wq); - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + mutex_lock(&wq->mutex); + + for_each_pwq(pwq, wq) { bool drained; spin_lock_irq(&pwq->pool->lock); @@ -2730,15 +2739,16 @@ reflush: if (++flush_cnt == 10 || (flush_cnt % 100 == 0 && flush_cnt <= 1000)) - pr_warn("workqueue %s: flush on destruction isn't complete after %u tries\n", + pr_warn("workqueue %s: drain_workqueue() isn't complete after %u tries\n", wq->name, flush_cnt); + + mutex_unlock(&wq->mutex); goto reflush; } - spin_lock(&workqueue_lock); if (!--wq->nr_drainers) - wq->flags &= ~WQ_DRAINING; - spin_unlock(&workqueue_lock); + wq->flags &= ~__WQ_DRAINING; + mutex_unlock(&wq->mutex); } EXPORT_SYMBOL_GPL(drain_workqueue); @@ -2749,11 +2759,15 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) struct pool_workqueue *pwq; might_sleep(); + + local_irq_disable(); pool = get_work_pool(work); - if (!pool) + if (!pool) { + local_irq_enable(); return false; + } - spin_lock_irq(&pool->lock); + spin_lock(&pool->lock); /* see the comment in try_to_grab_pending() with the same code */ pwq = get_work_pwq(work); if (pwq) { @@ -2775,7 +2789,7 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) * flusher is not running on the same workqueue by verifying write * access. */ - if (pwq->wq->saved_max_active == 1 || pwq->wq->flags & WQ_RESCUER) + if (pwq->wq->saved_max_active == 1 || pwq->wq->rescuer) lock_map_acquire(&pwq->wq->lockdep_map); else lock_map_acquire_read(&pwq->wq->lockdep_map); @@ -2932,66 +2946,6 @@ bool cancel_delayed_work_sync(struct delayed_work *dwork) EXPORT_SYMBOL(cancel_delayed_work_sync); /** - * schedule_work_on - put work task on a specific cpu - * @cpu: cpu to put the work task on - * @work: job to be done - * - * This puts a job on a specific cpu - */ -bool schedule_work_on(int cpu, struct work_struct *work) -{ - return queue_work_on(cpu, system_wq, work); -} -EXPORT_SYMBOL(schedule_work_on); - -/** - * schedule_work - put work task in global workqueue - * @work: job to be done - * - * Returns %false if @work was already on the kernel-global workqueue and - * %true otherwise. - * - * This puts a job in the kernel-global workqueue if it was not already - * queued and leaves it in the same position on the kernel-global - * workqueue otherwise. - */ -bool schedule_work(struct work_struct *work) -{ - return queue_work(system_wq, work); -} -EXPORT_SYMBOL(schedule_work); - -/** - * schedule_delayed_work_on - queue work in global workqueue on CPU after delay - * @cpu: cpu to use - * @dwork: job to be done - * @delay: number of jiffies to wait - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue on the specified CPU. - */ -bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork, - unsigned long delay) -{ - return queue_delayed_work_on(cpu, system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work_on); - -/** - * schedule_delayed_work - put work task in global workqueue after delay - * @dwork: job to be done - * @delay: number of jiffies to wait or 0 for immediate execution - * - * After waiting for a given time this puts a job in the kernel-global - * workqueue. - */ -bool schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) -{ - return queue_delayed_work(system_wq, dwork, delay); -} -EXPORT_SYMBOL(schedule_delayed_work); - -/** * schedule_on_each_cpu - execute a function synchronously on each online CPU * @func: the function to call * @@ -3084,51 +3038,1025 @@ int execute_in_process_context(work_func_t fn, struct execute_work *ew) } EXPORT_SYMBOL_GPL(execute_in_process_context); -int keventd_up(void) +#ifdef CONFIG_SYSFS +/* + * Workqueues with WQ_SYSFS flag set is visible to userland via + * /sys/bus/workqueue/devices/WQ_NAME. All visible workqueues have the + * following attributes. + * + * per_cpu RO bool : whether the workqueue is per-cpu or unbound + * max_active RW int : maximum number of in-flight work items + * + * Unbound workqueues have the following extra attributes. + * + * id RO int : the associated pool ID + * nice RW int : nice value of the workers + * cpumask RW mask : bitmask of allowed CPUs for the workers + */ +struct wq_device { + struct workqueue_struct *wq; + struct device dev; +}; + +static struct workqueue_struct *dev_to_wq(struct device *dev) +{ + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + return wq_dev->wq; +} + +static ssize_t wq_per_cpu_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", (bool)!(wq->flags & WQ_UNBOUND)); +} + +static ssize_t wq_max_active_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + + return scnprintf(buf, PAGE_SIZE, "%d\n", wq->saved_max_active); +} + +static ssize_t wq_max_active_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int val; + + if (sscanf(buf, "%d", &val) != 1 || val <= 0) + return -EINVAL; + + workqueue_set_max_active(wq, val); + return count; +} + +static struct device_attribute wq_sysfs_attrs[] = { + __ATTR(per_cpu, 0444, wq_per_cpu_show, NULL), + __ATTR(max_active, 0644, wq_max_active_show, wq_max_active_store), + __ATTR_NULL, +}; + +static ssize_t wq_pool_ids_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + const char *delim = ""; + int node, written = 0; + + rcu_read_lock_sched(); + for_each_node(node) { + written += scnprintf(buf + written, PAGE_SIZE - written, + "%s%d:%d", delim, node, + unbound_pwq_by_node(wq, node)->pool->id); + delim = " "; + } + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + rcu_read_unlock_sched(); + + return written; +} + +static ssize_t wq_nice_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", wq->unbound_attrs->nice); + mutex_unlock(&wq->mutex); + + return written; +} + +/* prepare workqueue_attrs for sysfs store operations */ +static struct workqueue_attrs *wq_sysfs_prep_attrs(struct workqueue_struct *wq) +{ + struct workqueue_attrs *attrs; + + attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!attrs) + return NULL; + + mutex_lock(&wq->mutex); + copy_workqueue_attrs(attrs, wq->unbound_attrs); + mutex_unlock(&wq->mutex); + return attrs; +} + +static ssize_t wq_nice_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + if (sscanf(buf, "%d", &attrs->nice) == 1 && + attrs->nice >= -20 && attrs->nice <= 19) + ret = apply_workqueue_attrs(wq, attrs); + else + ret = -EINVAL; + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_cpumask_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = cpumask_scnprintf(buf, PAGE_SIZE, wq->unbound_attrs->cpumask); + mutex_unlock(&wq->mutex); + + written += scnprintf(buf + written, PAGE_SIZE - written, "\n"); + return written; +} + +static ssize_t wq_cpumask_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = cpumask_parse(buf, attrs->cpumask); + if (!ret) + ret = apply_workqueue_attrs(wq, attrs); + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static ssize_t wq_numa_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + int written; + + mutex_lock(&wq->mutex); + written = scnprintf(buf, PAGE_SIZE, "%d\n", + !wq->unbound_attrs->no_numa); + mutex_unlock(&wq->mutex); + + return written; +} + +static ssize_t wq_numa_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct workqueue_struct *wq = dev_to_wq(dev); + struct workqueue_attrs *attrs; + int v, ret; + + attrs = wq_sysfs_prep_attrs(wq); + if (!attrs) + return -ENOMEM; + + ret = -EINVAL; + if (sscanf(buf, "%d", &v) == 1) { + attrs->no_numa = !v; + ret = apply_workqueue_attrs(wq, attrs); + } + + free_workqueue_attrs(attrs); + return ret ?: count; +} + +static struct device_attribute wq_sysfs_unbound_attrs[] = { + __ATTR(pool_ids, 0444, wq_pool_ids_show, NULL), + __ATTR(nice, 0644, wq_nice_show, wq_nice_store), + __ATTR(cpumask, 0644, wq_cpumask_show, wq_cpumask_store), + __ATTR(numa, 0644, wq_numa_show, wq_numa_store), + __ATTR_NULL, +}; + +static struct bus_type wq_subsys = { + .name = "workqueue", + .dev_attrs = wq_sysfs_attrs, +}; + +static int __init wq_sysfs_init(void) { - return system_wq != NULL; + return subsys_virtual_register(&wq_subsys, NULL); } +core_initcall(wq_sysfs_init); -static int alloc_pwqs(struct workqueue_struct *wq) +static void wq_device_release(struct device *dev) { + struct wq_device *wq_dev = container_of(dev, struct wq_device, dev); + + kfree(wq_dev); +} + +/** + * workqueue_sysfs_register - make a workqueue visible in sysfs + * @wq: the workqueue to register + * + * Expose @wq in sysfs under /sys/bus/workqueue/devices. + * alloc_workqueue*() automatically calls this function if WQ_SYSFS is set + * which is the preferred method. + * + * Workqueue user should use this function directly iff it wants to apply + * workqueue_attrs before making the workqueue visible in sysfs; otherwise, + * apply_workqueue_attrs() may race against userland updating the + * attributes. + * + * Returns 0 on success, -errno on failure. + */ +int workqueue_sysfs_register(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev; + int ret; + /* - * pwqs are forced aligned according to WORK_STRUCT_FLAG_BITS. - * Make sure that the alignment isn't lower than that of - * unsigned long long. + * Adjusting max_active or creating new pwqs by applyting + * attributes breaks ordering guarantee. Disallow exposing ordered + * workqueues. */ - const size_t size = sizeof(struct pool_workqueue); - const size_t align = max_t(size_t, 1 << WORK_STRUCT_FLAG_BITS, - __alignof__(unsigned long long)); + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return -EINVAL; - if (!(wq->flags & WQ_UNBOUND)) - wq->pool_wq.pcpu = __alloc_percpu(size, align); - else { - void *ptr; + wq->wq_dev = wq_dev = kzalloc(sizeof(*wq_dev), GFP_KERNEL); + if (!wq_dev) + return -ENOMEM; + + wq_dev->wq = wq; + wq_dev->dev.bus = &wq_subsys; + wq_dev->dev.init_name = wq->name; + wq_dev->dev.release = wq_device_release; + + /* + * unbound_attrs are created separately. Suppress uevent until + * everything is ready. + */ + dev_set_uevent_suppress(&wq_dev->dev, true); + + ret = device_register(&wq_dev->dev); + if (ret) { + kfree(wq_dev); + wq->wq_dev = NULL; + return ret; + } + + if (wq->flags & WQ_UNBOUND) { + struct device_attribute *attr; + + for (attr = wq_sysfs_unbound_attrs; attr->attr.name; attr++) { + ret = device_create_file(&wq_dev->dev, attr); + if (ret) { + device_unregister(&wq_dev->dev); + wq->wq_dev = NULL; + return ret; + } + } + } + + kobject_uevent(&wq_dev->dev.kobj, KOBJ_ADD); + return 0; +} + +/** + * workqueue_sysfs_unregister - undo workqueue_sysfs_register() + * @wq: the workqueue to unregister + * + * If @wq is registered to sysfs by workqueue_sysfs_register(), unregister. + */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) +{ + struct wq_device *wq_dev = wq->wq_dev; + + if (!wq->wq_dev) + return; + + wq->wq_dev = NULL; + device_unregister(&wq_dev->dev); +} +#else /* CONFIG_SYSFS */ +static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } +#endif /* CONFIG_SYSFS */ + +/** + * free_workqueue_attrs - free a workqueue_attrs + * @attrs: workqueue_attrs to free + * + * Undo alloc_workqueue_attrs(). + */ +void free_workqueue_attrs(struct workqueue_attrs *attrs) +{ + if (attrs) { + free_cpumask_var(attrs->cpumask); + kfree(attrs); + } +} + +/** + * alloc_workqueue_attrs - allocate a workqueue_attrs + * @gfp_mask: allocation mask to use + * + * Allocate a new workqueue_attrs, initialize with default settings and + * return it. Returns NULL on failure. + */ +struct workqueue_attrs *alloc_workqueue_attrs(gfp_t gfp_mask) +{ + struct workqueue_attrs *attrs; + + attrs = kzalloc(sizeof(*attrs), gfp_mask); + if (!attrs) + goto fail; + if (!alloc_cpumask_var(&attrs->cpumask, gfp_mask)) + goto fail; + + cpumask_copy(attrs->cpumask, cpu_possible_mask); + return attrs; +fail: + free_workqueue_attrs(attrs); + return NULL; +} + +static void copy_workqueue_attrs(struct workqueue_attrs *to, + const struct workqueue_attrs *from) +{ + to->nice = from->nice; + cpumask_copy(to->cpumask, from->cpumask); +} + +/* hash value of the content of @attr */ +static u32 wqattrs_hash(const struct workqueue_attrs *attrs) +{ + u32 hash = 0; + + hash = jhash_1word(attrs->nice, hash); + hash = jhash(cpumask_bits(attrs->cpumask), + BITS_TO_LONGS(nr_cpumask_bits) * sizeof(long), hash); + return hash; +} + +/* content equality test */ +static bool wqattrs_equal(const struct workqueue_attrs *a, + const struct workqueue_attrs *b) +{ + if (a->nice != b->nice) + return false; + if (!cpumask_equal(a->cpumask, b->cpumask)) + return false; + return true; +} + +/** + * init_worker_pool - initialize a newly zalloc'd worker_pool + * @pool: worker_pool to initialize + * + * Initiailize a newly zalloc'd @pool. It also allocates @pool->attrs. + * Returns 0 on success, -errno on failure. Even on failure, all fields + * inside @pool proper are initialized and put_unbound_pool() can be called + * on @pool safely to release it. + */ +static int init_worker_pool(struct worker_pool *pool) +{ + spin_lock_init(&pool->lock); + pool->id = -1; + pool->cpu = -1; + pool->node = NUMA_NO_NODE; + pool->flags |= POOL_DISASSOCIATED; + INIT_LIST_HEAD(&pool->worklist); + INIT_LIST_HEAD(&pool->idle_list); + hash_init(pool->busy_hash); + + init_timer_deferrable(&pool->idle_timer); + pool->idle_timer.function = idle_worker_timeout; + pool->idle_timer.data = (unsigned long)pool; + + setup_timer(&pool->mayday_timer, pool_mayday_timeout, + (unsigned long)pool); + + mutex_init(&pool->manager_arb); + mutex_init(&pool->manager_mutex); + idr_init(&pool->worker_idr); + + INIT_HLIST_NODE(&pool->hash_node); + pool->refcnt = 1; + + /* shouldn't fail above this point */ + pool->attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pool->attrs) + return -ENOMEM; + return 0; +} + +static void rcu_free_pool(struct rcu_head *rcu) +{ + struct worker_pool *pool = container_of(rcu, struct worker_pool, rcu); + + idr_destroy(&pool->worker_idr); + free_workqueue_attrs(pool->attrs); + kfree(pool); +} + +/** + * put_unbound_pool - put a worker_pool + * @pool: worker_pool to put + * + * Put @pool. If its refcnt reaches zero, it gets destroyed in sched-RCU + * safe manner. get_unbound_pool() calls this function on its failure path + * and this function should be able to release pools which went through, + * successfully or not, init_worker_pool(). + * + * Should be called with wq_pool_mutex held. + */ +static void put_unbound_pool(struct worker_pool *pool) +{ + struct worker *worker; + + lockdep_assert_held(&wq_pool_mutex); + + if (--pool->refcnt) + return; + + /* sanity checks */ + if (WARN_ON(!(pool->flags & POOL_DISASSOCIATED)) || + WARN_ON(!list_empty(&pool->worklist))) + return; + + /* release id and unhash */ + if (pool->id >= 0) + idr_remove(&worker_pool_idr, pool->id); + hash_del(&pool->hash_node); + + /* + * Become the manager and destroy all workers. Grabbing + * manager_arb prevents @pool's workers from blocking on + * manager_mutex. + */ + mutex_lock(&pool->manager_arb); + mutex_lock(&pool->manager_mutex); + spin_lock_irq(&pool->lock); + + while ((worker = first_worker(pool))) + destroy_worker(worker); + WARN_ON(pool->nr_workers || pool->nr_idle); + + spin_unlock_irq(&pool->lock); + mutex_unlock(&pool->manager_mutex); + mutex_unlock(&pool->manager_arb); + + /* shut down the timers */ + del_timer_sync(&pool->idle_timer); + del_timer_sync(&pool->mayday_timer); + + /* sched-RCU protected to allow dereferences from get_work_pool() */ + call_rcu_sched(&pool->rcu, rcu_free_pool); +} + +/** + * get_unbound_pool - get a worker_pool with the specified attributes + * @attrs: the attributes of the worker_pool to get + * + * Obtain a worker_pool which has the same attributes as @attrs, bump the + * reference count and return it. If there already is a matching + * worker_pool, it will be used; otherwise, this function attempts to + * create a new one. On failure, returns NULL. + * + * Should be called with wq_pool_mutex held. + */ +static struct worker_pool *get_unbound_pool(const struct workqueue_attrs *attrs) +{ + u32 hash = wqattrs_hash(attrs); + struct worker_pool *pool; + int node; + + lockdep_assert_held(&wq_pool_mutex); + + /* do we already have a matching pool? */ + hash_for_each_possible(unbound_pool_hash, pool, hash_node, hash) { + if (wqattrs_equal(pool->attrs, attrs)) { + pool->refcnt++; + goto out_unlock; + } + } + + /* nope, create a new one */ + pool = kzalloc(sizeof(*pool), GFP_KERNEL); + if (!pool || init_worker_pool(pool) < 0) + goto fail; + + if (workqueue_freezing) + pool->flags |= POOL_FREEZING; + + lockdep_set_subclass(&pool->lock, 1); /* see put_pwq() */ + copy_workqueue_attrs(pool->attrs, attrs); + + /* if cpumask is contained inside a NUMA node, we belong to that node */ + if (wq_numa_enabled) { + for_each_node(node) { + if (cpumask_subset(pool->attrs->cpumask, + wq_numa_possible_cpumask[node])) { + pool->node = node; + break; + } + } + } + + if (worker_pool_assign_id(pool) < 0) + goto fail; + + /* create and start the initial worker */ + if (create_and_start_worker(pool) < 0) + goto fail; + + /* install */ + hash_add(unbound_pool_hash, &pool->hash_node, hash); +out_unlock: + return pool; +fail: + if (pool) + put_unbound_pool(pool); + return NULL; +} + +static void rcu_free_pwq(struct rcu_head *rcu) +{ + kmem_cache_free(pwq_cache, + container_of(rcu, struct pool_workqueue, rcu)); +} + +/* + * Scheduled on system_wq by put_pwq() when an unbound pwq hits zero refcnt + * and needs to be destroyed. + */ +static void pwq_unbound_release_workfn(struct work_struct *work) +{ + struct pool_workqueue *pwq = container_of(work, struct pool_workqueue, + unbound_release_work); + struct workqueue_struct *wq = pwq->wq; + struct worker_pool *pool = pwq->pool; + bool is_last; + + if (WARN_ON_ONCE(!(wq->flags & WQ_UNBOUND))) + return; + + /* + * Unlink @pwq. Synchronization against wq->mutex isn't strictly + * necessary on release but do it anyway. It's easier to verify + * and consistent with the linking path. + */ + mutex_lock(&wq->mutex); + list_del_rcu(&pwq->pwqs_node); + is_last = list_empty(&wq->pwqs); + mutex_unlock(&wq->mutex); + + mutex_lock(&wq_pool_mutex); + put_unbound_pool(pool); + mutex_unlock(&wq_pool_mutex); + + call_rcu_sched(&pwq->rcu, rcu_free_pwq); + + /* + * If we're the last pwq going away, @wq is already dead and no one + * is gonna access it anymore. Free it. + */ + if (is_last) { + free_workqueue_attrs(wq->unbound_attrs); + kfree(wq); + } +} + +/** + * pwq_adjust_max_active - update a pwq's max_active to the current setting + * @pwq: target pool_workqueue + * + * If @pwq isn't freezing, set @pwq->max_active to the associated + * workqueue's saved_max_active and activate delayed work items + * accordingly. If @pwq is freezing, clear @pwq->max_active to zero. + */ +static void pwq_adjust_max_active(struct pool_workqueue *pwq) +{ + struct workqueue_struct *wq = pwq->wq; + bool freezable = wq->flags & WQ_FREEZABLE; + + /* for @wq->saved_max_active */ + lockdep_assert_held(&wq->mutex); + + /* fast exit for non-freezable wqs */ + if (!freezable && pwq->max_active == wq->saved_max_active) + return; + + spin_lock_irq(&pwq->pool->lock); + + if (!freezable || !(pwq->pool->flags & POOL_FREEZING)) { + pwq->max_active = wq->saved_max_active; + + while (!list_empty(&pwq->delayed_works) && + pwq->nr_active < pwq->max_active) + pwq_activate_first_delayed(pwq); /* - * Allocate enough room to align pwq and put an extra - * pointer at the end pointing back to the originally - * allocated pointer which will be used for free. + * Need to kick a worker after thawed or an unbound wq's + * max_active is bumped. It's a slow path. Do it always. */ - ptr = kzalloc(size + align + sizeof(void *), GFP_KERNEL); - if (ptr) { - wq->pool_wq.single = PTR_ALIGN(ptr, align); - *(void **)(wq->pool_wq.single + 1) = ptr; + wake_up_worker(pwq->pool); + } else { + pwq->max_active = 0; + } + + spin_unlock_irq(&pwq->pool->lock); +} + +/* initialize newly alloced @pwq which is associated with @wq and @pool */ +static void init_pwq(struct pool_workqueue *pwq, struct workqueue_struct *wq, + struct worker_pool *pool) +{ + BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); + + memset(pwq, 0, sizeof(*pwq)); + + pwq->pool = pool; + pwq->wq = wq; + pwq->flush_color = -1; + pwq->refcnt = 1; + INIT_LIST_HEAD(&pwq->delayed_works); + INIT_LIST_HEAD(&pwq->pwqs_node); + INIT_LIST_HEAD(&pwq->mayday_node); + INIT_WORK(&pwq->unbound_release_work, pwq_unbound_release_workfn); +} + +/* sync @pwq with the current state of its associated wq and link it */ +static void link_pwq(struct pool_workqueue *pwq) +{ + struct workqueue_struct *wq = pwq->wq; + + lockdep_assert_held(&wq->mutex); + + /* may be called multiple times, ignore if already linked */ + if (!list_empty(&pwq->pwqs_node)) + return; + + /* + * Set the matching work_color. This is synchronized with + * wq->mutex to avoid confusing flush_workqueue(). + */ + pwq->work_color = wq->work_color; + + /* sync max_active to the current setting */ + pwq_adjust_max_active(pwq); + + /* link in @pwq */ + list_add_rcu(&pwq->pwqs_node, &wq->pwqs); +} + +/* obtain a pool matching @attr and create a pwq associating the pool and @wq */ +static struct pool_workqueue *alloc_unbound_pwq(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct worker_pool *pool; + struct pool_workqueue *pwq; + + lockdep_assert_held(&wq_pool_mutex); + + pool = get_unbound_pool(attrs); + if (!pool) + return NULL; + + pwq = kmem_cache_alloc_node(pwq_cache, GFP_KERNEL, pool->node); + if (!pwq) { + put_unbound_pool(pool); + return NULL; + } + + init_pwq(pwq, wq, pool); + return pwq; +} + +/* undo alloc_unbound_pwq(), used only in the error path */ +static void free_unbound_pwq(struct pool_workqueue *pwq) +{ + lockdep_assert_held(&wq_pool_mutex); + + if (pwq) { + put_unbound_pool(pwq->pool); + kmem_cache_free(pwq_cache, pwq); + } +} + +/** + * wq_calc_node_mask - calculate a wq_attrs' cpumask for the specified node + * @attrs: the wq_attrs of interest + * @node: the target NUMA node + * @cpu_going_down: if >= 0, the CPU to consider as offline + * @cpumask: outarg, the resulting cpumask + * + * Calculate the cpumask a workqueue with @attrs should use on @node. If + * @cpu_going_down is >= 0, that cpu is considered offline during + * calculation. The result is stored in @cpumask. This function returns + * %true if the resulting @cpumask is different from @attrs->cpumask, + * %false if equal. + * + * If NUMA affinity is not enabled, @attrs->cpumask is always used. If + * enabled and @node has online CPUs requested by @attrs, the returned + * cpumask is the intersection of the possible CPUs of @node and + * @attrs->cpumask. + * + * The caller is responsible for ensuring that the cpumask of @node stays + * stable. + */ +static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node, + int cpu_going_down, cpumask_t *cpumask) +{ + if (!wq_numa_enabled || attrs->no_numa) + goto use_dfl; + + /* does @node have any online CPUs @attrs wants? */ + cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask); + if (cpu_going_down >= 0) + cpumask_clear_cpu(cpu_going_down, cpumask); + + if (cpumask_empty(cpumask)) + goto use_dfl; + + /* yeap, return possible CPUs in @node that @attrs wants */ + cpumask_and(cpumask, attrs->cpumask, wq_numa_possible_cpumask[node]); + return !cpumask_equal(cpumask, attrs->cpumask); + +use_dfl: + cpumask_copy(cpumask, attrs->cpumask); + return false; +} + +/* install @pwq into @wq's numa_pwq_tbl[] for @node and return the old pwq */ +static struct pool_workqueue *numa_pwq_tbl_install(struct workqueue_struct *wq, + int node, + struct pool_workqueue *pwq) +{ + struct pool_workqueue *old_pwq; + + lockdep_assert_held(&wq->mutex); + + /* link_pwq() can handle duplicate calls */ + link_pwq(pwq); + + old_pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); + rcu_assign_pointer(wq->numa_pwq_tbl[node], pwq); + return old_pwq; +} + +/** + * apply_workqueue_attrs - apply new workqueue_attrs to an unbound workqueue + * @wq: the target workqueue + * @attrs: the workqueue_attrs to apply, allocated with alloc_workqueue_attrs() + * + * Apply @attrs to an unbound workqueue @wq. Unless disabled, on NUMA + * machines, this function maps a separate pwq to each NUMA node with + * possibles CPUs in @attrs->cpumask so that work items are affine to the + * NUMA node it was issued on. Older pwqs are released as in-flight work + * items finish. Note that a work item which repeatedly requeues itself + * back-to-back will stay on its current pwq. + * + * Performs GFP_KERNEL allocations. Returns 0 on success and -errno on + * failure. + */ +int apply_workqueue_attrs(struct workqueue_struct *wq, + const struct workqueue_attrs *attrs) +{ + struct workqueue_attrs *new_attrs, *tmp_attrs; + struct pool_workqueue **pwq_tbl, *dfl_pwq; + int node, ret; + + /* only unbound workqueues can change attributes */ + if (WARN_ON(!(wq->flags & WQ_UNBOUND))) + return -EINVAL; + + /* creating multiple pwqs breaks ordering guarantee */ + if (WARN_ON((wq->flags & __WQ_ORDERED) && !list_empty(&wq->pwqs))) + return -EINVAL; + + pwq_tbl = kzalloc(wq_numa_tbl_len * sizeof(pwq_tbl[0]), GFP_KERNEL); + new_attrs = alloc_workqueue_attrs(GFP_KERNEL); + tmp_attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!pwq_tbl || !new_attrs || !tmp_attrs) + goto enomem; + + /* make a copy of @attrs and sanitize it */ + copy_workqueue_attrs(new_attrs, attrs); + cpumask_and(new_attrs->cpumask, new_attrs->cpumask, cpu_possible_mask); + + /* + * We may create multiple pwqs with differing cpumasks. Make a + * copy of @new_attrs which will be modified and used to obtain + * pools. + */ + copy_workqueue_attrs(tmp_attrs, new_attrs); + + /* + * CPUs should stay stable across pwq creations and installations. + * Pin CPUs, determine the target cpumask for each node and create + * pwqs accordingly. + */ + get_online_cpus(); + + mutex_lock(&wq_pool_mutex); + + /* + * If something goes wrong during CPU up/down, we'll fall back to + * the default pwq covering whole @attrs->cpumask. Always create + * it even if we don't use it immediately. + */ + dfl_pwq = alloc_unbound_pwq(wq, new_attrs); + if (!dfl_pwq) + goto enomem_pwq; + + for_each_node(node) { + if (wq_calc_node_cpumask(attrs, node, -1, tmp_attrs->cpumask)) { + pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs); + if (!pwq_tbl[node]) + goto enomem_pwq; + } else { + dfl_pwq->refcnt++; + pwq_tbl[node] = dfl_pwq; } } - /* just in case, make sure it's actually aligned */ - BUG_ON(!IS_ALIGNED(wq->pool_wq.v, align)); - return wq->pool_wq.v ? 0 : -ENOMEM; + mutex_unlock(&wq_pool_mutex); + + /* all pwqs have been created successfully, let's install'em */ + mutex_lock(&wq->mutex); + + copy_workqueue_attrs(wq->unbound_attrs, new_attrs); + + /* save the previous pwq and install the new one */ + for_each_node(node) + pwq_tbl[node] = numa_pwq_tbl_install(wq, node, pwq_tbl[node]); + + /* @dfl_pwq might not have been used, ensure it's linked */ + link_pwq(dfl_pwq); + swap(wq->dfl_pwq, dfl_pwq); + + mutex_unlock(&wq->mutex); + + /* put the old pwqs */ + for_each_node(node) + put_pwq_unlocked(pwq_tbl[node]); + put_pwq_unlocked(dfl_pwq); + + put_online_cpus(); + ret = 0; + /* fall through */ +out_free: + free_workqueue_attrs(tmp_attrs); + free_workqueue_attrs(new_attrs); + kfree(pwq_tbl); + return ret; + +enomem_pwq: + free_unbound_pwq(dfl_pwq); + for_each_node(node) + if (pwq_tbl && pwq_tbl[node] != dfl_pwq) + free_unbound_pwq(pwq_tbl[node]); + mutex_unlock(&wq_pool_mutex); + put_online_cpus(); +enomem: + ret = -ENOMEM; + goto out_free; } -static void free_pwqs(struct workqueue_struct *wq) +/** + * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug + * @wq: the target workqueue + * @cpu: the CPU coming up or going down + * @online: whether @cpu is coming up or going down + * + * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and + * %CPU_DOWN_FAILED. @cpu is being hot[un]plugged, update NUMA affinity of + * @wq accordingly. + * + * If NUMA affinity can't be adjusted due to memory allocation failure, it + * falls back to @wq->dfl_pwq which may not be optimal but is always + * correct. + * + * Note that when the last allowed CPU of a NUMA node goes offline for a + * workqueue with a cpumask spanning multiple nodes, the workers which were + * already executing the work items for the workqueue will lose their CPU + * affinity and may execute on any CPU. This is similar to how per-cpu + * workqueues behave on CPU_DOWN. If a workqueue user wants strict + * affinity, it's the user's responsibility to flush the work item from + * CPU_DOWN_PREPARE. + */ +static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu, + bool online) { - if (!(wq->flags & WQ_UNBOUND)) - free_percpu(wq->pool_wq.pcpu); - else if (wq->pool_wq.single) { - /* the pointer to free is stored right after the pwq */ - kfree(*(void **)(wq->pool_wq.single + 1)); + int node = cpu_to_node(cpu); + int cpu_off = online ? -1 : cpu; + struct pool_workqueue *old_pwq = NULL, *pwq; + struct workqueue_attrs *target_attrs; + cpumask_t *cpumask; + + lockdep_assert_held(&wq_pool_mutex); + + if (!wq_numa_enabled || !(wq->flags & WQ_UNBOUND)) + return; + + /* + * We don't wanna alloc/free wq_attrs for each wq for each CPU. + * Let's use a preallocated one. The following buf is protected by + * CPU hotplug exclusion. + */ + target_attrs = wq_update_unbound_numa_attrs_buf; + cpumask = target_attrs->cpumask; + + mutex_lock(&wq->mutex); + if (wq->unbound_attrs->no_numa) + goto out_unlock; + + copy_workqueue_attrs(target_attrs, wq->unbound_attrs); + pwq = unbound_pwq_by_node(wq, node); + + /* + * Let's determine what needs to be done. If the target cpumask is + * different from wq's, we need to compare it to @pwq's and create + * a new one if they don't match. If the target cpumask equals + * wq's, the default pwq should be used. If @pwq is already the + * default one, nothing to do; otherwise, install the default one. + */ + if (wq_calc_node_cpumask(wq->unbound_attrs, node, cpu_off, cpumask)) { + if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask)) + goto out_unlock; + } else { + if (pwq == wq->dfl_pwq) + goto out_unlock; + else + goto use_dfl_pwq; + } + + mutex_unlock(&wq->mutex); + + /* create a new pwq */ + pwq = alloc_unbound_pwq(wq, target_attrs); + if (!pwq) { + pr_warning("workqueue: allocation failed while updating NUMA affinity of \"%s\"\n", + wq->name); + goto out_unlock; + } + + /* + * Install the new pwq. As this function is called only from CPU + * hotplug callbacks and applying a new attrs is wrapped with + * get/put_online_cpus(), @wq->unbound_attrs couldn't have changed + * inbetween. + */ + mutex_lock(&wq->mutex); + old_pwq = numa_pwq_tbl_install(wq, node, pwq); + goto out_unlock; + +use_dfl_pwq: + spin_lock_irq(&wq->dfl_pwq->pool->lock); + get_pwq(wq->dfl_pwq); + spin_unlock_irq(&wq->dfl_pwq->pool->lock); + old_pwq = numa_pwq_tbl_install(wq, node, wq->dfl_pwq); +out_unlock: + mutex_unlock(&wq->mutex); + put_pwq_unlocked(old_pwq); +} + +static int alloc_and_link_pwqs(struct workqueue_struct *wq) +{ + bool highpri = wq->flags & WQ_HIGHPRI; + int cpu; + + if (!(wq->flags & WQ_UNBOUND)) { + wq->cpu_pwqs = alloc_percpu(struct pool_workqueue); + if (!wq->cpu_pwqs) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct pool_workqueue *pwq = + per_cpu_ptr(wq->cpu_pwqs, cpu); + struct worker_pool *cpu_pools = + per_cpu(cpu_worker_pools, cpu); + + init_pwq(pwq, wq, &cpu_pools[highpri]); + + mutex_lock(&wq->mutex); + link_pwq(pwq); + mutex_unlock(&wq->mutex); + } + return 0; + } else { + return apply_workqueue_attrs(wq, unbound_std_wq_attrs[highpri]); } } @@ -3150,30 +4078,28 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, struct lock_class_key *key, const char *lock_name, ...) { - va_list args, args1; + size_t tbl_size = 0; + va_list args; struct workqueue_struct *wq; - unsigned int cpu; - size_t namelen; + struct pool_workqueue *pwq; - /* determine namelen, allocate wq and format name */ - va_start(args, lock_name); - va_copy(args1, args); - namelen = vsnprintf(NULL, 0, fmt, args) + 1; + /* allocate wq and format name */ + if (flags & WQ_UNBOUND) + tbl_size = wq_numa_tbl_len * sizeof(wq->numa_pwq_tbl[0]); - wq = kzalloc(sizeof(*wq) + namelen, GFP_KERNEL); + wq = kzalloc(sizeof(*wq) + tbl_size, GFP_KERNEL); if (!wq) - goto err; + return NULL; - vsnprintf(wq->name, namelen, fmt, args1); - va_end(args); - va_end(args1); + if (flags & WQ_UNBOUND) { + wq->unbound_attrs = alloc_workqueue_attrs(GFP_KERNEL); + if (!wq->unbound_attrs) + goto err_free_wq; + } - /* - * Workqueues which may be used during memory reclaim should - * have a rescuer to guarantee forward progress. - */ - if (flags & WQ_MEM_RECLAIM) - flags |= WQ_RESCUER; + va_start(args, lock_name); + vsnprintf(wq->name, sizeof(wq->name), fmt, args); + va_end(args); max_active = max_active ?: WQ_DFL_ACTIVE; max_active = wq_clamp_max_active(max_active, flags, wq->name); @@ -3181,71 +4107,70 @@ struct workqueue_struct *__alloc_workqueue_key(const char *fmt, /* init wq */ wq->flags = flags; wq->saved_max_active = max_active; - mutex_init(&wq->flush_mutex); + mutex_init(&wq->mutex); atomic_set(&wq->nr_pwqs_to_flush, 0); + INIT_LIST_HEAD(&wq->pwqs); INIT_LIST_HEAD(&wq->flusher_queue); INIT_LIST_HEAD(&wq->flusher_overflow); + INIT_LIST_HEAD(&wq->maydays); lockdep_init_map(&wq->lockdep_map, lock_name, key, 0); INIT_LIST_HEAD(&wq->list); - if (alloc_pwqs(wq) < 0) - goto err; - - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + if (alloc_and_link_pwqs(wq) < 0) + goto err_free_wq; - BUG_ON((unsigned long)pwq & WORK_STRUCT_FLAG_MASK); - pwq->pool = get_std_worker_pool(cpu, flags & WQ_HIGHPRI); - pwq->wq = wq; - pwq->flush_color = -1; - pwq->max_active = max_active; - INIT_LIST_HEAD(&pwq->delayed_works); - } - - if (flags & WQ_RESCUER) { + /* + * Workqueues which may be used during memory reclaim should + * have a rescuer to guarantee forward progress. + */ + if (flags & WQ_MEM_RECLAIM) { struct worker *rescuer; - if (!alloc_mayday_mask(&wq->mayday_mask, GFP_KERNEL)) - goto err; - - wq->rescuer = rescuer = alloc_worker(); + rescuer = alloc_worker(); if (!rescuer) - goto err; + goto err_destroy; rescuer->rescue_wq = wq; rescuer->task = kthread_create(rescuer_thread, rescuer, "%s", wq->name); - if (IS_ERR(rescuer->task)) - goto err; + if (IS_ERR(rescuer->task)) { + kfree(rescuer); + goto err_destroy; + } - rescuer->task->flags |= PF_THREAD_BOUND; + wq->rescuer = rescuer; + rescuer->task->flags |= PF_NO_SETAFFINITY; wake_up_process(rescuer->task); } + if ((wq->flags & WQ_SYSFS) && workqueue_sysfs_register(wq)) + goto err_destroy; + /* - * workqueue_lock protects global freeze state and workqueues - * list. Grab it, set max_active accordingly and add the new - * workqueue to workqueues list. + * wq_pool_mutex protects global freeze state and workqueues list. + * Grab it, adjust max_active and add the new @wq to workqueues + * list. */ - spin_lock(&workqueue_lock); + mutex_lock(&wq_pool_mutex); - if (workqueue_freezing && wq->flags & WQ_FREEZABLE) - for_each_pwq_cpu(cpu, wq) - get_pwq(cpu, wq)->max_active = 0; + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); list_add(&wq->list, &workqueues); - spin_unlock(&workqueue_lock); + mutex_unlock(&wq_pool_mutex); return wq; -err: - if (wq) { - free_pwqs(wq); - free_mayday_mask(wq->mayday_mask); - kfree(wq->rescuer); - kfree(wq); - } + +err_free_wq: + free_workqueue_attrs(wq->unbound_attrs); + kfree(wq); + return NULL; +err_destroy: + destroy_workqueue(wq); return NULL; } EXPORT_SYMBOL_GPL(__alloc_workqueue_key); @@ -3258,60 +4183,78 @@ EXPORT_SYMBOL_GPL(__alloc_workqueue_key); */ void destroy_workqueue(struct workqueue_struct *wq) { - unsigned int cpu; + struct pool_workqueue *pwq; + int node; /* drain it before proceeding with destruction */ drain_workqueue(wq); + /* sanity checks */ + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) { + int i; + + for (i = 0; i < WORK_NR_COLORS; i++) { + if (WARN_ON(pwq->nr_in_flight[i])) { + mutex_unlock(&wq->mutex); + return; + } + } + + if (WARN_ON((pwq != wq->dfl_pwq) && (pwq->refcnt > 1)) || + WARN_ON(pwq->nr_active) || + WARN_ON(!list_empty(&pwq->delayed_works))) { + mutex_unlock(&wq->mutex); + return; + } + } + mutex_unlock(&wq->mutex); + /* * wq list is used to freeze wq, remove from list after * flushing is complete in case freeze races us. */ - spin_lock(&workqueue_lock); - list_del(&wq->list); - spin_unlock(&workqueue_lock); + mutex_lock(&wq_pool_mutex); + list_del_init(&wq->list); + mutex_unlock(&wq_pool_mutex); - /* sanity check */ - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - int i; + workqueue_sysfs_unregister(wq); - for (i = 0; i < WORK_NR_COLORS; i++) - BUG_ON(pwq->nr_in_flight[i]); - BUG_ON(pwq->nr_active); - BUG_ON(!list_empty(&pwq->delayed_works)); - } - - if (wq->flags & WQ_RESCUER) { + if (wq->rescuer) { kthread_stop(wq->rescuer->task); - free_mayday_mask(wq->mayday_mask); kfree(wq->rescuer); + wq->rescuer = NULL; } - free_pwqs(wq); - kfree(wq); -} -EXPORT_SYMBOL_GPL(destroy_workqueue); - -/** - * pwq_set_max_active - adjust max_active of a pwq - * @pwq: target pool_workqueue - * @max_active: new max_active value. - * - * Set @pwq->max_active to @max_active and activate delayed works if - * increased. - * - * CONTEXT: - * spin_lock_irq(pool->lock). - */ -static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) -{ - pwq->max_active = max_active; + if (!(wq->flags & WQ_UNBOUND)) { + /* + * The base ref is never dropped on per-cpu pwqs. Directly + * free the pwqs and wq. + */ + free_percpu(wq->cpu_pwqs); + kfree(wq); + } else { + /* + * We're the sole accessor of @wq at this point. Directly + * access numa_pwq_tbl[] and dfl_pwq to put the base refs. + * @wq will be freed when the last pwq is released. + */ + for_each_node(node) { + pwq = rcu_access_pointer(wq->numa_pwq_tbl[node]); + RCU_INIT_POINTER(wq->numa_pwq_tbl[node], NULL); + put_pwq_unlocked(pwq); + } - while (!list_empty(&pwq->delayed_works) && - pwq->nr_active < pwq->max_active) - pwq_activate_first_delayed(pwq); + /* + * Put dfl_pwq. @wq may be freed any time after dfl_pwq is + * put. Don't access it afterwards. + */ + pwq = wq->dfl_pwq; + wq->dfl_pwq = NULL; + put_pwq_unlocked(pwq); + } } +EXPORT_SYMBOL_GPL(destroy_workqueue); /** * workqueue_set_max_active - adjust max_active of a workqueue @@ -3325,30 +4268,37 @@ static void pwq_set_max_active(struct pool_workqueue *pwq, int max_active) */ void workqueue_set_max_active(struct workqueue_struct *wq, int max_active) { - unsigned int cpu; + struct pool_workqueue *pwq; + + /* disallow meddling with max_active for ordered workqueues */ + if (WARN_ON(wq->flags & __WQ_ORDERED)) + return; max_active = wq_clamp_max_active(max_active, wq->flags, wq->name); - spin_lock(&workqueue_lock); + mutex_lock(&wq->mutex); wq->saved_max_active = max_active; - for_each_pwq_cpu(cpu, wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - struct worker_pool *pool = pwq->pool; - - spin_lock_irq(&pool->lock); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); - if (!(wq->flags & WQ_FREEZABLE) || - !(pool->flags & POOL_FREEZING)) - pwq_set_max_active(pwq, max_active); + mutex_unlock(&wq->mutex); +} +EXPORT_SYMBOL_GPL(workqueue_set_max_active); - spin_unlock_irq(&pool->lock); - } +/** + * current_is_workqueue_rescuer - is %current workqueue rescuer? + * + * Determine whether %current is a workqueue rescuer. Can be used from + * work functions to determine whether it's being run off the rescuer task. + */ +bool current_is_workqueue_rescuer(void) +{ + struct worker *worker = current_wq_worker(); - spin_unlock(&workqueue_lock); + return worker && worker->rescue_wq; } -EXPORT_SYMBOL_GPL(workqueue_set_max_active); /** * workqueue_congested - test whether a workqueue is congested @@ -3362,11 +4312,22 @@ EXPORT_SYMBOL_GPL(workqueue_set_max_active); * RETURNS: * %true if congested, %false otherwise. */ -bool workqueue_congested(unsigned int cpu, struct workqueue_struct *wq) +bool workqueue_congested(int cpu, struct workqueue_struct *wq) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + struct pool_workqueue *pwq; + bool ret; + + rcu_read_lock_sched(); + + if (!(wq->flags & WQ_UNBOUND)) + pwq = per_cpu_ptr(wq->cpu_pwqs, cpu); + else + pwq = unbound_pwq_by_node(wq, cpu_to_node(cpu)); + + ret = !list_empty(&pwq->delayed_works); + rcu_read_unlock_sched(); - return !list_empty(&pwq->delayed_works); + return ret; } EXPORT_SYMBOL_GPL(workqueue_congested); @@ -3383,19 +4344,22 @@ EXPORT_SYMBOL_GPL(workqueue_congested); */ unsigned int work_busy(struct work_struct *work) { - struct worker_pool *pool = get_work_pool(work); + struct worker_pool *pool; unsigned long flags; unsigned int ret = 0; if (work_pending(work)) ret |= WORK_BUSY_PENDING; + local_irq_save(flags); + pool = get_work_pool(work); if (pool) { - spin_lock_irqsave(&pool->lock, flags); + spin_lock(&pool->lock); if (find_worker_executing_work(pool, work)) ret |= WORK_BUSY_RUNNING; - spin_unlock_irqrestore(&pool->lock, flags); + spin_unlock(&pool->lock); } + local_irq_restore(flags); return ret; } @@ -3421,53 +4385,153 @@ static void wq_unbind_fn(struct work_struct *work) int cpu = smp_processor_id(); struct worker_pool *pool; struct worker *worker; - int i; + int wi; - for_each_std_worker_pool(pool, cpu) { - BUG_ON(cpu != smp_processor_id()); + for_each_cpu_worker_pool(pool, cpu) { + WARN_ON_ONCE(cpu != smp_processor_id()); - mutex_lock(&pool->assoc_mutex); + mutex_lock(&pool->manager_mutex); spin_lock_irq(&pool->lock); /* - * We've claimed all manager positions. Make all workers + * We've blocked all manager operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * except for the ones which are still executing works from * before the last CPU down must be on the cpu. After * this, they may become diasporas. */ - list_for_each_entry(worker, &pool->idle_list, entry) - worker->flags |= WORKER_UNBOUND; - - for_each_busy_worker(worker, i, pool) + for_each_pool_worker(worker, wi, pool) worker->flags |= WORKER_UNBOUND; pool->flags |= POOL_DISASSOCIATED; spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); + + /* + * Call schedule() so that we cross rq->lock and thus can + * guarantee sched callbacks see the %WORKER_UNBOUND flag. + * This is necessary as scheduler callbacks may be invoked + * from other cpus. + */ + schedule(); + + /* + * Sched callbacks are disabled now. Zap nr_running. + * After this, nr_running stays zero and need_more_worker() + * and keep_working() are always true as long as the + * worklist is not empty. This pool now behaves as an + * unbound (in terms of concurrency management) pool which + * are served by workers tied to the pool. + */ + atomic_set(&pool->nr_running, 0); + + /* + * With concurrency management just turned off, a busy + * worker blocking could lead to lengthy stalls. Kick off + * unbound chain execution of currently pending work items. + */ + spin_lock_irq(&pool->lock); + wake_up_worker(pool); + spin_unlock_irq(&pool->lock); } +} - /* - * Call schedule() so that we cross rq->lock and thus can guarantee - * sched callbacks see the %WORKER_UNBOUND flag. This is necessary - * as scheduler callbacks may be invoked from other cpus. - */ - schedule(); +/** + * rebind_workers - rebind all workers of a pool to the associated CPU + * @pool: pool of interest + * + * @pool->cpu is coming online. Rebind all workers to the CPU. + */ +static void rebind_workers(struct worker_pool *pool) +{ + struct worker *worker; + int wi; + + lockdep_assert_held(&pool->manager_mutex); /* - * Sched callbacks are disabled now. Zap nr_running. After this, - * nr_running stays zero and need_more_worker() and keep_working() - * are always true as long as the worklist is not empty. Pools on - * @cpu now behave as unbound (in terms of concurrency management) - * pools which are served by workers tied to the CPU. - * - * On return from this function, the current worker would trigger - * unbound chain execution of pending work items if other workers - * didn't already. + * Restore CPU affinity of all workers. As all idle workers should + * be on the run-queue of the associated CPU before any local + * wake-ups for concurrency management happen, restore CPU affinty + * of all workers first and then clear UNBOUND. As we're called + * from CPU_ONLINE, the following shouldn't fail. */ - for_each_std_worker_pool(pool, cpu) - atomic_set(&pool->nr_running, 0); + for_each_pool_worker(worker, wi, pool) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, + pool->attrs->cpumask) < 0); + + spin_lock_irq(&pool->lock); + + for_each_pool_worker(worker, wi, pool) { + unsigned int worker_flags = worker->flags; + + /* + * A bound idle worker should actually be on the runqueue + * of the associated CPU for local wake-ups targeting it to + * work. Kick all idle workers so that they migrate to the + * associated CPU. Doing this in the same loop as + * replacing UNBOUND with REBOUND is safe as no worker will + * be bound before @pool->lock is released. + */ + if (worker_flags & WORKER_IDLE) + wake_up_process(worker->task); + + /* + * We want to clear UNBOUND but can't directly call + * worker_clr_flags() or adjust nr_running. Atomically + * replace UNBOUND with another NOT_RUNNING flag REBOUND. + * @worker will clear REBOUND using worker_clr_flags() when + * it initiates the next execution cycle thus restoring + * concurrency management. Note that when or whether + * @worker clears REBOUND doesn't affect correctness. + * + * ACCESS_ONCE() is necessary because @worker->flags may be + * tested without holding any lock in + * wq_worker_waking_up(). Without it, NOT_RUNNING test may + * fail incorrectly leading to premature concurrency + * management operations. + */ + WARN_ON_ONCE(!(worker_flags & WORKER_UNBOUND)); + worker_flags |= WORKER_REBOUND; + worker_flags &= ~WORKER_UNBOUND; + ACCESS_ONCE(worker->flags) = worker_flags; + } + + spin_unlock_irq(&pool->lock); +} + +/** + * restore_unbound_workers_cpumask - restore cpumask of unbound workers + * @pool: unbound pool of interest + * @cpu: the CPU which is coming up + * + * An unbound pool may end up with a cpumask which doesn't have any online + * CPUs. When a worker of such pool get scheduled, the scheduler resets + * its cpus_allowed. If @cpu is in @pool's cpumask which didn't have any + * online CPU before, cpus_allowed of all its workers should be restored. + */ +static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu) +{ + static cpumask_t cpumask; + struct worker *worker; + int wi; + + lockdep_assert_held(&pool->manager_mutex); + + /* is @cpu allowed for @pool? */ + if (!cpumask_test_cpu(cpu, pool->attrs->cpumask)) + return; + + /* is @cpu the only online CPU? */ + cpumask_and(&cpumask, pool->attrs->cpumask, cpu_online_mask); + if (cpumask_weight(&cpumask) != 1) + return; + + /* as we're called from CPU_ONLINE, the following shouldn't fail */ + for_each_pool_worker(worker, wi, pool) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, + pool->attrs->cpumask) < 0); } /* @@ -3478,39 +4542,46 @@ static int __cpuinit workqueue_cpu_up_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; + int cpu = (unsigned long)hcpu; struct worker_pool *pool; + struct workqueue_struct *wq; + int pi; switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - for_each_std_worker_pool(pool, cpu) { - struct worker *worker; - + for_each_cpu_worker_pool(pool, cpu) { if (pool->nr_workers) continue; - - worker = create_worker(pool); - if (!worker) + if (create_and_start_worker(pool) < 0) return NOTIFY_BAD; - - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); } break; case CPU_DOWN_FAILED: case CPU_ONLINE: - for_each_std_worker_pool(pool, cpu) { - mutex_lock(&pool->assoc_mutex); - spin_lock_irq(&pool->lock); + mutex_lock(&wq_pool_mutex); - pool->flags &= ~POOL_DISASSOCIATED; - rebind_workers(pool); + for_each_pool(pool, pi) { + mutex_lock(&pool->manager_mutex); + + if (pool->cpu == cpu) { + spin_lock_irq(&pool->lock); + pool->flags &= ~POOL_DISASSOCIATED; + spin_unlock_irq(&pool->lock); + + rebind_workers(pool); + } else if (pool->cpu < 0) { + restore_unbound_workers_cpumask(pool, cpu); + } - spin_unlock_irq(&pool->lock); - mutex_unlock(&pool->assoc_mutex); + mutex_unlock(&pool->manager_mutex); } + + /* update NUMA affinity of unbound workqueues */ + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, true); + + mutex_unlock(&wq_pool_mutex); break; } return NOTIFY_OK; @@ -3524,14 +4595,23 @@ static int __cpuinit workqueue_cpu_down_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; + int cpu = (unsigned long)hcpu; struct work_struct unbind_work; + struct workqueue_struct *wq; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: - /* unbinding should happen on the local CPU */ + /* unbinding per-cpu workers should happen on the local CPU */ INIT_WORK_ONSTACK(&unbind_work, wq_unbind_fn); queue_work_on(cpu, system_highpri_wq, &unbind_work); + + /* update NUMA affinity of unbound workqueues */ + mutex_lock(&wq_pool_mutex); + list_for_each_entry(wq, &workqueues, list) + wq_update_unbound_numa(wq, cpu, false); + mutex_unlock(&wq_pool_mutex); + + /* wait for per-cpu unbinding to finish */ flush_work(&unbind_work); break; } @@ -3564,7 +4644,7 @@ static void work_for_cpu_fn(struct work_struct *work) * It is up to the caller to ensure that the cpu doesn't go offline. * The caller must not hold any locks which would prevent @fn from completing. */ -long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg) +long work_on_cpu(int cpu, long (*fn)(void *), void *arg) { struct work_for_cpu wfc = { .fn = fn, .arg = arg }; @@ -3582,44 +4662,40 @@ EXPORT_SYMBOL_GPL(work_on_cpu); * freeze_workqueues_begin - begin freezing workqueues * * Start freezing workqueues. After this function returns, all freezable - * workqueues will queue new works to their frozen_works list instead of + * workqueues will queue new works to their delayed_works list instead of * pool->worklist. * * CONTEXT: - * Grabs and releases workqueue_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void freeze_workqueues_begin(void) { - unsigned int cpu; + struct worker_pool *pool; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + int pi; - spin_lock(&workqueue_lock); + mutex_lock(&wq_pool_mutex); - BUG_ON(workqueue_freezing); + WARN_ON_ONCE(workqueue_freezing); workqueue_freezing = true; - for_each_wq_cpu(cpu) { - struct worker_pool *pool; - struct workqueue_struct *wq; - - for_each_std_worker_pool(pool, cpu) { - spin_lock_irq(&pool->lock); - - WARN_ON_ONCE(pool->flags & POOL_FREEZING); - pool->flags |= POOL_FREEZING; - - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - - if (pwq && pwq->pool == pool && - (wq->flags & WQ_FREEZABLE)) - pwq->max_active = 0; - } + /* set FREEZING */ + for_each_pool(pool, pi) { + spin_lock_irq(&pool->lock); + WARN_ON_ONCE(pool->flags & POOL_FREEZING); + pool->flags |= POOL_FREEZING; + spin_unlock_irq(&pool->lock); + } - spin_unlock_irq(&pool->lock); - } + list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); } - spin_unlock(&workqueue_lock); + mutex_unlock(&wq_pool_mutex); } /** @@ -3629,7 +4705,7 @@ void freeze_workqueues_begin(void) * between freeze_workqueues_begin() and thaw_workqueues(). * * CONTEXT: - * Grabs and releases workqueue_lock. + * Grabs and releases wq_pool_mutex. * * RETURNS: * %true if some freezable workqueues are still busy. %false if freezing @@ -3637,34 +4713,34 @@ void freeze_workqueues_begin(void) */ bool freeze_workqueues_busy(void) { - unsigned int cpu; bool busy = false; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; - spin_lock(&workqueue_lock); + mutex_lock(&wq_pool_mutex); - BUG_ON(!workqueue_freezing); + WARN_ON_ONCE(!workqueue_freezing); - for_each_wq_cpu(cpu) { - struct workqueue_struct *wq; + list_for_each_entry(wq, &workqueues, list) { + if (!(wq->flags & WQ_FREEZABLE)) + continue; /* * nr_active is monotonically decreasing. It's safe * to peek without lock. */ - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); - - if (!pwq || !(wq->flags & WQ_FREEZABLE)) - continue; - - BUG_ON(pwq->nr_active < 0); + rcu_read_lock_sched(); + for_each_pwq(pwq, wq) { + WARN_ON_ONCE(pwq->nr_active < 0); if (pwq->nr_active) { busy = true; + rcu_read_unlock_sched(); goto out_unlock; } } + rcu_read_unlock_sched(); } out_unlock: - spin_unlock(&workqueue_lock); + mutex_unlock(&wq_pool_mutex); return busy; } @@ -3675,104 +4751,141 @@ out_unlock: * frozen works are transferred to their respective pool worklists. * * CONTEXT: - * Grabs and releases workqueue_lock and pool->lock's. + * Grabs and releases wq_pool_mutex, wq->mutex and pool->lock's. */ void thaw_workqueues(void) { - unsigned int cpu; + struct workqueue_struct *wq; + struct pool_workqueue *pwq; + struct worker_pool *pool; + int pi; - spin_lock(&workqueue_lock); + mutex_lock(&wq_pool_mutex); if (!workqueue_freezing) goto out_unlock; - for_each_wq_cpu(cpu) { - struct worker_pool *pool; - struct workqueue_struct *wq; + /* clear FREEZING */ + for_each_pool(pool, pi) { + spin_lock_irq(&pool->lock); + WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); + pool->flags &= ~POOL_FREEZING; + spin_unlock_irq(&pool->lock); + } - for_each_std_worker_pool(pool, cpu) { - spin_lock_irq(&pool->lock); + /* restore max_active and repopulate worklist */ + list_for_each_entry(wq, &workqueues, list) { + mutex_lock(&wq->mutex); + for_each_pwq(pwq, wq) + pwq_adjust_max_active(pwq); + mutex_unlock(&wq->mutex); + } + + workqueue_freezing = false; +out_unlock: + mutex_unlock(&wq_pool_mutex); +} +#endif /* CONFIG_FREEZER */ - WARN_ON_ONCE(!(pool->flags & POOL_FREEZING)); - pool->flags &= ~POOL_FREEZING; +static void __init wq_numa_init(void) +{ + cpumask_var_t *tbl; + int node, cpu; - list_for_each_entry(wq, &workqueues, list) { - struct pool_workqueue *pwq = get_pwq(cpu, wq); + /* determine NUMA pwq table len - highest node id + 1 */ + for_each_node(node) + wq_numa_tbl_len = max(wq_numa_tbl_len, node + 1); - if (!pwq || pwq->pool != pool || - !(wq->flags & WQ_FREEZABLE)) - continue; + if (num_possible_nodes() <= 1) + return; - /* restore max_active and repopulate worklist */ - pwq_set_max_active(pwq, wq->saved_max_active); - } + if (wq_disable_numa) { + pr_info("workqueue: NUMA affinity support disabled\n"); + return; + } - wake_up_worker(pool); + wq_update_unbound_numa_attrs_buf = alloc_workqueue_attrs(GFP_KERNEL); + BUG_ON(!wq_update_unbound_numa_attrs_buf); - spin_unlock_irq(&pool->lock); + /* + * We want masks of possible CPUs of each node which isn't readily + * available. Build one from cpu_to_node() which should have been + * fully initialized by now. + */ + tbl = kzalloc(wq_numa_tbl_len * sizeof(tbl[0]), GFP_KERNEL); + BUG_ON(!tbl); + + for_each_node(node) + BUG_ON(!alloc_cpumask_var_node(&tbl[node], GFP_KERNEL, node)); + + for_each_possible_cpu(cpu) { + node = cpu_to_node(cpu); + if (WARN_ON(node == NUMA_NO_NODE)) { + pr_warn("workqueue: NUMA node mapping not available for cpu%d, disabling NUMA support\n", cpu); + /* happens iff arch is bonkers, let's just proceed */ + return; } + cpumask_set_cpu(cpu, tbl[node]); } - workqueue_freezing = false; -out_unlock: - spin_unlock(&workqueue_lock); + wq_numa_possible_cpumask = tbl; + wq_numa_enabled = true; } -#endif /* CONFIG_FREEZER */ static int __init init_workqueues(void) { - unsigned int cpu; + int std_nice[NR_STD_WORKER_POOLS] = { 0, HIGHPRI_NICE_LEVEL }; + int i, cpu; /* make sure we have enough bits for OFFQ pool ID */ BUILD_BUG_ON((1LU << (BITS_PER_LONG - WORK_OFFQ_POOL_SHIFT)) < WORK_CPU_END * NR_STD_WORKER_POOLS); + WARN_ON(__alignof__(struct pool_workqueue) < __alignof__(long long)); + + pwq_cache = KMEM_CACHE(pool_workqueue, SLAB_PANIC); + cpu_notifier(workqueue_cpu_up_callback, CPU_PRI_WORKQUEUE_UP); hotcpu_notifier(workqueue_cpu_down_callback, CPU_PRI_WORKQUEUE_DOWN); + wq_numa_init(); + /* initialize CPU pools */ - for_each_wq_cpu(cpu) { + for_each_possible_cpu(cpu) { struct worker_pool *pool; - for_each_std_worker_pool(pool, cpu) { - spin_lock_init(&pool->lock); + i = 0; + for_each_cpu_worker_pool(pool, cpu) { + BUG_ON(init_worker_pool(pool)); pool->cpu = cpu; - pool->flags |= POOL_DISASSOCIATED; - INIT_LIST_HEAD(&pool->worklist); - INIT_LIST_HEAD(&pool->idle_list); - hash_init(pool->busy_hash); - - init_timer_deferrable(&pool->idle_timer); - pool->idle_timer.function = idle_worker_timeout; - pool->idle_timer.data = (unsigned long)pool; - - setup_timer(&pool->mayday_timer, pool_mayday_timeout, - (unsigned long)pool); - - mutex_init(&pool->assoc_mutex); - ida_init(&pool->worker_ida); + cpumask_copy(pool->attrs->cpumask, cpumask_of(cpu)); + pool->attrs->nice = std_nice[i++]; + pool->node = cpu_to_node(cpu); /* alloc pool ID */ + mutex_lock(&wq_pool_mutex); BUG_ON(worker_pool_assign_id(pool)); + mutex_unlock(&wq_pool_mutex); } } /* create the initial worker */ - for_each_online_wq_cpu(cpu) { + for_each_online_cpu(cpu) { struct worker_pool *pool; - for_each_std_worker_pool(pool, cpu) { - struct worker *worker; + for_each_cpu_worker_pool(pool, cpu) { + pool->flags &= ~POOL_DISASSOCIATED; + BUG_ON(create_and_start_worker(pool) < 0); + } + } - if (cpu != WORK_CPU_UNBOUND) - pool->flags &= ~POOL_DISASSOCIATED; + /* create default unbound wq attrs */ + for (i = 0; i < NR_STD_WORKER_POOLS; i++) { + struct workqueue_attrs *attrs; - worker = create_worker(pool); - BUG_ON(!worker); - spin_lock_irq(&pool->lock); - start_worker(worker); - spin_unlock_irq(&pool->lock); - } + BUG_ON(!(attrs = alloc_workqueue_attrs(GFP_KERNEL))); + attrs->nice = std_nice[i]; + unbound_std_wq_attrs[i] = attrs; } system_wq = alloc_workqueue("events", 0, 0); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 07650264ec15..84ab6e1dc6fb 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -32,14 +32,12 @@ struct worker { struct list_head scheduled; /* L: scheduled works */ struct task_struct *task; /* I: worker task */ struct worker_pool *pool; /* I: the associated pool */ + /* L: for rescuers */ /* 64 bytes boundary on 64bit, 32 on 32bit */ unsigned long last_active; /* L: last active timestamp */ unsigned int flags; /* X: flags */ int id; /* I: worker id */ - /* for rebinding worker to CPU */ - struct work_struct rebind_work; /* L: for busy worker */ - /* used only by rescuers to point to the target workqueue */ struct workqueue_struct *rescue_wq; /* I: the workqueue to rescue */ }; @@ -58,8 +56,7 @@ static inline struct worker *current_wq_worker(void) * Scheduler hooks for concurrency managed workqueue. Only to be used from * sched.c and workqueue.c. */ -void wq_worker_waking_up(struct task_struct *task, unsigned int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, - unsigned int cpu); +void wq_worker_waking_up(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |