diff options
Diffstat (limited to 'kernel')
69 files changed, 2191 insertions, 1548 deletions
diff --git a/kernel/audit.c b/kernel/audit.c index c71bd26631a2..8296aa516c5a 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -407,7 +407,7 @@ static void kauditd_send_skb(struct sk_buff *skb) audit_hold_skb(skb); } else /* drop the extra reference if sent ok */ - kfree_skb(skb); + consume_skb(skb); } static int kauditd_thread(void *dummy) diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 291775021b2e..a8ce09954404 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1788,6 +1788,29 @@ out: return retval; } +/** + * cgroup_attach_task_current_cg - attach task 'tsk' to current task's cgroup + * @tsk: the task to be attached + */ +int cgroup_attach_task_current_cg(struct task_struct *tsk) +{ + struct cgroupfs_root *root; + struct cgroup *cur_cg; + int retval = 0; + + cgroup_lock(); + for_each_active_root(root) { + cur_cg = task_cgroup_from_root(current, root); + retval = cgroup_attach_task(cur_cg, tsk); + if (retval) + break; + } + cgroup_unlock(); + + return retval; +} +EXPORT_SYMBOL_GPL(cgroup_attach_task_current_cg); + /* * Attach task with pid 'pid' to cgroup 'cgrp'. Call with cgroup_mutex * held. May take task_lock of task @@ -2994,7 +3017,6 @@ static void cgroup_event_remove(struct work_struct *work) remove); struct cgroup *cgrp = event->cgrp; - /* TODO: check return code */ event->cft->unregister_event(cgrp, event->cft, event->eventfd); eventfd_ctx_put(event->eventfd); @@ -4599,7 +4621,7 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, parent_css = parent->subsys[subsys_id]; child_css = child->subsys[subsys_id]; parent_id = parent_css->id; - depth = parent_id->depth; + depth = parent_id->depth + 1; child_id = get_new_cssid(ss, depth); if (IS_ERR(child_id)) diff --git a/kernel/cpu.c b/kernel/cpu.c index 124ad9d6be16..97d1b426a4ac 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -20,13 +20,29 @@ /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); -static __cpuinitdata RAW_NOTIFIER_HEAD(cpu_chain); +/* + * The following two API's must be used when attempting + * to serialize the updates to cpu_online_mask, cpu_present_mask. + */ +void cpu_maps_update_begin(void) +{ + mutex_lock(&cpu_add_remove_lock); +} + +void cpu_maps_update_done(void) +{ + mutex_unlock(&cpu_add_remove_lock); +} + +static RAW_NOTIFIER_HEAD(cpu_chain); /* If set, cpu_up and cpu_down will return -EBUSY and do nothing. * Should always be manipulated under cpu_add_remove_lock */ static int cpu_hotplug_disabled; +#ifdef CONFIG_HOTPLUG_CPU + static struct { struct task_struct *active_writer; struct mutex lock; /* Synchronizes accesses to refcount, */ @@ -41,8 +57,6 @@ static struct { .refcount = 0, }; -#ifdef CONFIG_HOTPLUG_CPU - void get_online_cpus(void) { might_sleep(); @@ -67,22 +81,6 @@ void put_online_cpus(void) } EXPORT_SYMBOL_GPL(put_online_cpus); -#endif /* CONFIG_HOTPLUG_CPU */ - -/* - * The following two API's must be used when attempting - * to serialize the updates to cpu_online_mask, cpu_present_mask. - */ -void cpu_maps_update_begin(void) -{ - mutex_lock(&cpu_add_remove_lock); -} - -void cpu_maps_update_done(void) -{ - mutex_unlock(&cpu_add_remove_lock); -} - /* * This ensures that the hotplug operation can begin only when the * refcount goes to zero. @@ -124,6 +122,12 @@ static void cpu_hotplug_done(void) cpu_hotplug.active_writer = NULL; mutex_unlock(&cpu_hotplug.lock); } + +#else /* #if CONFIG_HOTPLUG_CPU */ +static void cpu_hotplug_begin(void) {} +static void cpu_hotplug_done(void) {} +#endif /* #esle #if CONFIG_HOTPLUG_CPU */ + /* Need to know about CPUs going up/down? */ int __ref register_cpu_notifier(struct notifier_block *nb) { @@ -134,8 +138,29 @@ int __ref register_cpu_notifier(struct notifier_block *nb) return ret; } +static int __cpu_notify(unsigned long val, void *v, int nr_to_call, + int *nr_calls) +{ + int ret; + + ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, + nr_calls); + + return notifier_to_errno(ret); +} + +static int cpu_notify(unsigned long val, void *v) +{ + return __cpu_notify(val, v, -1, NULL); +} + #ifdef CONFIG_HOTPLUG_CPU +static void cpu_notify_nofail(unsigned long val, void *v) +{ + BUG_ON(cpu_notify(val, v)); +} + EXPORT_SYMBOL(register_cpu_notifier); void __ref unregister_cpu_notifier(struct notifier_block *nb) @@ -181,8 +206,7 @@ static int __ref take_cpu_down(void *_param) if (err < 0) return err; - raw_notifier_call_chain(&cpu_chain, CPU_DYING | param->mod, - param->hcpu); + cpu_notify(CPU_DYING | param->mod, param->hcpu); if (task_cpu(param->caller) == cpu) move_task_off_dead_cpu(cpu, param->caller); @@ -212,17 +236,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) cpu_hotplug_begin(); set_cpu_active(cpu, false); - err = __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_PREPARE | mod, - hcpu, -1, &nr_calls); - if (err == NOTIFY_BAD) { + err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); + if (err) { set_cpu_active(cpu, true); nr_calls--; - __raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, - hcpu, nr_calls, NULL); + __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); printk("%s: attempt to take down CPU %u failed\n", __func__, cpu); - err = -EINVAL; goto out_release; } @@ -230,9 +251,7 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) if (err) { set_cpu_active(cpu, true); /* CPU didn't die: tell everyone. Can't complain. */ - if (raw_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED | mod, - hcpu) == NOTIFY_BAD) - BUG(); + cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); goto out_release; } @@ -246,19 +265,14 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) __cpu_die(cpu); /* CPU is completely dead: tell everyone. Too late to complain. */ - if (raw_notifier_call_chain(&cpu_chain, CPU_DEAD | mod, - hcpu) == NOTIFY_BAD) - BUG(); + cpu_notify_nofail(CPU_DEAD | mod, hcpu); check_for_tasks(cpu); out_release: cpu_hotplug_done(); - if (!err) { - if (raw_notifier_call_chain(&cpu_chain, CPU_POST_DEAD | mod, - hcpu) == NOTIFY_BAD) - BUG(); - } + if (!err) + cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); return err; } @@ -293,13 +307,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) return -EINVAL; cpu_hotplug_begin(); - ret = __raw_notifier_call_chain(&cpu_chain, CPU_UP_PREPARE | mod, hcpu, - -1, &nr_calls); - if (ret == NOTIFY_BAD) { + ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); + if (ret) { nr_calls--; printk("%s: attempt to bring up CPU %u failed\n", __func__, cpu); - ret = -EINVAL; goto out_notify; } @@ -312,12 +324,11 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen) set_cpu_active(cpu, true); /* Now call notifier in preparation. */ - raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu); + cpu_notify(CPU_ONLINE | mod, hcpu); out_notify: if (ret != 0) - __raw_notifier_call_chain(&cpu_chain, - CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); + __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); cpu_hotplug_done(); return ret; @@ -383,7 +394,7 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error; + int cpu, first_cpu, error = 0; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); @@ -481,7 +492,7 @@ void __cpuinit notify_cpu_starting(unsigned int cpu) if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) val = CPU_STARTING_FROZEN; #endif /* CONFIG_PM_SLEEP_SMP */ - raw_notifier_call_chain(&cpu_chain, val, (void *)(long)cpu); + cpu_notify(val, (void *)(long)cpu); } #endif /* CONFIG_SMP */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 61d6af7fa676..02b9611eadde 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2469,7 +2469,8 @@ void cpuset_unlock(void) } /** - * cpuset_mem_spread_node() - On which node to begin search for a page + * cpuset_mem_spread_node() - On which node to begin search for a file page + * cpuset_slab_spread_node() - On which node to begin search for a slab page * * If a task is marked PF_SPREAD_PAGE or PF_SPREAD_SLAB (as for * tasks in a cpuset with is_spread_page or is_spread_slab set), @@ -2494,16 +2495,27 @@ void cpuset_unlock(void) * See kmem_cache_alloc_node(). */ -int cpuset_mem_spread_node(void) +static int cpuset_spread_node(int *rotor) { int node; - node = next_node(current->cpuset_mem_spread_rotor, current->mems_allowed); + node = next_node(*rotor, current->mems_allowed); if (node == MAX_NUMNODES) node = first_node(current->mems_allowed); - current->cpuset_mem_spread_rotor = node; + *rotor = node; return node; } + +int cpuset_mem_spread_node(void) +{ + return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); +} + +int cpuset_slab_spread_node(void) +{ + return cpuset_spread_node(¤t->cpuset_slab_spread_rotor); +} + EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); /** diff --git a/kernel/cred.c b/kernel/cred.c index 2c24870c55d1..60bc8b1e32e6 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -209,6 +209,31 @@ void exit_creds(struct task_struct *tsk) } } +/** + * get_task_cred - Get another task's objective credentials + * @task: The task to query + * + * Get the objective credentials of a task, pinning them so that they can't go + * away. Accessing a task's credentials directly is not permitted. + * + * The caller must also make sure task doesn't get deleted, either by holding a + * ref on task or by holding tasklist_lock to prevent it from being unlinked. + */ +const struct cred *get_task_cred(struct task_struct *task) +{ + const struct cred *cred; + + rcu_read_lock(); + + do { + cred = __task_cred((task)); + BUG_ON(!cred); + } while (!atomic_inc_not_zero(&((struct cred *)cred)->usage)); + + rcu_read_unlock(); + return cred; +} + /* * Allocate blank credentials, such that the credentials can be filled in at a * later date without risk of ENOMEM. @@ -347,66 +372,6 @@ struct cred *prepare_exec_creds(void) } /* - * prepare new credentials for the usermode helper dispatcher - */ -struct cred *prepare_usermodehelper_creds(void) -{ -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred = NULL; -#endif - struct cred *new; - -#ifdef CONFIG_KEYS - tgcred = kzalloc(sizeof(*new->tgcred), GFP_ATOMIC); - if (!tgcred) - return NULL; -#endif - - new = kmem_cache_alloc(cred_jar, GFP_ATOMIC); - if (!new) - goto free_tgcred; - - kdebug("prepare_usermodehelper_creds() alloc %p", new); - - memcpy(new, &init_cred, sizeof(struct cred)); - - atomic_set(&new->usage, 1); - set_cred_subscribers(new, 0); - get_group_info(new->group_info); - get_uid(new->user); - -#ifdef CONFIG_KEYS - new->thread_keyring = NULL; - new->request_key_auth = NULL; - new->jit_keyring = KEY_REQKEY_DEFL_DEFAULT; - - atomic_set(&tgcred->usage, 1); - spin_lock_init(&tgcred->lock); - new->tgcred = tgcred; -#endif - -#ifdef CONFIG_SECURITY - new->security = NULL; -#endif - if (security_prepare_creds(new, &init_cred, GFP_ATOMIC) < 0) - goto error; - validate_creds(new); - - BUG_ON(atomic_read(&new->usage) != 1); - return new; - -error: - put_cred(new); - return NULL; - -free_tgcred: -#ifdef CONFIG_KEYS - kfree(tgcred); -#endif - return NULL; -} - -/* * Copy credentials for the new process created by fork() * * We share if we can, but under some circumstances we have to generate a new diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 5cb7cd1de10c..8bc5eeffec8a 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -605,13 +605,13 @@ cpu_master_loop: if (dbg_kdb_mode) { kgdb_connected = 1; error = kdb_stub(ks); + kgdb_connected = 0; } else { error = gdb_serial_stub(ks); } if (error == DBG_PASS_EVENT) { dbg_kdb_mode = !dbg_kdb_mode; - kgdb_connected = 0; } else if (error == DBG_SWITCH_CPU_EVENT) { dbg_cpu_switch(cpu, dbg_switch_cpu); goto cpu_loop; diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c index 4b17b3269525..e8fd6868682d 100644 --- a/kernel/debug/gdbstub.c +++ b/kernel/debug/gdbstub.c @@ -621,10 +621,8 @@ static void gdb_cmd_query(struct kgdb_state *ks) switch (remcom_in_buffer[1]) { case 's': case 'f': - if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) { - error_packet(remcom_out_buffer, -EINVAL); + if (memcmp(remcom_in_buffer + 2, "ThreadInfo", 10)) break; - } i = 0; remcom_out_buffer[0] = 'm'; @@ -665,10 +663,9 @@ static void gdb_cmd_query(struct kgdb_state *ks) pack_threadid(remcom_out_buffer + 2, thref); break; case 'T': - if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) { - error_packet(remcom_out_buffer, -EINVAL); + if (memcmp(remcom_in_buffer + 1, "ThreadExtraInfo,", 16)) break; - } + ks->threadid = 0; ptr = remcom_in_buffer + 17; kgdb_hex2long(&ptr, &ks->threadid); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index b724c791b6d4..ebe4a287419e 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1820,9 +1820,8 @@ static int kdb_sr(int argc, const char **argv) { if (argc != 1) return KDB_ARGCOUNT; - sysrq_toggle_support(1); kdb_trap_printk++; - handle_sysrq(*argv[1], NULL); + __handle_sysrq(*argv[1], NULL, 0); kdb_trap_printk--; return 0; @@ -1857,12 +1856,6 @@ static int kdb_ef(int argc, const char **argv) } #if defined(CONFIG_MODULES) -/* modules using other modules */ -struct module_use { - struct list_head list; - struct module *module_which_uses; -}; - /* * kdb_lsmod - This function implements the 'lsmod' command. Lists * currently loaded kernel modules. @@ -1889,14 +1882,15 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); + kdb_printf(" 0x%p", mod->module_core); #ifdef CONFIG_MODULE_UNLOAD { struct module_use *use; kdb_printf(" [ "); - list_for_each_entry(use, &mod->modules_which_use_me, - list) - kdb_printf("%s ", use->module_which_uses->name); + list_for_each_entry(use, &mod->source_list, + source_list) + kdb_printf("%s ", use->target->name); kdb_printf("]\n"); } #endif @@ -2297,6 +2291,9 @@ static int kdb_ll(int argc, const char **argv) while (va) { char buf[80]; + if (KDB_FLAG(CMD_INTERRUPT)) + return 0; + sprintf(buf, "%s " kdb_machreg_fmt "\n", command, va); diag = kdb_parse(buf); if (diag) diff --git a/kernel/early_res.c b/kernel/early_res.c index 31aa9332ef3f..7bfae887f211 100644 --- a/kernel/early_res.c +++ b/kernel/early_res.c @@ -7,6 +7,8 @@ #include <linux/bootmem.h> #include <linux/mm.h> #include <linux/early_res.h> +#include <linux/slab.h> +#include <linux/kmemleak.h> /* * Early reserved memory areas. @@ -319,6 +321,8 @@ void __init free_early(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + i = find_overlapped_early(start, end); r = &early_res[i]; if (i >= max_early_res || r->end != end || r->start != start) @@ -333,6 +337,8 @@ void __init free_early_partial(u64 start, u64 end) struct early_res *r; int i; + kmemleak_free_part(__va(start), end - start); + if (start == end) return; diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c index c35452cadded..dd62f8e714ca 100644 --- a/kernel/exec_domain.c +++ b/kernel/exec_domain.c @@ -27,7 +27,7 @@ static struct exec_domain *exec_domains = &default_exec_domain; static DEFINE_RWLOCK(exec_domains_lock); -static u_long ident_map[32] = { +static unsigned long ident_map[32] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, @@ -56,10 +56,10 @@ default_handler(int segment, struct pt_regs *regp) } static struct exec_domain * -lookup_exec_domain(u_long personality) +lookup_exec_domain(unsigned int personality) { - struct exec_domain * ep; - u_long pers = personality(personality); + unsigned int pers = personality(personality); + struct exec_domain *ep; read_lock(&exec_domains_lock); for (ep = exec_domains; ep; ep = ep->next) { @@ -70,7 +70,7 @@ lookup_exec_domain(u_long personality) #ifdef CONFIG_MODULES read_unlock(&exec_domains_lock); - request_module("personality-%ld", pers); + request_module("personality-%d", pers); read_lock(&exec_domains_lock); for (ep = exec_domains; ep; ep = ep->next) { @@ -135,7 +135,7 @@ unregister: } int -__set_personality(u_long personality) +__set_personality(unsigned int personality) { struct exec_domain *ep, *oep; @@ -188,9 +188,9 @@ static int __init proc_execdomains_init(void) module_init(proc_execdomains_init); #endif -SYSCALL_DEFINE1(personality, u_long, personality) +SYSCALL_DEFINE1(personality, unsigned int, personality) { - u_long old = current->personality; + unsigned int old = current->personality; if (personality != 0xffffffff) { set_personality(personality); @@ -198,7 +198,7 @@ SYSCALL_DEFINE1(personality, u_long, personality) return -EINVAL; } - return (long)old; + return old; } diff --git a/kernel/exit.c b/kernel/exit.c index 019a2843bf95..ceffc67b564a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -58,11 +58,11 @@ static void exit_mm(struct task_struct * tsk); -static void __unhash_process(struct task_struct *p) +static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; detach_pid(p, PIDTYPE_PID); - if (thread_group_leader(p)) { + if (group_dead) { detach_pid(p, PIDTYPE_PGID); detach_pid(p, PIDTYPE_SID); @@ -79,10 +79,9 @@ static void __unhash_process(struct task_struct *p) static void __exit_signal(struct task_struct *tsk) { struct signal_struct *sig = tsk->signal; + bool group_dead = thread_group_leader(tsk); struct sighand_struct *sighand; - - BUG_ON(!sig); - BUG_ON(!atomic_read(&sig->count)); + struct tty_struct *uninitialized_var(tty); sighand = rcu_dereference_check(tsk->sighand, rcu_read_lock_held() || @@ -90,14 +89,16 @@ static void __exit_signal(struct task_struct *tsk) spin_lock(&sighand->siglock); posix_cpu_timers_exit(tsk); - if (atomic_dec_and_test(&sig->count)) + if (group_dead) { posix_cpu_timers_exit_group(tsk); - else { + tty = sig->tty; + sig->tty = NULL; + } else { /* * If there is any task waiting for the group exit * then notify it: */ - if (sig->group_exit_task && atomic_read(&sig->count) == sig->notify_count) + if (sig->notify_count > 0 && !--sig->notify_count) wake_up_process(sig->group_exit_task); if (tsk == sig->curr_target) @@ -123,32 +124,24 @@ static void __exit_signal(struct task_struct *tsk) sig->oublock += task_io_get_oublock(tsk); task_io_accounting_add(&sig->ioac, &tsk->ioac); sig->sum_sched_runtime += tsk->se.sum_exec_runtime; - sig = NULL; /* Marker for below. */ } - __unhash_process(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); /* * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ flush_sigqueue(&tsk->pending); - - tsk->signal = NULL; tsk->sighand = NULL; spin_unlock(&sighand->siglock); __cleanup_sighand(sighand); clear_tsk_thread_flag(tsk,TIF_SIGPENDING); - if (sig) { + if (group_dead) { flush_sigqueue(&sig->shared_pending); - taskstats_tgid_free(sig); - /* - * Make sure ->signal can't go away under rq->lock, - * see account_group_exec_runtime(). - */ - task_rq_unlock_wait(tsk); - __cleanup_signal(sig); + tty_kref_put(tty); } } @@ -856,12 +849,9 @@ static void exit_notify(struct task_struct *tsk, int group_dead) tsk->exit_state = signal == DEATH_REAP ? EXIT_DEAD : EXIT_ZOMBIE; - /* mt-exec, de_thread() is waiting for us */ - if (thread_group_leader(tsk) && - tsk->signal->group_exit_task && - tsk->signal->notify_count < 0) + /* mt-exec, de_thread() is waiting for group leader */ + if (unlikely(tsk->signal->notify_count < 0)) wake_up_process(tsk->signal->group_exit_task); - write_unlock_irq(&tasklist_lock); tracehook_report_death(tsk, signal, cookie, group_dead); diff --git a/kernel/fork.c b/kernel/fork.c index 4d57d9e3a6e9..b6cce14ba047 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -165,6 +165,18 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +static inline void free_signal_struct(struct signal_struct *sig) +{ + taskstats_tgid_free(sig); + kmem_cache_free(signal_cachep, sig); +} + +static inline void put_signal_struct(struct signal_struct *sig) +{ + if (atomic_dec_and_test(&sig->sigcnt)) + free_signal_struct(sig); +} + void __put_task_struct(struct task_struct *tsk) { WARN_ON(!tsk->exit_state); @@ -173,6 +185,7 @@ void __put_task_struct(struct task_struct *tsk) exit_creds(tsk); delayacct_tsk_free(tsk); + put_signal_struct(tsk->signal); if (!profile_handoff_task(tsk)) free_task(tsk); @@ -864,8 +877,9 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) if (!sig) return -ENOMEM; - atomic_set(&sig->count, 1); + sig->nr_threads = 1; atomic_set(&sig->live, 1); + atomic_set(&sig->sigcnt, 1); init_waitqueue_head(&sig->wait_chldexit); if (clone_flags & CLONE_NEWPID) sig->flags |= SIGNAL_UNKILLABLE; @@ -889,13 +903,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk) return 0; } -void __cleanup_signal(struct signal_struct *sig) -{ - thread_group_cputime_free(sig); - tty_kref_put(sig->tty); - kmem_cache_free(signal_cachep, sig); -} - static void copy_flags(unsigned long clone_flags, struct task_struct *p) { unsigned long new_flags = p->flags; @@ -1245,8 +1252,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, } if (clone_flags & CLONE_THREAD) { - atomic_inc(¤t->signal->count); + current->signal->nr_threads++; atomic_inc(¤t->signal->live); + atomic_inc(¤t->signal->sigcnt); p->group_leader = current->group_leader; list_add_tail_rcu(&p->thread_group, &p->group_leader->thread_group); } @@ -1259,7 +1267,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->nsproxy->pid_ns->child_reaper = p; p->signal->leader_pid = pid; - tty_kref_put(p->signal->tty); p->signal->tty = tty_kref_get(current->signal->tty); attach_pid(p, PIDTYPE_PGID, task_pgrp(current)); attach_pid(p, PIDTYPE_SID, task_session(current)); @@ -1292,7 +1299,7 @@ bad_fork_cleanup_mm: mmput(p->mm); bad_fork_cleanup_signal: if (!(clone_flags & CLONE_THREAD)) - __cleanup_signal(p->signal); + free_signal_struct(p->signal); bad_fork_cleanup_sighand: __cleanup_sighand(p->sighand); bad_fork_cleanup_fs: @@ -1327,6 +1334,16 @@ noinline struct pt_regs * __cpuinit __attribute__((weak)) idle_regs(struct pt_re return regs; } +static inline void init_idle_pids(struct pid_link *links) +{ + enum pid_type type; + + for (type = PIDTYPE_PID; type < PIDTYPE_MAX; ++type) { + INIT_HLIST_NODE(&links[type].node); /* not really needed */ + links[type].pid = &init_struct_pid; + } +} + struct task_struct * __cpuinit fork_idle(int cpu) { struct task_struct *task; @@ -1334,8 +1351,10 @@ struct task_struct * __cpuinit fork_idle(int cpu) task = copy_process(CLONE_VM, 0, idle_regs(®s), 0, NULL, &init_struct_pid, 0); - if (!IS_ERR(task)) + if (!IS_ERR(task)) { + init_idle_pids(task->pids); init_idle(task, cpu); + } return task; } @@ -1507,14 +1526,6 @@ static void check_unshare_flags(unsigned long *flags_ptr) *flags_ptr |= CLONE_SIGHAND; /* - * If unsharing signal handlers and the task was created - * using CLONE_THREAD, then must unshare the thread - */ - if ((*flags_ptr & CLONE_SIGHAND) && - (atomic_read(¤t->signal->count) > 1)) - *flags_ptr |= CLONE_THREAD; - - /* * If unsharing namespace, must also unshare filesystem information. */ if (*flags_ptr & CLONE_NEWNS) diff --git a/kernel/futex.c b/kernel/futex.c index e7a35f1039e7..6a3a5fa1526d 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -429,20 +429,11 @@ static void free_pi_state(struct futex_pi_state *pi_state) static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; - const struct cred *cred = current_cred(), *pcred; rcu_read_lock(); p = find_task_by_vpid(pid); - if (!p) { - p = ERR_PTR(-ESRCH); - } else { - pcred = __task_cred(p); - if (cred->euid != pcred->euid && - cred->euid != pcred->uid) - p = ERR_PTR(-ESRCH); - else - get_task_struct(p); - } + if (p) + get_task_struct(p); rcu_read_unlock(); @@ -564,8 +555,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, if (!pid) return -ESRCH; p = futex_find_get_task(pid); - if (IS_ERR(p)) - return PTR_ERR(p); + if (!p) + return -ESRCH; /* * We need to look at the task state flags to figure out, diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index b9b134b35088..5c69e996bd0f 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -89,7 +89,7 @@ static void hrtimer_get_softirq_time(struct hrtimer_cpu_base *base) do { seq = read_seqbegin(&xtime_lock); - xts = current_kernel_time(); + xts = __current_kernel_time(); tom = wall_to_monotonic; } while (read_seqretry(&xtime_lock, seq)); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 3164ba7ce151..e1497481fe8a 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -456,6 +456,9 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq, /* note that IRQF_TRIGGER_MASK == IRQ_TYPE_SENSE_MASK */ desc->status &= ~(IRQ_LEVEL | IRQ_TYPE_SENSE_MASK); desc->status |= flags; + + if (chip != desc->chip) + irq_chip_set_defaults(desc->chip); } return ret; diff --git a/kernel/kexec.c b/kernel/kexec.c index 474a84715eac..131b1703936f 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1089,9 +1089,10 @@ void crash_kexec(struct pt_regs *regs) size_t crash_get_memory_size(void) { - size_t size; + size_t size = 0; mutex_lock(&kexec_mutex); - size = crashk_res.end - crashk_res.start + 1; + if (crashk_res.end != crashk_res.start) + size = crashk_res.end - crashk_res.start + 1; mutex_unlock(&kexec_mutex); return size; } @@ -1134,7 +1135,7 @@ int crash_shrink_memory(unsigned long new_size) free_reserved_phys_range(end, crashk_res.end); - if (start == end) + if ((start == end) && (crashk_res.parent != NULL)) release_resource(&crashk_res); crashk_res.end = end - 1; diff --git a/kernel/kmod.c b/kernel/kmod.c index bf0e231d9702..6e9b19667a8d 100644 --- a/kernel/kmod.c +++ b/kernel/kmod.c @@ -116,27 +116,16 @@ int __request_module(bool wait, const char *fmt, ...) trace_module_request(module_name, wait, _RET_IP_); - ret = call_usermodehelper(modprobe_path, argv, envp, - wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC); + ret = call_usermodehelper_fns(modprobe_path, argv, envp, + wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, + NULL, NULL, NULL); + atomic_dec(&kmod_concurrent); return ret; } EXPORT_SYMBOL(__request_module); #endif /* CONFIG_MODULES */ -struct subprocess_info { - struct work_struct work; - struct completion *complete; - struct cred *cred; - char *path; - char **argv; - char **envp; - enum umh_wait wait; - int retval; - struct file *stdin; - void (*cleanup)(char **argv, char **envp); -}; - /* * This is the task which runs the usermode application */ @@ -145,36 +134,10 @@ static int ____call_usermodehelper(void *data) struct subprocess_info *sub_info = data; int retval; - BUG_ON(atomic_read(&sub_info->cred->usage) != 1); - - /* Unblock all signals */ spin_lock_irq(¤t->sighand->siglock); flush_signal_handlers(current, 1); - sigemptyset(¤t->blocked); - recalc_sigpending(); spin_unlock_irq(¤t->sighand->siglock); - /* Install the credentials */ - commit_creds(sub_info->cred); - sub_info->cred = NULL; - - /* Install input pipe when needed */ - if (sub_info->stdin) { - struct files_struct *f = current->files; - struct fdtable *fdt; - /* no races because files should be private here */ - sys_close(0); - fd_install(0, sub_info->stdin); - spin_lock(&f->file_lock); - fdt = files_fdtable(f); - FD_SET(0, fdt->open_fds); - FD_CLR(0, fdt->close_on_exec); - spin_unlock(&f->file_lock); - - /* and disallow core files too */ - current->signal->rlim[RLIMIT_CORE] = (struct rlimit){0, 0}; - } - /* We can run anywhere, unlike our parent keventd(). */ set_cpus_allowed_ptr(current, cpu_all_mask); @@ -184,9 +147,16 @@ static int ____call_usermodehelper(void *data) */ set_user_nice(current, 0); + if (sub_info->init) { + retval = sub_info->init(sub_info); + if (retval) + goto fail; + } + retval = kernel_execve(sub_info->path, sub_info->argv, sub_info->envp); /* Exec failed? */ +fail: sub_info->retval = retval; do_exit(0); } @@ -194,9 +164,7 @@ static int ____call_usermodehelper(void *data) void call_usermodehelper_freeinfo(struct subprocess_info *info) { if (info->cleanup) - (*info->cleanup)(info->argv, info->envp); - if (info->cred) - put_cred(info->cred); + (*info->cleanup)(info); kfree(info); } EXPORT_SYMBOL(call_usermodehelper_freeinfo); @@ -207,16 +175,16 @@ static int wait_for_helper(void *data) struct subprocess_info *sub_info = data; pid_t pid; - /* Install a handler: if SIGCLD isn't handled sys_wait4 won't - * populate the status, but will return -ECHILD. */ - allow_signal(SIGCHLD); + /* If SIGCLD is ignored sys_wait4 won't populate the status. */ + spin_lock_irq(¤t->sighand->siglock); + current->sighand->action[SIGCHLD-1].sa.sa_handler = SIG_DFL; + spin_unlock_irq(¤t->sighand->siglock); pid = kernel_thread(____call_usermodehelper, sub_info, SIGCHLD); if (pid < 0) { sub_info->retval = pid; } else { - int ret; - + int ret = -ECHILD; /* * Normally it is bogus to call wait4() from in-kernel because * wait4() wants to write the exit code to a userspace address. @@ -237,10 +205,7 @@ static int wait_for_helper(void *data) sub_info->retval = ret; } - if (sub_info->wait == UMH_NO_WAIT) - call_usermodehelper_freeinfo(sub_info); - else - complete(sub_info->complete); + complete(sub_info->complete); return 0; } @@ -249,15 +214,13 @@ static void __call_usermodehelper(struct work_struct *work) { struct subprocess_info *sub_info = container_of(work, struct subprocess_info, work); - pid_t pid; enum umh_wait wait = sub_info->wait; - - BUG_ON(atomic_read(&sub_info->cred->usage) != 1); + pid_t pid; /* CLONE_VFORK: wait until the usermode helper has execve'd * successfully We need the data structures to stay around * until that is done. */ - if (wait == UMH_WAIT_PROC || wait == UMH_NO_WAIT) + if (wait == UMH_WAIT_PROC) pid = kernel_thread(wait_for_helper, sub_info, CLONE_FS | CLONE_FILES | SIGCHLD); else @@ -266,15 +229,16 @@ static void __call_usermodehelper(struct work_struct *work) switch (wait) { case UMH_NO_WAIT: + call_usermodehelper_freeinfo(sub_info); break; case UMH_WAIT_PROC: if (pid > 0) break; - sub_info->retval = pid; /* FALLTHROUGH */ - case UMH_WAIT_EXEC: + if (pid < 0) + sub_info->retval = pid; complete(sub_info->complete); } } @@ -376,80 +340,37 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv, sub_info->path = path; sub_info->argv = argv; sub_info->envp = envp; - sub_info->cred = prepare_usermodehelper_creds(); - if (!sub_info->cred) { - kfree(sub_info); - return NULL; - } - out: return sub_info; } EXPORT_SYMBOL(call_usermodehelper_setup); /** - * call_usermodehelper_setkeys - set the session keys for usermode helper - * @info: a subprocess_info returned by call_usermodehelper_setup - * @session_keyring: the session keyring for the process - */ -void call_usermodehelper_setkeys(struct subprocess_info *info, - struct key *session_keyring) -{ -#ifdef CONFIG_KEYS - struct thread_group_cred *tgcred = info->cred->tgcred; - key_put(tgcred->session_keyring); - tgcred->session_keyring = key_get(session_keyring); -#else - BUG(); -#endif -} -EXPORT_SYMBOL(call_usermodehelper_setkeys); - -/** - * call_usermodehelper_setcleanup - set a cleanup function + * call_usermodehelper_setfns - set a cleanup/init function * @info: a subprocess_info returned by call_usermodehelper_setup * @cleanup: a cleanup function + * @init: an init function + * @data: arbitrary context sensitive data * - * The cleanup function is just befor ethe subprocess_info is about to + * The init function is used to customize the helper process prior to + * exec. A non-zero return code causes the process to error out, exit, + * and return the failure to the calling process + * + * The cleanup function is just before ethe subprocess_info is about to * be freed. This can be used for freeing the argv and envp. The * Function must be runnable in either a process context or the * context in which call_usermodehelper_exec is called. */ -void call_usermodehelper_setcleanup(struct subprocess_info *info, - void (*cleanup)(char **argv, char **envp)) +void call_usermodehelper_setfns(struct subprocess_info *info, + int (*init)(struct subprocess_info *info), + void (*cleanup)(struct subprocess_info *info), + void *data) { info->cleanup = cleanup; + info->init = init; + info->data = data; } -EXPORT_SYMBOL(call_usermodehelper_setcleanup); - -/** - * call_usermodehelper_stdinpipe - set up a pipe to be used for stdin - * @sub_info: a subprocess_info returned by call_usermodehelper_setup - * @filp: set to the write-end of a pipe - * - * This constructs a pipe, and sets the read end to be the stdin of the - * subprocess, and returns the write-end in *@filp. - */ -int call_usermodehelper_stdinpipe(struct subprocess_info *sub_info, - struct file **filp) -{ - struct file *f; - - f = create_write_pipe(0); - if (IS_ERR(f)) - return PTR_ERR(f); - *filp = f; - - f = create_read_pipe(f, 0); - if (IS_ERR(f)) { - free_write_pipe(*filp); - return PTR_ERR(f); - } - sub_info->stdin = f; - - return 0; -} -EXPORT_SYMBOL(call_usermodehelper_stdinpipe); +EXPORT_SYMBOL(call_usermodehelper_setfns); /** * call_usermodehelper_exec - start a usermode application @@ -469,9 +390,6 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, DECLARE_COMPLETION_ONSTACK(done); int retval = 0; - BUG_ON(atomic_read(&sub_info->cred->usage) != 1); - validate_creds(sub_info->cred); - helper_lock(); if (sub_info->path[0] == '\0') goto out; @@ -498,41 +416,6 @@ unlock: } EXPORT_SYMBOL(call_usermodehelper_exec); -/** - * call_usermodehelper_pipe - call a usermode helper process with a pipe stdin - * @path: path to usermode executable - * @argv: arg vector for process - * @envp: environment for process - * @filp: set to the write-end of a pipe - * - * This is a simple wrapper which executes a usermode-helper function - * with a pipe as stdin. It is implemented entirely in terms of - * lower-level call_usermodehelper_* functions. - */ -int call_usermodehelper_pipe(char *path, char **argv, char **envp, - struct file **filp) -{ - struct subprocess_info *sub_info; - int ret; - - sub_info = call_usermodehelper_setup(path, argv, envp, GFP_KERNEL); - if (sub_info == NULL) - return -ENOMEM; - - ret = call_usermodehelper_stdinpipe(sub_info, filp); - if (ret < 0) { - call_usermodehelper_freeinfo(sub_info); - return ret; - } - - ret = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); - if (ret < 0) /* Failed to execute helper, close pipe */ - filp_close(*filp, NULL); - - return ret; -} -EXPORT_SYMBOL(call_usermodehelper_pipe); - void __init usermodehelper_init(void) { khelper_wq = create_singlethread_workqueue("khelper"); diff --git a/kernel/module.c b/kernel/module.c index 333fbcc96978..6c562828c85c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -72,7 +72,11 @@ /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) -/* List of modules, protected by module_mutex or preempt_disable +/* + * Mutex protects: + * 1) List of modules (also safely readable with preempt_disable), + * 2) module_use links, + * 3) module_addr_min/module_addr_max. * (delete uses stop_machine/add uses RCU list operations). */ DEFINE_MUTEX(module_mutex); EXPORT_SYMBOL_GPL(module_mutex); @@ -90,7 +94,8 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq); static BLOCKING_NOTIFIER_HEAD(module_notify_list); -/* Bounds of module allocation, for speeding __module_address */ +/* Bounds of module allocation, for speeding __module_address. + * Protected by module_mutex. */ static unsigned long module_addr_min = -1UL, module_addr_max = 0; int register_module_notifier(struct notifier_block * nb) @@ -329,7 +334,7 @@ static bool find_symbol_in_section(const struct symsearch *syms, } /* Find a symbol and return it, along with, (optional) crc and - * (optional) module which owns it */ + * (optional) module which owns it. Needs preempt disabled or module_mutex. */ const struct kernel_symbol *find_symbol(const char *name, struct module **owner, const unsigned long **crc, @@ -403,7 +408,7 @@ static unsigned int find_pcpusec(Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const char *secstrings) { - return find_sec(hdr, sechdrs, secstrings, ".data.percpu"); + return find_sec(hdr, sechdrs, secstrings, ".data..percpu"); } static void percpu_modcopy(struct module *mod, @@ -523,7 +528,8 @@ static void module_unload_init(struct module *mod) { int cpu; - INIT_LIST_HEAD(&mod->modules_which_use_me); + INIT_LIST_HEAD(&mod->source_list); + INIT_LIST_HEAD(&mod->target_list); for_each_possible_cpu(cpu) { per_cpu_ptr(mod->refptr, cpu)->incs = 0; per_cpu_ptr(mod->refptr, cpu)->decs = 0; @@ -535,20 +541,13 @@ static void module_unload_init(struct module *mod) mod->waiter = current; } -/* modules using other modules */ -struct module_use -{ - struct list_head list; - struct module *module_which_uses; -}; - /* Does a already use b? */ static int already_uses(struct module *a, struct module *b) { struct module_use *use; - list_for_each_entry(use, &b->modules_which_use_me, list) { - if (use->module_which_uses == a) { + list_for_each_entry(use, &b->source_list, source_list) { + if (use->source == a) { DEBUGP("%s uses %s!\n", a->name, b->name); return 1; } @@ -557,62 +556,68 @@ static int already_uses(struct module *a, struct module *b) return 0; } -/* Module a uses b */ -int use_module(struct module *a, struct module *b) +/* + * Module a uses b + * - we add 'a' as a "source", 'b' as a "target" of module use + * - the module_use is added to the list of 'b' sources (so + * 'b' can walk the list to see who sourced them), and of 'a' + * targets (so 'a' can see what modules it targets). + */ +static int add_module_usage(struct module *a, struct module *b) { struct module_use *use; - int no_warn, err; - if (b == NULL || already_uses(a, b)) return 1; + DEBUGP("Allocating new usage for %s.\n", a->name); + use = kmalloc(sizeof(*use), GFP_ATOMIC); + if (!use) { + printk(KERN_WARNING "%s: out of memory loading\n", a->name); + return -ENOMEM; + } - /* If we're interrupted or time out, we fail. */ - if (wait_event_interruptible_timeout( - module_wq, (err = strong_try_module_get(b)) != -EBUSY, - 30 * HZ) <= 0) { - printk("%s: gave up waiting for init of module %s.\n", - a->name, b->name); + use->source = a; + use->target = b; + list_add(&use->source_list, &b->source_list); + list_add(&use->target_list, &a->target_list); + return 0; +} + +/* Module a uses b: caller needs module_mutex() */ +int ref_module(struct module *a, struct module *b) +{ + int err; + + if (b == NULL || already_uses(a, b)) return 0; - } - /* If strong_try_module_get() returned a different error, we fail. */ + /* If module isn't available, we fail. */ + err = strong_try_module_get(b); if (err) - return 0; + return err; - DEBUGP("Allocating new usage for %s.\n", a->name); - use = kmalloc(sizeof(*use), GFP_ATOMIC); - if (!use) { - printk("%s: out of memory loading\n", a->name); + err = add_module_usage(a, b); + if (err) { module_put(b); - return 0; + return err; } - - use->module_which_uses = a; - list_add(&use->list, &b->modules_which_use_me); - no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name); - return 1; + return 0; } -EXPORT_SYMBOL_GPL(use_module); +EXPORT_SYMBOL_GPL(ref_module); /* Clear the unload stuff of the module. */ static void module_unload_free(struct module *mod) { - struct module *i; + struct module_use *use, *tmp; - list_for_each_entry(i, &modules, list) { - struct module_use *use; - - list_for_each_entry(use, &i->modules_which_use_me, list) { - if (use->module_which_uses == mod) { - DEBUGP("%s unusing %s\n", mod->name, i->name); - module_put(i); - list_del(&use->list); - kfree(use); - sysfs_remove_link(i->holders_dir, mod->name); - /* There can be at most one match. */ - break; - } - } + mutex_lock(&module_mutex); + list_for_each_entry_safe(use, tmp, &mod->target_list, target_list) { + struct module *i = use->target; + DEBUGP("%s unusing %s\n", mod->name, i->name); + module_put(i); + list_del(&use->source_list); + list_del(&use->target_list); + kfree(use); } + mutex_unlock(&module_mutex); } #ifdef CONFIG_MODULE_FORCE_UNLOAD @@ -735,7 +740,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, goto out; } - if (!list_empty(&mod->modules_which_use_me)) { + if (!list_empty(&mod->source_list)) { /* Other modules depend on us: get rid of them first. */ ret = -EWOULDBLOCK; goto out; @@ -779,13 +784,13 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); async_synchronize_full(); - mutex_lock(&module_mutex); + /* Store the name of the last unloaded module for diagnostic purposes */ strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module)); - ddebug_remove_module(mod->name); - free_module(mod); - out: + free_module(mod); + return 0; +out: mutex_unlock(&module_mutex); return ret; } @@ -799,9 +804,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) /* Always include a trailing , so userspace can differentiate between this and the old multi-field proc format. */ - list_for_each_entry(use, &mod->modules_which_use_me, list) { + list_for_each_entry(use, &mod->source_list, source_list) { printed_something = 1; - seq_printf(m, "%s,", use->module_which_uses->name); + seq_printf(m, "%s,", use->source->name); } if (mod->init != NULL && mod->exit == NULL) { @@ -880,11 +885,11 @@ static inline void module_unload_free(struct module *mod) { } -int use_module(struct module *a, struct module *b) +int ref_module(struct module *a, struct module *b) { - return strong_try_module_get(b) == 0; + return strong_try_module_get(b); } -EXPORT_SYMBOL_GPL(use_module); +EXPORT_SYMBOL_GPL(ref_module); static inline void module_unload_init(struct module *mod) { @@ -1001,6 +1006,8 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs, { const unsigned long *crc; + /* Since this should be found in kernel (which can't be removed), + * no locking is necessary. */ if (!find_symbol(MODULE_SYMBOL_PREFIX "module_layout", NULL, &crc, true, false)) BUG(); @@ -1043,29 +1050,62 @@ static inline int same_magic(const char *amagic, const char *bmagic, } #endif /* CONFIG_MODVERSIONS */ -/* Resolve a symbol for this module. I.e. if we find one, record usage. - Must be holding module_mutex. */ +/* Resolve a symbol for this module. I.e. if we find one, record usage. */ static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs, unsigned int versindex, const char *name, - struct module *mod) + struct module *mod, + char ownername[]) { struct module *owner; const struct kernel_symbol *sym; const unsigned long *crc; + int err; + mutex_lock(&module_mutex); sym = find_symbol(name, &owner, &crc, !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true); - /* use_module can fail due to OOM, - or module initialization or unloading */ - if (sym) { - if (!check_version(sechdrs, versindex, name, mod, crc, owner) - || !use_module(mod, owner)) - sym = NULL; + if (!sym) + goto unlock; + + if (!check_version(sechdrs, versindex, name, mod, crc, owner)) { + sym = ERR_PTR(-EINVAL); + goto getname; } + + err = ref_module(mod, owner); + if (err) { + sym = ERR_PTR(err); + goto getname; + } + +getname: + /* We must make copy under the lock if we failed to get ref. */ + strncpy(ownername, module_name(owner), MODULE_NAME_LEN); +unlock: + mutex_unlock(&module_mutex); return sym; } +static const struct kernel_symbol *resolve_symbol_wait(Elf_Shdr *sechdrs, + unsigned int versindex, + const char *name, + struct module *mod) +{ + const struct kernel_symbol *ksym; + char ownername[MODULE_NAME_LEN]; + + if (wait_event_interruptible_timeout(module_wq, + !IS_ERR(ksym = resolve_symbol(sechdrs, versindex, name, + mod, ownername)) || + PTR_ERR(ksym) != -EBUSY, + 30 * HZ) <= 0) { + printk(KERN_WARNING "%s: gave up waiting for init of module %s.\n", + mod->name, ownername); + } + return ksym; +} + /* * /sys/module/foo/sections stuff * J. Corbet <corbet@lwn.net> @@ -1295,7 +1335,34 @@ static inline void remove_notes_attrs(struct module *mod) #endif #ifdef CONFIG_SYSFS -int module_add_modinfo_attrs(struct module *mod) +static void add_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + int nowarn; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) { + nowarn = sysfs_create_link(use->target->holders_dir, + &mod->mkobj.kobj, mod->name); + } + mutex_unlock(&module_mutex); +#endif +} + +static void del_usage_links(struct module *mod) +{ +#ifdef CONFIG_MODULE_UNLOAD + struct module_use *use; + + mutex_lock(&module_mutex); + list_for_each_entry(use, &mod->target_list, target_list) + sysfs_remove_link(use->target->holders_dir, mod->name); + mutex_unlock(&module_mutex); +#endif +} + +static int module_add_modinfo_attrs(struct module *mod) { struct module_attribute *attr; struct module_attribute *temp_attr; @@ -1321,7 +1388,7 @@ int module_add_modinfo_attrs(struct module *mod) return error; } -void module_remove_modinfo_attrs(struct module *mod) +static void module_remove_modinfo_attrs(struct module *mod) { struct module_attribute *attr; int i; @@ -1337,7 +1404,7 @@ void module_remove_modinfo_attrs(struct module *mod) kfree(mod->modinfo_attrs); } -int mod_sysfs_init(struct module *mod) +static int mod_sysfs_init(struct module *mod) { int err; struct kobject *kobj; @@ -1371,12 +1438,16 @@ out: return err; } -int mod_sysfs_setup(struct module *mod, +static int mod_sysfs_setup(struct module *mod, struct kernel_param *kparam, unsigned int num_params) { int err; + err = mod_sysfs_init(mod); + if (err) + goto out; + mod->holders_dir = kobject_create_and_add("holders", &mod->mkobj.kobj); if (!mod->holders_dir) { err = -ENOMEM; @@ -1391,6 +1462,8 @@ int mod_sysfs_setup(struct module *mod, if (err) goto out_unreg_param; + add_usage_links(mod); + kobject_uevent(&mod->mkobj.kobj, KOBJ_ADD); return 0; @@ -1400,6 +1473,7 @@ out_unreg_holders: kobject_put(mod->holders_dir); out_unreg: kobject_put(&mod->mkobj.kobj); +out: return err; } @@ -1410,14 +1484,40 @@ static void mod_sysfs_fini(struct module *mod) #else /* CONFIG_SYSFS */ +static inline int mod_sysfs_init(struct module *mod) +{ + return 0; +} + +static inline int mod_sysfs_setup(struct module *mod, + struct kernel_param *kparam, + unsigned int num_params) +{ + return 0; +} + +static inline int module_add_modinfo_attrs(struct module *mod) +{ + return 0; +} + +static inline void module_remove_modinfo_attrs(struct module *mod) +{ +} + static void mod_sysfs_fini(struct module *mod) { } +static void del_usage_links(struct module *mod) +{ +} + #endif /* CONFIG_SYSFS */ static void mod_kobject_remove(struct module *mod) { + del_usage_links(mod); module_remove_modinfo_attrs(mod); module_param_sysfs_remove(mod); kobject_put(mod->mkobj.drivers_dir); @@ -1436,17 +1536,22 @@ static int __unlink_module(void *_mod) return 0; } -/* Free a module, remove from lists, etc (must hold module_mutex). */ +/* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { trace_module_free(mod); /* Delete from various lists */ + mutex_lock(&module_mutex); stop_machine(__unlink_module, mod, NULL); + mutex_unlock(&module_mutex); remove_notes_attrs(mod); remove_sect_attrs(mod); mod_kobject_remove(mod); + /* Remove dynamic debug info */ + ddebug_remove_module(mod->name); + /* Arch-specific cleanup. */ module_arch_cleanup(mod); @@ -1493,6 +1598,8 @@ EXPORT_SYMBOL_GPL(__symbol_get); /* * Ensure that an exported symbol [global namespace] does not already exist * in the kernel or in some other module's exported symbol table. + * + * You must hold the module_mutex. */ static int verify_export_symbols(struct module *mod) { @@ -1558,21 +1665,23 @@ static int simplify_symbols(Elf_Shdr *sechdrs, break; case SHN_UNDEF: - ksym = resolve_symbol(sechdrs, versindex, - strtab + sym[i].st_name, mod); + ksym = resolve_symbol_wait(sechdrs, versindex, + strtab + sym[i].st_name, + mod); /* Ok if resolved. */ - if (ksym) { + if (ksym && !IS_ERR(ksym)) { sym[i].st_value = ksym->value; break; } /* Ok if weak. */ - if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK) + if (!ksym && ELF_ST_BIND(sym[i].st_info) == STB_WEAK) break; - printk(KERN_WARNING "%s: Unknown symbol %s\n", - mod->name, strtab + sym[i].st_name); - ret = -ENOENT; + printk(KERN_WARNING "%s: Unknown symbol %s (err %li)\n", + mod->name, strtab + sym[i].st_name, + PTR_ERR(ksym)); + ret = PTR_ERR(ksym) ?: -ENOENT; break; default: @@ -1955,16 +2064,24 @@ static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num) #endif } +static void dynamic_debug_remove(struct _ddebug *debug) +{ + if (debug) + ddebug_remove_module(debug->modname); +} + static void *module_alloc_update_bounds(unsigned long size) { void *ret = module_alloc(size); if (ret) { + mutex_lock(&module_mutex); /* Update module bounds. */ if ((unsigned long)ret < module_addr_min) module_addr_min = (unsigned long)ret; if ((unsigned long)ret + size > module_addr_max) module_addr_max = (unsigned long)ret + size; + mutex_unlock(&module_mutex); } return ret; } @@ -2014,6 +2131,9 @@ static noinline struct module *load_module(void __user *umod, long err = 0; void *ptr = NULL; /* Stops spurious gcc warning */ unsigned long symoffs, stroffs, *strmap; + void __percpu *percpu; + struct _ddebug *debug = NULL; + unsigned int num_debug = 0; mm_segment_t old_fs; @@ -2138,11 +2258,6 @@ static noinline struct module *load_module(void __user *umod, goto free_mod; } - if (find_module(mod->name)) { - err = -EEXIST; - goto free_mod; - } - mod->state = MODULE_STATE_COMING; /* Allow arches to frob section contents and sizes. */ @@ -2158,6 +2273,8 @@ static noinline struct module *load_module(void __user *umod, goto free_mod; sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC; } + /* Keep this around for failure path. */ + percpu = mod_percpu(mod); /* Determine total sizes, and put offsets in sh_entsize. For now this is done generically; there doesn't appear to be any @@ -2231,11 +2348,6 @@ static noinline struct module *load_module(void __user *umod, /* Now we've moved module, initialize linked lists, etc. */ module_unload_init(mod); - /* add kobject, so we can reference it. */ - err = mod_sysfs_init(mod); - if (err) - goto free_unload; - /* Set up license info based on the info section */ set_license(mod, get_modinfo(sechdrs, infoindex, "license")); @@ -2360,11 +2472,6 @@ static noinline struct module *load_module(void __user *umod, goto cleanup; } - /* Find duplicate symbols */ - err = verify_export_symbols(mod); - if (err < 0) - goto cleanup; - /* Set up and sort exception table */ mod->extable = section_objs(hdr, sechdrs, secstrings, "__ex_table", sizeof(*mod->extable), &mod->num_exentries); @@ -2379,15 +2486,9 @@ static noinline struct module *load_module(void __user *umod, kfree(strmap); strmap = NULL; - if (!mod->taints) { - struct _ddebug *debug; - unsigned int num_debug; - + if (!mod->taints) debug = section_objs(hdr, sechdrs, secstrings, "__verbose", sizeof(*debug), &num_debug); - if (debug) - dynamic_debug_setup(debug, num_debug); - } err = module_finalize(hdr, sechdrs, mod); if (err < 0) @@ -2423,7 +2524,22 @@ static noinline struct module *load_module(void __user *umod, * function to insert in a way safe to concurrent readers. * The mutex protects against concurrent writers. */ + mutex_lock(&module_mutex); + if (find_module(mod->name)) { + err = -EEXIST; + goto unlock; + } + + if (debug) + dynamic_debug_setup(debug, num_debug); + + /* Find duplicate symbols */ + err = verify_export_symbols(mod); + if (err < 0) + goto ddebug; + list_add_rcu(&mod->list, &modules); + mutex_unlock(&module_mutex); err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL); if (err < 0) @@ -2432,6 +2548,7 @@ static noinline struct module *load_module(void __user *umod, err = mod_sysfs_setup(mod, mod->kp, mod->num_kp); if (err < 0) goto unlink; + add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs); add_notes_attrs(mod, hdr->e_shnum, secstrings, sechdrs); @@ -2444,15 +2561,17 @@ static noinline struct module *load_module(void __user *umod, return mod; unlink: + mutex_lock(&module_mutex); /* Unlink carefully: kallsyms could be walking list. */ list_del_rcu(&mod->list); + ddebug: + dynamic_debug_remove(debug); + unlock: + mutex_unlock(&module_mutex); synchronize_sched(); module_arch_cleanup(mod); cleanup: free_modinfo(mod); - kobject_del(&mod->mkobj.kobj); - kobject_put(&mod->mkobj.kobj); - free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) free_percpu(mod->refptr); @@ -2463,7 +2582,7 @@ static noinline struct module *load_module(void __user *umod, module_free(mod, mod->module_core); /* mod will be freed with core. Don't access it beyond this line! */ free_percpu: - percpu_modfree(mod); + free_percpu(percpu); free_mod: kfree(args); kfree(strmap); @@ -2499,19 +2618,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; - /* Only one module load at a time, please */ - if (mutex_lock_interruptible(&module_mutex) != 0) - return -EINTR; - /* Do all the hard work */ mod = load_module(umod, len, uargs); - if (IS_ERR(mod)) { - mutex_unlock(&module_mutex); + if (IS_ERR(mod)) return PTR_ERR(mod); - } - - /* Drop lock so they can recurse */ - mutex_unlock(&module_mutex); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); @@ -2528,9 +2638,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); - mutex_lock(&module_mutex); free_module(mod); - mutex_unlock(&module_mutex); wake_up(&module_wq); return ret; } diff --git a/kernel/mutex.c b/kernel/mutex.c index 632f04c57d82..4c0b7b3e6d2e 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -172,6 +172,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct thread_info *owner; /* + * If we own the BKL, then don't spin. The owner of + * the mutex might be waiting on us to release the BKL. + */ + if (unlikely(current->lock_depth >= 0)) + break; + + /* * If there's an owner, wait for it to either * release the lock or go to sleep. */ diff --git a/kernel/padata.c b/kernel/padata.c index fd4679266ede..751019415d23 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -864,7 +864,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, err = __padata_add_cpu(pinst, cpu); mutex_unlock(&pinst->lock); if (err) - return NOTIFY_BAD; + return notifier_from_errno(err); break; case CPU_DOWN_PREPARE: @@ -875,7 +875,7 @@ static int padata_cpu_callback(struct notifier_block *nfb, err = __padata_remove_cpu(pinst, cpu); mutex_unlock(&pinst->lock); if (err) - return NOTIFY_BAD; + return notifier_from_errno(err); break; case CPU_UP_CANCELED: diff --git a/kernel/panic.c b/kernel/panic.c index dbe13dbb057a..3b16cd93fa7d 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -87,6 +87,7 @@ NORET_TYPE void panic(const char * fmt, ...) */ preempt_disable(); + console_verbose(); bust_spinlocks(1); va_start(args, fmt); vsnprintf(buf, sizeof(buf), fmt, args); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index a4fa381db3c2..ff86c558af4c 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -283,14 +283,15 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { - struct perf_event *group_leader = event->group_leader; + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); + event->attach_state |= PERF_ATTACH_CONTEXT; /* - * Depending on whether it is a standalone or sibling event, - * add it straight to the context's event list, or to the group - * leader's sibling list: + * If we're a stand alone event or group leader, we go to the context + * list, group events are kept attached to the group so that + * perf_group_detach can, at all times, locate all siblings. */ - if (group_leader == event) { + if (event->group_leader == event) { struct list_head *list; if (is_software_event(event)) @@ -298,13 +299,6 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) list = ctx_group_list(event, ctx); list_add_tail(&event->group_entry, list); - } else { - if (group_leader->group_flags & PERF_GROUP_SOFTWARE && - !is_software_event(event)) - group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; - - list_add_tail(&event->group_entry, &group_leader->sibling_list); - group_leader->nr_siblings++; } list_add_rcu(&event->event_entry, &ctx->event_list); @@ -313,6 +307,24 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) ctx->nr_stat++; } +static void perf_group_attach(struct perf_event *event) +{ + struct perf_event *group_leader = event->group_leader; + + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_GROUP); + event->attach_state |= PERF_ATTACH_GROUP; + + if (group_leader == event) + return; + + if (group_leader->group_flags & PERF_GROUP_SOFTWARE && + !is_software_event(event)) + group_leader->group_flags &= ~PERF_GROUP_SOFTWARE; + + list_add_tail(&event->group_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; +} + /* * Remove a event from the lists for its context. * Must be called with ctx->mutex and ctx->lock held. @@ -320,17 +332,22 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx) static void list_del_event(struct perf_event *event, struct perf_event_context *ctx) { - if (list_empty(&event->group_entry)) + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_CONTEXT)) return; + + event->attach_state &= ~PERF_ATTACH_CONTEXT; + ctx->nr_events--; if (event->attr.inherit_stat) ctx->nr_stat--; - list_del_init(&event->group_entry); list_del_rcu(&event->event_entry); - if (event->group_leader != event) - event->group_leader->nr_siblings--; + if (event->group_leader == event) + list_del_init(&event->group_entry); update_group_times(event); @@ -345,21 +362,39 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) event->state = PERF_EVENT_STATE_OFF; } -static void -perf_destroy_group(struct perf_event *event, struct perf_event_context *ctx) +static void perf_group_detach(struct perf_event *event) { struct perf_event *sibling, *tmp; + struct list_head *list = NULL; + + /* + * We can have double detach due to exit/hot-unplug + close. + */ + if (!(event->attach_state & PERF_ATTACH_GROUP)) + return; + + event->attach_state &= ~PERF_ATTACH_GROUP; + + /* + * If this is a sibling, remove it from its group. + */ + if (event->group_leader != event) { + list_del_init(&event->group_entry); + event->group_leader->nr_siblings--; + return; + } + + if (!list_empty(&event->group_entry)) + list = &event->group_entry; /* * If this was a group event with sibling events then * upgrade the siblings to singleton events by adding them - * to the context list directly: + * to whatever list we are on. */ list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) { - struct list_head *list; - - list = ctx_group_list(event, ctx); - list_move_tail(&sibling->group_entry, list); + if (list) + list_move_tail(&sibling->group_entry, list); sibling->group_leader = sibling; /* Inherit group flags from the previous leader */ @@ -652,8 +687,11 @@ group_sched_in(struct perf_event *group_event, if (txn) pmu->start_txn(pmu); - if (event_sched_in(group_event, cpuctx, ctx)) + if (event_sched_in(group_event, cpuctx, ctx)) { + if (txn) + pmu->cancel_txn(pmu); return -EAGAIN; + } /* * Schedule in siblings as one group (if any): @@ -675,9 +713,6 @@ group_sched_in(struct perf_event *group_event, } group_error: - if (txn) - pmu->cancel_txn(pmu); - /* * Groups can be scheduled in as one unit only, so undo any * partial group before returning: @@ -689,6 +724,9 @@ group_error: } event_sched_out(group_event, cpuctx, ctx); + if (txn) + pmu->cancel_txn(pmu); + return -EAGAIN; } @@ -727,6 +765,7 @@ static void add_event_to_ctx(struct perf_event *event, struct perf_event_context *ctx) { list_add_event(event, ctx); + perf_group_attach(event); event->tstamp_enabled = ctx->time; event->tstamp_running = ctx->time; event->tstamp_stopped = ctx->time; @@ -1468,6 +1507,9 @@ do { \ divisor = nsec * frequency; } + if (!divisor) + return dividend; + return div64_u64(dividend, divisor); } @@ -1490,7 +1532,7 @@ static int perf_event_start(struct perf_event *event) static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count) { struct hw_perf_event *hwc = &event->hw; - u64 period, sample_period; + s64 period, sample_period; s64 delta; period = perf_calculate_period(event, nsec, count); @@ -1841,6 +1883,7 @@ static void free_event_rcu(struct rcu_head *head) } static void perf_pending_sync(struct perf_event *event); +static void perf_mmap_data_put(struct perf_mmap_data *data); static void free_event(struct perf_event *event) { @@ -1856,9 +1899,9 @@ static void free_event(struct perf_event *event) atomic_dec(&nr_task_events); } - if (event->output) { - fput(event->output->filp); - event->output = NULL; + if (event->data) { + perf_mmap_data_put(event->data); + event->data = NULL; } if (event->destroy) @@ -1893,8 +1936,8 @@ int perf_event_release_kernel(struct perf_event *event) */ mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING); raw_spin_lock_irq(&ctx->lock); + perf_group_detach(event); list_del_event(event, ctx); - perf_destroy_group(event, ctx); raw_spin_unlock_irq(&ctx->lock); mutex_unlock(&ctx->mutex); @@ -2175,7 +2218,27 @@ unlock: return ret; } -static int perf_event_set_output(struct perf_event *event, int output_fd); +static const struct file_operations perf_fops; + +static struct perf_event *perf_fget_light(int fd, int *fput_needed) +{ + struct file *file; + + file = fget_light(fd, fput_needed); + if (!file) + return ERR_PTR(-EBADF); + + if (file->f_op != &perf_fops) { + fput_light(file, *fput_needed); + *fput_needed = 0; + return ERR_PTR(-EBADF); + } + + return file->private_data; +} + +static int perf_event_set_output(struct perf_event *event, + struct perf_event *output_event); static int perf_event_set_filter(struct perf_event *event, void __user *arg); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -2202,7 +2265,23 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return perf_event_period(event, (u64 __user *)arg); case PERF_EVENT_IOC_SET_OUTPUT: - return perf_event_set_output(event, arg); + { + struct perf_event *output_event = NULL; + int fput_needed = 0; + int ret; + + if (arg != -1) { + output_event = perf_fget_light(arg, &fput_needed); + if (IS_ERR(output_event)) + return PTR_ERR(output_event); + } + + ret = perf_event_set_output(event, output_event); + if (output_event) + fput_light(output_event->filp, fput_needed); + + return ret; + } case PERF_EVENT_IOC_SET_FILTER: return perf_event_set_filter(event, (void __user *)arg); @@ -2297,11 +2376,6 @@ unlock: rcu_read_unlock(); } -static unsigned long perf_data_size(struct perf_mmap_data *data) -{ - return data->nr_pages << (PAGE_SHIFT + data->data_order); -} - #ifndef CONFIG_PERF_USE_VMALLOC /* @@ -2320,6 +2394,19 @@ perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) return virt_to_page(data->data_pages[pgoff - 1]); } +static void *perf_mmap_alloc_page(int cpu) +{ + struct page *page; + int node; + + node = (cpu == -1) ? cpu : cpu_to_node(cpu); + page = alloc_pages_node(node, GFP_KERNEL | __GFP_ZERO, 0); + if (!page) + return NULL; + + return page_address(page); +} + static struct perf_mmap_data * perf_mmap_data_alloc(struct perf_event *event, int nr_pages) { @@ -2327,8 +2414,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) unsigned long size; int i; - WARN_ON(atomic_read(&event->mmap_count)); - size = sizeof(struct perf_mmap_data); size += nr_pages * sizeof(void *); @@ -2336,17 +2421,16 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) if (!data) goto fail; - data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + data->user_page = perf_mmap_alloc_page(event->cpu); if (!data->user_page) goto fail_user_page; for (i = 0; i < nr_pages; i++) { - data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + data->data_pages[i] = perf_mmap_alloc_page(event->cpu); if (!data->data_pages[i]) goto fail_data_pages; } - data->data_order = 0; data->nr_pages = nr_pages; return data; @@ -2382,6 +2466,11 @@ static void perf_mmap_data_free(struct perf_mmap_data *data) kfree(data); } +static inline int page_order(struct perf_mmap_data *data) +{ + return 0; +} + #else /* @@ -2390,10 +2479,15 @@ static void perf_mmap_data_free(struct perf_mmap_data *data) * Required for architectures that have d-cache aliasing issues. */ +static inline int page_order(struct perf_mmap_data *data) +{ + return data->page_order; +} + static struct page * perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff) { - if (pgoff > (1UL << data->data_order)) + if (pgoff > (1UL << page_order(data))) return NULL; return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE); @@ -2413,7 +2507,7 @@ static void perf_mmap_data_free_work(struct work_struct *work) int i, nr; data = container_of(work, struct perf_mmap_data, work); - nr = 1 << data->data_order; + nr = 1 << page_order(data); base = data->user_page; for (i = 0; i < nr + 1; i++) @@ -2435,8 +2529,6 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) unsigned long size; void *all_buf; - WARN_ON(atomic_read(&event->mmap_count)); - size = sizeof(struct perf_mmap_data); size += sizeof(void *); @@ -2452,7 +2544,7 @@ perf_mmap_data_alloc(struct perf_event *event, int nr_pages) data->user_page = all_buf; data->data_pages[0] = all_buf + PAGE_SIZE; - data->data_order = ilog2(nr_pages); + data->page_order = ilog2(nr_pages); data->nr_pages = 1; return data; @@ -2466,6 +2558,11 @@ fail: #endif +static unsigned long perf_data_size(struct perf_mmap_data *data) +{ + return data->nr_pages << (PAGE_SHIFT + page_order(data)); +} + static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_event *event = vma->vm_file->private_data; @@ -2506,8 +2603,6 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) { long max_size = perf_data_size(data); - atomic_set(&data->lock, -1); - if (event->attr.watermark) { data->watermark = min_t(long, max_size, event->attr.wakeup_watermark); @@ -2516,7 +2611,7 @@ perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data) if (!data->watermark) data->watermark = max_size / 2; - + atomic_set(&data->refcount, 1); rcu_assign_pointer(event->data, data); } @@ -2528,13 +2623,26 @@ static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head) perf_mmap_data_free(data); } -static void perf_mmap_data_release(struct perf_event *event) +static struct perf_mmap_data *perf_mmap_data_get(struct perf_event *event) { - struct perf_mmap_data *data = event->data; + struct perf_mmap_data *data; - WARN_ON(atomic_read(&event->mmap_count)); + rcu_read_lock(); + data = rcu_dereference(event->data); + if (data) { + if (!atomic_inc_not_zero(&data->refcount)) + data = NULL; + } + rcu_read_unlock(); + + return data; +} + +static void perf_mmap_data_put(struct perf_mmap_data *data) +{ + if (!atomic_dec_and_test(&data->refcount)) + return; - rcu_assign_pointer(event->data, NULL); call_rcu(&data->rcu_head, perf_mmap_data_free_rcu); } @@ -2549,15 +2657,18 @@ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - WARN_ON_ONCE(event->ctx->parent_ctx); if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { unsigned long size = perf_data_size(event->data); - struct user_struct *user = current_user(); + struct user_struct *user = event->mmap_user; + struct perf_mmap_data *data = event->data; atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->locked_vm -= event->data->nr_locked; - perf_mmap_data_release(event); + vma->vm_mm->locked_vm -= event->mmap_locked; + rcu_assign_pointer(event->data, NULL); mutex_unlock(&event->mmap_mutex); + + perf_mmap_data_put(data); + free_uid(user); } } @@ -2580,6 +2691,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) long user_extra, extra; int ret = 0; + /* + * Don't allow mmap() of inherited per-task counters. This would + * create a performance issue due to all children writing to the + * same buffer. + */ + if (event->cpu == -1 && event->attr.inherit) + return -EINVAL; + if (!(vma->vm_flags & VM_SHARED)) return -EINVAL; @@ -2601,13 +2720,10 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); - if (event->output) { - ret = -EINVAL; - goto unlock; - } - - if (atomic_inc_not_zero(&event->mmap_count)) { - if (nr_pages != event->data->nr_pages) + if (event->data) { + if (event->data->nr_pages == nr_pages) + atomic_inc(&event->data->refcount); + else ret = -EINVAL; goto unlock; } @@ -2639,21 +2755,23 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON(event->data); data = perf_mmap_data_alloc(event, nr_pages); - ret = -ENOMEM; - if (!data) + if (!data) { + ret = -ENOMEM; goto unlock; + } - ret = 0; perf_mmap_data_init(event, data); - - atomic_set(&event->mmap_count, 1); - atomic_long_add(user_extra, &user->locked_vm); - vma->vm_mm->locked_vm += extra; - event->data->nr_locked = extra; if (vma->vm_flags & VM_WRITE) event->data->writable = 1; + atomic_long_add(user_extra, &user->locked_vm); + event->mmap_locked = extra; + event->mmap_user = get_current_user(); + vma->vm_mm->locked_vm += event->mmap_locked; + unlock: + if (!ret) + atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); vma->vm_flags |= VM_RESERVED; @@ -2885,127 +3003,87 @@ static void perf_output_wakeup(struct perf_output_handle *handle) } /* - * Curious locking construct. - * * We need to ensure a later event_id doesn't publish a head when a former - * event_id isn't done writing. However since we need to deal with NMIs we + * event isn't done writing. However since we need to deal with NMIs we * cannot fully serialize things. * - * What we do is serialize between CPUs so we only have to deal with NMI - * nesting on a single CPU. - * * We only publish the head (and generate a wakeup) when the outer-most - * event_id completes. + * event completes. */ -static void perf_output_lock(struct perf_output_handle *handle) +static void perf_output_get_handle(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; - int cur, cpu = get_cpu(); - - handle->locked = 0; - for (;;) { - cur = atomic_cmpxchg(&data->lock, -1, cpu); - if (cur == -1) { - handle->locked = 1; - break; - } - if (cur == cpu) - break; - - cpu_relax(); - } + preempt_disable(); + local_inc(&data->nest); + handle->wakeup = local_read(&data->wakeup); } -static void perf_output_unlock(struct perf_output_handle *handle) +static void perf_output_put_handle(struct perf_output_handle *handle) { struct perf_mmap_data *data = handle->data; unsigned long head; - int cpu; - - data->done_head = data->head; - - if (!handle->locked) - goto out; again: - /* - * The xchg implies a full barrier that ensures all writes are done - * before we publish the new head, matched by a rmb() in userspace when - * reading this position. - */ - while ((head = atomic_long_xchg(&data->done_head, 0))) - data->user_page->data_head = head; + head = local_read(&data->head); /* - * NMI can happen here, which means we can miss a done_head update. + * IRQ/NMI can happen here, which means we can miss a head update. */ - cpu = atomic_xchg(&data->lock, -1); - WARN_ON_ONCE(cpu != smp_processor_id()); + if (!local_dec_and_test(&data->nest)) + goto out; /* - * Therefore we have to validate we did not indeed do so. + * Publish the known good head. Rely on the full barrier implied + * by atomic_dec_and_test() order the data->head read and this + * write. */ - if (unlikely(atomic_long_read(&data->done_head))) { - /* - * Since we had it locked, we can lock it again. - */ - while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) - cpu_relax(); + data->user_page->data_head = head; + /* + * Now check if we missed an update, rely on the (compiler) + * barrier in atomic_dec_and_test() to re-read data->head. + */ + if (unlikely(head != local_read(&data->head))) { + local_inc(&data->nest); goto again; } - if (atomic_xchg(&data->wakeup, 0)) + if (handle->wakeup != local_read(&data->wakeup)) perf_output_wakeup(handle); -out: - put_cpu(); + + out: + preempt_enable(); } -void perf_output_copy(struct perf_output_handle *handle, +__always_inline void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len) { - unsigned int pages_mask; - unsigned long offset; - unsigned int size; - void **pages; - - offset = handle->offset; - pages_mask = handle->data->nr_pages - 1; - pages = handle->data->data_pages; - do { - unsigned long page_offset; - unsigned long page_size; - int nr; + unsigned long size = min_t(unsigned long, handle->size, len); - nr = (offset >> PAGE_SHIFT) & pages_mask; - page_size = 1UL << (handle->data->data_order + PAGE_SHIFT); - page_offset = offset & (page_size - 1); - size = min_t(unsigned int, page_size - page_offset, len); + memcpy(handle->addr, buf, size); - memcpy(pages[nr] + page_offset, buf, size); + len -= size; + handle->addr += size; + buf += size; + handle->size -= size; + if (!handle->size) { + struct perf_mmap_data *data = handle->data; - len -= size; - buf += size; - offset += size; + handle->page++; + handle->page &= data->nr_pages - 1; + handle->addr = data->data_pages[handle->page]; + handle->size = PAGE_SIZE << page_order(data); + } } while (len); - - handle->offset = offset; - - /* - * Check we didn't copy past our reservation window, taking the - * possible unsigned int wrap into account. - */ - WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0); } int perf_output_begin(struct perf_output_handle *handle, struct perf_event *event, unsigned int size, int nmi, int sample) { - struct perf_event *output_event; struct perf_mmap_data *data; unsigned long tail, offset, head; int have_lost; @@ -3022,10 +3100,6 @@ int perf_output_begin(struct perf_output_handle *handle, if (event->parent) event = event->parent; - output_event = rcu_dereference(event->output); - if (output_event) - event = output_event; - data = rcu_dereference(event->data); if (!data) goto out; @@ -3036,13 +3110,13 @@ int perf_output_begin(struct perf_output_handle *handle, handle->sample = sample; if (!data->nr_pages) - goto fail; + goto out; - have_lost = atomic_read(&data->lost); + have_lost = local_read(&data->lost); if (have_lost) size += sizeof(lost_event); - perf_output_lock(handle); + perf_output_get_handle(handle); do { /* @@ -3052,24 +3126,28 @@ int perf_output_begin(struct perf_output_handle *handle, */ tail = ACCESS_ONCE(data->user_page->data_tail); smp_rmb(); - offset = head = atomic_long_read(&data->head); + offset = head = local_read(&data->head); head += size; if (unlikely(!perf_output_space(data, tail, offset, head))) goto fail; - } while (atomic_long_cmpxchg(&data->head, offset, head) != offset); + } while (local_cmpxchg(&data->head, offset, head) != offset); - handle->offset = offset; - handle->head = head; + if (head - local_read(&data->wakeup) > data->watermark) + local_add(data->watermark, &data->wakeup); - if (head - tail > data->watermark) - atomic_set(&data->wakeup, 1); + handle->page = offset >> (PAGE_SHIFT + page_order(data)); + handle->page &= data->nr_pages - 1; + handle->size = offset & ((PAGE_SIZE << page_order(data)) - 1); + handle->addr = data->data_pages[handle->page]; + handle->addr += handle->size; + handle->size = (PAGE_SIZE << page_order(data)) - handle->size; if (have_lost) { lost_event.header.type = PERF_RECORD_LOST; lost_event.header.misc = 0; lost_event.header.size = sizeof(lost_event); lost_event.id = event->id; - lost_event.lost = atomic_xchg(&data->lost, 0); + lost_event.lost = local_xchg(&data->lost, 0); perf_output_put(handle, lost_event); } @@ -3077,8 +3155,8 @@ int perf_output_begin(struct perf_output_handle *handle, return 0; fail: - atomic_inc(&data->lost); - perf_output_unlock(handle); + local_inc(&data->lost); + perf_output_put_handle(handle); out: rcu_read_unlock(); @@ -3093,14 +3171,14 @@ void perf_output_end(struct perf_output_handle *handle) int wakeup_events = event->attr.wakeup_events; if (handle->sample && wakeup_events) { - int events = atomic_inc_return(&data->events); + int events = local_inc_return(&data->events); if (events >= wakeup_events) { - atomic_sub(wakeup_events, &data->events); - atomic_set(&data->wakeup, 1); + local_sub(wakeup_events, &data->events); + local_inc(&data->wakeup); } } - perf_output_unlock(handle); + perf_output_put_handle(handle); rcu_read_unlock(); } @@ -3436,22 +3514,13 @@ static void perf_event_task_output(struct perf_event *event, { struct perf_output_handle handle; struct task_struct *task = task_event->task; - unsigned long flags; int size, ret; - /* - * If this CPU attempts to acquire an rq lock held by a CPU spinning - * in perf_output_lock() from interrupt context, it's game over. - */ - local_irq_save(flags); - size = task_event->event_id.header.size; ret = perf_output_begin(&handle, event, size, 0, 0); - if (ret) { - local_irq_restore(flags); + if (ret) return; - } task_event->event_id.pid = perf_event_pid(event, task); task_event->event_id.ppid = perf_event_pid(event, current); @@ -3462,7 +3531,6 @@ static void perf_event_task_output(struct perf_event *event, perf_output_put(&handle, task_event->event_id); perf_output_end(&handle); - local_irq_restore(flags); } static int perf_event_task_match(struct perf_event *event) @@ -3990,13 +4058,6 @@ static void perf_swevent_overflow(struct perf_event *event, u64 overflow, } } -static void perf_swevent_unthrottle(struct perf_event *event) -{ - /* - * Nothing to do, we already reset hwc->interrupts. - */ -} - static void perf_swevent_add(struct perf_event *event, u64 nr, int nmi, struct perf_sample_data *data, struct pt_regs *regs) @@ -4020,9 +4081,6 @@ static void perf_swevent_add(struct perf_event *event, u64 nr, perf_swevent_overflow(event, 0, nmi, data, regs); } -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data); - static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs) { @@ -4052,10 +4110,6 @@ static int perf_swevent_match(struct perf_event *event, if (perf_exclude_event(event, regs)) return 0; - if (event->attr.type == PERF_TYPE_TRACEPOINT && - !perf_tp_event_match(event, data)) - return 0; - return 1; } @@ -4066,19 +4120,46 @@ static inline u64 swevent_hash(u64 type, u32 event_id) return hash_64(val, SWEVENT_HLIST_BITS); } -static struct hlist_head * -find_swevent_head(struct perf_cpu_context *ctx, u64 type, u32 event_id) +static inline struct hlist_head * +__find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id) { - u64 hash; - struct swevent_hlist *hlist; + u64 hash = swevent_hash(type, event_id); - hash = swevent_hash(type, event_id); + return &hlist->heads[hash]; +} + +/* For the read side: events when they trigger */ +static inline struct hlist_head * +find_swevent_head_rcu(struct perf_cpu_context *ctx, u64 type, u32 event_id) +{ + struct swevent_hlist *hlist; hlist = rcu_dereference(ctx->swevent_hlist); if (!hlist) return NULL; - return &hlist->heads[hash]; + return __find_swevent_head(hlist, type, event_id); +} + +/* For the event head insertion and removal in the hlist */ +static inline struct hlist_head * +find_swevent_head(struct perf_cpu_context *ctx, struct perf_event *event) +{ + struct swevent_hlist *hlist; + u32 event_id = event->attr.config; + u64 type = event->attr.type; + + /* + * Event scheduling is always serialized against hlist allocation + * and release. Which makes the protected version suitable here. + * The context lock guarantees that. + */ + hlist = rcu_dereference_protected(ctx->swevent_hlist, + lockdep_is_held(&event->ctx->lock)); + if (!hlist) + return NULL; + + return __find_swevent_head(hlist, type, event_id); } static void do_perf_sw_event(enum perf_type_id type, u32 event_id, @@ -4095,7 +4176,7 @@ static void do_perf_sw_event(enum perf_type_id type, u32 event_id, rcu_read_lock(); - head = find_swevent_head(cpuctx, type, event_id); + head = find_swevent_head_rcu(cpuctx, type, event_id); if (!head) goto end; @@ -4110,7 +4191,7 @@ end: int perf_swevent_get_recursion_context(void) { - struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); int rctx; if (in_nmi()) @@ -4122,10 +4203,8 @@ int perf_swevent_get_recursion_context(void) else rctx = 0; - if (cpuctx->recursion[rctx]) { - put_cpu_var(perf_cpu_context); + if (cpuctx->recursion[rctx]) return -1; - } cpuctx->recursion[rctx]++; barrier(); @@ -4139,7 +4218,6 @@ void perf_swevent_put_recursion_context(int rctx) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); barrier(); cpuctx->recursion[rctx]--; - put_cpu_var(perf_cpu_context); } EXPORT_SYMBOL_GPL(perf_swevent_put_recursion_context); @@ -4150,6 +4228,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, struct perf_sample_data data; int rctx; + preempt_disable_notrace(); rctx = perf_swevent_get_recursion_context(); if (rctx < 0) return; @@ -4159,6 +4238,7 @@ void __perf_sw_event(u32 event_id, u64 nr, int nmi, do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, nmi, &data, regs); perf_swevent_put_recursion_context(rctx); + preempt_enable_notrace(); } static void perf_swevent_read(struct perf_event *event) @@ -4178,7 +4258,7 @@ static int perf_swevent_enable(struct perf_event *event) perf_swevent_set_period(event); } - head = find_swevent_head(cpuctx, event->attr.type, event->attr.config); + head = find_swevent_head(cpuctx, event); if (WARN_ON_ONCE(!head)) return -EINVAL; @@ -4192,11 +4272,22 @@ static void perf_swevent_disable(struct perf_event *event) hlist_del_rcu(&event->hlist_entry); } +static void perf_swevent_void(struct perf_event *event) +{ +} + +static int perf_swevent_int(struct perf_event *event) +{ + return 0; +} + static const struct pmu perf_ops_generic = { .enable = perf_swevent_enable, .disable = perf_swevent_disable, + .start = perf_swevent_int, + .stop = perf_swevent_void, .read = perf_swevent_read, - .unthrottle = perf_swevent_unthrottle, + .unthrottle = perf_swevent_void, /* hwc->interrupts already reset */ }; /* @@ -4366,6 +4457,14 @@ static const struct pmu perf_ops_task_clock = { .read = task_clock_perf_event_read, }; +/* Deref the hlist from the update side */ +static inline struct swevent_hlist * +swevent_hlist_deref(struct perf_cpu_context *cpuctx) +{ + return rcu_dereference_protected(cpuctx->swevent_hlist, + lockdep_is_held(&cpuctx->hlist_mutex)); +} + static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) { struct swevent_hlist *hlist; @@ -4376,12 +4475,11 @@ static void swevent_hlist_release_rcu(struct rcu_head *rcu_head) static void swevent_hlist_release(struct perf_cpu_context *cpuctx) { - struct swevent_hlist *hlist; + struct swevent_hlist *hlist = swevent_hlist_deref(cpuctx); - if (!cpuctx->swevent_hlist) + if (!hlist) return; - hlist = cpuctx->swevent_hlist; rcu_assign_pointer(cpuctx->swevent_hlist, NULL); call_rcu(&hlist->rcu_head, swevent_hlist_release_rcu); } @@ -4418,7 +4516,7 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) mutex_lock(&cpuctx->hlist_mutex); - if (!cpuctx->swevent_hlist && cpu_online(cpu)) { + if (!swevent_hlist_deref(cpuctx) && cpu_online(cpu)) { struct swevent_hlist *hlist; hlist = kzalloc(sizeof(*hlist), GFP_KERNEL); @@ -4467,10 +4565,48 @@ static int swevent_hlist_get(struct perf_event *event) #ifdef CONFIG_EVENT_TRACING -void perf_tp_event(int event_id, u64 addr, u64 count, void *record, - int entry_size, struct pt_regs *regs) +static const struct pmu perf_ops_tracepoint = { + .enable = perf_trace_enable, + .disable = perf_trace_disable, + .start = perf_swevent_int, + .stop = perf_swevent_void, + .read = perf_swevent_read, + .unthrottle = perf_swevent_void, +}; + +static int perf_tp_filter_match(struct perf_event *event, + struct perf_sample_data *data) +{ + void *record = data->raw->data; + + if (likely(!event->filter) || filter_match_preds(event->filter, record)) + return 1; + return 0; +} + +static int perf_tp_event_match(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + /* + * All tracepoints are from kernel-space. + */ + if (event->attr.exclude_kernel) + return 0; + + if (!perf_tp_filter_match(event, data)) + return 0; + + return 1; +} + +void perf_tp_event(u64 addr, u64 count, void *record, int entry_size, + struct pt_regs *regs, struct hlist_head *head) { struct perf_sample_data data; + struct perf_event *event; + struct hlist_node *node; + struct perf_raw_record raw = { .size = entry_size, .data = record, @@ -4479,26 +4615,18 @@ void perf_tp_event(int event_id, u64 addr, u64 count, void *record, perf_sample_data_init(&data, addr); data.raw = &raw; - /* Trace events already protected against recursion */ - do_perf_sw_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, - &data, regs); + rcu_read_lock(); + hlist_for_each_entry_rcu(event, node, head, hlist_entry) { + if (perf_tp_event_match(event, &data, regs)) + perf_swevent_add(event, count, 1, &data, regs); + } + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(perf_tp_event); -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data) -{ - void *record = data->raw->data; - - if (likely(!event->filter) || filter_match_preds(event->filter, record)) - return 1; - return 0; -} - static void tp_perf_event_destroy(struct perf_event *event) { - perf_trace_disable(event->attr.config); - swevent_hlist_put(event); + perf_trace_destroy(event); } static const struct pmu *tp_perf_event_init(struct perf_event *event) @@ -4514,17 +4642,13 @@ static const struct pmu *tp_perf_event_init(struct perf_event *event) !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EPERM); - if (perf_trace_enable(event->attr.config)) + err = perf_trace_init(event); + if (err) return NULL; event->destroy = tp_perf_event_destroy; - err = swevent_hlist_get(event); - if (err) { - perf_trace_disable(event->attr.config); - return ERR_PTR(err); - } - return &perf_ops_generic; + return &perf_ops_tracepoint; } static int perf_event_set_filter(struct perf_event *event, void __user *arg) @@ -4552,12 +4676,6 @@ static void perf_event_free_filter(struct perf_event *event) #else -static int perf_tp_event_match(struct perf_event *event, - struct perf_sample_data *data) -{ - return 1; -} - static const struct pmu *tp_perf_event_init(struct perf_event *event) { return NULL; @@ -4886,54 +5004,53 @@ err_size: goto out; } -static int perf_event_set_output(struct perf_event *event, int output_fd) +static int +perf_event_set_output(struct perf_event *event, struct perf_event *output_event) { - struct perf_event *output_event = NULL; - struct file *output_file = NULL; - struct perf_event *old_output; - int fput_needed = 0; + struct perf_mmap_data *data = NULL, *old_data = NULL; int ret = -EINVAL; - if (!output_fd) + if (!output_event) goto set; - output_file = fget_light(output_fd, &fput_needed); - if (!output_file) - return -EBADF; - - if (output_file->f_op != &perf_fops) + /* don't allow circular references */ + if (event == output_event) goto out; - output_event = output_file->private_data; - - /* Don't chain output fds */ - if (output_event->output) + /* + * Don't allow cross-cpu buffers + */ + if (output_event->cpu != event->cpu) goto out; - /* Don't set an output fd when we already have an output channel */ - if (event->data) + /* + * If its not a per-cpu buffer, it must be the same task. + */ + if (output_event->cpu == -1 && output_event->ctx != event->ctx) goto out; - atomic_long_inc(&output_file->f_count); - set: mutex_lock(&event->mmap_mutex); - old_output = event->output; - rcu_assign_pointer(event->output, output_event); - mutex_unlock(&event->mmap_mutex); + /* Can't redirect output if we've got an active mmap() */ + if (atomic_read(&event->mmap_count)) + goto unlock; - if (old_output) { - /* - * we need to make sure no existing perf_output_*() - * is still referencing this event. - */ - synchronize_rcu(); - fput(old_output->filp); + if (output_event) { + /* get the buffer we want to redirect to */ + data = perf_mmap_data_get(output_event); + if (!data) + goto unlock; } + old_data = event->data; + rcu_assign_pointer(event->data, data); ret = 0; +unlock: + mutex_unlock(&event->mmap_mutex); + + if (old_data) + perf_mmap_data_put(old_data); out: - fput_light(output_file, fput_needed); return ret; } @@ -4949,13 +5066,13 @@ SYSCALL_DEFINE5(perf_event_open, struct perf_event_attr __user *, attr_uptr, pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { - struct perf_event *event, *group_leader; + struct perf_event *event, *group_leader = NULL, *output_event = NULL; struct perf_event_attr attr; struct perf_event_context *ctx; struct file *event_file = NULL; struct file *group_file = NULL; + int event_fd; int fput_needed = 0; - int fput_needed2 = 0; int err; /* for future expandability... */ @@ -4976,26 +5093,38 @@ SYSCALL_DEFINE5(perf_event_open, return -EINVAL; } + event_fd = get_unused_fd_flags(O_RDWR); + if (event_fd < 0) + return event_fd; + /* * Get the target context (task or percpu): */ ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); + if (IS_ERR(ctx)) { + err = PTR_ERR(ctx); + goto err_fd; + } + + if (group_fd != -1) { + group_leader = perf_fget_light(group_fd, &fput_needed); + if (IS_ERR(group_leader)) { + err = PTR_ERR(group_leader); + goto err_put_context; + } + group_file = group_leader->filp; + if (flags & PERF_FLAG_FD_OUTPUT) + output_event = group_leader; + if (flags & PERF_FLAG_FD_NO_GROUP) + group_leader = NULL; + } /* * Look up the group leader (we will attach this event to it): */ - group_leader = NULL; - if (group_fd != -1 && !(flags & PERF_FLAG_FD_NO_GROUP)) { + if (group_leader) { err = -EINVAL; - group_file = fget_light(group_fd, &fput_needed); - if (!group_file) - goto err_put_context; - if (group_file->f_op != &perf_fops) - goto err_put_context; - group_leader = group_file->private_data; /* * Do not allow a recursive hierarchy (this new sibling * becoming part of another group-sibling): @@ -5017,22 +5146,21 @@ SYSCALL_DEFINE5(perf_event_open, event = perf_event_alloc(&attr, cpu, ctx, group_leader, NULL, NULL, GFP_KERNEL); - err = PTR_ERR(event); - if (IS_ERR(event)) + if (IS_ERR(event)) { + err = PTR_ERR(event); goto err_put_context; + } - err = anon_inode_getfd("[perf_event]", &perf_fops, event, O_RDWR); - if (err < 0) - goto err_free_put_context; + if (output_event) { + err = perf_event_set_output(event, output_event); + if (err) + goto err_free_put_context; + } - event_file = fget_light(err, &fput_needed2); - if (!event_file) + event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR); + if (IS_ERR(event_file)) { + err = PTR_ERR(event_file); goto err_free_put_context; - - if (flags & PERF_FLAG_FD_OUTPUT) { - err = perf_event_set_output(event, group_fd); - if (err) - goto err_fput_free_put_context; } event->filp = event_file; @@ -5048,19 +5176,23 @@ SYSCALL_DEFINE5(perf_event_open, list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); -err_fput_free_put_context: - fput_light(event_file, fput_needed2); + /* + * Drop the reference on the group_event after placing the + * new event on the sibling_list. This ensures destruction + * of the group leader will find the pointer to itself in + * perf_group_detach(). + */ + fput_light(group_file, fput_needed); + fd_install(event_fd, event_file); + return event_fd; err_free_put_context: - if (err < 0) - free_event(event); - + free_event(event); err_put_context: - if (err < 0) - put_ctx(ctx); - fput_light(group_file, fput_needed); - + put_ctx(ctx); +err_fd: + put_unused_fd(event_fd); return err; } @@ -5371,6 +5503,7 @@ static void perf_free_event(struct perf_event *event, fput(parent->filp); + perf_group_detach(event); list_del_event(event, ctx); free_event(event); } diff --git a/kernel/pid.c b/kernel/pid.c index aebb30d9c233..e9fd8c132d26 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -513,6 +513,13 @@ void __init pidhash_init(void) void __init pidmap_init(void) { + /* bump default and minimum pid_max based on number of cpus */ + pid_max = min(pid_max_max, max_t(int, pid_max, + PIDS_PER_CPU_DEFAULT * num_possible_cpus())); + pid_max_min = max_t(int, pid_max_min, + PIDS_PER_CPU_MIN * num_possible_cpus()); + pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min); + init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL); /* Reserve PID 0. We never call free_pidmap(0) */ set_bit(0, init_pid_ns.pidmap[0].page); diff --git a/kernel/pm_qos_params.c b/kernel/pm_qos_params.c index f42d3f737a33..996a4dec5f96 100644 --- a/kernel/pm_qos_params.c +++ b/kernel/pm_qos_params.c @@ -48,59 +48,49 @@ * or pm_qos_object list and pm_qos_objects need to happen with pm_qos_lock * held, taken with _irqsave. One lock to rule them all */ -struct pm_qos_request_list { - struct list_head list; - union { - s32 value; - s32 usec; - s32 kbps; - }; - int pm_qos_class; +enum pm_qos_type { + PM_QOS_MAX, /* return the largest value */ + PM_QOS_MIN /* return the smallest value */ }; -static s32 max_compare(s32 v1, s32 v2); -static s32 min_compare(s32 v1, s32 v2); - struct pm_qos_object { - struct pm_qos_request_list requests; + struct plist_head requests; struct blocking_notifier_head *notifiers; struct miscdevice pm_qos_power_miscdev; char *name; s32 default_value; - atomic_t target_value; - s32 (*comparitor)(s32, s32); + enum pm_qos_type type; }; +static DEFINE_SPINLOCK(pm_qos_lock); + static struct pm_qos_object null_pm_qos; static BLOCKING_NOTIFIER_HEAD(cpu_dma_lat_notifier); static struct pm_qos_object cpu_dma_pm_qos = { - .requests = {LIST_HEAD_INIT(cpu_dma_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(cpu_dma_pm_qos.requests, pm_qos_lock), .notifiers = &cpu_dma_lat_notifier, .name = "cpu_dma_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN, }; static BLOCKING_NOTIFIER_HEAD(network_lat_notifier); static struct pm_qos_object network_lat_pm_qos = { - .requests = {LIST_HEAD_INIT(network_lat_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(network_lat_pm_qos.requests, pm_qos_lock), .notifiers = &network_lat_notifier, .name = "network_latency", .default_value = 2000 * USEC_PER_SEC, - .target_value = ATOMIC_INIT(2000 * USEC_PER_SEC), - .comparitor = min_compare + .type = PM_QOS_MIN }; static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier); static struct pm_qos_object network_throughput_pm_qos = { - .requests = {LIST_HEAD_INIT(network_throughput_pm_qos.requests.list)}, + .requests = PLIST_HEAD_INIT(network_throughput_pm_qos.requests, pm_qos_lock), .notifiers = &network_throughput_notifier, .name = "network_throughput", .default_value = 0, - .target_value = ATOMIC_INIT(0), - .comparitor = max_compare + .type = PM_QOS_MAX, }; @@ -111,8 +101,6 @@ static struct pm_qos_object *pm_qos_array[] = { &network_throughput_pm_qos }; -static DEFINE_SPINLOCK(pm_qos_lock); - static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf, size_t count, loff_t *f_pos); static int pm_qos_power_open(struct inode *inode, struct file *filp); @@ -124,46 +112,55 @@ static const struct file_operations pm_qos_power_fops = { .release = pm_qos_power_release, }; -/* static helper functions */ -static s32 max_compare(s32 v1, s32 v2) +/* unlocked internal variant */ +static inline int pm_qos_get_value(struct pm_qos_object *o) { - return max(v1, v2); -} + if (plist_head_empty(&o->requests)) + return o->default_value; -static s32 min_compare(s32 v1, s32 v2) -{ - return min(v1, v2); -} + switch (o->type) { + case PM_QOS_MIN: + return plist_last(&o->requests)->prio; + case PM_QOS_MAX: + return plist_first(&o->requests)->prio; -static void update_target(int pm_qos_class) + default: + /* runtime check for not using enum */ + BUG(); + } +} + +static void update_target(struct pm_qos_object *o, struct plist_node *node, + int del, int value) { - s32 extreme_value; - struct pm_qos_request_list *node; unsigned long flags; - int call_notifier = 0; + int prev_value, curr_value; spin_lock_irqsave(&pm_qos_lock, flags); - extreme_value = pm_qos_array[pm_qos_class]->default_value; - list_for_each_entry(node, - &pm_qos_array[pm_qos_class]->requests.list, list) { - extreme_value = pm_qos_array[pm_qos_class]->comparitor( - extreme_value, node->value); - } - if (atomic_read(&pm_qos_array[pm_qos_class]->target_value) != - extreme_value) { - call_notifier = 1; - atomic_set(&pm_qos_array[pm_qos_class]->target_value, - extreme_value); - pr_debug(KERN_ERR "new target for qos %d is %d\n", pm_qos_class, - atomic_read(&pm_qos_array[pm_qos_class]->target_value)); + prev_value = pm_qos_get_value(o); + /* PM_QOS_DEFAULT_VALUE is a signal that the value is unchanged */ + if (value != PM_QOS_DEFAULT_VALUE) { + /* + * to change the list, we atomically remove, reinit + * with new value and add, then see if the extremal + * changed + */ + plist_del(node, &o->requests); + plist_node_init(node, value); + plist_add(node, &o->requests); + } else if (del) { + plist_del(node, &o->requests); + } else { + plist_add(node, &o->requests); } + curr_value = pm_qos_get_value(o); spin_unlock_irqrestore(&pm_qos_lock, flags); - if (call_notifier) - blocking_notifier_call_chain( - pm_qos_array[pm_qos_class]->notifiers, - (unsigned long) extreme_value, NULL); + if (prev_value != curr_value) + blocking_notifier_call_chain(o->notifiers, + (unsigned long)curr_value, + NULL); } static int register_pm_qos_misc(struct pm_qos_object *qos) @@ -196,10 +193,23 @@ static int find_pm_qos_object_by_minor(int minor) */ int pm_qos_request(int pm_qos_class) { - return atomic_read(&pm_qos_array[pm_qos_class]->target_value); + unsigned long flags; + int value; + + spin_lock_irqsave(&pm_qos_lock, flags); + value = pm_qos_get_value(pm_qos_array[pm_qos_class]); + spin_unlock_irqrestore(&pm_qos_lock, flags); + + return value; } EXPORT_SYMBOL_GPL(pm_qos_request); +int pm_qos_request_active(struct pm_qos_request_list *req) +{ + return req->pm_qos_class != 0; +} +EXPORT_SYMBOL_GPL(pm_qos_request_active); + /** * pm_qos_add_request - inserts new qos request into the list * @pm_qos_class: identifies which list of qos request to us @@ -211,27 +221,23 @@ EXPORT_SYMBOL_GPL(pm_qos_request); * element as a handle for use in updating and removal. Call needs to save * this handle for later use. */ -struct pm_qos_request_list *pm_qos_add_request(int pm_qos_class, s32 value) +void pm_qos_add_request(struct pm_qos_request_list *dep, + int pm_qos_class, s32 value) { - struct pm_qos_request_list *dep; - unsigned long flags; + struct pm_qos_object *o = pm_qos_array[pm_qos_class]; + int new_value; - dep = kzalloc(sizeof(struct pm_qos_request_list), GFP_KERNEL); - if (dep) { - if (value == PM_QOS_DEFAULT_VALUE) - dep->value = pm_qos_array[pm_qos_class]->default_value; - else - dep->value = value; - dep->pm_qos_class = pm_qos_class; - - spin_lock_irqsave(&pm_qos_lock, flags); - list_add(&dep->list, - &pm_qos_array[pm_qos_class]->requests.list); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(pm_qos_class); + if (pm_qos_request_active(dep)) { + WARN(1, KERN_ERR "pm_qos_add_request() called for already added request\n"); + return; } - - return dep; + if (value == PM_QOS_DEFAULT_VALUE) + new_value = o->default_value; + else + new_value = value; + plist_node_init(&dep->list, new_value); + dep->pm_qos_class = pm_qos_class; + update_target(o, &dep->list, 0, PM_QOS_DEFAULT_VALUE); } EXPORT_SYMBOL_GPL(pm_qos_add_request); @@ -246,27 +252,28 @@ EXPORT_SYMBOL_GPL(pm_qos_add_request); * Attempts are made to make this code callable on hot code paths. */ void pm_qos_update_request(struct pm_qos_request_list *pm_qos_req, - s32 new_value) + s32 new_value) { - unsigned long flags; - int pending_update = 0; s32 temp; + struct pm_qos_object *o; + + if (!pm_qos_req) /*guard against callers passing in null */ + return; - if (pm_qos_req) { /*guard against callers passing in null */ - spin_lock_irqsave(&pm_qos_lock, flags); - if (new_value == PM_QOS_DEFAULT_VALUE) - temp = pm_qos_array[pm_qos_req->pm_qos_class]->default_value; - else - temp = new_value; - - if (temp != pm_qos_req->value) { - pending_update = 1; - pm_qos_req->value = temp; - } - spin_unlock_irqrestore(&pm_qos_lock, flags); - if (pending_update) - update_target(pm_qos_req->pm_qos_class); + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n"); + return; } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + + if (new_value == PM_QOS_DEFAULT_VALUE) + temp = o->default_value; + else + temp = new_value; + + if (temp != pm_qos_req->list.prio) + update_target(o, &pm_qos_req->list, 0, temp); } EXPORT_SYMBOL_GPL(pm_qos_update_request); @@ -280,19 +287,20 @@ EXPORT_SYMBOL_GPL(pm_qos_update_request); */ void pm_qos_remove_request(struct pm_qos_request_list *pm_qos_req) { - unsigned long flags; - int qos_class; + struct pm_qos_object *o; if (pm_qos_req == NULL) return; /* silent return to keep pcm code cleaner */ - qos_class = pm_qos_req->pm_qos_class; - spin_lock_irqsave(&pm_qos_lock, flags); - list_del(&pm_qos_req->list); - kfree(pm_qos_req); - spin_unlock_irqrestore(&pm_qos_lock, flags); - update_target(qos_class); + if (!pm_qos_request_active(pm_qos_req)) { + WARN(1, KERN_ERR "pm_qos_remove_request() called for unknown object\n"); + return; + } + + o = pm_qos_array[pm_qos_req->pm_qos_class]; + update_target(o, &pm_qos_req->list, 1, PM_QOS_DEFAULT_VALUE); + memset(pm_qos_req, 0, sizeof(*pm_qos_req)); } EXPORT_SYMBOL_GPL(pm_qos_remove_request); @@ -340,8 +348,12 @@ static int pm_qos_power_open(struct inode *inode, struct file *filp) pm_qos_class = find_pm_qos_object_by_minor(iminor(inode)); if (pm_qos_class >= 0) { - filp->private_data = (void *) pm_qos_add_request(pm_qos_class, - PM_QOS_DEFAULT_VALUE); + struct pm_qos_request_list *req = kzalloc(GFP_KERNEL, sizeof(*req)); + if (!req) + return -ENOMEM; + + pm_qos_add_request(req, pm_qos_class, PM_QOS_DEFAULT_VALUE); + filp->private_data = req; if (filp->private_data) return 0; @@ -353,8 +365,9 @@ static int pm_qos_power_release(struct inode *inode, struct file *filp) { struct pm_qos_request_list *req; - req = (struct pm_qos_request_list *)filp->private_data; + req = filp->private_data; pm_qos_remove_request(req); + kfree(req); return 0; } diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c index 00bb252f29a2..9829646d399c 100644 --- a/kernel/posix-cpu-timers.c +++ b/kernel/posix-cpu-timers.c @@ -363,7 +363,7 @@ int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) } } else { read_lock(&tasklist_lock); - if (thread_group_leader(p) && p->signal) { + if (thread_group_leader(p) && p->sighand) { error = cpu_clock_sample_group(which_clock, p, &rtn); @@ -439,7 +439,7 @@ int posix_cpu_timer_del(struct k_itimer *timer) if (likely(p != NULL)) { read_lock(&tasklist_lock); - if (unlikely(p->signal == NULL)) { + if (unlikely(p->sighand == NULL)) { /* * We raced with the reaping of the task. * The deletion should have cleared us off the list. @@ -691,10 +691,10 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags, read_lock(&tasklist_lock); /* * We need the tasklist_lock to protect against reaping that - * clears p->signal. If p has just been reaped, we can no + * clears p->sighand. If p has just been reaped, we can no * longer get any information about it at all. */ - if (unlikely(p->signal == NULL)) { + if (unlikely(p->sighand == NULL)) { read_unlock(&tasklist_lock); put_task_struct(p); timer->it.cpu.task = NULL; @@ -863,7 +863,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp) clear_dead = p->exit_state; } else { read_lock(&tasklist_lock); - if (unlikely(p->signal == NULL)) { + if (unlikely(p->sighand == NULL)) { /* * The process has been reaped. * We can't even collect a sample any more. @@ -1199,7 +1199,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) spin_lock(&p->sighand->siglock); } else { read_lock(&tasklist_lock); - if (unlikely(p->signal == NULL)) { + if (unlikely(p->sighand == NULL)) { /* * The process has been reaped. * We can't even collect a sample any more. diff --git a/kernel/posix-timers.c b/kernel/posix-timers.c index 00d1fda58ab6..ad723420acc3 100644 --- a/kernel/posix-timers.c +++ b/kernel/posix-timers.c @@ -559,14 +559,7 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->it_id = (timer_t) new_timer_id; new_timer->it_clock = which_clock; new_timer->it_overrun = -1; - error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); - if (error) - goto out; - /* - * return the timer_id now. The next step is hard to - * back out if there is an error. - */ if (copy_to_user(created_timer_id, &new_timer_id, sizeof (new_timer_id))) { error = -EFAULT; @@ -597,6 +590,10 @@ SYSCALL_DEFINE3(timer_create, const clockid_t, which_clock, new_timer->sigq->info.si_tid = new_timer->it_id; new_timer->sigq->info.si_code = SI_TIMER; + error = CLOCK_DISPATCH(which_clock, timer_create, (new_timer)); + if (error) + goto out; + spin_lock_irq(¤t->sighand->siglock); new_timer->it_signal = current->signal; list_add(&new_timer->list, ¤t->signal->posix_timers); diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 5c36ea9d55d2..ca6066a6952e 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -99,9 +99,13 @@ config PM_SLEEP_ADVANCED_DEBUG depends on PM_ADVANCED_DEBUG default n +config SUSPEND_NVS + bool + config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE + select SUSPEND_NVS if HAS_IOMEM default y ---help--- Allow the system to enter sleep states in which main memory is @@ -130,13 +134,10 @@ config SUSPEND_FREEZER Turning OFF this setting is NOT recommended! If in doubt, say Y. -config HIBERNATION_NVS - bool - config HIBERNATION bool "Hibernation (aka 'suspend to disk')" depends on PM && SWAP && ARCH_HIBERNATION_POSSIBLE - select HIBERNATION_NVS if HAS_IOMEM + select SUSPEND_NVS if HAS_IOMEM ---help--- Enable the suspend to disk (STD) functionality, which is usually called "hibernation" in user interfaces. STD checkpoints the diff --git a/kernel/power/Makefile b/kernel/power/Makefile index 524e058dcf06..f9063c6b185d 100644 --- a/kernel/power/Makefile +++ b/kernel/power/Makefile @@ -10,6 +10,6 @@ obj-$(CONFIG_SUSPEND) += suspend.o obj-$(CONFIG_PM_TEST_SUSPEND) += suspend_test.o obj-$(CONFIG_HIBERNATION) += hibernate.o snapshot.o swap.o user.o \ block_io.o -obj-$(CONFIG_HIBERNATION_NVS) += hibernate_nvs.o +obj-$(CONFIG_SUSPEND_NVS) += nvs.o obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index aa9e916da4d5..d26f04e92743 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -277,7 +277,7 @@ static int create_image(int platform_mode) goto Enable_irqs; } - if (hibernation_test(TEST_CORE)) + if (hibernation_test(TEST_CORE) || !pm_check_wakeup_events()) goto Power_up; in_suspend = 1; @@ -288,8 +288,10 @@ static int create_image(int platform_mode) error); /* Restore control flow magically appears here */ restore_processor_state(); - if (!in_suspend) + if (!in_suspend) { + events_check_enabled = false; platform_leave(platform_mode); + } Power_up: sysdev_resume(); @@ -328,7 +330,7 @@ int hibernation_snapshot(int platform_mode) error = platform_begin(platform_mode); if (error) - return error; + goto Close; /* Preallocate image memory before shutting down devices. */ error = hibernate_preallocate_memory(); @@ -511,18 +513,24 @@ int hibernation_platform_enter(void) local_irq_disable(); sysdev_suspend(PMSG_HIBERNATE); + if (!pm_check_wakeup_events()) { + error = -EAGAIN; + goto Power_up; + } + hibernation_ops->enter(); /* We should never get here */ while (1); - /* - * We don't need to reenable the nonboot CPUs or resume consoles, since - * the system is going to be halted anyway. - */ + Power_up: + sysdev_resume(); + local_irq_enable(); + enable_nonboot_cpus(); + Platform_finish: hibernation_ops->finish(); - dpm_suspend_noirq(PMSG_RESTORE); + dpm_resume_noirq(PMSG_RESTORE); Resume_devices: entering_platform_hibernation = false; diff --git a/kernel/power/main.c b/kernel/power/main.c index b58800b21fc0..62b0bc6e4983 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -204,6 +204,60 @@ static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr, power_attr(state); +#ifdef CONFIG_PM_SLEEP +/* + * The 'wakeup_count' attribute, along with the functions defined in + * drivers/base/power/wakeup.c, provides a means by which wakeup events can be + * handled in a non-racy way. + * + * If a wakeup event occurs when the system is in a sleep state, it simply is + * woken up. In turn, if an event that would wake the system up from a sleep + * state occurs when it is undergoing a transition to that sleep state, the + * transition should be aborted. Moreover, if such an event occurs when the + * system is in the working state, an attempt to start a transition to the + * given sleep state should fail during certain period after the detection of + * the event. Using the 'state' attribute alone is not sufficient to satisfy + * these requirements, because a wakeup event may occur exactly when 'state' + * is being written to and may be delivered to user space right before it is + * frozen, so the event will remain only partially processed until the system is + * woken up by another event. In particular, it won't cause the transition to + * a sleep state to be aborted. + * + * This difficulty may be overcome if user space uses 'wakeup_count' before + * writing to 'state'. It first should read from 'wakeup_count' and store + * the read value. Then, after carrying out its own preparations for the system + * transition to a sleep state, it should write the stored value to + * 'wakeup_count'. If that fails, at least one wakeup event has occured since + * 'wakeup_count' was read and 'state' should not be written to. Otherwise, it + * is allowed to write to 'state', but the transition will be aborted if there + * are any wakeup events detected after 'wakeup_count' was written to. + */ + +static ssize_t wakeup_count_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + unsigned long val; + + return pm_get_wakeup_count(&val) ? sprintf(buf, "%lu\n", val) : -EINTR; +} + +static ssize_t wakeup_count_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (sscanf(buf, "%lu", &val) == 1) { + if (pm_save_wakeup_count(val)) + return n; + } + return -EINVAL; +} + +power_attr(wakeup_count); +#endif /* CONFIG_PM_SLEEP */ + #ifdef CONFIG_PM_TRACE int pm_trace_enabled; @@ -236,6 +290,7 @@ static struct attribute * g[] = { #endif #ifdef CONFIG_PM_SLEEP &pm_async_attr.attr, + &wakeup_count_attr.attr, #ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif diff --git a/kernel/power/hibernate_nvs.c b/kernel/power/nvs.c index fdcad9ed5a7b..1836db60bbb6 100644 --- a/kernel/power/hibernate_nvs.c +++ b/kernel/power/nvs.c @@ -15,7 +15,7 @@ /* * Platforms, like ACPI, may want us to save some memory used by them during - * hibernation and to restore the contents of this memory during the subsequent + * suspend and to restore the contents of this memory during the subsequent * resume. The code below implements a mechanism allowing us to do that. */ @@ -30,7 +30,7 @@ struct nvs_page { static LIST_HEAD(nvs_list); /** - * hibernate_nvs_register - register platform NVS memory region to save + * suspend_nvs_register - register platform NVS memory region to save * @start - physical address of the region * @size - size of the region * @@ -38,7 +38,7 @@ static LIST_HEAD(nvs_list); * things so that the data from page-aligned addresses in this region will * be copied into separate RAM pages. */ -int hibernate_nvs_register(unsigned long start, unsigned long size) +int suspend_nvs_register(unsigned long start, unsigned long size) { struct nvs_page *entry, *next; @@ -68,9 +68,9 @@ int hibernate_nvs_register(unsigned long start, unsigned long size) } /** - * hibernate_nvs_free - free data pages allocated for saving NVS regions + * suspend_nvs_free - free data pages allocated for saving NVS regions */ -void hibernate_nvs_free(void) +void suspend_nvs_free(void) { struct nvs_page *entry; @@ -86,16 +86,16 @@ void hibernate_nvs_free(void) } /** - * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions + * suspend_nvs_alloc - allocate memory necessary for saving NVS regions */ -int hibernate_nvs_alloc(void) +int suspend_nvs_alloc(void) { struct nvs_page *entry; list_for_each_entry(entry, &nvs_list, node) { entry->data = (void *)__get_free_page(GFP_KERNEL); if (!entry->data) { - hibernate_nvs_free(); + suspend_nvs_free(); return -ENOMEM; } } @@ -103,9 +103,9 @@ int hibernate_nvs_alloc(void) } /** - * hibernate_nvs_save - save NVS memory regions + * suspend_nvs_save - save NVS memory regions */ -void hibernate_nvs_save(void) +void suspend_nvs_save(void) { struct nvs_page *entry; @@ -119,12 +119,12 @@ void hibernate_nvs_save(void) } /** - * hibernate_nvs_restore - restore NVS memory regions + * suspend_nvs_restore - restore NVS memory regions * * This function is going to be called with interrupts disabled, so it * cannot iounmap the virtual addresses used to access the NVS region. */ -void hibernate_nvs_restore(void) +void suspend_nvs_restore(void) { struct nvs_page *entry; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index 56e7dbb8b996..7335952ee473 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -16,6 +16,12 @@ #include <linux/cpu.h> #include <linux/syscalls.h> #include <linux/gfp.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/suspend.h> #include "power.h" @@ -130,19 +136,19 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->prepare) { error = suspend_ops->prepare(); if (error) - return error; + goto Platform_finish; } error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { printk(KERN_ERR "PM: Some devices failed to power down\n"); - goto Platfrom_finish; + goto Platform_finish; } if (suspend_ops->prepare_late) { error = suspend_ops->prepare_late(); if (error) - goto Power_up_devices; + goto Platform_wake; } if (suspend_test(TEST_PLATFORM)) @@ -157,8 +163,10 @@ static int suspend_enter(suspend_state_t state) error = sysdev_suspend(PMSG_SUSPEND); if (!error) { - if (!suspend_test(TEST_CORE)) + if (!suspend_test(TEST_CORE) && pm_check_wakeup_events()) { error = suspend_ops->enter(state); + events_check_enabled = false; + } sysdev_resume(); } @@ -172,10 +180,9 @@ static int suspend_enter(suspend_state_t state) if (suspend_ops->wake) suspend_ops->wake(); - Power_up_devices: dpm_resume_noirq(PMSG_RESUME); - Platfrom_finish: + Platform_finish: if (suspend_ops->finish) suspend_ops->finish(); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b0bb21778391..7c3ae83e41d7 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -32,7 +32,7 @@ /* * The swap map is a data structure used for keeping track of each page * written to a swap partition. It consists of many swap_map_page - * structures that contain each an array of MAP_PAGE_SIZE swap entries. + * structures that contain each an array of MAP_PAGE_ENTRIES swap entries. * These structures are stored on the swap and linked together with the * help of the .next_swap member. * @@ -148,7 +148,7 @@ sector_t alloc_swapdev_block(int swap) /** * free_all_swap_pages - free swap pages allocated for saving image data. - * It also frees the extents used to register which swap entres had been + * It also frees the extents used to register which swap entries had been * allocated. */ diff --git a/kernel/profile.c b/kernel/profile.c index dfadc5b729f1..b22a899934cc 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -365,14 +365,14 @@ static int __cpuinit profile_cpu_callback(struct notifier_block *info, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - node = cpu_to_node(cpu); + node = cpu_to_mem(cpu); per_cpu(cpu_profile_flip, cpu) = 0; if (!per_cpu(cpu_profile_hits, cpu)[1]) { page = alloc_pages_exact_node(node, GFP_KERNEL | __GFP_ZERO, 0); if (!page) - return NOTIFY_BAD; + return notifier_from_errno(-ENOMEM); per_cpu(cpu_profile_hits, cpu)[1] = page_address(page); } if (!per_cpu(cpu_profile_hits, cpu)[0]) { @@ -388,7 +388,7 @@ out_free: page = virt_to_page(per_cpu(cpu_profile_hits, cpu)[1]); per_cpu(cpu_profile_hits, cpu)[1] = NULL; __free_page(page); - return NOTIFY_BAD; + return notifier_from_errno(-ENOMEM); case CPU_ONLINE: case CPU_ONLINE_FROZEN: if (prof_cpu_mask != NULL) @@ -567,7 +567,7 @@ static int create_hash_tables(void) int cpu; for_each_online_cpu(cpu) { - int node = cpu_to_node(cpu); + int node = cpu_to_mem(cpu); struct page *page; page = alloc_pages_exact_node(node, diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 6af9cdd558b7..74a3d693c196 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -594,6 +594,32 @@ int ptrace_request(struct task_struct *child, long request, ret = ptrace_detach(child, data); break; +#ifdef CONFIG_BINFMT_ELF_FDPIC + case PTRACE_GETFDPIC: { + struct mm_struct *mm = get_task_mm(child); + unsigned long tmp = 0; + + ret = -ESRCH; + if (!mm) + break; + + switch (addr) { + case PTRACE_GETFDPIC_EXEC: + tmp = mm->context.exec_fdpic_loadmap; + break; + case PTRACE_GETFDPIC_INTERP: + tmp = mm->context.interp_fdpic_loadmap; + break; + default: + break; + } + mmput(mm); + + ret = put_user(tmp, (unsigned long __user *) data); + break; + } +#endif + #ifdef PTRACE_SINGLESTEP case PTRACE_SINGLESTEP: #endif diff --git a/kernel/relay.c b/kernel/relay.c index 4268287148c1..c7cf397fb929 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -539,7 +539,7 @@ static int __cpuinit relay_hotcpu_callback(struct notifier_block *nb, "relay_hotcpu_callback: cpu %d buffer " "creation failed\n", hotcpu); mutex_unlock(&relay_channels_mutex); - return NOTIFY_BAD; + return notifier_from_errno(-ENOMEM); } } mutex_unlock(&relay_channels_mutex); diff --git a/kernel/sched.c b/kernel/sched.c index 054a6012de99..f52a8801b7a2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -306,52 +306,6 @@ static int init_task_group_load = INIT_TASK_GROUP_LOAD; */ struct task_group init_task_group; -/* return group to which a task belongs */ -static inline struct task_group *task_group(struct task_struct *p) -{ - struct task_group *tg; - -#ifdef CONFIG_CGROUP_SCHED - tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id), - struct task_group, css); -#else - tg = &init_task_group; -#endif - return tg; -} - -/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) -{ - /* - * Strictly speaking this rcu_read_lock() is not needed since the - * task_group is tied to the cgroup, which in turn can never go away - * as long as there are tasks attached to it. - * - * However since task_group() uses task_subsys_state() which is an - * rcu_dereference() user, this quiets CONFIG_PROVE_RCU. - */ - rcu_read_lock(); -#ifdef CONFIG_FAIR_GROUP_SCHED - p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; - p->se.parent = task_group(p)->se[cpu]; -#endif - -#ifdef CONFIG_RT_GROUP_SCHED - p->rt.rt_rq = task_group(p)->rt_rq[cpu]; - p->rt.parent = task_group(p)->rt_se[cpu]; -#endif - rcu_read_unlock(); -} - -#else - -static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline struct task_group *task_group(struct task_struct *p) -{ - return NULL; -} - #endif /* CONFIG_CGROUP_SCHED */ /* CFS-related fields in a runqueue */ @@ -544,6 +498,8 @@ struct rq { struct root_domain *rd; struct sched_domain *sd; + unsigned long cpu_power; + unsigned char idle_at_tick; /* For active balancing */ int post_schedule; @@ -642,6 +598,49 @@ static inline int cpu_of(struct rq *rq) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) #define raw_rq() (&__raw_get_cpu_var(runqueues)) +#ifdef CONFIG_CGROUP_SCHED + +/* + * Return the group to which this tasks belongs. + * + * We use task_subsys_state_check() and extend the RCU verification + * with lockdep_is_held(&task_rq(p)->lock) because cpu_cgroup_attach() + * holds that lock for each task it moves into the cgroup. Therefore + * by holding that lock, we pin the task to the current cgroup. + */ +static inline struct task_group *task_group(struct task_struct *p) +{ + struct cgroup_subsys_state *css; + + css = task_subsys_state_check(p, cpu_cgroup_subsys_id, + lockdep_is_held(&task_rq(p)->lock)); + return container_of(css, struct task_group, css); +} + +/* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) +{ +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; + p->se.parent = task_group(p)->se[cpu]; +#endif + +#ifdef CONFIG_RT_GROUP_SCHED + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; +#endif +} + +#else /* CONFIG_CGROUP_SCHED */ + +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } +static inline struct task_group *task_group(struct task_struct *p) +{ + return NULL; +} + +#endif /* CONFIG_CGROUP_SCHED */ + inline void update_rq_clock(struct rq *rq) { if (!rq->skip_clock_update) @@ -969,14 +968,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } -void task_rq_unlock_wait(struct task_struct *p) -{ - struct rq *rq = task_rq(p); - - smp_mb(); /* spin-unlock-wait is not a full memory barrier */ - raw_spin_unlock_wait(&rq->lock); -} - static void __task_rq_unlock(struct rq *rq) __releases(rq->lock) { @@ -1263,6 +1254,12 @@ static void sched_avg_update(struct rq *rq) s64 period = sched_avg_period(); while ((s64)(rq->clock - rq->age_stamp) > period) { + /* + * Inline assembly required to prevent the compiler + * optimising this loop into a divmod call. + * See __iter_div_u64_rem() for another example of this. + */ + asm("" : "+rm" (rq->age_stamp)); rq->age_stamp += period; rq->rt_avg /= 2; } @@ -1507,24 +1504,9 @@ static unsigned long target_load(int cpu, int type) return max(rq->cpu_load[type-1], total); } -static struct sched_group *group_of(int cpu) -{ - struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); - - if (!sd) - return NULL; - - return sd->groups; -} - static unsigned long power_of(int cpu) { - struct sched_group *group = group_of(cpu); - - if (!group) - return SCHED_LOAD_SCALE; - - return group->cpu_power; + return cpu_rq(cpu)->cpu_power; } static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); @@ -1681,9 +1663,6 @@ static void update_shares(struct sched_domain *sd) static void update_h_load(long cpu) { - if (root_task_group_empty()) - return; - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); } @@ -1862,8 +1841,8 @@ static void dec_nr_running(struct rq *rq) static void set_load_weight(struct task_struct *p) { if (task_has_rt_policy(p)) { - p->se.load.weight = prio_to_weight[0] * 2; - p->se.load.inv_weight = prio_to_wmult[0] >> 1; + p->se.load.weight = 0; + p->se.load.inv_weight = WMULT_CONST; return; } @@ -2515,7 +2494,16 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); + /* + * The child is not yet in the pid-hash so no cgroup attach races, + * and the cgroup is pinned to this child due to cgroup_fork() + * is ran before sched_fork(). + * + * Silence PROVE_RCU. + */ + rcu_read_lock(); set_task_cpu(p, cpu); + rcu_read_unlock(); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (likely(sched_info_on())) @@ -2885,9 +2873,9 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_iowait_cpu(void) +unsigned long nr_iowait_cpu(int cpu) { - struct rq *this = this_rq(); + struct rq *this = cpu_rq(cpu); return atomic_read(&this->nr_iowait); } @@ -4062,6 +4050,23 @@ int __sched wait_for_completion_killable(struct completion *x) EXPORT_SYMBOL(wait_for_completion_killable); /** + * wait_for_completion_killable_timeout: - waits for completion of a task (w/(to,killable)) + * @x: holds the state of this particular completion + * @timeout: timeout value in jiffies + * + * This waits for either a completion of a specific task to be + * signaled or for a specified timeout to expire. It can be + * interrupted by a kill signal. The timeout is in jiffies. + */ +unsigned long __sched +wait_for_completion_killable_timeout(struct completion *x, + unsigned long timeout) +{ + return wait_for_common(x, timeout, TASK_KILLABLE); +} +EXPORT_SYMBOL(wait_for_completion_killable_timeout); + +/** * try_wait_for_completion - try to decrement a completion without blocking * @x: completion structure * @@ -4469,16 +4474,6 @@ recheck: } if (user) { -#ifdef CONFIG_RT_GROUP_SCHED - /* - * Do not allow realtime tasks into groups that have no runtime - * assigned. - */ - if (rt_bandwidth_enabled() && rt_policy(policy) && - task_group(p)->rt_bandwidth.rt_runtime == 0) - return -EPERM; -#endif - retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -4494,6 +4489,22 @@ recheck: * runqueue lock must be held. */ rq = __task_rq_lock(p); + +#ifdef CONFIG_RT_GROUP_SCHED + if (user) { + /* + * Do not allow realtime tasks into groups that have no runtime + * assigned. + */ + if (rt_bandwidth_enabled() && rt_policy(policy) && + task_group(p)->rt_bandwidth.rt_runtime == 0) { + __task_rq_unlock(rq); + raw_spin_unlock_irqrestore(&p->pi_lock, flags); + return -EPERM; + } + } +#endif + /* recheck policy now with rq lock held */ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { policy = oldpolicy = -1; @@ -7596,6 +7607,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; + rq->cpu_power = SCHED_LOAD_SCALE; rq->post_schedule = 0; rq->active_balance = 0; rq->next_balance = jiffies; diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 87a330a7185f..35565395d00d 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -381,15 +381,9 @@ __initcall(init_sched_debug_procfs); void proc_sched_show_task(struct task_struct *p, struct seq_file *m) { unsigned long nr_switches; - unsigned long flags; - int num_threads = 1; - - if (lock_task_sighand(p, &flags)) { - num_threads = atomic_read(&p->signal->count); - unlock_task_sighand(p, &flags); - } - SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, num_threads); + SEQ_printf(m, "%s (%d, #threads: %d)\n", p->comm, p->pid, + get_nr_threads(p)); SEQ_printf(m, "---------------------------------------------------------\n"); #define __P(F) \ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 217e4a9393e4..a878b5332daa 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1225,7 +1225,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) unsigned long this_load, load; int idx, this_cpu, prev_cpu; unsigned long tl_per_task; - unsigned int imbalance; struct task_group *tg; unsigned long weight; int balanced; @@ -1241,6 +1240,7 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * effect of the currently running task from the load * of the current CPU: */ + rcu_read_lock(); if (sync) { tg = task_group(current); weight = current->se.load.weight; @@ -1252,8 +1252,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) tg = task_group(p); weight = p->se.load.weight; - imbalance = 100 + (sd->imbalance_pct - 100) / 2; - /* * In low-load situations, where prev_cpu is idle and this_cpu is idle * due to the sync cause above having dropped this_load to 0, we'll @@ -1263,9 +1261,22 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync) * Otherwise check if either cpus are near enough in load to allow this * task to be woken on this_cpu. */ - balanced = !this_load || - 100*(this_load + effective_load(tg, this_cpu, weight, weight)) <= - imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); + if (this_load) { + unsigned long this_eff_load, prev_eff_load; + + this_eff_load = 100; + this_eff_load *= power_of(prev_cpu); + this_eff_load *= this_load + + effective_load(tg, this_cpu, weight, weight); + + prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2; + prev_eff_load *= power_of(this_cpu); + prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight); + + balanced = this_eff_load <= prev_eff_load; + } else + balanced = true; + rcu_read_unlock(); /* * If the currently running task will sleep within @@ -2298,6 +2309,7 @@ static void update_cpu_power(struct sched_domain *sd, int cpu) if (!power) power = 1; + cpu_rq(cpu)->cpu_power = power; sdg->cpu_power = power; } diff --git a/kernel/signal.c b/kernel/signal.c index 825a3f24ad76..bded65187780 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -637,12 +637,12 @@ static inline bool si_fromuser(const struct siginfo *info) /* * Bad permissions for sending the signal - * - the caller must hold at least the RCU read lock + * - the caller must hold the RCU read lock */ static int check_kill_permission(int sig, struct siginfo *info, struct task_struct *t) { - const struct cred *cred = current_cred(), *tcred; + const struct cred *cred, *tcred; struct pid *sid; int error; @@ -656,8 +656,10 @@ static int check_kill_permission(int sig, struct siginfo *info, if (error) return error; + cred = current_cred(); tcred = __task_cred(t); - if ((cred->euid ^ tcred->suid) && + if (!same_thread_group(current, t) && + (cred->euid ^ tcred->suid) && (cred->euid ^ tcred->uid) && (cred->uid ^ tcred->suid) && (cred->uid ^ tcred->uid) && @@ -1083,23 +1085,24 @@ force_sig_info(int sig, struct siginfo *info, struct task_struct *t) /* * Nuke all other threads in the group. */ -void zap_other_threads(struct task_struct *p) +int zap_other_threads(struct task_struct *p) { - struct task_struct *t; + struct task_struct *t = p; + int count = 0; p->signal->group_stop_count = 0; - for (t = next_thread(p); t != p; t = next_thread(t)) { - /* - * Don't bother with already dead threads - */ + while_each_thread(p, t) { + count++; + + /* Don't bother with already dead threads */ if (t->exit_state) continue; - - /* SIGKILL will be handled before any pending SIGSTOP */ sigaddset(&t->pending.signal, SIGKILL); signal_wake_up(t, 1); } + + return count; } struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long *flags) @@ -1124,11 +1127,14 @@ struct sighand_struct *lock_task_sighand(struct task_struct *tsk, unsigned long /* * send signal info to all the members of a group - * - the caller must hold the RCU read lock at least */ int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p) { - int ret = check_kill_permission(sig, info, p); + int ret; + + rcu_read_lock(); + ret = check_kill_permission(sig, info, p); + rcu_read_unlock(); if (!ret && sig) ret = do_send_sig_info(sig, info, p, true); diff --git a/kernel/smp.c b/kernel/smp.c index 3fc697336183..75c970c715d3 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -52,7 +52,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE_FROZEN: if (!zalloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL, cpu_to_node(cpu))) - return NOTIFY_BAD; + return notifier_from_errno(-ENOMEM); break; #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/softirq.c b/kernel/softirq.c index 0db913a5c60f..07b4f1b1a73a 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -808,7 +808,7 @@ static int __cpuinit cpu_callback(struct notifier_block *nfb, p = kthread_create(run_ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu); if (IS_ERR(p)) { printk("ksoftirqd for %i failed\n", hotcpu); - return NOTIFY_BAD; + return notifier_from_errno(PTR_ERR(p)); } kthread_bind(p, hotcpu); per_cpu(ksoftirqd, hotcpu) = p; @@ -850,7 +850,7 @@ static __init int spawn_ksoftirqd(void) void *cpu = (void *)(long)smp_processor_id(); int err = cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); cpu_callback(&cpu_nfb, CPU_ONLINE, cpu); register_cpu_notifier(&cpu_nfb); return 0; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b4e7431e7c78..70f8d90331e9 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -321,7 +321,7 @@ static int __cpuinit cpu_stop_cpu_callback(struct notifier_block *nfb, #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: - case CPU_DEAD: + case CPU_POST_DEAD: { struct cpu_stop_work *work; diff --git a/kernel/sys.c b/kernel/sys.c index 0d36d889c74d..e83ddbbaf89d 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1632,9 +1632,9 @@ SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, char poweroff_cmd[POWEROFF_CMD_PATH_LEN] = "/sbin/poweroff"; -static void argv_cleanup(char **argv, char **envp) +static void argv_cleanup(struct subprocess_info *info) { - argv_free(argv); + argv_free(info->argv); } /** @@ -1668,7 +1668,7 @@ int orderly_poweroff(bool force) goto out; } - call_usermodehelper_setcleanup(info, argv_cleanup); + call_usermodehelper_setfns(info, NULL, argv_cleanup, NULL); ret = call_usermodehelper_exec(info, UMH_NO_WAIT); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 997080f00e0b..d24f761f4876 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1471,12 +1471,12 @@ static struct ctl_table fs_table[] = { }, #endif { - .procname = "pipe-max-pages", - .data = &pipe_max_pages, + .procname = "pipe-max-size", + .data = &pipe_max_size, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = &proc_dointvec_minmax, - .extra1 = &two, + .proc_handler = &pipe_proc_fn, + .extra1 = &pipe_min_size, }, /* * NOTE: do not add new entries to this table unless you have read diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 1d7b9bc1c034..813993b5fb61 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -154,14 +154,14 @@ static void tick_nohz_update_jiffies(ktime_t now) * Updates the per cpu time idle statistics counters */ static void -update_ts_time_stats(struct tick_sched *ts, ktime_t now, u64 *last_update_time) +update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_update_time) { ktime_t delta; if (ts->idle_active) { delta = ktime_sub(now, ts->idle_entrytime); ts->idle_sleeptime = ktime_add(ts->idle_sleeptime, delta); - if (nr_iowait_cpu() > 0) + if (nr_iowait_cpu(cpu) > 0) ts->iowait_sleeptime = ktime_add(ts->iowait_sleeptime, delta); ts->idle_entrytime = now; } @@ -175,19 +175,19 @@ static void tick_nohz_stop_idle(int cpu, ktime_t now) { struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_active = 0; sched_clock_idle_wakeup_event(0); } -static ktime_t tick_nohz_start_idle(struct tick_sched *ts) +static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts) { ktime_t now; now = ktime_get(); - update_ts_time_stats(ts, now, NULL); + update_ts_time_stats(cpu, ts, now, NULL); ts->idle_entrytime = now; ts->idle_active = 1; @@ -216,7 +216,7 @@ u64 get_cpu_idle_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->idle_sleeptime); } @@ -242,7 +242,7 @@ u64 get_cpu_iowait_time_us(int cpu, u64 *last_update_time) if (!tick_nohz_enabled) return -1; - update_ts_time_stats(ts, ktime_get(), last_update_time); + update_ts_time_stats(cpu, ts, ktime_get(), last_update_time); return ktime_to_us(ts->iowait_sleeptime); } @@ -284,7 +284,7 @@ void tick_nohz_stop_sched_tick(int inidle) */ ts->inidle = 1; - now = tick_nohz_start_idle(ts); + now = tick_nohz_start_idle(cpu, ts); /* * If this cpu is offline and it is the one which updates @@ -315,9 +315,6 @@ void tick_nohz_stop_sched_tick(int inidle) goto end; } - if (nohz_ratelimit(cpu)) - goto end; - ts->idle_calls++; /* Read jiffies and the time when jiffies were updated last */ do { @@ -328,7 +325,7 @@ void tick_nohz_stop_sched_tick(int inidle) } while (read_seqretry(&xtime_lock, seq)); if (rcu_needs_cpu(cpu) || printk_needs_cpu(cpu) || - arch_needs_cpu(cpu)) { + arch_needs_cpu(cpu) || nohz_ratelimit(cpu)) { next_jiffies = last_jiffies + 1; delta_jiffies = 1; } else { diff --git a/kernel/timer.c b/kernel/timer.c index be394af5bc22..efde11e197c4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -577,6 +577,19 @@ static void __init_timer(struct timer_list *timer, lockdep_init_map(&timer->lockdep_map, name, key, 0); } +void setup_deferrable_timer_on_stack_key(struct timer_list *timer, + const char *name, + struct lock_class_key *key, + void (*function)(unsigned long), + unsigned long data) +{ + timer->function = function; + timer->data = data; + init_timer_on_stack_key(timer, name, key); + timer_set_deferrable(timer); +} +EXPORT_SYMBOL_GPL(setup_deferrable_timer_on_stack_key); + /** * init_timer_key - initialize a timer * @timer: the timer to be initialized @@ -752,11 +765,15 @@ unsigned long apply_slack(struct timer_list *timer, unsigned long expires) expires_limit = expires; - if (timer->slack > -1) + if (timer->slack >= 0) { expires_limit = expires + timer->slack; - else if (time_after(expires, jiffies)) /* auto slack: use 0.4% */ - expires_limit = expires + (expires - jiffies)/256; + } else { + unsigned long now = jiffies; + /* No slack, if already expired else auto slack 0.4% */ + if (time_after(expires, now)) + expires_limit = expires + (expires - now)/256; + } mask = expires ^ expires_limit; if (mask == 0) return expires; @@ -1680,11 +1697,14 @@ static int __cpuinit timer_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { long cpu = (long)hcpu; + int err; + switch(action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - if (init_timers_cpu(cpu) < 0) - return NOTIFY_BAD; + err = init_timers_cpu(cpu); + if (err < 0) + return notifier_from_errno(err); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_DEAD: @@ -1710,7 +1730,7 @@ void __init init_timers(void) init_timer_stats(); - BUG_ON(err == NOTIFY_BAD); + BUG_ON(err != NOTIFY_OK); register_cpu_notifier(&timers_nb); open_softirq(TIMER_SOFTIRQ, run_timer_softirq); } diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index b3bc91a3f510..638711c17504 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -675,28 +675,33 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, } } -static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_abort(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_ABORT); } -static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_insert(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_INSERT); } -static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq) +static void blk_add_trace_rq_issue(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_ISSUE); } -static void blk_add_trace_rq_requeue(struct request_queue *q, +static void blk_add_trace_rq_requeue(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_REQUEUE); } -static void blk_add_trace_rq_complete(struct request_queue *q, +static void blk_add_trace_rq_complete(void *ignore, + struct request_queue *q, struct request *rq) { blk_add_trace_rq(q, rq, BLK_TA_COMPLETE); @@ -724,34 +729,40 @@ static void blk_add_trace_bio(struct request_queue *q, struct bio *bio, !bio_flagged(bio, BIO_UPTODATE), 0, NULL); } -static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_bounce(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BOUNCE); } -static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_complete(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_COMPLETE); } -static void blk_add_trace_bio_backmerge(struct request_queue *q, +static void blk_add_trace_bio_backmerge(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE); } -static void blk_add_trace_bio_frontmerge(struct request_queue *q, +static void blk_add_trace_bio_frontmerge(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE); } -static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio) +static void blk_add_trace_bio_queue(void *ignore, + struct request_queue *q, struct bio *bio) { blk_add_trace_bio(q, bio, BLK_TA_QUEUE); } -static void blk_add_trace_getrq(struct request_queue *q, +static void blk_add_trace_getrq(void *ignore, + struct request_queue *q, struct bio *bio, int rw) { if (bio) @@ -765,7 +776,8 @@ static void blk_add_trace_getrq(struct request_queue *q, } -static void blk_add_trace_sleeprq(struct request_queue *q, +static void blk_add_trace_sleeprq(void *ignore, + struct request_queue *q, struct bio *bio, int rw) { if (bio) @@ -779,7 +791,7 @@ static void blk_add_trace_sleeprq(struct request_queue *q, } } -static void blk_add_trace_plug(struct request_queue *q) +static void blk_add_trace_plug(void *ignore, struct request_queue *q) { struct blk_trace *bt = q->blk_trace; @@ -787,7 +799,7 @@ static void blk_add_trace_plug(struct request_queue *q) __blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL); } -static void blk_add_trace_unplug_io(struct request_queue *q) +static void blk_add_trace_unplug_io(void *ignore, struct request_queue *q) { struct blk_trace *bt = q->blk_trace; @@ -800,7 +812,7 @@ static void blk_add_trace_unplug_io(struct request_queue *q) } } -static void blk_add_trace_unplug_timer(struct request_queue *q) +static void blk_add_trace_unplug_timer(void *ignore, struct request_queue *q) { struct blk_trace *bt = q->blk_trace; @@ -813,7 +825,8 @@ static void blk_add_trace_unplug_timer(struct request_queue *q) } } -static void blk_add_trace_split(struct request_queue *q, struct bio *bio, +static void blk_add_trace_split(void *ignore, + struct request_queue *q, struct bio *bio, unsigned int pdu) { struct blk_trace *bt = q->blk_trace; @@ -829,6 +842,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, /** * blk_add_trace_remap - Add a trace for a remap operation + * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @bio: the source bio * @dev: target device @@ -839,8 +853,9 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * it spans a stripe (or similar). Add a trace for that action. * **/ -static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from) +static void blk_add_trace_remap(void *ignore, + struct request_queue *q, struct bio *bio, + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -859,6 +874,7 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, /** * blk_add_trace_rq_remap - Add a trace for a request-remap operation + * @ignore: trace callback data parameter (not used) * @q: queue the io is for * @rq: the source request * @dev: target device @@ -869,7 +885,8 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, * Add a trace for that action. * **/ -static void blk_add_trace_rq_remap(struct request_queue *q, +static void blk_add_trace_rq_remap(void *ignore, + struct request_queue *q, struct request *rq, dev_t dev, sector_t from) { @@ -921,64 +938,64 @@ static void blk_register_tracepoints(void) { int ret; - ret = register_trace_block_rq_abort(blk_add_trace_rq_abort); + ret = register_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); WARN_ON(ret); - ret = register_trace_block_rq_insert(blk_add_trace_rq_insert); + ret = register_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); WARN_ON(ret); - ret = register_trace_block_rq_issue(blk_add_trace_rq_issue); + ret = register_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); WARN_ON(ret); - ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue); + ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); WARN_ON(ret); - ret = register_trace_block_rq_complete(blk_add_trace_rq_complete); + ret = register_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); WARN_ON(ret); - ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce); + ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); WARN_ON(ret); - ret = register_trace_block_bio_complete(blk_add_trace_bio_complete); + ret = register_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); WARN_ON(ret); - ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); + ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); WARN_ON(ret); - ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); + ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); WARN_ON(ret); - ret = register_trace_block_bio_queue(blk_add_trace_bio_queue); + ret = register_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); WARN_ON(ret); - ret = register_trace_block_getrq(blk_add_trace_getrq); + ret = register_trace_block_getrq(blk_add_trace_getrq, NULL); WARN_ON(ret); - ret = register_trace_block_sleeprq(blk_add_trace_sleeprq); + ret = register_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); WARN_ON(ret); - ret = register_trace_block_plug(blk_add_trace_plug); + ret = register_trace_block_plug(blk_add_trace_plug, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer); + ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); WARN_ON(ret); - ret = register_trace_block_unplug_io(blk_add_trace_unplug_io); + ret = register_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); WARN_ON(ret); - ret = register_trace_block_split(blk_add_trace_split); + ret = register_trace_block_split(blk_add_trace_split, NULL); WARN_ON(ret); - ret = register_trace_block_remap(blk_add_trace_remap); + ret = register_trace_block_remap(blk_add_trace_remap, NULL); WARN_ON(ret); - ret = register_trace_block_rq_remap(blk_add_trace_rq_remap); + ret = register_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); WARN_ON(ret); } static void blk_unregister_tracepoints(void) { - unregister_trace_block_rq_remap(blk_add_trace_rq_remap); - unregister_trace_block_remap(blk_add_trace_remap); - unregister_trace_block_split(blk_add_trace_split); - unregister_trace_block_unplug_io(blk_add_trace_unplug_io); - unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer); - unregister_trace_block_plug(blk_add_trace_plug); - unregister_trace_block_sleeprq(blk_add_trace_sleeprq); - unregister_trace_block_getrq(blk_add_trace_getrq); - unregister_trace_block_bio_queue(blk_add_trace_bio_queue); - unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge); - unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge); - unregister_trace_block_bio_complete(blk_add_trace_bio_complete); - unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce); - unregister_trace_block_rq_complete(blk_add_trace_rq_complete); - unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue); - unregister_trace_block_rq_issue(blk_add_trace_rq_issue); - unregister_trace_block_rq_insert(blk_add_trace_rq_insert); - unregister_trace_block_rq_abort(blk_add_trace_rq_abort); + unregister_trace_block_rq_remap(blk_add_trace_rq_remap, NULL); + unregister_trace_block_remap(blk_add_trace_remap, NULL); + unregister_trace_block_split(blk_add_trace_split, NULL); + unregister_trace_block_unplug_io(blk_add_trace_unplug_io, NULL); + unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer, NULL); + unregister_trace_block_plug(blk_add_trace_plug, NULL); + unregister_trace_block_sleeprq(blk_add_trace_sleeprq, NULL); + unregister_trace_block_getrq(blk_add_trace_getrq, NULL); + unregister_trace_block_bio_queue(blk_add_trace_bio_queue, NULL); + unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge, NULL); + unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge, NULL); + unregister_trace_block_bio_complete(blk_add_trace_bio_complete, NULL); + unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce, NULL); + unregister_trace_block_rq_complete(blk_add_trace_rq_complete, NULL); + unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue, NULL); + unregister_trace_block_rq_issue(blk_add_trace_rq_issue, NULL); + unregister_trace_block_rq_insert(blk_add_trace_rq_insert, NULL); + unregister_trace_block_rq_abort(blk_add_trace_rq_abort, NULL); tracepoint_synchronize_unregister(); } @@ -1321,7 +1338,7 @@ out: } static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { return print_one_line(iter, false); } @@ -1343,7 +1360,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter) } static enum print_line_t -blk_trace_event_print_binary(struct trace_iterator *iter, int flags) +blk_trace_event_print_binary(struct trace_iterator *iter, int flags, + struct trace_event *event) { return blk_trace_synthesize_old_trace(iter) ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE; @@ -1381,12 +1399,16 @@ static struct tracer blk_tracer __read_mostly = { .set_flag = blk_tracer_set_flag, }; -static struct trace_event trace_blk_event = { - .type = TRACE_BLK, +static struct trace_event_functions trace_blk_event_funcs = { .trace = blk_trace_event_print, .binary = blk_trace_event_print_binary, }; +static struct trace_event trace_blk_event = { + .type = TRACE_BLK, + .funcs = &trace_blk_event_funcs, +}; + static int __init init_blk_tracer(void) { if (!register_ftrace_event(&trace_blk_event)) { diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 32837e19e3bd..6d2cb14f9449 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3234,7 +3234,8 @@ free: } static void -ftrace_graph_probe_sched_switch(struct task_struct *prev, struct task_struct *next) +ftrace_graph_probe_sched_switch(void *ignore, + struct task_struct *prev, struct task_struct *next) { unsigned long long timestamp; int index; @@ -3288,7 +3289,7 @@ static int start_graph_tracing(void) } while (ret == -EAGAIN); if (!ret) { - ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch); + ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); if (ret) pr_info("ftrace_graph: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); @@ -3364,7 +3365,7 @@ void unregister_ftrace_graph(void) ftrace_graph_entry = ftrace_graph_entry_stub; ftrace_shutdown(FTRACE_STOP_FUNC_RET); unregister_pm_notifier(&ftrace_suspend_notifier); - unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); + unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); out: mutex_unlock(&ftrace_lock); diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index a91da69f153a..bbfc1bb1660b 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -95,7 +95,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, trace_wake_up(); } -static void kmemtrace_kmalloc(unsigned long call_site, +static void kmemtrace_kmalloc(void *ignore, + unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, @@ -105,7 +106,8 @@ static void kmemtrace_kmalloc(unsigned long call_site, bytes_req, bytes_alloc, gfp_flags, -1); } -static void kmemtrace_kmem_cache_alloc(unsigned long call_site, +static void kmemtrace_kmem_cache_alloc(void *ignore, + unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, @@ -115,7 +117,8 @@ static void kmemtrace_kmem_cache_alloc(unsigned long call_site, bytes_req, bytes_alloc, gfp_flags, -1); } -static void kmemtrace_kmalloc_node(unsigned long call_site, +static void kmemtrace_kmalloc_node(void *ignore, + unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, @@ -126,7 +129,8 @@ static void kmemtrace_kmalloc_node(unsigned long call_site, bytes_req, bytes_alloc, gfp_flags, node); } -static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, +static void kmemtrace_kmem_cache_alloc_node(void *ignore, + unsigned long call_site, const void *ptr, size_t bytes_req, size_t bytes_alloc, @@ -137,12 +141,14 @@ static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site, bytes_req, bytes_alloc, gfp_flags, node); } -static void kmemtrace_kfree(unsigned long call_site, const void *ptr) +static void +kmemtrace_kfree(void *ignore, unsigned long call_site, const void *ptr) { kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr); } -static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr) +static void kmemtrace_kmem_cache_free(void *ignore, + unsigned long call_site, const void *ptr) { kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr); } @@ -151,34 +157,34 @@ static int kmemtrace_start_probes(void) { int err; - err = register_trace_kmalloc(kmemtrace_kmalloc); + err = register_trace_kmalloc(kmemtrace_kmalloc, NULL); if (err) return err; - err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); + err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); if (err) return err; - err = register_trace_kmalloc_node(kmemtrace_kmalloc_node); + err = register_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); if (err) return err; - err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); + err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); if (err) return err; - err = register_trace_kfree(kmemtrace_kfree); + err = register_trace_kfree(kmemtrace_kfree, NULL); if (err) return err; - err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free); + err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); return err; } static void kmemtrace_stop_probes(void) { - unregister_trace_kmalloc(kmemtrace_kmalloc); - unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc); - unregister_trace_kmalloc_node(kmemtrace_kmalloc_node); - unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node); - unregister_trace_kfree(kmemtrace_kfree); - unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free); + unregister_trace_kmalloc(kmemtrace_kmalloc, NULL); + unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc, NULL); + unregister_trace_kmalloc_node(kmemtrace_kmalloc_node, NULL); + unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node, NULL); + unregister_trace_kfree(kmemtrace_kfree, NULL); + unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free, NULL); } static int kmem_trace_init(struct trace_array *tr) @@ -237,7 +243,8 @@ struct kmemtrace_user_event_alloc { }; static enum print_line_t -kmemtrace_print_alloc(struct trace_iterator *iter, int flags) +kmemtrace_print_alloc(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct kmemtrace_alloc_entry *entry; @@ -257,7 +264,8 @@ kmemtrace_print_alloc(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_free(struct trace_iterator *iter, int flags) +kmemtrace_print_free(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct kmemtrace_free_entry *entry; @@ -275,7 +283,8 @@ kmemtrace_print_free(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) +kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct kmemtrace_alloc_entry *entry; @@ -309,7 +318,8 @@ kmemtrace_print_alloc_user(struct trace_iterator *iter, int flags) } static enum print_line_t -kmemtrace_print_free_user(struct trace_iterator *iter, int flags) +kmemtrace_print_free_user(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct kmemtrace_free_entry *entry; @@ -463,18 +473,26 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter) } } -static struct trace_event kmem_trace_alloc = { - .type = TRACE_KMEM_ALLOC, +static struct trace_event_functions kmem_trace_alloc_funcs = { .trace = kmemtrace_print_alloc, .binary = kmemtrace_print_alloc_user, }; -static struct trace_event kmem_trace_free = { - .type = TRACE_KMEM_FREE, +static struct trace_event kmem_trace_alloc = { + .type = TRACE_KMEM_ALLOC, + .funcs = &kmem_trace_alloc_funcs, +}; + +static struct trace_event_functions kmem_trace_free_funcs = { .trace = kmemtrace_print_free, .binary = kmemtrace_print_free_user, }; +static struct trace_event kmem_trace_free = { + .type = TRACE_KMEM_FREE, + .funcs = &kmem_trace_free_funcs, +}; + static struct tracer kmem_tracer __read_mostly = { .name = "kmemtrace", .init = kmem_trace_init, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7f6059c5aa94..1da7b6ea8b85 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1768,6 +1768,14 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, * must fill the old tail_page with padding. */ if (tail >= BUF_PAGE_SIZE) { + /* + * If the page was filled, then we still need + * to update the real_end. Reset it to zero + * and the reader will ignore it. + */ + if (tail == BUF_PAGE_SIZE) + tail_page->real_end = 0; + local_sub(length, &tail_page->write); return; } @@ -3894,12 +3902,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer, ret = read; cpu_buffer->lost_events = 0; + + commit = local_read(&bpage->commit); /* * Set a flag in the commit field if we lost events */ if (missed_events) { - commit = local_read(&bpage->commit); - /* If there is room at the end of the page to save the * missed events, then record it there. */ @@ -3907,10 +3915,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer, memcpy(&bpage->data[commit], &missed_events, sizeof(missed_events)); local_add(RB_MISSED_STORED, &bpage->commit); + commit += sizeof(missed_events); } local_add(RB_MISSED_EVENTS, &bpage->commit); } + /* + * This page may be off to user land. Zero it out here. + */ + if (commit < BUF_PAGE_SIZE) + memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit); + out_unlock: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8a76339a9e65..086d36316805 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1936,7 +1936,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter) } if (event) - return event->trace(iter, sym_flags); + return event->funcs->trace(iter, sym_flags, event); if (!trace_seq_printf(s, "Unknown type %d\n", entry->type)) goto partial; @@ -1962,7 +1962,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (event) - return event->raw(iter, 0); + return event->funcs->raw(iter, 0, event); if (!trace_seq_printf(s, "%d ?\n", entry->type)) goto partial; @@ -1989,7 +1989,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter) event = ftrace_find_event(entry->type); if (event) { - enum print_line_t ret = event->hex(iter, 0); + enum print_line_t ret = event->funcs->hex(iter, 0, event); if (ret != TRACE_TYPE_HANDLED) return ret; } @@ -2014,7 +2014,8 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter) } event = ftrace_find_event(entry->type); - return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED; + return event ? event->funcs->binary(iter, 0, event) : + TRACE_TYPE_HANDLED; } int trace_empty(struct trace_iterator *iter) @@ -3665,7 +3666,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, size_t count, loff_t *ppos) { struct ftrace_buffer_info *info = filp->private_data; - unsigned int pos; ssize_t ret; size_t size; @@ -3692,11 +3692,6 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, if (ret < 0) return 0; - pos = ring_buffer_page_len(info->spare); - - if (pos < PAGE_SIZE) - memset(info->spare + pos, 0, PAGE_SIZE - pos); - read: size = PAGE_SIZE - info->read; if (size > count) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d1ce0bec1b3f..2cd96399463f 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -405,12 +405,12 @@ void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, void __trace_stack(struct trace_array *tr, unsigned long flags, int skip, int pc); #else -static inline void ftrace_trace_stack(struct trace_array *tr, +static inline void ftrace_trace_stack(struct ring_buffer *buffer, unsigned long flags, int skip, int pc) { } -static inline void ftrace_trace_userstack(struct trace_array *tr, +static inline void ftrace_trace_userstack(struct ring_buffer *buffer, unsigned long flags, int pc) { } @@ -778,12 +778,15 @@ extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); extern int filter_assign_type(const char *type); +struct list_head * +trace_get_fields(struct ftrace_event_call *event_call); + static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->filter_active) && + if (unlikely(call->flags & TRACE_EVENT_FL_FILTERED) && !filter_match_preds(call->filter, rec)) { ring_buffer_discard_commit(buffer, event); return 1; diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index b9bc4d470177..8d3538b4ea5f 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -143,7 +143,7 @@ static void branch_trace_reset(struct trace_array *tr) } static enum print_line_t trace_branch_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct trace_branch *field; @@ -167,9 +167,13 @@ static void branch_print_header(struct seq_file *s) " |\n"); } +static struct trace_event_functions trace_branch_funcs = { + .trace = trace_branch_print, +}; + static struct trace_event trace_branch_event = { .type = TRACE_BRANCH, - .trace = trace_branch_print, + .funcs = &trace_branch_funcs, }; static struct tracer branch_trace __read_mostly = diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index 0565bb42566f..8a2b73f7c068 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -9,13 +9,9 @@ #include <linux/kprobes.h> #include "trace.h" -DEFINE_PER_CPU(struct pt_regs, perf_trace_regs); -EXPORT_PER_CPU_SYMBOL_GPL(perf_trace_regs); - EXPORT_SYMBOL_GPL(perf_arch_fetch_caller_regs); -static char *perf_trace_buf; -static char *perf_trace_buf_nmi; +static char *perf_trace_buf[4]; /* * Force it to be aligned to unsigned long to avoid misaligned accesses @@ -27,57 +23,84 @@ typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) /* Count the events in use (per event id, not per instance) */ static int total_ref_count; -static int perf_trace_event_enable(struct ftrace_event_call *event) +static int perf_trace_event_init(struct ftrace_event_call *tp_event, + struct perf_event *p_event) { - char *buf; + struct hlist_head *list; int ret = -ENOMEM; + int cpu; - if (event->perf_refcount++ > 0) + p_event->tp_event = tp_event; + if (tp_event->perf_refcount++ > 0) return 0; - if (!total_ref_count) { - buf = (char *)alloc_percpu(perf_trace_t); - if (!buf) - goto fail_buf; + list = alloc_percpu(struct hlist_head); + if (!list) + goto fail; - rcu_assign_pointer(perf_trace_buf, buf); + for_each_possible_cpu(cpu) + INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); - buf = (char *)alloc_percpu(perf_trace_t); - if (!buf) - goto fail_buf_nmi; + tp_event->perf_events = list; - rcu_assign_pointer(perf_trace_buf_nmi, buf); - } + if (!total_ref_count) { + char *buf; + int i; - ret = event->perf_event_enable(event); - if (!ret) { - total_ref_count++; - return 0; + for (i = 0; i < 4; i++) { + buf = (char *)alloc_percpu(perf_trace_t); + if (!buf) + goto fail; + + perf_trace_buf[i] = buf; + } } -fail_buf_nmi: + if (tp_event->class->reg) + ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER); + else + ret = tracepoint_probe_register(tp_event->name, + tp_event->class->perf_probe, + tp_event); + + if (ret) + goto fail; + + total_ref_count++; + return 0; + +fail: if (!total_ref_count) { - free_percpu(perf_trace_buf_nmi); - free_percpu(perf_trace_buf); - perf_trace_buf_nmi = NULL; - perf_trace_buf = NULL; + int i; + + for (i = 0; i < 4; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; + } + } + + if (!--tp_event->perf_refcount) { + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; } -fail_buf: - event->perf_refcount--; return ret; } -int perf_trace_enable(int event_id) +int perf_trace_init(struct perf_event *p_event) { - struct ftrace_event_call *event; + struct ftrace_event_call *tp_event; + int event_id = p_event->attr.config; int ret = -EINVAL; mutex_lock(&event_mutex); - list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id && event->perf_event_enable && - try_module_get(event->mod)) { - ret = perf_trace_event_enable(event); + list_for_each_entry(tp_event, &ftrace_events, list) { + if (tp_event->event.type == event_id && + tp_event->class && + (tp_event->class->perf_probe || + tp_event->class->reg) && + try_module_get(tp_event->mod)) { + ret = perf_trace_event_init(tp_event, p_event); break; } } @@ -86,90 +109,87 @@ int perf_trace_enable(int event_id) return ret; } -static void perf_trace_event_disable(struct ftrace_event_call *event) +int perf_trace_enable(struct perf_event *p_event) { - char *buf, *nmi_buf; + struct ftrace_event_call *tp_event = p_event->tp_event; + struct hlist_head *list; - if (--event->perf_refcount > 0) - return; + list = tp_event->perf_events; + if (WARN_ON_ONCE(!list)) + return -EINVAL; - event->perf_event_disable(event); - - if (!--total_ref_count) { - buf = perf_trace_buf; - rcu_assign_pointer(perf_trace_buf, NULL); + list = this_cpu_ptr(list); + hlist_add_head_rcu(&p_event->hlist_entry, list); - nmi_buf = perf_trace_buf_nmi; - rcu_assign_pointer(perf_trace_buf_nmi, NULL); - - /* - * Ensure every events in profiling have finished before - * releasing the buffers - */ - synchronize_sched(); + return 0; +} - free_percpu(buf); - free_percpu(nmi_buf); - } +void perf_trace_disable(struct perf_event *p_event) +{ + hlist_del_rcu(&p_event->hlist_entry); } -void perf_trace_disable(int event_id) +void perf_trace_destroy(struct perf_event *p_event) { - struct ftrace_event_call *event; + struct ftrace_event_call *tp_event = p_event->tp_event; + int i; mutex_lock(&event_mutex); - list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) { - perf_trace_event_disable(event); - module_put(event->mod); - break; + if (--tp_event->perf_refcount > 0) + goto out; + + if (tp_event->class->reg) + tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER); + else + tracepoint_probe_unregister(tp_event->name, + tp_event->class->perf_probe, + tp_event); + + /* + * Ensure our callback won't be called anymore. See + * tracepoint_probe_unregister() and __DO_TRACE(). + */ + synchronize_sched(); + + free_percpu(tp_event->perf_events); + tp_event->perf_events = NULL; + + if (!--total_ref_count) { + for (i = 0; i < 4; i++) { + free_percpu(perf_trace_buf[i]); + perf_trace_buf[i] = NULL; } } +out: mutex_unlock(&event_mutex); } __kprobes void *perf_trace_buf_prepare(int size, unsigned short type, - int *rctxp, unsigned long *irq_flags) + struct pt_regs *regs, int *rctxp) { struct trace_entry *entry; - char *trace_buf, *raw_data; - int pc, cpu; + unsigned long flags; + char *raw_data; + int pc; BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); pc = preempt_count(); - /* Protect the per cpu buffer, begin the rcu read side */ - local_irq_save(*irq_flags); - *rctxp = perf_swevent_get_recursion_context(); if (*rctxp < 0) - goto err_recursion; - - cpu = smp_processor_id(); - - if (in_nmi()) - trace_buf = rcu_dereference_sched(perf_trace_buf_nmi); - else - trace_buf = rcu_dereference_sched(perf_trace_buf); - - if (!trace_buf) - goto err; + return NULL; - raw_data = per_cpu_ptr(trace_buf, cpu); + raw_data = this_cpu_ptr(perf_trace_buf[*rctxp]); /* zero the dead bytes from align to not leak stack to user */ memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); entry = (struct trace_entry *)raw_data; - tracing_generic_entry_update(entry, *irq_flags, pc); + local_save_flags(flags); + tracing_generic_entry_update(entry, flags, pc); entry->type = type; return raw_data; -err: - perf_swevent_put_recursion_context(*rctxp); -err_recursion: - local_irq_restore(*irq_flags); - return NULL; } EXPORT_SYMBOL_GPL(perf_trace_buf_prepare); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index c697c7043349..53cffc0b0801 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -29,11 +29,23 @@ DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); +struct list_head * +trace_get_fields(struct ftrace_event_call *event_call) +{ + if (!event_call->class->get_fields) + return &event_call->class->fields; + return event_call->class->get_fields(event_call); +} + int trace_define_field(struct ftrace_event_call *call, const char *type, const char *name, int offset, int size, int is_signed, int filter_type) { struct ftrace_event_field *field; + struct list_head *head; + + if (WARN_ON(!call->class)) + return 0; field = kzalloc(sizeof(*field), GFP_KERNEL); if (!field) @@ -56,7 +68,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, field->size = size; field->is_signed = is_signed; - list_add(&field->link, &call->fields); + head = trace_get_fields(call); + list_add(&field->link, head); return 0; @@ -94,8 +107,10 @@ static int trace_define_common_fields(struct ftrace_event_call *call) void trace_destroy_fields(struct ftrace_event_call *call) { struct ftrace_event_field *field, *next; + struct list_head *head; - list_for_each_entry_safe(field, next, &call->fields, link) { + head = trace_get_fields(call); + list_for_each_entry_safe(field, next, head, link) { list_del(&field->link); kfree(field->type); kfree(field->name); @@ -107,11 +122,9 @@ int trace_event_raw_init(struct ftrace_event_call *call) { int id; - id = register_ftrace_event(call->event); + id = register_ftrace_event(&call->event); if (!id) return -ENODEV; - call->id = id; - INIT_LIST_HEAD(&call->fields); return 0; } @@ -124,23 +137,33 @@ static int ftrace_event_enable_disable(struct ftrace_event_call *call, switch (enable) { case 0: - if (call->enabled) { - call->enabled = 0; + if (call->flags & TRACE_EVENT_FL_ENABLED) { + call->flags &= ~TRACE_EVENT_FL_ENABLED; tracing_stop_cmdline_record(); - call->unregfunc(call); + if (call->class->reg) + call->class->reg(call, TRACE_REG_UNREGISTER); + else + tracepoint_probe_unregister(call->name, + call->class->probe, + call); } break; case 1: - if (!call->enabled) { + if (!(call->flags & TRACE_EVENT_FL_ENABLED)) { tracing_start_cmdline_record(); - ret = call->regfunc(call); + if (call->class->reg) + ret = call->class->reg(call, TRACE_REG_REGISTER); + else + ret = tracepoint_probe_register(call->name, + call->class->probe, + call); if (ret) { tracing_stop_cmdline_record(); pr_info("event trace: Could not enable event " "%s\n", call->name); break; } - call->enabled = 1; + call->flags |= TRACE_EVENT_FL_ENABLED; } break; } @@ -171,15 +194,16 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->regfunc) + if (!call->name || !call->class || + (!call->class->probe && !call->class->reg)) continue; if (match && strcmp(match, call->name) != 0 && - strcmp(match, call->system) != 0) + strcmp(match, call->class->system) != 0) continue; - if (sub && strcmp(sub, call->system) != 0) + if (sub && strcmp(sub, call->class->system) != 0) continue; if (event && strcmp(event, call->name) != 0) @@ -297,7 +321,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->regfunc) + if (call->class && (call->class->probe || call->class->reg)) return call; } @@ -328,7 +352,7 @@ s_next(struct seq_file *m, void *v, loff_t *pos) (*pos)++; list_for_each_entry_continue(call, &ftrace_events, list) { - if (call->enabled) + if (call->flags & TRACE_EVENT_FL_ENABLED) return call; } @@ -355,8 +379,8 @@ static int t_show(struct seq_file *m, void *v) { struct ftrace_event_call *call = v; - if (strcmp(call->system, TRACE_SYSTEM) != 0) - seq_printf(m, "%s:", call->system); + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) + seq_printf(m, "%s:", call->class->system); seq_printf(m, "%s\n", call->name); return 0; @@ -387,7 +411,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt, struct ftrace_event_call *call = filp->private_data; char *buf; - if (call->enabled) + if (call->flags & TRACE_EVENT_FL_ENABLED) buf = "1\n"; else buf = "0\n"; @@ -450,10 +474,11 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (!call->name || !call->regfunc) + if (!call->name || !call->class || + (!call->class->probe && !call->class->reg)) continue; - if (system && strcmp(call->system, system) != 0) + if (system && strcmp(call->class->system, system) != 0) continue; /* @@ -461,7 +486,7 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - set |= (1 << !!call->enabled); + set |= (1 << !!(call->flags & TRACE_EVENT_FL_ENABLED)); /* * If we have a mixture, no need to look further. @@ -525,6 +550,7 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, { struct ftrace_event_call *call = filp->private_data; struct ftrace_event_field *field; + struct list_head *head; struct trace_seq *s; int common_field_count = 5; char *buf; @@ -540,10 +566,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); trace_seq_printf(s, "name: %s\n", call->name); - trace_seq_printf(s, "ID: %d\n", call->id); + trace_seq_printf(s, "ID: %d\n", call->event.type); trace_seq_printf(s, "format:\n"); - list_for_each_entry_reverse(field, &call->fields, link) { + head = trace_get_fields(call); + list_for_each_entry_reverse(field, head, link) { /* * Smartly shows the array type(except dynamic array). * Normal: @@ -613,7 +640,7 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) return -ENOMEM; trace_seq_init(s); - trace_seq_printf(s, "%d\n", call->id); + trace_seq_printf(s, "%d\n", call->event.type); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); @@ -919,14 +946,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, const struct file_operations *filter, const struct file_operations *format) { + struct list_head *head; int ret; /* * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->system, TRACE_SYSTEM) != 0) - d_events = event_subsystem_dir(call->system, d_events); + if (strcmp(call->class->system, TRACE_SYSTEM) != 0) + d_events = event_subsystem_dir(call->class->system, d_events); call->dir = debugfs_create_dir(call->name, d_events); if (!call->dir) { @@ -935,22 +963,31 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, return -1; } - if (call->regfunc) + if (call->class->probe || call->class->reg) trace_create_file("enable", 0644, call->dir, call, enable); - if (call->id && call->perf_event_enable) +#ifdef CONFIG_PERF_EVENTS + if (call->event.type && (call->class->perf_probe || call->class->reg)) trace_create_file("id", 0444, call->dir, call, id); +#endif - if (call->define_fields) { - ret = trace_define_common_fields(call); - if (!ret) - ret = call->define_fields(call); - if (ret < 0) { - pr_warning("Could not initialize trace point" - " events/%s\n", call->name); - return ret; + if (call->class->define_fields) { + /* + * Other events may have the same class. Only update + * the fields if they are not already defined. + */ + head = trace_get_fields(call); + if (list_empty(head)) { + ret = trace_define_common_fields(call); + if (!ret) + ret = call->class->define_fields(call); + if (ret < 0) { + pr_warning("Could not initialize trace point" + " events/%s\n", call->name); + return ret; + } } trace_create_file("filter", 0644, call->dir, call, filter); @@ -970,8 +1007,8 @@ static int __trace_add_event_call(struct ftrace_event_call *call) if (!call->name) return -EINVAL; - if (call->raw_init) { - ret = call->raw_init(call); + if (call->class->raw_init) { + ret = call->class->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1035,13 +1072,13 @@ static void remove_subsystem_dir(const char *name) static void __trace_remove_event_call(struct ftrace_event_call *call) { ftrace_event_enable_disable(call, 0); - if (call->event) - __unregister_ftrace_event(call->event); + if (call->event.funcs) + __unregister_ftrace_event(&call->event); debugfs_remove_recursive(call->dir); list_del(&call->list); trace_destroy_fields(call); destroy_preds(call); - remove_subsystem_dir(call->system); + remove_subsystem_dir(call->class->system); } /* Remove an event_call */ @@ -1132,8 +1169,8 @@ static void trace_module_add_events(struct module *mod) /* The linker may leave blanks */ if (!call->name) continue; - if (call->raw_init) { - ret = call->raw_init(call); + if (call->class->raw_init) { + ret = call->class->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1286,8 +1323,8 @@ static __init int event_trace_init(void) /* The linker may leave blanks */ if (!call->name) continue; - if (call->raw_init) { - ret = call->raw_init(call); + if (call->class->raw_init) { + ret = call->class->raw_init(call); if (ret < 0) { if (ret != -ENOSYS) pr_warning("Could not initialize trace " @@ -1388,8 +1425,8 @@ static __init void event_trace_self_tests(void) list_for_each_entry(call, &ftrace_events, list) { - /* Only test those that have a regfunc */ - if (!call->regfunc) + /* Only test those that have a probe */ + if (!call->class || !call->class->probe) continue; /* @@ -1399,8 +1436,8 @@ static __init void event_trace_self_tests(void) * syscalls as we test. */ #ifndef CONFIG_EVENT_TRACE_TEST_SYSCALLS - if (call->system && - strcmp(call->system, "syscalls") == 0) + if (call->class->system && + strcmp(call->class->system, "syscalls") == 0) continue; #endif @@ -1410,7 +1447,7 @@ static __init void event_trace_self_tests(void) * If an event is already enabled, someone is using * it and the self test should not be on. */ - if (call->enabled) { + if (call->flags & TRACE_EVENT_FL_ENABLED) { pr_warning("Enabled event during self test!\n"); WARN_ON_ONCE(1); continue; diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 58092d844a1f..57bb1bb32999 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -500,8 +500,10 @@ static struct ftrace_event_field * find_event_field(struct ftrace_event_call *call, char *name) { struct ftrace_event_field *field; + struct list_head *head; - list_for_each_entry(field, &call->fields, link) { + head = trace_get_fields(call); + list_for_each_entry(field, head, link) { if (!strcmp(field->name, name)) return field; } @@ -545,7 +547,7 @@ static void filter_disable_preds(struct ftrace_event_call *call) struct event_filter *filter = call->filter; int i; - call->filter_active = 0; + call->flags &= ~TRACE_EVENT_FL_FILTERED; filter->n_preds = 0; for (i = 0; i < MAX_FILTER_PRED; i++) @@ -572,7 +574,7 @@ void destroy_preds(struct ftrace_event_call *call) { __free_preds(call->filter); call->filter = NULL; - call->filter_active = 0; + call->flags &= ~TRACE_EVENT_FL_FILTERED; } static struct event_filter *__alloc_preds(void) @@ -611,7 +613,7 @@ static int init_preds(struct ftrace_event_call *call) if (call->filter) return 0; - call->filter_active = 0; + call->flags &= ~TRACE_EVENT_FL_FILTERED; call->filter = __alloc_preds(); if (IS_ERR(call->filter)) return PTR_ERR(call->filter); @@ -625,10 +627,10 @@ static int init_subsystem_preds(struct event_subsystem *system) int err; list_for_each_entry(call, &ftrace_events, list) { - if (!call->define_fields) + if (!call->class || !call->class->define_fields) continue; - if (strcmp(call->system, system->name) != 0) + if (strcmp(call->class->system, system->name) != 0) continue; err = init_preds(call); @@ -644,10 +646,10 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) struct ftrace_event_call *call; list_for_each_entry(call, &ftrace_events, list) { - if (!call->define_fields) + if (!call->class || !call->class->define_fields) continue; - if (strcmp(call->system, system->name) != 0) + if (strcmp(call->class->system, system->name) != 0) continue; filter_disable_preds(call); @@ -1249,10 +1251,10 @@ static int replace_system_preds(struct event_subsystem *system, list_for_each_entry(call, &ftrace_events, list) { struct event_filter *filter = call->filter; - if (!call->define_fields) + if (!call->class || !call->class->define_fields) continue; - if (strcmp(call->system, system->name) != 0) + if (strcmp(call->class->system, system->name) != 0) continue; /* try to see if the filter can be applied */ @@ -1266,7 +1268,7 @@ static int replace_system_preds(struct event_subsystem *system, if (err) filter_disable_preds(call); else { - call->filter_active = 1; + call->flags |= TRACE_EVENT_FL_FILTERED; replace_filter_string(filter, filter_string); } fail = false; @@ -1315,7 +1317,7 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) if (err) append_filter_err(ps, call->filter); else - call->filter_active = 1; + call->flags |= TRACE_EVENT_FL_FILTERED; out: filter_opstack_clear(ps); postfix_clear(ps); @@ -1393,7 +1395,7 @@ int ftrace_profile_set_filter(struct perf_event *event, int event_id, mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - if (call->id == event_id) + if (call->event.type == event_id) break; } diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index e091f64ba6ce..8536e2a65969 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -127,7 +127,7 @@ ftrace_define_fields_##name(struct ftrace_event_call *event_call) \ static int ftrace_raw_init_event(struct ftrace_event_call *call) { - INIT_LIST_HEAD(&call->fields); + INIT_LIST_HEAD(&call->class->fields); return 0; } @@ -153,17 +153,21 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) #define F_printk(fmt, args...) #fmt ", " __stringify(args) #undef FTRACE_ENTRY -#define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ +#define FTRACE_ENTRY(call, struct_name, etype, tstruct, print) \ + \ +struct ftrace_event_class event_class_ftrace_##call = { \ + .system = __stringify(TRACE_SYSTEM), \ + .define_fields = ftrace_define_fields_##call, \ + .raw_init = ftrace_raw_init_event, \ +}; \ \ struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ - .id = type, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_raw_init_event, \ + .event.type = etype, \ + .class = &event_class_ftrace_##call, \ .print_fmt = print, \ - .define_fields = ftrace_define_fields_##call, \ }; \ #include "trace_entries.h" diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index dd11c830eb84..79f4bac99a94 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -1025,7 +1025,7 @@ print_graph_comment(struct trace_seq *s, struct trace_entry *ent, if (!event) return TRACE_TYPE_UNHANDLED; - ret = event->trace(iter, sym_flags); + ret = event->funcs->trace(iter, sym_flags, event); if (ret != TRACE_TYPE_HANDLED) return ret; } @@ -1112,7 +1112,8 @@ print_graph_function(struct trace_iterator *iter) } static enum print_line_t -print_graph_function_event(struct trace_iterator *iter, int flags) +print_graph_function_event(struct trace_iterator *iter, int flags, + struct trace_event *event) { return print_graph_function(iter); } @@ -1225,14 +1226,18 @@ void graph_trace_close(struct trace_iterator *iter) } } +static struct trace_event_functions graph_functions = { + .trace = print_graph_function_event, +}; + static struct trace_event graph_trace_entry_event = { .type = TRACE_GRAPH_ENT, - .trace = print_graph_function_event, + .funcs = &graph_functions, }; static struct trace_event graph_trace_ret_event = { .type = TRACE_GRAPH_RET, - .trace = print_graph_function_event, + .funcs = &graph_functions }; static struct tracer graph_trace __read_mostly = { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index a7514326052b..f52b5f50299d 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -324,8 +324,8 @@ struct trace_probe { unsigned long nhit; unsigned int flags; /* For TP_FLAG_* */ const char *symbol; /* symbol name */ + struct ftrace_event_class class; struct ftrace_event_call call; - struct trace_event event; ssize_t size; /* trace entry size */ unsigned int nr_args; struct probe_arg args[]; @@ -404,6 +404,7 @@ static struct trace_probe *alloc_trace_probe(const char *group, goto error; } + tp->call.class = &tp->class; tp->call.name = kstrdup(event, GFP_KERNEL); if (!tp->call.name) goto error; @@ -413,8 +414,8 @@ static struct trace_probe *alloc_trace_probe(const char *group, goto error; } - tp->call.system = kstrdup(group, GFP_KERNEL); - if (!tp->call.system) + tp->class.system = kstrdup(group, GFP_KERNEL); + if (!tp->class.system) goto error; INIT_LIST_HEAD(&tp->list); @@ -443,7 +444,7 @@ static void free_trace_probe(struct trace_probe *tp) for (i = 0; i < tp->nr_args; i++) free_probe_arg(&tp->args[i]); - kfree(tp->call.system); + kfree(tp->call.class->system); kfree(tp->call.name); kfree(tp->symbol); kfree(tp); @@ -456,7 +457,7 @@ static struct trace_probe *find_probe_event(const char *event, list_for_each_entry(tp, &probe_list, list) if (strcmp(tp->call.name, event) == 0 && - strcmp(tp->call.system, group) == 0) + strcmp(tp->call.class->system, group) == 0) return tp; return NULL; } @@ -481,7 +482,7 @@ static int register_trace_probe(struct trace_probe *tp) mutex_lock(&probe_lock); /* register as an event */ - old_tp = find_probe_event(tp->call.name, tp->call.system); + old_tp = find_probe_event(tp->call.name, tp->call.class->system); if (old_tp) { /* delete old event */ unregister_trace_probe(old_tp); @@ -904,7 +905,7 @@ static int probes_seq_show(struct seq_file *m, void *v) int i; seq_printf(m, "%c", probe_is_return(tp) ? 'r' : 'p'); - seq_printf(m, ":%s/%s", tp->call.system, tp->call.name); + seq_printf(m, ":%s/%s", tp->call.class->system, tp->call.name); if (!tp->symbol) seq_printf(m, " 0x%p", tp->rp.kp.addr); @@ -1061,8 +1062,8 @@ static __kprobes void kprobe_trace_func(struct kprobe *kp, struct pt_regs *regs) size = sizeof(*entry) + tp->size; - event = trace_current_buffer_lock_reserve(&buffer, call->id, size, - irq_flags, pc); + event = trace_current_buffer_lock_reserve(&buffer, call->event.type, + size, irq_flags, pc); if (!event) return; @@ -1094,8 +1095,8 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, size = sizeof(*entry) + tp->size; - event = trace_current_buffer_lock_reserve(&buffer, call->id, size, - irq_flags, pc); + event = trace_current_buffer_lock_reserve(&buffer, call->event.type, + size, irq_flags, pc); if (!event) return; @@ -1112,18 +1113,17 @@ static __kprobes void kretprobe_trace_func(struct kretprobe_instance *ri, /* Event entry printers */ enum print_line_t -print_kprobe_event(struct trace_iterator *iter, int flags) +print_kprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct kprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; - struct trace_event *event; struct trace_probe *tp; u8 *data; int i; field = (struct kprobe_trace_entry_head *)iter->ent; - event = ftrace_find_event(field->ent.type); - tp = container_of(event, struct trace_probe, event); + tp = container_of(event, struct trace_probe, call.event); if (!trace_seq_printf(s, "%s: (", tp->call.name)) goto partial; @@ -1149,18 +1149,17 @@ partial: } enum print_line_t -print_kretprobe_event(struct trace_iterator *iter, int flags) +print_kretprobe_event(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct kretprobe_trace_entry_head *field; struct trace_seq *s = &iter->seq; - struct trace_event *event; struct trace_probe *tp; u8 *data; int i; field = (struct kretprobe_trace_entry_head *)iter->ent; - event = ftrace_find_event(field->ent.type); - tp = container_of(event, struct trace_probe, event); + tp = container_of(event, struct trace_probe, call.event); if (!trace_seq_printf(s, "%s: (", tp->call.name)) goto partial; @@ -1217,8 +1216,6 @@ static void probe_event_disable(struct ftrace_event_call *call) static int probe_event_raw_init(struct ftrace_event_call *event_call) { - INIT_LIST_HEAD(&event_call->fields); - return 0; } @@ -1341,9 +1338,9 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, struct trace_probe *tp = container_of(kp, struct trace_probe, rp.kp); struct ftrace_event_call *call = &tp->call; struct kprobe_trace_entry_head *entry; + struct hlist_head *head; u8 *data; int size, __size, i; - unsigned long irq_flags; int rctx; __size = sizeof(*entry) + tp->size; @@ -1353,7 +1350,7 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, "profile buffer not large enough")) return; - entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); + entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) return; @@ -1362,7 +1359,8 @@ static __kprobes void kprobe_perf_func(struct kprobe *kp, for (i = 0; i < tp->nr_args; i++) call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); - perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, irq_flags, regs); + head = this_cpu_ptr(call->perf_events); + perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head); } /* Kretprobe profile handler */ @@ -1372,9 +1370,9 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, struct trace_probe *tp = container_of(ri->rp, struct trace_probe, rp); struct ftrace_event_call *call = &tp->call; struct kretprobe_trace_entry_head *entry; + struct hlist_head *head; u8 *data; int size, __size, i; - unsigned long irq_flags; int rctx; __size = sizeof(*entry) + tp->size; @@ -1384,7 +1382,7 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, "profile buffer not large enough")) return; - entry = perf_trace_buf_prepare(size, call->id, &rctx, &irq_flags); + entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx); if (!entry) return; @@ -1394,8 +1392,8 @@ static __kprobes void kretprobe_perf_func(struct kretprobe_instance *ri, for (i = 0; i < tp->nr_args; i++) call_fetch(&tp->args[i].fetch, regs, data + tp->args[i].offset); - perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, - irq_flags, regs); + head = this_cpu_ptr(call->perf_events); + perf_trace_buf_submit(entry, size, rctx, entry->ret_ip, 1, regs, head); } static int probe_perf_enable(struct ftrace_event_call *call) @@ -1425,6 +1423,26 @@ static void probe_perf_disable(struct ftrace_event_call *call) } #endif /* CONFIG_PERF_EVENTS */ +static __kprobes +int kprobe_register(struct ftrace_event_call *event, enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return probe_event_enable(event); + case TRACE_REG_UNREGISTER: + probe_event_disable(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return probe_perf_enable(event); + case TRACE_REG_PERF_UNREGISTER: + probe_perf_disable(event); + return 0; +#endif + } + return 0; +} static __kprobes int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) @@ -1454,6 +1472,14 @@ int kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) return 0; /* We don't tweek kernel, so just return 0 */ } +static struct trace_event_functions kretprobe_funcs = { + .trace = print_kretprobe_event +}; + +static struct trace_event_functions kprobe_funcs = { + .trace = print_kprobe_event +}; + static int register_probe_event(struct trace_probe *tp) { struct ftrace_event_call *call = &tp->call; @@ -1461,36 +1487,31 @@ static int register_probe_event(struct trace_probe *tp) /* Initialize ftrace_event_call */ if (probe_is_return(tp)) { - tp->event.trace = print_kretprobe_event; - call->raw_init = probe_event_raw_init; - call->define_fields = kretprobe_event_define_fields; + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &kretprobe_funcs; + call->class->raw_init = probe_event_raw_init; + call->class->define_fields = kretprobe_event_define_fields; } else { - tp->event.trace = print_kprobe_event; - call->raw_init = probe_event_raw_init; - call->define_fields = kprobe_event_define_fields; + INIT_LIST_HEAD(&call->class->fields); + call->event.funcs = &kprobe_funcs; + call->class->raw_init = probe_event_raw_init; + call->class->define_fields = kprobe_event_define_fields; } if (set_print_fmt(tp) < 0) return -ENOMEM; - call->event = &tp->event; - call->id = register_ftrace_event(&tp->event); - if (!call->id) { + ret = register_ftrace_event(&call->event); + if (!ret) { kfree(call->print_fmt); return -ENODEV; } - call->enabled = 0; - call->regfunc = probe_event_enable; - call->unregfunc = probe_event_disable; - -#ifdef CONFIG_PERF_EVENTS - call->perf_event_enable = probe_perf_enable; - call->perf_event_disable = probe_perf_disable; -#endif + call->flags = 0; + call->class->reg = kprobe_register; call->data = tp; ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", call->name); kfree(call->print_fmt); - unregister_ftrace_event(&tp->event); + unregister_ftrace_event(&call->event); } return ret; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index ab13d7008061..57c1b4596470 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -742,6 +742,9 @@ int register_ftrace_event(struct trace_event *event) if (WARN_ON(!event)) goto out; + if (WARN_ON(!event->funcs)) + goto out; + INIT_LIST_HEAD(&event->list); if (!event->type) { @@ -774,14 +777,14 @@ int register_ftrace_event(struct trace_event *event) goto out; } - if (event->trace == NULL) - event->trace = trace_nop_print; - if (event->raw == NULL) - event->raw = trace_nop_print; - if (event->hex == NULL) - event->hex = trace_nop_print; - if (event->binary == NULL) - event->binary = trace_nop_print; + if (event->funcs->trace == NULL) + event->funcs->trace = trace_nop_print; + if (event->funcs->raw == NULL) + event->funcs->raw = trace_nop_print; + if (event->funcs->hex == NULL) + event->funcs->hex = trace_nop_print; + if (event->funcs->binary == NULL) + event->funcs->binary = trace_nop_print; key = event->type & (EVENT_HASHSIZE - 1); @@ -823,13 +826,15 @@ EXPORT_SYMBOL_GPL(unregister_ftrace_event); * Standard events */ -enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags) +enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags, + struct trace_event *event) { return TRACE_TYPE_HANDLED; } /* TRACE_FN */ -static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -856,7 +861,8 @@ static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; @@ -870,7 +876,8 @@ static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -883,7 +890,8 @@ static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) +static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct ftrace_entry *field; struct trace_seq *s = &iter->seq; @@ -896,14 +904,18 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } -static struct trace_event trace_fn_event = { - .type = TRACE_FN, +static struct trace_event_functions trace_fn_funcs = { .trace = trace_fn_trace, .raw = trace_fn_raw, .hex = trace_fn_hex, .binary = trace_fn_bin, }; +static struct trace_event trace_fn_event = { + .type = TRACE_FN, + .funcs = &trace_fn_funcs, +}; + /* TRACE_CTX an TRACE_WAKE */ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, char *delim) @@ -932,13 +944,14 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter, return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_print(iter, "==>"); } static enum print_line_t trace_wake_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { return trace_ctxwake_print(iter, " +"); } @@ -966,12 +979,14 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_raw(iter, 0); } -static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_raw(iter, '+'); } @@ -1000,18 +1015,20 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S) return TRACE_TYPE_HANDLED; } -static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_hex(iter, 0); } -static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags) +static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags, + struct trace_event *event) { return trace_ctxwake_hex(iter, '+'); } static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct ctx_switch_entry *field; struct trace_seq *s = &iter->seq; @@ -1028,25 +1045,33 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter, return TRACE_TYPE_HANDLED; } -static struct trace_event trace_ctx_event = { - .type = TRACE_CTX, +static struct trace_event_functions trace_ctx_funcs = { .trace = trace_ctx_print, .raw = trace_ctx_raw, .hex = trace_ctx_hex, .binary = trace_ctxwake_bin, }; -static struct trace_event trace_wake_event = { - .type = TRACE_WAKE, +static struct trace_event trace_ctx_event = { + .type = TRACE_CTX, + .funcs = &trace_ctx_funcs, +}; + +static struct trace_event_functions trace_wake_funcs = { .trace = trace_wake_print, .raw = trace_wake_raw, .hex = trace_wake_hex, .binary = trace_ctxwake_bin, }; +static struct trace_event trace_wake_event = { + .type = TRACE_WAKE, + .funcs = &trace_wake_funcs, +}; + /* TRACE_SPECIAL */ static enum print_line_t trace_special_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct special_entry *field; @@ -1062,7 +1087,7 @@ static enum print_line_t trace_special_print(struct trace_iterator *iter, } static enum print_line_t trace_special_hex(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct special_entry *field; struct trace_seq *s = &iter->seq; @@ -1077,7 +1102,7 @@ static enum print_line_t trace_special_hex(struct trace_iterator *iter, } static enum print_line_t trace_special_bin(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct special_entry *field; struct trace_seq *s = &iter->seq; @@ -1091,18 +1116,22 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter, return TRACE_TYPE_HANDLED; } -static struct trace_event trace_special_event = { - .type = TRACE_SPECIAL, +static struct trace_event_functions trace_special_funcs = { .trace = trace_special_print, .raw = trace_special_print, .hex = trace_special_hex, .binary = trace_special_bin, }; +static struct trace_event trace_special_event = { + .type = TRACE_SPECIAL, + .funcs = &trace_special_funcs, +}; + /* TRACE_STACK */ static enum print_line_t trace_stack_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct stack_entry *field; struct trace_seq *s = &iter->seq; @@ -1130,17 +1159,21 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } -static struct trace_event trace_stack_event = { - .type = TRACE_STACK, +static struct trace_event_functions trace_stack_funcs = { .trace = trace_stack_print, .raw = trace_special_print, .hex = trace_special_hex, .binary = trace_special_bin, }; +static struct trace_event trace_stack_event = { + .type = TRACE_STACK, + .funcs = &trace_stack_funcs, +}; + /* TRACE_USER_STACK */ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct userstack_entry *field; struct trace_seq *s = &iter->seq; @@ -1159,17 +1192,22 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } -static struct trace_event trace_user_stack_event = { - .type = TRACE_USER_STACK, +static struct trace_event_functions trace_user_stack_funcs = { .trace = trace_user_stack_print, .raw = trace_special_print, .hex = trace_special_hex, .binary = trace_special_bin, }; +static struct trace_event trace_user_stack_event = { + .type = TRACE_USER_STACK, + .funcs = &trace_user_stack_funcs, +}; + /* TRACE_BPRINT */ static enum print_line_t -trace_bprint_print(struct trace_iterator *iter, int flags) +trace_bprint_print(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_entry *entry = iter->ent; struct trace_seq *s = &iter->seq; @@ -1194,7 +1232,8 @@ trace_bprint_print(struct trace_iterator *iter, int flags) static enum print_line_t -trace_bprint_raw(struct trace_iterator *iter, int flags) +trace_bprint_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct bprint_entry *field; struct trace_seq *s = &iter->seq; @@ -1213,16 +1252,19 @@ trace_bprint_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } +static struct trace_event_functions trace_bprint_funcs = { + .trace = trace_bprint_print, + .raw = trace_bprint_raw, +}; static struct trace_event trace_bprint_event = { .type = TRACE_BPRINT, - .trace = trace_bprint_print, - .raw = trace_bprint_raw, + .funcs = &trace_bprint_funcs, }; /* TRACE_PRINT */ static enum print_line_t trace_print_print(struct trace_iterator *iter, - int flags) + int flags, struct trace_event *event) { struct print_entry *field; struct trace_seq *s = &iter->seq; @@ -1241,7 +1283,8 @@ static enum print_line_t trace_print_print(struct trace_iterator *iter, return TRACE_TYPE_PARTIAL_LINE; } -static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) +static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct print_entry *field; @@ -1256,12 +1299,16 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags) return TRACE_TYPE_PARTIAL_LINE; } -static struct trace_event trace_print_event = { - .type = TRACE_PRINT, +static struct trace_event_functions trace_print_funcs = { .trace = trace_print_print, .raw = trace_print_raw, }; +static struct trace_event trace_print_event = { + .type = TRACE_PRINT, + .funcs = &trace_print_funcs, +}; + static struct trace_event *events[] __initdata = { &trace_fn_event, diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 9d91c72ba38b..c038eba0492b 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -25,7 +25,7 @@ extern void trace_event_read_unlock(void); extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, - int flags); + int flags, struct trace_event *event); extern int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry); diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index a55fccfede5d..8f758d070c43 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -50,7 +50,7 @@ tracing_sched_switch_trace(struct trace_array *tr, } static void -probe_sched_switch(struct task_struct *prev, struct task_struct *next) +probe_sched_switch(void *ignore, struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; unsigned long flags; @@ -108,7 +108,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, } static void -probe_sched_wakeup(struct task_struct *wakee, int success) +probe_sched_wakeup(void *ignore, struct task_struct *wakee, int success) { struct trace_array_cpu *data; unsigned long flags; @@ -138,21 +138,21 @@ static int tracing_sched_register(void) { int ret; - ret = register_trace_sched_wakeup(probe_sched_wakeup); + ret = register_trace_sched_wakeup(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return ret; } - ret = register_trace_sched_wakeup_new(probe_sched_wakeup); + ret = register_trace_sched_wakeup_new(probe_sched_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = register_trace_sched_switch(probe_sched_switch); + ret = register_trace_sched_switch(probe_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); @@ -161,17 +161,17 @@ static int tracing_sched_register(void) return ret; fail_deprobe_wake_new: - unregister_trace_sched_wakeup_new(probe_sched_wakeup); + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); fail_deprobe: - unregister_trace_sched_wakeup(probe_sched_wakeup); + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); return ret; } static void tracing_sched_unregister(void) { - unregister_trace_sched_switch(probe_sched_switch); - unregister_trace_sched_wakeup_new(probe_sched_wakeup); - unregister_trace_sched_wakeup(probe_sched_wakeup); + unregister_trace_sched_switch(probe_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_sched_wakeup, NULL); + unregister_trace_sched_wakeup(probe_sched_wakeup, NULL); } static void tracing_start_sched_switch(void) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 8052446ceeaa..0e73bc2ef8c5 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -98,7 +98,8 @@ static int report_latency(cycle_t delta) return 1; } -static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) +static void +probe_wakeup_migrate_task(void *ignore, struct task_struct *task, int cpu) { if (task != wakeup_task) return; @@ -107,7 +108,8 @@ static void probe_wakeup_migrate_task(struct task_struct *task, int cpu) } static void notrace -probe_wakeup_sched_switch(struct task_struct *prev, struct task_struct *next) +probe_wakeup_sched_switch(void *ignore, + struct task_struct *prev, struct task_struct *next) { struct trace_array_cpu *data; cycle_t T0, T1, delta; @@ -199,7 +201,7 @@ static void wakeup_reset(struct trace_array *tr) } static void -probe_wakeup(struct task_struct *p, int success) +probe_wakeup(void *ignore, struct task_struct *p, int success) { struct trace_array_cpu *data; int cpu = smp_processor_id(); @@ -263,28 +265,28 @@ static void start_wakeup_tracer(struct trace_array *tr) { int ret; - ret = register_trace_sched_wakeup(probe_wakeup); + ret = register_trace_sched_wakeup(probe_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup\n"); return; } - ret = register_trace_sched_wakeup_new(probe_wakeup); + ret = register_trace_sched_wakeup_new(probe_wakeup, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_wakeup_new\n"); goto fail_deprobe; } - ret = register_trace_sched_switch(probe_wakeup_sched_switch); + ret = register_trace_sched_switch(probe_wakeup_sched_switch, NULL); if (ret) { pr_info("sched trace: Couldn't activate tracepoint" " probe to kernel_sched_switch\n"); goto fail_deprobe_wake_new; } - ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task); + ret = register_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); if (ret) { pr_info("wakeup trace: Couldn't activate tracepoint" " probe to kernel_sched_migrate_task\n"); @@ -311,19 +313,19 @@ static void start_wakeup_tracer(struct trace_array *tr) return; fail_deprobe_wake_new: - unregister_trace_sched_wakeup_new(probe_wakeup); + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); fail_deprobe: - unregister_trace_sched_wakeup(probe_wakeup); + unregister_trace_sched_wakeup(probe_wakeup, NULL); } static void stop_wakeup_tracer(struct trace_array *tr) { tracer_enabled = 0; unregister_ftrace_function(&trace_ops); - unregister_trace_sched_switch(probe_wakeup_sched_switch); - unregister_trace_sched_wakeup_new(probe_wakeup); - unregister_trace_sched_wakeup(probe_wakeup); - unregister_trace_sched_migrate_task(probe_wakeup_migrate_task); + unregister_trace_sched_switch(probe_wakeup_sched_switch, NULL); + unregister_trace_sched_wakeup_new(probe_wakeup, NULL); + unregister_trace_sched_wakeup(probe_wakeup, NULL); + unregister_trace_sched_migrate_task(probe_wakeup_migrate_task, NULL); } static int __wakeup_tracer_init(struct trace_array *tr) diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 4d6d711717f2..34e35804304b 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -15,6 +15,54 @@ static int sys_refcount_exit; static DECLARE_BITMAP(enabled_enter_syscalls, NR_syscalls); static DECLARE_BITMAP(enabled_exit_syscalls, NR_syscalls); +static int syscall_enter_register(struct ftrace_event_call *event, + enum trace_reg type); +static int syscall_exit_register(struct ftrace_event_call *event, + enum trace_reg type); + +static int syscall_enter_define_fields(struct ftrace_event_call *call); +static int syscall_exit_define_fields(struct ftrace_event_call *call); + +static struct list_head * +syscall_get_enter_fields(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + return &entry->enter_fields; +} + +static struct list_head * +syscall_get_exit_fields(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + return &entry->exit_fields; +} + +struct trace_event_functions enter_syscall_print_funcs = { + .trace = print_syscall_enter, +}; + +struct trace_event_functions exit_syscall_print_funcs = { + .trace = print_syscall_exit, +}; + +struct ftrace_event_class event_class_syscall_enter = { + .system = "syscalls", + .reg = syscall_enter_register, + .define_fields = syscall_enter_define_fields, + .get_fields = syscall_get_enter_fields, + .raw_init = init_syscall_trace, +}; + +struct ftrace_event_class event_class_syscall_exit = { + .system = "syscalls", + .reg = syscall_exit_register, + .define_fields = syscall_exit_define_fields, + .get_fields = syscall_get_exit_fields, + .raw_init = init_syscall_trace, +}; + extern unsigned long __start_syscalls_metadata[]; extern unsigned long __stop_syscalls_metadata[]; @@ -53,7 +101,8 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) } enum print_line_t -print_syscall_enter(struct trace_iterator *iter, int flags) +print_syscall_enter(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; @@ -68,7 +117,7 @@ print_syscall_enter(struct trace_iterator *iter, int flags) if (!entry) goto end; - if (entry->enter_event->id != ent->type) { + if (entry->enter_event->event.type != ent->type) { WARN_ON_ONCE(1); goto end; } @@ -105,7 +154,8 @@ end: } enum print_line_t -print_syscall_exit(struct trace_iterator *iter, int flags) +print_syscall_exit(struct trace_iterator *iter, int flags, + struct trace_event *event) { struct trace_seq *s = &iter->seq; struct trace_entry *ent = iter->ent; @@ -123,7 +173,7 @@ print_syscall_exit(struct trace_iterator *iter, int flags) return TRACE_TYPE_HANDLED; } - if (entry->exit_event->id != ent->type) { + if (entry->exit_event->event.type != ent->type) { WARN_ON_ONCE(1); return TRACE_TYPE_UNHANDLED; } @@ -205,7 +255,7 @@ static void free_syscall_print_fmt(struct ftrace_event_call *call) kfree(call->print_fmt); } -int syscall_enter_define_fields(struct ftrace_event_call *call) +static int syscall_enter_define_fields(struct ftrace_event_call *call) { struct syscall_trace_enter trace; struct syscall_metadata *meta = call->data; @@ -228,7 +278,7 @@ int syscall_enter_define_fields(struct ftrace_event_call *call) return ret; } -int syscall_exit_define_fields(struct ftrace_event_call *call) +static int syscall_exit_define_fields(struct ftrace_event_call *call) { struct syscall_trace_exit trace; int ret; @@ -243,7 +293,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call) return ret; } -void ftrace_syscall_enter(struct pt_regs *regs, long id) +void ftrace_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_trace_enter *entry; struct syscall_metadata *sys_data; @@ -265,7 +315,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args; event = trace_current_buffer_lock_reserve(&buffer, - sys_data->enter_event->id, size, 0, 0); + sys_data->enter_event->event.type, size, 0, 0); if (!event) return; @@ -278,7 +328,7 @@ void ftrace_syscall_enter(struct pt_regs *regs, long id) trace_current_buffer_unlock_commit(buffer, event, 0, 0); } -void ftrace_syscall_exit(struct pt_regs *regs, long ret) +void ftrace_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_trace_exit *entry; struct syscall_metadata *sys_data; @@ -297,7 +347,7 @@ void ftrace_syscall_exit(struct pt_regs *regs, long ret) return; event = trace_current_buffer_lock_reserve(&buffer, - sys_data->exit_event->id, sizeof(*entry), 0, 0); + sys_data->exit_event->event.type, sizeof(*entry), 0, 0); if (!event) return; @@ -320,7 +370,7 @@ int reg_event_syscall_enter(struct ftrace_event_call *call) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_enter) - ret = register_trace_sys_enter(ftrace_syscall_enter); + ret = register_trace_sys_enter(ftrace_syscall_enter, NULL); if (!ret) { set_bit(num, enabled_enter_syscalls); sys_refcount_enter++; @@ -340,7 +390,7 @@ void unreg_event_syscall_enter(struct ftrace_event_call *call) sys_refcount_enter--; clear_bit(num, enabled_enter_syscalls); if (!sys_refcount_enter) - unregister_trace_sys_enter(ftrace_syscall_enter); + unregister_trace_sys_enter(ftrace_syscall_enter, NULL); mutex_unlock(&syscall_trace_lock); } @@ -354,7 +404,7 @@ int reg_event_syscall_exit(struct ftrace_event_call *call) return -ENOSYS; mutex_lock(&syscall_trace_lock); if (!sys_refcount_exit) - ret = register_trace_sys_exit(ftrace_syscall_exit); + ret = register_trace_sys_exit(ftrace_syscall_exit, NULL); if (!ret) { set_bit(num, enabled_exit_syscalls); sys_refcount_exit++; @@ -374,7 +424,7 @@ void unreg_event_syscall_exit(struct ftrace_event_call *call) sys_refcount_exit--; clear_bit(num, enabled_exit_syscalls); if (!sys_refcount_exit) - unregister_trace_sys_exit(ftrace_syscall_exit); + unregister_trace_sys_exit(ftrace_syscall_exit, NULL); mutex_unlock(&syscall_trace_lock); } @@ -434,11 +484,11 @@ static DECLARE_BITMAP(enabled_perf_exit_syscalls, NR_syscalls); static int sys_perf_refcount_enter; static int sys_perf_refcount_exit; -static void perf_syscall_enter(struct pt_regs *regs, long id) +static void perf_syscall_enter(void *ignore, struct pt_regs *regs, long id) { struct syscall_metadata *sys_data; struct syscall_trace_enter *rec; - unsigned long flags; + struct hlist_head *head; int syscall_nr; int rctx; int size; @@ -461,14 +511,16 @@ static void perf_syscall_enter(struct pt_regs *regs, long id) return; rec = (struct syscall_trace_enter *)perf_trace_buf_prepare(size, - sys_data->enter_event->id, &rctx, &flags); + sys_data->enter_event->event.type, regs, &rctx); if (!rec) return; rec->nr = syscall_nr; syscall_get_arguments(current, regs, 0, sys_data->nb_args, (unsigned long *)&rec->args); - perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); + + head = this_cpu_ptr(sys_data->enter_event->perf_events); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); } int perf_sysenter_enable(struct ftrace_event_call *call) @@ -480,7 +532,7 @@ int perf_sysenter_enable(struct ftrace_event_call *call) mutex_lock(&syscall_trace_lock); if (!sys_perf_refcount_enter) - ret = register_trace_sys_enter(perf_syscall_enter); + ret = register_trace_sys_enter(perf_syscall_enter, NULL); if (ret) { pr_info("event trace: Could not activate" "syscall entry trace point"); @@ -502,15 +554,15 @@ void perf_sysenter_disable(struct ftrace_event_call *call) sys_perf_refcount_enter--; clear_bit(num, enabled_perf_enter_syscalls); if (!sys_perf_refcount_enter) - unregister_trace_sys_enter(perf_syscall_enter); + unregister_trace_sys_enter(perf_syscall_enter, NULL); mutex_unlock(&syscall_trace_lock); } -static void perf_syscall_exit(struct pt_regs *regs, long ret) +static void perf_syscall_exit(void *ignore, struct pt_regs *regs, long ret) { struct syscall_metadata *sys_data; struct syscall_trace_exit *rec; - unsigned long flags; + struct hlist_head *head; int syscall_nr; int rctx; int size; @@ -536,14 +588,15 @@ static void perf_syscall_exit(struct pt_regs *regs, long ret) return; rec = (struct syscall_trace_exit *)perf_trace_buf_prepare(size, - sys_data->exit_event->id, &rctx, &flags); + sys_data->exit_event->event.type, regs, &rctx); if (!rec) return; rec->nr = syscall_nr; rec->ret = syscall_get_return_value(current, regs); - perf_trace_buf_submit(rec, size, rctx, 0, 1, flags, regs); + head = this_cpu_ptr(sys_data->exit_event->perf_events); + perf_trace_buf_submit(rec, size, rctx, 0, 1, regs, head); } int perf_sysexit_enable(struct ftrace_event_call *call) @@ -555,7 +608,7 @@ int perf_sysexit_enable(struct ftrace_event_call *call) mutex_lock(&syscall_trace_lock); if (!sys_perf_refcount_exit) - ret = register_trace_sys_exit(perf_syscall_exit); + ret = register_trace_sys_exit(perf_syscall_exit, NULL); if (ret) { pr_info("event trace: Could not activate" "syscall exit trace point"); @@ -577,9 +630,50 @@ void perf_sysexit_disable(struct ftrace_event_call *call) sys_perf_refcount_exit--; clear_bit(num, enabled_perf_exit_syscalls); if (!sys_perf_refcount_exit) - unregister_trace_sys_exit(perf_syscall_exit); + unregister_trace_sys_exit(perf_syscall_exit, NULL); mutex_unlock(&syscall_trace_lock); } #endif /* CONFIG_PERF_EVENTS */ +static int syscall_enter_register(struct ftrace_event_call *event, + enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_enter(event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_enter(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysenter_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysenter_disable(event); + return 0; +#endif + } + return 0; +} + +static int syscall_exit_register(struct ftrace_event_call *event, + enum trace_reg type) +{ + switch (type) { + case TRACE_REG_REGISTER: + return reg_event_syscall_exit(event); + case TRACE_REG_UNREGISTER: + unreg_event_syscall_exit(event); + return 0; + +#ifdef CONFIG_PERF_EVENTS + case TRACE_REG_PERF_REGISTER: + return perf_sysexit_enable(event); + case TRACE_REG_PERF_UNREGISTER: + perf_sysexit_disable(event); + return 0; +#endif + } + return 0; +} diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index cc2d2faa7d9e..a7cc3793baf6 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -49,7 +49,8 @@ static void cpu_workqueue_stat_free(struct kref *kref) /* Insertion of a work */ static void -probe_workqueue_insertion(struct task_struct *wq_thread, +probe_workqueue_insertion(void *ignore, + struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); @@ -70,7 +71,8 @@ found: /* Execution of a work */ static void -probe_workqueue_execution(struct task_struct *wq_thread, +probe_workqueue_execution(void *ignore, + struct task_struct *wq_thread, struct work_struct *work) { int cpu = cpumask_first(&wq_thread->cpus_allowed); @@ -90,7 +92,8 @@ found: } /* Creation of a cpu workqueue thread */ -static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) +static void probe_workqueue_creation(void *ignore, + struct task_struct *wq_thread, int cpu) { struct cpu_workqueue_stats *cws; unsigned long flags; @@ -114,7 +117,8 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu) } /* Destruction of a cpu workqueue thread */ -static void probe_workqueue_destruction(struct task_struct *wq_thread) +static void +probe_workqueue_destruction(void *ignore, struct task_struct *wq_thread) { /* Workqueue only execute on one cpu */ int cpu = cpumask_first(&wq_thread->cpus_allowed); @@ -259,19 +263,19 @@ int __init trace_workqueue_early_init(void) { int ret, cpu; - ret = register_trace_workqueue_insertion(probe_workqueue_insertion); + ret = register_trace_workqueue_insertion(probe_workqueue_insertion, NULL); if (ret) goto out; - ret = register_trace_workqueue_execution(probe_workqueue_execution); + ret = register_trace_workqueue_execution(probe_workqueue_execution, NULL); if (ret) goto no_insertion; - ret = register_trace_workqueue_creation(probe_workqueue_creation); + ret = register_trace_workqueue_creation(probe_workqueue_creation, NULL); if (ret) goto no_execution; - ret = register_trace_workqueue_destruction(probe_workqueue_destruction); + ret = register_trace_workqueue_destruction(probe_workqueue_destruction, NULL); if (ret) goto no_creation; @@ -283,11 +287,11 @@ int __init trace_workqueue_early_init(void) return 0; no_creation: - unregister_trace_workqueue_creation(probe_workqueue_creation); + unregister_trace_workqueue_creation(probe_workqueue_creation, NULL); no_execution: - unregister_trace_workqueue_execution(probe_workqueue_execution); + unregister_trace_workqueue_execution(probe_workqueue_execution, NULL); no_insertion: - unregister_trace_workqueue_insertion(probe_workqueue_insertion); + unregister_trace_workqueue_insertion(probe_workqueue_insertion, NULL); out: pr_warning("trace_workqueue: unable to trace workqueues\n"); diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index cc89be5bc0f8..c77f3eceea25 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -54,7 +54,7 @@ static struct hlist_head tracepoint_table[TRACEPOINT_TABLE_SIZE]; */ struct tracepoint_entry { struct hlist_node hlist; - void **funcs; + struct tracepoint_func *funcs; int refcount; /* Number of times armed. 0 if disarmed. */ char name[0]; }; @@ -64,12 +64,12 @@ struct tp_probes { struct rcu_head rcu; struct list_head list; } u; - void *probes[0]; + struct tracepoint_func probes[0]; }; static inline void *allocate_probes(int count) { - struct tp_probes *p = kmalloc(count * sizeof(void *) + struct tp_probes *p = kmalloc(count * sizeof(struct tracepoint_func) + sizeof(struct tp_probes), GFP_KERNEL); return p == NULL ? NULL : p->probes; } @@ -79,7 +79,7 @@ static void rcu_free_old_probes(struct rcu_head *head) kfree(container_of(head, struct tp_probes, u.rcu)); } -static inline void release_probes(void *old) +static inline void release_probes(struct tracepoint_func *old) { if (old) { struct tp_probes *tp_probes = container_of(old, @@ -95,15 +95,16 @@ static void debug_print_probes(struct tracepoint_entry *entry) if (!tracepoint_debug || !entry->funcs) return; - for (i = 0; entry->funcs[i]; i++) - printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i]); + for (i = 0; entry->funcs[i].func; i++) + printk(KERN_DEBUG "Probe %d : %p\n", i, entry->funcs[i].func); } -static void * -tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) +static struct tracepoint_func * +tracepoint_entry_add_probe(struct tracepoint_entry *entry, + void *probe, void *data) { int nr_probes = 0; - void **old, **new; + struct tracepoint_func *old, *new; WARN_ON(!probe); @@ -111,8 +112,9 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) old = entry->funcs; if (old) { /* (N -> N+1), (N != 0, 1) probes */ - for (nr_probes = 0; old[nr_probes]; nr_probes++) - if (old[nr_probes] == probe) + for (nr_probes = 0; old[nr_probes].func; nr_probes++) + if (old[nr_probes].func == probe && + old[nr_probes].data == data) return ERR_PTR(-EEXIST); } /* + 2 : one for new probe, one for NULL func */ @@ -120,9 +122,10 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) if (new == NULL) return ERR_PTR(-ENOMEM); if (old) - memcpy(new, old, nr_probes * sizeof(void *)); - new[nr_probes] = probe; - new[nr_probes + 1] = NULL; + memcpy(new, old, nr_probes * sizeof(struct tracepoint_func)); + new[nr_probes].func = probe; + new[nr_probes].data = data; + new[nr_probes + 1].func = NULL; entry->refcount = nr_probes + 1; entry->funcs = new; debug_print_probes(entry); @@ -130,10 +133,11 @@ tracepoint_entry_add_probe(struct tracepoint_entry *entry, void *probe) } static void * -tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) +tracepoint_entry_remove_probe(struct tracepoint_entry *entry, + void *probe, void *data) { int nr_probes = 0, nr_del = 0, i; - void **old, **new; + struct tracepoint_func *old, *new; old = entry->funcs; @@ -142,8 +146,10 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) debug_print_probes(entry); /* (N -> M), (N > 1, M >= 0) probes */ - for (nr_probes = 0; old[nr_probes]; nr_probes++) { - if ((!probe || old[nr_probes] == probe)) + for (nr_probes = 0; old[nr_probes].func; nr_probes++) { + if (!probe || + (old[nr_probes].func == probe && + old[nr_probes].data == data)) nr_del++; } @@ -160,10 +166,11 @@ tracepoint_entry_remove_probe(struct tracepoint_entry *entry, void *probe) new = allocate_probes(nr_probes - nr_del + 1); if (new == NULL) return ERR_PTR(-ENOMEM); - for (i = 0; old[i]; i++) - if ((probe && old[i] != probe)) + for (i = 0; old[i].func; i++) + if (probe && + (old[i].func != probe || old[i].data != data)) new[j++] = old[i]; - new[nr_probes - nr_del] = NULL; + new[nr_probes - nr_del].func = NULL; entry->refcount = nr_probes - nr_del; entry->funcs = new; } @@ -315,18 +322,19 @@ static void tracepoint_update_probes(void) module_update_tracepoints(); } -static void *tracepoint_add_probe(const char *name, void *probe) +static struct tracepoint_func * +tracepoint_add_probe(const char *name, void *probe, void *data) { struct tracepoint_entry *entry; - void *old; + struct tracepoint_func *old; entry = get_tracepoint(name); if (!entry) { entry = add_tracepoint(name); if (IS_ERR(entry)) - return entry; + return (struct tracepoint_func *)entry; } - old = tracepoint_entry_add_probe(entry, probe); + old = tracepoint_entry_add_probe(entry, probe, data); if (IS_ERR(old) && !entry->refcount) remove_tracepoint(entry); return old; @@ -340,12 +348,12 @@ static void *tracepoint_add_probe(const char *name, void *probe) * Returns 0 if ok, error value on error. * The probe address must at least be aligned on the architecture pointer size. */ -int tracepoint_probe_register(const char *name, void *probe) +int tracepoint_probe_register(const char *name, void *probe, void *data) { - void *old; + struct tracepoint_func *old; mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe); + old = tracepoint_add_probe(name, probe, data); mutex_unlock(&tracepoints_mutex); if (IS_ERR(old)) return PTR_ERR(old); @@ -356,15 +364,16 @@ int tracepoint_probe_register(const char *name, void *probe) } EXPORT_SYMBOL_GPL(tracepoint_probe_register); -static void *tracepoint_remove_probe(const char *name, void *probe) +static struct tracepoint_func * +tracepoint_remove_probe(const char *name, void *probe, void *data) { struct tracepoint_entry *entry; - void *old; + struct tracepoint_func *old; entry = get_tracepoint(name); if (!entry) return ERR_PTR(-ENOENT); - old = tracepoint_entry_remove_probe(entry, probe); + old = tracepoint_entry_remove_probe(entry, probe, data); if (IS_ERR(old)) return old; if (!entry->refcount) @@ -382,12 +391,12 @@ static void *tracepoint_remove_probe(const char *name, void *probe) * itself uses stop_machine(), which insures that every preempt disabled section * have finished. */ -int tracepoint_probe_unregister(const char *name, void *probe) +int tracepoint_probe_unregister(const char *name, void *probe, void *data) { - void *old; + struct tracepoint_func *old; mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe); + old = tracepoint_remove_probe(name, probe, data); mutex_unlock(&tracepoints_mutex); if (IS_ERR(old)) return PTR_ERR(old); @@ -418,12 +427,13 @@ static void tracepoint_add_old_probes(void *old) * * caller must call tracepoint_probe_update_all() */ -int tracepoint_probe_register_noupdate(const char *name, void *probe) +int tracepoint_probe_register_noupdate(const char *name, void *probe, + void *data) { - void *old; + struct tracepoint_func *old; mutex_lock(&tracepoints_mutex); - old = tracepoint_add_probe(name, probe); + old = tracepoint_add_probe(name, probe, data); if (IS_ERR(old)) { mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); @@ -441,12 +451,13 @@ EXPORT_SYMBOL_GPL(tracepoint_probe_register_noupdate); * * caller must call tracepoint_probe_update_all() */ -int tracepoint_probe_unregister_noupdate(const char *name, void *probe) +int tracepoint_probe_unregister_noupdate(const char *name, void *probe, + void *data) { - void *old; + struct tracepoint_func *old; mutex_lock(&tracepoints_mutex); - old = tracepoint_remove_probe(name, probe); + old = tracepoint_remove_probe(name, probe, data); if (IS_ERR(old)) { mutex_unlock(&tracepoints_mutex); return PTR_ERR(old); diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index b2d70d38dff4..25915832291a 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -9,6 +9,7 @@ #include <linux/nsproxy.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/highuid.h> #include <linux/cred.h> /* @@ -82,3 +83,46 @@ void free_user_ns(struct kref *kref) schedule_work(&ns->destroyer); } EXPORT_SYMBOL(free_user_ns); + +uid_t user_ns_map_uid(struct user_namespace *to, const struct cred *cred, uid_t uid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return uid; + + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (uid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowuid; +} + +gid_t user_ns_map_gid(struct user_namespace *to, const struct cred *cred, gid_t gid) +{ + struct user_namespace *tmp; + + if (likely(to == cred->user->user_ns)) + return gid; + + /* Is cred->user the creator of the target user_ns + * or the creator of one of it's parents? + */ + for ( tmp = to; tmp != &init_user_ns; + tmp = tmp->creator->user_ns ) { + if (cred->user == tmp->creator) { + return (gid_t)0; + } + } + + /* No useful relationship so no mapping */ + return overflowgid; +} diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 77dabbf64b8f..327d2deb4451 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1110,7 +1110,7 @@ static int __devinit workqueue_cpu_callback(struct notifier_block *nfb, unsigned int cpu = (unsigned long)hcpu; struct cpu_workqueue_struct *cwq; struct workqueue_struct *wq; - int ret = NOTIFY_OK; + int err = 0; action &= ~CPU_TASKS_FROZEN; @@ -1124,12 +1124,13 @@ undo: switch (action) { case CPU_UP_PREPARE: - if (!create_workqueue_thread(cwq, cpu)) + err = create_workqueue_thread(cwq, cpu); + if (!err) break; printk(KERN_ERR "workqueue [%s] for %i failed\n", wq->name, cpu); action = CPU_UP_CANCELED; - ret = NOTIFY_BAD; + err = -ENOMEM; goto undo; case CPU_ONLINE: @@ -1150,7 +1151,7 @@ undo: cpumask_clear_cpu(cpu, cpu_populated_map); } - return ret; + return notifier_from_errno(err); } #ifdef CONFIG_SMP |