diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 2 | ||||
-rw-r--r-- | kernel/acct.c | 4 | ||||
-rw-r--r-- | kernel/cpu.c | 75 | ||||
-rw-r--r-- | kernel/cpuset.c | 24 | ||||
-rw-r--r-- | kernel/delayacct.c | 178 | ||||
-rw-r--r-- | kernel/exit.c | 10 | ||||
-rw-r--r-- | kernel/fork.c | 6 | ||||
-rw-r--r-- | kernel/futex.c | 121 | ||||
-rw-r--r-- | kernel/futex_compat.c | 34 | ||||
-rw-r--r-- | kernel/kallsyms.c | 4 | ||||
-rw-r--r-- | kernel/kthread.c | 24 | ||||
-rw-r--r-- | kernel/module.c | 11 | ||||
-rw-r--r-- | kernel/rtmutex-tester.c | 1 | ||||
-rw-r--r-- | kernel/sched.c | 84 | ||||
-rw-r--r-- | kernel/softirq.c | 2 | ||||
-rw-r--r-- | kernel/taskstats.c | 568 | ||||
-rw-r--r-- | kernel/timer.c | 20 |
17 files changed, 1021 insertions, 147 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 47dbcd570cd8..d62ec66c1af2 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -48,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ obj-$(CONFIG_SECCOMP) += seccomp.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o obj-$(CONFIG_RELAY) += relay.o +obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o +obj-$(CONFIG_TASKSTATS) += taskstats.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/acct.c b/kernel/acct.c index f18e0b8df3e1..2a7c933651c7 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -488,7 +488,7 @@ static void do_acct_process(struct file *file) old_encode_dev(tty_devnum(current->signal->tty)) : 0; read_unlock(&tasklist_lock); - spin_lock(¤t->sighand->siglock); + spin_lock_irq(¤t->sighand->siglock); ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime))); ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime))); ac.ac_flag = pacct->ac_flag; @@ -496,7 +496,7 @@ static void do_acct_process(struct file *file) ac.ac_minflt = encode_comp_t(pacct->ac_minflt); ac.ac_majflt = encode_comp_t(pacct->ac_majflt); ac.ac_exitcode = pacct->ac_exitcode; - spin_unlock(¤t->sighand->siglock); + spin_unlock_irq(¤t->sighand->siglock); ac.ac_io = encode_comp_t(0 /* current->io_usage */); /* %% */ ac.ac_rw = encode_comp_t(ac.ac_io / 1024); ac.ac_swaps = encode_comp_t(0); diff --git a/kernel/cpu.c b/kernel/cpu.c index 70fbf2e83766..f230f9ae01c2 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -16,56 +16,48 @@ #include <linux/mutex.h> /* This protects CPUs going up and down... */ -static DEFINE_MUTEX(cpucontrol); +static DEFINE_MUTEX(cpu_add_remove_lock); +static DEFINE_MUTEX(cpu_bitmask_lock); static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain); #ifdef CONFIG_HOTPLUG_CPU -static struct task_struct *lock_cpu_hotplug_owner; -static int lock_cpu_hotplug_depth; -static int __lock_cpu_hotplug(int interruptible) -{ - int ret = 0; - - if (lock_cpu_hotplug_owner != current) { - if (interruptible) - ret = mutex_lock_interruptible(&cpucontrol); - else - mutex_lock(&cpucontrol); - } - - /* - * Set only if we succeed in locking - */ - if (!ret) { - lock_cpu_hotplug_depth++; - lock_cpu_hotplug_owner = current; - } - - return ret; -} +/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */ +static struct task_struct *recursive; +static int recursive_depth; void lock_cpu_hotplug(void) { - __lock_cpu_hotplug(0); + struct task_struct *tsk = current; + + if (tsk == recursive) { + static int warnings = 10; + if (warnings) { + printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n"); + WARN_ON(1); + warnings--; + } + recursive_depth++; + return; + } + mutex_lock(&cpu_bitmask_lock); + recursive = tsk; } EXPORT_SYMBOL_GPL(lock_cpu_hotplug); void unlock_cpu_hotplug(void) { - if (--lock_cpu_hotplug_depth == 0) { - lock_cpu_hotplug_owner = NULL; - mutex_unlock(&cpucontrol); + WARN_ON(recursive != current); + if (recursive_depth) { + recursive_depth--; + return; } + mutex_unlock(&cpu_bitmask_lock); + recursive = NULL; } EXPORT_SYMBOL_GPL(unlock_cpu_hotplug); -int lock_cpu_hotplug_interruptible(void) -{ - return __lock_cpu_hotplug(1); -} -EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible); #endif /* CONFIG_HOTPLUG_CPU */ /* Need to know about CPUs going up/down? */ @@ -122,9 +114,7 @@ int cpu_down(unsigned int cpu) struct task_struct *p; cpumask_t old_allowed, tmp; - if ((err = lock_cpu_hotplug_interruptible()) != 0) - return err; - + mutex_lock(&cpu_add_remove_lock); if (num_online_cpus() == 1) { err = -EBUSY; goto out; @@ -150,7 +140,10 @@ int cpu_down(unsigned int cpu) cpu_clear(cpu, tmp); set_cpus_allowed(current, tmp); + mutex_lock(&cpu_bitmask_lock); p = __stop_machine_run(take_cpu_down, NULL, cpu); + mutex_unlock(&cpu_bitmask_lock); + if (IS_ERR(p)) { /* CPU didn't die: tell everyone. Can't complain. */ if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED, @@ -187,7 +180,7 @@ out_thread: out_allowed: set_cpus_allowed(current, old_allowed); out: - unlock_cpu_hotplug(); + mutex_unlock(&cpu_add_remove_lock); return err; } #endif /*CONFIG_HOTPLUG_CPU*/ @@ -197,9 +190,7 @@ int __devinit cpu_up(unsigned int cpu) int ret; void *hcpu = (void *)(long)cpu; - if ((ret = lock_cpu_hotplug_interruptible()) != 0) - return ret; - + mutex_lock(&cpu_add_remove_lock); if (cpu_online(cpu) || !cpu_present(cpu)) { ret = -EINVAL; goto out; @@ -214,7 +205,9 @@ int __devinit cpu_up(unsigned int cpu) } /* Arch-specific enabling code. */ + mutex_lock(&cpu_bitmask_lock); ret = __cpu_up(cpu); + mutex_unlock(&cpu_bitmask_lock); if (ret != 0) goto out_notify; BUG_ON(!cpu_online(cpu)); @@ -227,6 +220,6 @@ out_notify: blocking_notifier_call_chain(&cpu_chain, CPU_UP_CANCELED, hcpu); out: - unlock_cpu_hotplug(); + mutex_unlock(&cpu_add_remove_lock); return ret; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c232dc077438..1a649f2bb9bb 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -762,6 +762,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * * Call with manage_mutex held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. + * Must not be called holding callback_mutex, because we must + * not call lock_cpu_hotplug() while holding callback_mutex. */ static void update_cpu_domains(struct cpuset *cur) @@ -781,7 +783,7 @@ static void update_cpu_domains(struct cpuset *cur) if (is_cpu_exclusive(c)) cpus_andnot(pspan, pspan, c->cpus_allowed); } - if (is_removed(cur) || !is_cpu_exclusive(cur)) { + if (!is_cpu_exclusive(cur)) { cpus_or(pspan, pspan, cur->cpus_allowed); if (cpus_equal(pspan, cur->cpus_allowed)) return; @@ -1917,6 +1919,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); } +/* + * Locking note on the strange update_flag() call below: + * + * If the cpuset being removed is marked cpu_exclusive, then simulate + * turning cpu_exclusive off, which will call update_cpu_domains(). + * The lock_cpu_hotplug() call in update_cpu_domains() must not be + * made while holding callback_mutex. Elsewhere the kernel nests + * callback_mutex inside lock_cpu_hotplug() calls. So the reverse + * nesting would risk an ABBA deadlock. + */ + static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) { struct cpuset *cs = dentry->d_fsdata; @@ -1936,11 +1949,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) mutex_unlock(&manage_mutex); return -EBUSY; } + if (is_cpu_exclusive(cs)) { + int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0"); + if (retval < 0) { + mutex_unlock(&manage_mutex); + return retval; + } + } parent = cs->parent; mutex_lock(&callback_mutex); set_bit(CS_REMOVED, &cs->flags); - if (is_cpu_exclusive(cs)) - update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); diff --git a/kernel/delayacct.c b/kernel/delayacct.c new file mode 100644 index 000000000000..f05392d64267 --- /dev/null +++ b/kernel/delayacct.c @@ -0,0 +1,178 @@ +/* delayacct.c - per-task delay accounting + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU General Public License for more details. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/time.h> +#include <linux/sysctl.h> +#include <linux/delayacct.h> + +int delayacct_on __read_mostly; /* Delay accounting turned on/off */ +kmem_cache_t *delayacct_cache; + +static int __init delayacct_setup_enable(char *str) +{ + delayacct_on = 1; + return 1; +} +__setup("delayacct", delayacct_setup_enable); + +void delayacct_init(void) +{ + delayacct_cache = kmem_cache_create("delayacct_cache", + sizeof(struct task_delay_info), + 0, + SLAB_PANIC, + NULL, NULL); + delayacct_tsk_init(&init_task); +} + +void __delayacct_tsk_init(struct task_struct *tsk) +{ + spin_lock_init(&tsk->delays_lock); + /* No need to acquire tsk->delays_lock for allocation here unless + __delayacct_tsk_init called after tsk is attached to tasklist + */ + tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL); + if (tsk->delays) + spin_lock_init(&tsk->delays->lock); +} + +void __delayacct_tsk_exit(struct task_struct *tsk) +{ + struct task_delay_info *delays = tsk->delays; + spin_lock(&tsk->delays_lock); + tsk->delays = NULL; + spin_unlock(&tsk->delays_lock); + kmem_cache_free(delayacct_cache, delays); +} + +/* + * Start accounting for a delay statistic using + * its starting timestamp (@start) + */ + +static inline void delayacct_start(struct timespec *start) +{ + do_posix_clock_monotonic_gettime(start); +} + +/* + * Finish delay accounting for a statistic using + * its timestamps (@start, @end), accumalator (@total) and @count + */ + +static void delayacct_end(struct timespec *start, struct timespec *end, + u64 *total, u32 *count) +{ + struct timespec ts; + s64 ns; + + do_posix_clock_monotonic_gettime(end); + ts = timespec_sub(*end, *start); + ns = timespec_to_ns(&ts); + if (ns < 0) + return; + + spin_lock(¤t->delays->lock); + *total += ns; + (*count)++; + spin_unlock(¤t->delays->lock); +} + +void __delayacct_blkio_start(void) +{ + delayacct_start(¤t->delays->blkio_start); +} + +void __delayacct_blkio_end(void) +{ + if (current->delays->flags & DELAYACCT_PF_SWAPIN) + /* Swapin block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->swapin_delay, + ¤t->delays->swapin_count); + else /* Other block I/O */ + delayacct_end(¤t->delays->blkio_start, + ¤t->delays->blkio_end, + ¤t->delays->blkio_delay, + ¤t->delays->blkio_count); +} + +int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk) +{ + s64 tmp; + struct timespec ts; + unsigned long t1,t2,t3; + + spin_lock(&tsk->delays_lock); + + /* Though tsk->delays accessed later, early exit avoids + * unnecessary returning of other data + */ + if (!tsk->delays) + goto done; + + tmp = (s64)d->cpu_run_real_total; + cputime_to_timespec(tsk->utime + tsk->stime, &ts); + tmp += timespec_to_ns(&ts); + d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp; + + /* + * No locking available for sched_info (and too expensive to add one) + * Mitigate by taking snapshot of values + */ + t1 = tsk->sched_info.pcnt; + t2 = tsk->sched_info.run_delay; + t3 = tsk->sched_info.cpu_time; + + d->cpu_count += t1; + + jiffies_to_timespec(t2, &ts); + tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts); + d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp; + + tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000; + d->cpu_run_virtual_total = + (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp; + + /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */ + + spin_lock(&tsk->delays->lock); + tmp = d->blkio_delay_total + tsk->delays->blkio_delay; + d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp; + tmp = d->swapin_delay_total + tsk->delays->swapin_delay; + d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp; + d->blkio_count += tsk->delays->blkio_count; + d->swapin_count += tsk->delays->swapin_count; + spin_unlock(&tsk->delays->lock); + +done: + spin_unlock(&tsk->delays_lock); + return 0; +} + +__u64 __delayacct_blkio_ticks(struct task_struct *tsk) +{ + __u64 ret; + + spin_lock(&tsk->delays->lock); + ret = nsec_to_clock_t(tsk->delays->blkio_delay + + tsk->delays->swapin_delay); + spin_unlock(&tsk->delays->lock); + return ret; +} + diff --git a/kernel/exit.c b/kernel/exit.c index 6664c084783d..dba194a8d416 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -25,6 +25,8 @@ #include <linux/mount.h> #include <linux/proc_fs.h> #include <linux/mempolicy.h> +#include <linux/taskstats_kern.h> +#include <linux/delayacct.h> #include <linux/cpuset.h> #include <linux/syscalls.h> #include <linux/signal.h> @@ -843,7 +845,9 @@ static void exit_notify(struct task_struct *tsk) fastcall NORET_TYPE void do_exit(long code) { struct task_struct *tsk = current; + struct taskstats *tidstats; int group_dead; + unsigned int mycpu; profile_task_exit(tsk); @@ -881,6 +885,8 @@ fastcall NORET_TYPE void do_exit(long code) current->comm, current->pid, preempt_count()); + taskstats_exit_alloc(&tidstats, &mycpu); + acct_update_integrals(tsk); if (tsk->mm) { update_hiwater_rss(tsk->mm); @@ -900,6 +906,10 @@ fastcall NORET_TYPE void do_exit(long code) #endif if (unlikely(tsk->audit_context)) audit_free(tsk); + taskstats_exit_send(tsk, tidstats, group_dead, mycpu); + taskstats_exit_free(tidstats); + delayacct_tsk_exit(tsk); + exit_mm(tsk); if (group_dead) diff --git a/kernel/fork.c b/kernel/fork.c index 926e5a68ea9e..1b0f7b1e0881 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -43,6 +43,8 @@ #include <linux/rmap.h> #include <linux/acct.h> #include <linux/cn_proc.h> +#include <linux/delayacct.h> +#include <linux/taskstats_kern.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -818,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts if (clone_flags & CLONE_THREAD) { atomic_inc(¤t->signal->count); atomic_inc(¤t->signal->live); + taskstats_tgid_alloc(current->signal); return 0; } sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL); @@ -862,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts INIT_LIST_HEAD(&sig->cpu_timers[0]); INIT_LIST_HEAD(&sig->cpu_timers[1]); INIT_LIST_HEAD(&sig->cpu_timers[2]); + taskstats_tgid_init(sig); task_lock(current->group_leader); memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim); @@ -883,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts void __cleanup_signal(struct signal_struct *sig) { exit_thread_group_keys(sig); + taskstats_tgid_free(sig); kmem_cache_free(signal_cachep, sig); } @@ -1000,6 +1005,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto bad_fork_cleanup_put_domain; p->did_exec = 0; + delayacct_tsk_init(p); /* Must remain after dup_task_struct() */ copy_flags(clone_flags, p); p->pid = pid; retval = -EFAULT; diff --git a/kernel/futex.c b/kernel/futex.c index cf0c8e21d1ab..dda2049692a2 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -415,15 +415,15 @@ out_unlock: */ void exit_pi_state_list(struct task_struct *curr) { - struct futex_hash_bucket *hb; struct list_head *next, *head = &curr->pi_state_list; struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; union futex_key key; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful - * versus waiters unqueueing themselfs + * versus waiters unqueueing themselves: */ spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { @@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr) next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; + hb = hash_futex(&key); spin_unlock_irq(&curr->pi_lock); - hb = hash_futex(&key); spin_lock(&hb->lock); spin_lock_irq(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ if (head->next != next) { spin_unlock(&hb->lock); continue; } - list_del_init(&pi_state->list); - WARN_ON(pi_state->owner != curr); - + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); pi_state->owner = NULL; spin_unlock_irq(&curr->pi_lock); @@ -470,7 +473,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) head = &hb->chain; list_for_each_entry_safe(this, next, head, list) { - if (match_futex (&this->key, &me->key)) { + if (match_futex(&this->key, &me->key)) { /* * Another waiter already exists - bump up * the refcount and return its pi_state: @@ -482,6 +485,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) if (unlikely(!pi_state)) return -EINVAL; + WARN_ON(!atomic_read(&pi_state->refcount)); + atomic_inc(&pi_state->refcount); me->pi_state = pi_state; @@ -490,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) } /* - * We are the first waiter - try to look up the real owner and - * attach the new pi_state to it: + * We are the first waiter - try to look up the real owner and attach + * the new pi_state to it, but bail out when the owner died bit is set + * and TID = 0: */ pid = uval & FUTEX_TID_MASK; + if (!pid && (uval & FUTEX_OWNER_DIED)) + return -ESRCH; p = futex_find_get_task(pid); if (!p) return -ESRCH; @@ -510,6 +518,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me) pi_state->key = me->key; spin_lock_irq(&p->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; spin_unlock_irq(&p->pi_lock); @@ -573,20 +582,29 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) * kept enabled while there is PI state around. We must also * preserve the owner died bit.) */ - newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid; + if (!(uval & FUTEX_OWNER_DIED)) { + newval = FUTEX_WAITERS | new_owner->pid; - inc_preempt_count(); - curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); - dec_preempt_count(); + inc_preempt_count(); + curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval); + dec_preempt_count(); + if (curval == -EFAULT) + return -EFAULT; + if (curval != uval) + return -EINVAL; + } - if (curval == -EFAULT) - return -EFAULT; - if (curval != uval) - return -EINVAL; + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); - list_del_init(&pi_state->owner->pi_state_list); + spin_lock_irq(&new_owner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; + spin_unlock_irq(&new_owner->pi_lock); + rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -1236,6 +1254,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, /* Owner died? */ if (q.pi_state->owner != NULL) { spin_lock_irq(&q.pi_state->owner->pi_lock); + WARN_ON(list_empty(&q.pi_state->list)); list_del_init(&q.pi_state->list); spin_unlock_irq(&q.pi_state->owner->pi_lock); } else @@ -1244,6 +1263,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock, q.pi_state->owner = current; spin_lock_irq(¤t->pi_lock); + WARN_ON(!list_empty(&q.pi_state->list)); list_add(&q.pi_state->list, ¤t->pi_state_list); spin_unlock_irq(¤t->pi_lock); @@ -1427,9 +1447,11 @@ retry_locked: * again. If it succeeds then we can return without waking * anyone else up: */ - inc_preempt_count(); - uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); - dec_preempt_count(); + if (!(uval & FUTEX_OWNER_DIED)) { + inc_preempt_count(); + uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0); + dec_preempt_count(); + } if (unlikely(uval == -EFAULT)) goto pi_faulted; @@ -1462,9 +1484,11 @@ retry_locked: /* * No waiters - kernel unlocks the futex: */ - ret = unlock_futex_pi(uaddr, uval); - if (ret == -EFAULT) - goto pi_faulted; + if (!(uval & FUTEX_OWNER_DIED)) { + ret = unlock_futex_pi(uaddr, uval); + if (ret == -EFAULT) + goto pi_faulted; + } out_unlock: spin_unlock(&hb->lock); @@ -1683,9 +1707,9 @@ err_unlock: * Process a futex-list entry, check whether it's owned by the * dying task, and do notification if so: */ -int handle_futex_death(u32 __user *uaddr, struct task_struct *curr) +int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi) { - u32 uval, nval; + u32 uval, nval, mval; retry: if (get_user(uval, uaddr)) @@ -1702,21 +1726,45 @@ retry: * thread-death.) The rest of the cleanup is done in * userspace. */ - nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, - uval | FUTEX_OWNER_DIED); + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval); + if (nval == -EFAULT) return -1; if (nval != uval) goto retry; - if (uval & FUTEX_WAITERS) - futex_wake(uaddr, 1); + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi) { + if (uval & FUTEX_WAITERS) + futex_wake(uaddr, 1); + } } return 0; } /* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int fetch_robust_entry(struct robust_list __user **entry, + struct robust_list __user **head, int *pi) +{ + unsigned long uentry; + + if (get_user(uentry, (unsigned long *)head)) + return -EFAULT; + + *entry = (void *)(uentry & ~1UL); + *pi = uentry & 1; + + return 0; +} + +/* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. * @@ -1726,14 +1774,14 @@ void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; unsigned long futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(entry, &head->list.next)) + if (fetch_robust_entry(&entry, &head->list.next, &pi)) return; /* * Fetch the relative futex offset: @@ -1744,10 +1792,11 @@ void exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(pending, &head->list_op_pending)) + if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; + if (pending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pip); while (entry != &head->list) { /* @@ -1756,12 +1805,12 @@ void exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(entry, &entry->next)) + if (fetch_robust_entry(&entry, &entry->next, &pi)) return; /* * Avoid excessively long or circular lists: diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index d1d92b441fb7..d1aab1a452cc 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -12,6 +12,23 @@ #include <asm/uaccess.h> + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t *head, int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + /* * Walk curr->robust_list (very carefully, it's a userspace list!) * and mark any locks found there dead, and notify any waiters. @@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr) { struct compat_robust_list_head __user *head = curr->compat_robust_list; struct robust_list __user *entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi; compat_uptr_t uentry, upending; - unsigned int limit = ROBUST_LIST_LIMIT; compat_long_t futex_offset; /* * Fetch the list head (which was registered earlier, via * sys_set_robust_list()): */ - if (get_user(uentry, &head->list.next)) + if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) return; - entry = compat_ptr(uentry); /* * Fetch the relative futex offset: */ @@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr) * Fetch any possibly pending lock-add first, and handle it * if it exists: */ - if (get_user(upending, &head->list_op_pending)) + if (fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pi)) return; - pending = compat_ptr(upending); if (upending) - handle_futex_death((void *)pending + futex_offset, curr); + handle_futex_death((void *)pending + futex_offset, curr, pi); while (compat_ptr(uentry) != &head->list) { /* @@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr) */ if (entry != pending) if (handle_futex_death((void *)entry + futex_offset, - curr)) + curr, pi)) return; /* * Fetch the next entry in the list: */ - if (get_user(uentry, (compat_uptr_t *)&entry->next)) + if (fetch_robust_entry(&uentry, &entry, + (compat_uptr_t *)&entry->next, &pi)) return; - entry = compat_ptr(uentry); /* * Avoid excessively long or circular lists: */ diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 39277dd6bf90..ab16a5a4cfe9 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter) static int get_ksymbol_mod(struct kallsym_iter *iter) { iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms, - &iter->value, - &iter->type, iter->name); + &iter->value, &iter->type, + iter->name, sizeof(iter->name)); if (iter->owner == NULL) return 0; diff --git a/kernel/kthread.c b/kernel/kthread.c index 24be714b04c7..4f9c60ef95e8 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -216,23 +216,6 @@ EXPORT_SYMBOL(kthread_bind); */ int kthread_stop(struct task_struct *k) { - return kthread_stop_sem(k, NULL); -} -EXPORT_SYMBOL(kthread_stop); - -/** - * kthread_stop_sem - stop a thread created by kthread_create(). - * @k: thread created by kthread_create(). - * @s: semaphore that @k waits on while idle. - * - * Does essentially the same thing as kthread_stop() above, but wakes - * @k by calling up(@s). - * - * Returns the result of threadfn(), or %-EINTR if wake_up_process() - * was never called. - */ -int kthread_stop_sem(struct task_struct *k, struct semaphore *s) -{ int ret; mutex_lock(&kthread_stop_lock); @@ -246,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) /* Now set kthread_should_stop() to true, and wake it up. */ kthread_stop_info.k = k; - if (s) - up(s); - else - wake_up_process(k); + wake_up_process(k); put_task_struct(k); /* Once it dies, reset stop ptr, gather result and we're done. */ @@ -260,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s) return ret; } -EXPORT_SYMBOL(kthread_stop_sem); +EXPORT_SYMBOL(kthread_stop); static __init int helper_init(void) { diff --git a/kernel/module.c b/kernel/module.c index 35e1b1f859d7..2a19cd47c046 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2019,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr, return NULL; } -struct module *module_get_kallsym(unsigned int symnum, - unsigned long *value, - char *type, - char namebuf[128]) +struct module *module_get_kallsym(unsigned int symnum, unsigned long *value, + char *type, char *name, size_t namelen) { struct module *mod; @@ -2031,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum, if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; - strncpy(namebuf, - mod->strtab + mod->symtab[symnum].st_name, - 127); + strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, + namelen); mutex_unlock(&module_mutex); return mod; } diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c index 494dac872a13..948bd8f643e2 100644 --- a/kernel/rtmutex-tester.c +++ b/kernel/rtmutex-tester.c @@ -275,6 +275,7 @@ static int test_func(void *data) /* Wait for the next command to be executed */ schedule(); + try_to_freeze(); if (signal_pending(current)) flush_signals(current); diff --git a/kernel/sched.c b/kernel/sched.c index d714611f1691..b44b9a43b0fc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -51,6 +51,7 @@ #include <linux/times.h> #include <linux/acct.h> #include <linux/kprobes.h> +#include <linux/delayacct.h> #include <asm/tlb.h> #include <asm/unistd.h> @@ -501,9 +502,36 @@ struct file_operations proc_schedstat_operations = { .release = single_release, }; +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) +{ + if (rq) { + rq->rq_sched_info.run_delay += delta_jiffies; + rq->rq_sched_info.pcnt++; + } +} + +/* + * Expects runqueue lock to be held for atomicity of update + */ +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) +{ + if (rq) + rq->rq_sched_info.cpu_time += delta_jiffies; +} # define schedstat_inc(rq, field) do { (rq)->field++; } while (0) # define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) #else /* !CONFIG_SCHEDSTATS */ +static inline void +rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) +{} +static inline void +rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) +{} # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) #endif @@ -523,7 +551,7 @@ static inline struct rq *this_rq_lock(void) return rq; } -#ifdef CONFIG_SCHEDSTATS +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* * Called when a process is dequeued from the active array and given * the cpu. We should note that with the exception of interactive @@ -551,21 +579,16 @@ static inline void sched_info_dequeued(struct task_struct *t) */ static void sched_info_arrive(struct task_struct *t) { - unsigned long now = jiffies, diff = 0; - struct rq *rq = task_rq(t); + unsigned long now = jiffies, delta_jiffies = 0; if (t->sched_info.last_queued) - diff = now - t->sched_info.last_queued; + delta_jiffies = now - t->sched_info.last_queued; sched_info_dequeued(t); - t->sched_info.run_delay += diff; + t->sched_info.run_delay += delta_jiffies; t->sched_info.last_arrival = now; t->sched_info.pcnt++; - if (!rq) - return; - - rq->rq_sched_info.run_delay += diff; - rq->rq_sched_info.pcnt++; + rq_sched_info_arrive(task_rq(t), delta_jiffies); } /* @@ -585,8 +608,9 @@ static void sched_info_arrive(struct task_struct *t) */ static inline void sched_info_queued(struct task_struct *t) { - if (!t->sched_info.last_queued) - t->sched_info.last_queued = jiffies; + if (unlikely(sched_info_on())) + if (!t->sched_info.last_queued) + t->sched_info.last_queued = jiffies; } /* @@ -595,13 +619,10 @@ static inline void sched_info_queued(struct task_struct *t) */ static inline void sched_info_depart(struct task_struct *t) { - struct rq *rq = task_rq(t); - unsigned long diff = jiffies - t->sched_info.last_arrival; - - t->sched_info.cpu_time += diff; + unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; - if (rq) - rq->rq_sched_info.cpu_time += diff; + t->sched_info.cpu_time += delta_jiffies; + rq_sched_info_depart(task_rq(t), delta_jiffies); } /* @@ -610,7 +631,7 @@ static inline void sched_info_depart(struct task_struct *t) * the idle task.) We are only called when prev != next. */ static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) +__sched_info_switch(struct task_struct *prev, struct task_struct *next) { struct rq *rq = task_rq(prev); @@ -625,10 +646,16 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next) if (next != rq->idle) sched_info_arrive(next); } +static inline void +sched_info_switch(struct task_struct *prev, struct task_struct *next) +{ + if (unlikely(sched_info_on())) + __sched_info_switch(prev, next); +} #else #define sched_info_queued(t) do { } while (0) #define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS */ +#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ /* * Adding/removing a task to/from a priority array: @@ -1530,8 +1557,9 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) INIT_LIST_HEAD(&p->run_list); p->array = NULL; -#ifdef CONFIG_SCHEDSTATS - memset(&p->sched_info, 0, sizeof(p->sched_info)); +#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (unlikely(sched_info_on())) + memset(&p->sched_info, 0, sizeof(p->sched_info)); #endif #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) p->oncpu = 0; @@ -1788,7 +1816,15 @@ context_switch(struct rq *rq, struct task_struct *prev, WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; } + /* + * Since the runqueue lock will be released by the next + * task (which is an invalid locking op but in the case + * of the scheduler it's an obvious special-case), so we + * do an early lockdep release here: + */ +#ifndef __ARCH_WANT_UNLOCKED_CTXSW spin_release(&rq->lock.dep_map, 1, _THIS_IP_); +#endif /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); @@ -4526,9 +4562,11 @@ void __sched io_schedule(void) { struct rq *rq = &__raw_get_cpu_var(runqueues); + delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); schedule(); atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); } EXPORT_SYMBOL(io_schedule); @@ -4537,9 +4575,11 @@ long __sched io_schedule_timeout(long timeout) struct rq *rq = &__raw_get_cpu_var(runqueues); long ret; + delayacct_blkio_start(); atomic_inc(&rq->nr_iowait); ret = schedule_timeout(timeout); atomic_dec(&rq->nr_iowait); + delayacct_blkio_end(); return ret; } diff --git a/kernel/softirq.c b/kernel/softirq.c index fd12f2556f0d..0f08a84ae307 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -311,8 +311,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data) softirq_vec[nr].action = action; } -EXPORT_UNUSED_SYMBOL(open_softirq); /* June 2006 */ - /* Tasklets */ struct tasklet_head { diff --git a/kernel/taskstats.c b/kernel/taskstats.c new file mode 100644 index 000000000000..f45179ce028e --- /dev/null +++ b/kernel/taskstats.c @@ -0,0 +1,568 @@ +/* + * taskstats.c - Export per-task statistics to userland + * + * Copyright (C) Shailabh Nagar, IBM Corp. 2006 + * (C) Balbir Singh, IBM Corp. 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#include <linux/kernel.h> +#include <linux/taskstats_kern.h> +#include <linux/delayacct.h> +#include <linux/cpumask.h> +#include <linux/percpu.h> +#include <net/genetlink.h> +#include <asm/atomic.h> + +/* + * Maximum length of a cpumask that can be specified in + * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute + */ +#define TASKSTATS_CPUMASK_MAXLEN (100+6*NR_CPUS) + +static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 }; +static int family_registered; +kmem_cache_t *taskstats_cache; + +static struct genl_family family = { + .id = GENL_ID_GENERATE, + .name = TASKSTATS_GENL_NAME, + .version = TASKSTATS_GENL_VERSION, + .maxattr = TASKSTATS_CMD_ATTR_MAX, +}; + +static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] +__read_mostly = { + [TASKSTATS_CMD_ATTR_PID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 }, + [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, + [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; + +struct listener { + struct list_head list; + pid_t pid; + char valid; +}; + +struct listener_list { + struct rw_semaphore sem; + struct list_head list; +}; +static DEFINE_PER_CPU(struct listener_list, listener_array); + +enum actions { + REGISTER, + DEREGISTER, + CPU_DONT_CARE +}; + +static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp, + void **replyp, size_t size) +{ + struct sk_buff *skb; + void *reply; + + /* + * If new attributes are added, please revisit this allocation + */ + skb = nlmsg_new(size); + if (!skb) + return -ENOMEM; + + if (!info) { + int seq = get_cpu_var(taskstats_seqnum)++; + put_cpu_var(taskstats_seqnum); + + reply = genlmsg_put(skb, 0, seq, + family.id, 0, 0, + cmd, family.version); + } else + reply = genlmsg_put(skb, info->snd_pid, info->snd_seq, + family.id, 0, 0, + cmd, family.version); + if (reply == NULL) { + nlmsg_free(skb); + return -EINVAL; + } + + *skbp = skb; + *replyp = reply; + return 0; +} + +/* + * Send taskstats data in @skb to listener with nl_pid @pid + */ +static int send_reply(struct sk_buff *skb, pid_t pid) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + void *reply = genlmsg_data(genlhdr); + int rc; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + return genlmsg_unicast(skb, pid); +} + +/* + * Send taskstats data in @skb to listeners registered for @cpu's exit data + */ +static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu) +{ + struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data); + struct listener_list *listeners; + struct listener *s, *tmp; + struct sk_buff *skb_next, *skb_cur = skb; + void *reply = genlmsg_data(genlhdr); + int rc, ret, delcount = 0; + + rc = genlmsg_end(skb, reply); + if (rc < 0) { + nlmsg_free(skb); + return rc; + } + + rc = 0; + listeners = &per_cpu(listener_array, cpu); + down_read(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + skb_next = NULL; + if (!list_is_last(&s->list, &listeners->list)) { + skb_next = skb_clone(skb_cur, GFP_KERNEL); + if (!skb_next) { + nlmsg_free(skb_cur); + rc = -ENOMEM; + break; + } + } + ret = genlmsg_unicast(skb_cur, s->pid); + if (ret == -ECONNREFUSED) { + s->valid = 0; + delcount++; + rc = ret; + } + skb_cur = skb_next; + } + up_read(&listeners->sem); + + if (!delcount) + return rc; + + /* Delete invalidated entries */ + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (!s->valid) { + list_del(&s->list); + kfree(s); + } + } + up_write(&listeners->sem); + return rc; +} + +static int fill_pid(pid_t pid, struct task_struct *pidtsk, + struct taskstats *stats) +{ + int rc; + struct task_struct *tsk = pidtsk; + + if (!pidtsk) { + read_lock(&tasklist_lock); + tsk = find_task_by_pid(pid); + if (!tsk) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + get_task_struct(tsk); + read_unlock(&tasklist_lock); + } else + get_task_struct(tsk); + + /* + * Each accounting subsystem adds calls to its functions to + * fill in relevant parts of struct taskstsats as follows + * + * rc = per-task-foo(stats, tsk); + * if (rc) + * goto err; + */ + + rc = delayacct_add_tsk(stats, tsk); + stats->version = TASKSTATS_VERSION; + + /* Define err: label here if needed */ + put_task_struct(tsk); + return rc; + +} + +static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk, + struct taskstats *stats) +{ + struct task_struct *tsk, *first; + unsigned long flags; + + /* + * Add additional stats from live tasks except zombie thread group + * leaders who are already counted with the dead tasks + */ + first = tgidtsk; + if (!first) { + read_lock(&tasklist_lock); + first = find_task_by_pid(tgid); + if (!first) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + get_task_struct(first); + read_unlock(&tasklist_lock); + } else + get_task_struct(first); + + /* Start with stats from dead tasks */ + spin_lock_irqsave(&first->signal->stats_lock, flags); + if (first->signal->stats) + memcpy(stats, first->signal->stats, sizeof(*stats)); + spin_unlock_irqrestore(&first->signal->stats_lock, flags); + + tsk = first; + read_lock(&tasklist_lock); + do { + if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk)) + continue; + /* + * Accounting subsystem can call its functions here to + * fill in relevant parts of struct taskstsats as follows + * + * per-task-foo(stats, tsk); + */ + delayacct_add_tsk(stats, tsk); + + } while_each_thread(first, tsk); + read_unlock(&tasklist_lock); + stats->version = TASKSTATS_VERSION; + + /* + * Accounting subsytems can also add calls here to modify + * fields of taskstats. + */ + + return 0; +} + + +static void fill_tgid_exit(struct task_struct *tsk) +{ + unsigned long flags; + + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + if (!tsk->signal->stats) + goto ret; + + /* + * Each accounting subsystem calls its functions here to + * accumalate its per-task stats for tsk, into the per-tgid structure + * + * per-task-foo(tsk->signal->stats, tsk); + */ + delayacct_add_tsk(tsk->signal->stats, tsk); +ret: + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + return; +} + +static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd) +{ + struct listener_list *listeners; + struct listener *s, *tmp; + unsigned int cpu; + cpumask_t mask = *maskp; + + if (!cpus_subset(mask, cpu_possible_map)) + return -EINVAL; + + if (isadd == REGISTER) { + for_each_cpu_mask(cpu, mask) { + s = kmalloc_node(sizeof(struct listener), GFP_KERNEL, + cpu_to_node(cpu)); + if (!s) + goto cleanup; + s->pid = pid; + INIT_LIST_HEAD(&s->list); + s->valid = 1; + + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_add(&s->list, &listeners->list); + up_write(&listeners->sem); + } + return 0; + } + + /* Deregister or cleanup */ +cleanup: + for_each_cpu_mask(cpu, mask) { + listeners = &per_cpu(listener_array, cpu); + down_write(&listeners->sem); + list_for_each_entry_safe(s, tmp, &listeners->list, list) { + if (s->pid == pid) { + list_del(&s->list); + kfree(s); + break; + } + } + up_write(&listeners->sem); + } + return 0; +} + +static int parse(struct nlattr *na, cpumask_t *mask) +{ + char *data; + int len; + int ret; + + if (na == NULL) + return 1; + len = nla_len(na); + if (len > TASKSTATS_CPUMASK_MAXLEN) + return -E2BIG; + if (len < 1) + return -EINVAL; + data = kmalloc(len, GFP_KERNEL); + if (!data) + return -ENOMEM; + nla_strlcpy(data, na, len); + ret = cpulist_parse(data, *mask); + kfree(data); + return ret; +} + +static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info) +{ + int rc = 0; + struct sk_buff *rep_skb; + struct taskstats stats; + void *reply; + size_t size; + struct nlattr *na; + cpumask_t mask; + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, REGISTER); + + rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask); + if (rc < 0) + return rc; + if (rc == 0) + return add_del_listener(info->snd_pid, &mask, DEREGISTER); + + /* + * Size includes space for nested attributes + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + memset(&stats, 0, sizeof(stats)); + rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + return rc; + + if (info->attrs[TASKSTATS_CMD_ATTR_PID]) { + u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]); + rc = fill_pid(pid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + stats); + } else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) { + u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]); + rc = fill_tgid(tgid, NULL, &stats); + if (rc < 0) + goto err; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + stats); + } else { + rc = -EINVAL; + goto err; + } + + nla_nest_end(rep_skb, na); + + return send_reply(rep_skb, info->snd_pid); + +nla_put_failure: + return genlmsg_cancel(rep_skb, reply); +err: + nlmsg_free(rep_skb); + return rc; +} + +void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu) +{ + struct listener_list *listeners; + struct taskstats *tmp; + /* + * This is the cpu on which the task is exiting currently and will + * be the one for which the exit event is sent, even if the cpu + * on which this function is running changes later. + */ + *mycpu = raw_smp_processor_id(); + + *ptidstats = NULL; + tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL); + if (!tmp) + return; + + listeners = &per_cpu(listener_array, *mycpu); + down_read(&listeners->sem); + if (!list_empty(&listeners->list)) { + *ptidstats = tmp; + tmp = NULL; + } + up_read(&listeners->sem); + kfree(tmp); +} + +/* Send pid data out on exit */ +void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats, + int group_dead, unsigned int mycpu) +{ + int rc; + struct sk_buff *rep_skb; + void *reply; + size_t size; + int is_thread_group; + struct nlattr *na; + unsigned long flags; + + if (!family_registered || !tidstats) + return; + + spin_lock_irqsave(&tsk->signal->stats_lock, flags); + is_thread_group = tsk->signal->stats ? 1 : 0; + spin_unlock_irqrestore(&tsk->signal->stats_lock, flags); + + rc = 0; + /* + * Size includes space for nested attributes + */ + size = nla_total_size(sizeof(u32)) + + nla_total_size(sizeof(struct taskstats)) + nla_total_size(0); + + if (is_thread_group) + size = 2 * size; /* PID + STATS + TGID + STATS */ + + rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size); + if (rc < 0) + goto ret; + + rc = fill_pid(tsk->pid, tsk, tidstats); + if (rc < 0) + goto err_skb; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid); + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + *tidstats); + nla_nest_end(rep_skb, na); + + if (!is_thread_group) + goto send; + + /* + * tsk has/had a thread group so fill the tsk->signal->stats structure + * Doesn't matter if tsk is the leader or the last group member leaving + */ + + fill_tgid_exit(tsk); + if (!group_dead) + goto send; + + na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID); + NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid); + /* No locking needed for tsk->signal->stats since group is dead */ + NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS, + *tsk->signal->stats); + nla_nest_end(rep_skb, na); + +send: + send_cpu_listeners(rep_skb, mycpu); + return; + +nla_put_failure: + genlmsg_cancel(rep_skb, reply); + goto ret; +err_skb: + nlmsg_free(rep_skb); +ret: + return; +} + +static struct genl_ops taskstats_ops = { + .cmd = TASKSTATS_CMD_GET, + .doit = taskstats_user_cmd, + .policy = taskstats_cmd_get_policy, +}; + +/* Needed early in initialization */ +void __init taskstats_init_early(void) +{ + unsigned int i; + + taskstats_cache = kmem_cache_create("taskstats_cache", + sizeof(struct taskstats), + 0, SLAB_PANIC, NULL, NULL); + for_each_possible_cpu(i) { + INIT_LIST_HEAD(&(per_cpu(listener_array, i).list)); + init_rwsem(&(per_cpu(listener_array, i).sem)); + } +} + +static int __init taskstats_init(void) +{ + int rc; + + rc = genl_register_family(&family); + if (rc) + return rc; + + rc = genl_register_ops(&family, &taskstats_ops); + if (rc < 0) + goto err; + + family_registered = 1; + return 0; +err: + genl_unregister_family(&family); + return rc; +} + +/* + * late initcall ensures initialization of statistics collection + * mechanisms precedes initialization of the taskstats interface + */ +late_initcall(taskstats_init); diff --git a/kernel/timer.c b/kernel/timer.c index 2a87430a58d4..05809c2e2fd6 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer) int ret = try_to_del_timer_sync(timer); if (ret >= 0) return ret; + cpu_relax(); } } @@ -968,6 +969,7 @@ void __init timekeeping_init(void) } +static int timekeeping_suspended; /* * timekeeping_resume - Resumes the generic timekeeping subsystem. * @dev: unused @@ -983,6 +985,18 @@ static int timekeeping_resume(struct sys_device *dev) write_seqlock_irqsave(&xtime_lock, flags); /* restart the last cycle value */ clock->cycle_last = clocksource_read(clock); + clock->error = 0; + timekeeping_suspended = 0; + write_sequnlock_irqrestore(&xtime_lock, flags); + return 0; +} + +static int timekeeping_suspend(struct sys_device *dev, pm_message_t state) +{ + unsigned long flags; + + write_seqlock_irqsave(&xtime_lock, flags); + timekeeping_suspended = 1; write_sequnlock_irqrestore(&xtime_lock, flags); return 0; } @@ -990,6 +1004,7 @@ static int timekeeping_resume(struct sys_device *dev) /* sysfs resume/suspend bits for timekeeping */ static struct sysdev_class timekeeping_sysclass = { .resume = timekeeping_resume, + .suspend = timekeeping_suspend, set_kset_name("timekeeping"), }; @@ -1100,13 +1115,16 @@ static void update_wall_time(void) { cycle_t offset; - clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; + /* Make sure we're fully resumed: */ + if (unlikely(timekeeping_suspended)) + return; #ifdef CONFIG_GENERIC_TIME offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask; #else offset = clock->cycle_interval; #endif + clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift; /* normally this loop will run just once, however in the * case of lost or late ticks, it will accumulate correctly. |