17 files changed, 1021 insertions, 147 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 47dbcd570cd8..d62ec66c1af2 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -48,6 +48,8 @@ obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 obj-$(CONFIG_RELAY) += relay.o
+obj-$(CONFIG_TASK_DELAY_ACCT) += delayacct.o
+obj-$(CONFIG_TASKSTATS) += taskstats.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/acct.c b/kernel/acct.c
index f18e0b8df3e1..2a7c933651c7 100644
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -488,7 +488,7 @@ static void do_acct_process(struct file *file)
 		old_encode_dev(tty_devnum(current->signal->tty)) : 0;
 	read_unlock(&tasklist_lock);
 
-	spin_lock(&current->sighand->siglock);
+	spin_lock_irq(&current->sighand->siglock);
 	ac.ac_utime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_utime)));
 	ac.ac_stime = encode_comp_t(jiffies_to_AHZ(cputime_to_jiffies(pacct->ac_stime)));
 	ac.ac_flag = pacct->ac_flag;
@@ -496,7 +496,7 @@ static void do_acct_process(struct file *file)
 	ac.ac_minflt = encode_comp_t(pacct->ac_minflt);
 	ac.ac_majflt = encode_comp_t(pacct->ac_majflt);
 	ac.ac_exitcode = pacct->ac_exitcode;
-	spin_unlock(&current->sighand->siglock);
+	spin_unlock_irq(&current->sighand->siglock);
 	ac.ac_io = encode_comp_t(0 /* current->io_usage */);	/* %% */
 	ac.ac_rw = encode_comp_t(ac.ac_io / 1024);
 	ac.ac_swaps = encode_comp_t(0);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 70fbf2e83766..f230f9ae01c2 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -16,56 +16,48 @@
 #include <linux/mutex.h>
 
 /* This protects CPUs going up and down... */
-static DEFINE_MUTEX(cpucontrol);
+static DEFINE_MUTEX(cpu_add_remove_lock);
+static DEFINE_MUTEX(cpu_bitmask_lock);
 
 static __cpuinitdata BLOCKING_NOTIFIER_HEAD(cpu_chain);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static struct task_struct *lock_cpu_hotplug_owner;
-static int lock_cpu_hotplug_depth;
 
-static int __lock_cpu_hotplug(int interruptible)
-{
-	int ret = 0;
-
-	if (lock_cpu_hotplug_owner != current) {
-		if (interruptible)
-			ret = mutex_lock_interruptible(&cpucontrol);
-		else
-			mutex_lock(&cpucontrol);
-	}
-
-	/*
-	 * Set only if we succeed in locking
-	 */
-	if (!ret) {
-		lock_cpu_hotplug_depth++;
-		lock_cpu_hotplug_owner = current;
-	}
-
-	return ret;
-}
+/* Crappy recursive lock-takers in cpufreq! Complain loudly about idiots */
+static struct task_struct *recursive;
+static int recursive_depth;
 
 void lock_cpu_hotplug(void)
 {
-	__lock_cpu_hotplug(0);
+	struct task_struct *tsk = current;
+
+	if (tsk == recursive) {
+		static int warnings = 10;
+		if (warnings) {
+			printk(KERN_ERR "Lukewarm IQ detected in hotplug locking\n");
+			WARN_ON(1);
+			warnings--;
+		}
+		recursive_depth++;
+		return;
+	}
+	mutex_lock(&cpu_bitmask_lock);
+	recursive = tsk;
 }
 EXPORT_SYMBOL_GPL(lock_cpu_hotplug);
 
 void unlock_cpu_hotplug(void)
 {
-	if (--lock_cpu_hotplug_depth == 0) {
-		lock_cpu_hotplug_owner = NULL;
-		mutex_unlock(&cpucontrol);
+	WARN_ON(recursive != current);
+	if (recursive_depth) {
+		recursive_depth--;
+		return;
 	}
+	mutex_unlock(&cpu_bitmask_lock);
+	recursive = NULL;
 }
 EXPORT_SYMBOL_GPL(unlock_cpu_hotplug);
 
-int lock_cpu_hotplug_interruptible(void)
-{
-	return __lock_cpu_hotplug(1);
-}
-EXPORT_SYMBOL_GPL(lock_cpu_hotplug_interruptible);
 #endif	/* CONFIG_HOTPLUG_CPU */
 
 /* Need to know about CPUs going up/down? */
@@ -122,9 +114,7 @@ int cpu_down(unsigned int cpu)
 	struct task_struct *p;
 	cpumask_t old_allowed, tmp;
 
-	if ((err = lock_cpu_hotplug_interruptible()) != 0)
-		return err;
-
+	mutex_lock(&cpu_add_remove_lock);
 	if (num_online_cpus() == 1) {
 		err = -EBUSY;
 		goto out;
@@ -150,7 +140,10 @@ int cpu_down(unsigned int cpu)
 	cpu_clear(cpu, tmp);
 	set_cpus_allowed(current, tmp);
 
+	mutex_lock(&cpu_bitmask_lock);
 	p = __stop_machine_run(take_cpu_down, NULL, cpu);
+	mutex_unlock(&cpu_bitmask_lock);
+
 	if (IS_ERR(p)) {
 		/* CPU didn't die: tell everyone.  Can't complain. */
 		if (blocking_notifier_call_chain(&cpu_chain, CPU_DOWN_FAILED,
@@ -187,7 +180,7 @@ out_thread:
 out_allowed:
 	set_cpus_allowed(current, old_allowed);
 out:
-	unlock_cpu_hotplug();
+	mutex_unlock(&cpu_add_remove_lock);
 	return err;
 }
 #endif /*CONFIG_HOTPLUG_CPU*/
@@ -197,9 +190,7 @@ int __devinit cpu_up(unsigned int cpu)
 	int ret;
 	void *hcpu = (void *)(long)cpu;
 
-	if ((ret = lock_cpu_hotplug_interruptible()) != 0)
-		return ret;
-
+	mutex_lock(&cpu_add_remove_lock);
 	if (cpu_online(cpu) || !cpu_present(cpu)) {
 		ret = -EINVAL;
 		goto out;
@@ -214,7 +205,9 @@ int __devinit cpu_up(unsigned int cpu)
 	}
 
 	/* Arch-specific enabling code. */
+	mutex_lock(&cpu_bitmask_lock);
 	ret = __cpu_up(cpu);
+	mutex_unlock(&cpu_bitmask_lock);
 	if (ret != 0)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
@@ -227,6 +220,6 @@ out_notify:
 		blocking_notifier_call_chain(&cpu_chain,
 				CPU_UP_CANCELED, hcpu);
 out:
-	unlock_cpu_hotplug();
+	mutex_unlock(&cpu_add_remove_lock);
 	return ret;
 }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index c232dc077438..1a649f2bb9bb 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -762,6 +762,8 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
  *
  * Call with manage_mutex held.  May nest a call to the
  * lock_cpu_hotplug()/unlock_cpu_hotplug() pair.
+ * Must not be called holding callback_mutex, because we must
+ * not call lock_cpu_hotplug() while holding callback_mutex.
  */
 
 static void update_cpu_domains(struct cpuset *cur)
@@ -781,7 +783,7 @@ static void update_cpu_domains(struct cpuset *cur)
 		if (is_cpu_exclusive(c))
 			cpus_andnot(pspan, pspan, c->cpus_allowed);
 	}
-	if (is_removed(cur) || !is_cpu_exclusive(cur)) {
+	if (!is_cpu_exclusive(cur)) {
 		cpus_or(pspan, pspan, cur->cpus_allowed);
 		if (cpus_equal(pspan, cur->cpus_allowed))
 			return;
@@ -1917,6 +1919,17 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
 }
 
+/*
+ * Locking note on the strange update_flag() call below:
+ *
+ * If the cpuset being removed is marked cpu_exclusive, then simulate
+ * turning cpu_exclusive off, which will call update_cpu_domains().
+ * The lock_cpu_hotplug() call in update_cpu_domains() must not be
+ * made while holding callback_mutex.  Elsewhere the kernel nests
+ * callback_mutex inside lock_cpu_hotplug() calls.  So the reverse
+ * nesting would risk an ABBA deadlock.
+ */
+
 static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
 	struct cpuset *cs = dentry->d_fsdata;
@@ -1936,11 +1949,16 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
 		mutex_unlock(&manage_mutex);
 		return -EBUSY;
 	}
+	if (is_cpu_exclusive(cs)) {
+		int retval = update_flag(CS_CPU_EXCLUSIVE, cs, "0");
+		if (retval < 0) {
+			mutex_unlock(&manage_mutex);
+			return retval;
+		}
+	}
 	parent = cs->parent;
 	mutex_lock(&callback_mutex);
 	set_bit(CS_REMOVED, &cs->flags);
-	if (is_cpu_exclusive(cs))
-		update_cpu_domains(cs);
 	list_del(&cs->sibling);	/* delete my sibling from parent->children */
 	spin_lock(&cs->dentry->d_lock);
 	d = dget(cs->dentry);
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
new file mode 100644
index 000000000000..f05392d64267
--- /dev/null
+++ b/kernel/delayacct.c
@@ -0,0 +1,178 @@
+/* delayacct.c - per-task delay accounting
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See
+ * the GNU General Public License for more details.
+ */
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/time.h>
+#include <linux/sysctl.h>
+#include <linux/delayacct.h>
+
+int delayacct_on __read_mostly;	/* Delay accounting turned on/off */
+kmem_cache_t *delayacct_cache;
+
+static int __init delayacct_setup_enable(char *str)
+{
+	delayacct_on = 1;
+	return 1;
+}
+__setup("delayacct", delayacct_setup_enable);
+
+void delayacct_init(void)
+{
+	delayacct_cache = kmem_cache_create("delayacct_cache",
+					sizeof(struct task_delay_info),
+					0,
+					SLAB_PANIC,
+					NULL, NULL);
+	delayacct_tsk_init(&init_task);
+}
+
+void __delayacct_tsk_init(struct task_struct *tsk)
+{
+	spin_lock_init(&tsk->delays_lock);
+	/* No need to acquire tsk->delays_lock for allocation here unless
+	   __delayacct_tsk_init called after tsk is attached to tasklist
+	*/
+	tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
+	if (tsk->delays)
+		spin_lock_init(&tsk->delays->lock);
+}
+
+void __delayacct_tsk_exit(struct task_struct *tsk)
+{
+	struct task_delay_info *delays = tsk->delays;
+	spin_lock(&tsk->delays_lock);
+	tsk->delays = NULL;
+	spin_unlock(&tsk->delays_lock);
+	kmem_cache_free(delayacct_cache, delays);
+}
+
+/*
+ * Start accounting for a delay statistic using
+ * its starting timestamp (@start)
+ */
+
+static inline void delayacct_start(struct timespec *start)
+{
+	do_posix_clock_monotonic_gettime(start);
+}
+
+/*
+ * Finish delay accounting for a statistic using
+ * its timestamps (@start, @end), accumalator (@total) and @count
+ */
+
+static void delayacct_end(struct timespec *start, struct timespec *end,
+				u64 *total, u32 *count)
+{
+	struct timespec ts;
+	s64 ns;
+
+	do_posix_clock_monotonic_gettime(end);
+	ts = timespec_sub(*end, *start);
+	ns = timespec_to_ns(&ts);
+	if (ns < 0)
+		return;
+
+	spin_lock(&current->delays->lock);
+	*total += ns;
+	(*count)++;
+	spin_unlock(&current->delays->lock);
+}
+
+void __delayacct_blkio_start(void)
+{
+	delayacct_start(&current->delays->blkio_start);
+}
+
+void __delayacct_blkio_end(void)
+{
+	if (current->delays->flags & DELAYACCT_PF_SWAPIN)
+		/* Swapin block I/O */
+		delayacct_end(&current->delays->blkio_start,
+			&current->delays->blkio_end,
+			&current->delays->swapin_delay,
+			&current->delays->swapin_count);
+	else	/* Other block I/O */
+		delayacct_end(&current->delays->blkio_start,
+			&current->delays->blkio_end,
+			&current->delays->blkio_delay,
+			&current->delays->blkio_count);
+}
+
+int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+{
+	s64 tmp;
+	struct timespec ts;
+	unsigned long t1,t2,t3;
+
+	spin_lock(&tsk->delays_lock);
+
+	/* Though tsk->delays accessed later, early exit avoids
+	 * unnecessary returning of other data
+	 */
+	if (!tsk->delays)
+		goto done;
+
+	tmp = (s64)d->cpu_run_real_total;
+	cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+	tmp += timespec_to_ns(&ts);
+	d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+
+	/*
+	 * No locking available for sched_info (and too expensive to add one)
+	 * Mitigate by taking snapshot of values
+	 */
+	t1 = tsk->sched_info.pcnt;
+	t2 = tsk->sched_info.run_delay;
+	t3 = tsk->sched_info.cpu_time;
+
+	d->cpu_count += t1;
+
+	jiffies_to_timespec(t2, &ts);
+	tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
+	d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+
+	tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
+	d->cpu_run_virtual_total =
+		(tmp < (s64)d->cpu_run_virtual_total) ?	0 : tmp;
+
+	/* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+
+	spin_lock(&tsk->delays->lock);
+	tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+	d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+	tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+	d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+	d->blkio_count += tsk->delays->blkio_count;
+	d->swapin_count += tsk->delays->swapin_count;
+	spin_unlock(&tsk->delays->lock);
+
+done:
+	spin_unlock(&tsk->delays_lock);
+	return 0;
+}
+
+__u64 __delayacct_blkio_ticks(struct task_struct *tsk)
+{
+	__u64 ret;
+
+	spin_lock(&tsk->delays->lock);
+	ret = nsec_to_clock_t(tsk->delays->blkio_delay +
+				tsk->delays->swapin_delay);
+	spin_unlock(&tsk->delays->lock);
+	return ret;
+}
+
diff --git a/kernel/exit.c b/kernel/exit.c
index 6664c084783d..dba194a8d416 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -25,6 +25,8 @@
 #include <linux/mount.h>
 #include <linux/proc_fs.h>
 #include <linux/mempolicy.h>
+#include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
@@ -843,7 +845,9 @@ static void exit_notify(struct task_struct *tsk)
 fastcall NORET_TYPE void do_exit(long code)
 {
 	struct task_struct *tsk = current;
+	struct taskstats *tidstats;
 	int group_dead;
+	unsigned int mycpu;
 
 	profile_task_exit(tsk);
 
@@ -881,6 +885,8 @@ fastcall NORET_TYPE void do_exit(long code)
 				current->comm, current->pid,
 				preempt_count());
 
+	taskstats_exit_alloc(&tidstats, &mycpu);
+
 	acct_update_integrals(tsk);
 	if (tsk->mm) {
 		update_hiwater_rss(tsk->mm);
@@ -900,6 +906,10 @@ fastcall NORET_TYPE void do_exit(long code)
 #endif
 	if (unlikely(tsk->audit_context))
 		audit_free(tsk);
+	taskstats_exit_send(tsk, tidstats, group_dead, mycpu);
+	taskstats_exit_free(tidstats);
+	delayacct_tsk_exit(tsk);
+
 	exit_mm(tsk);
 
 	if (group_dead)
diff --git a/kernel/fork.c b/kernel/fork.c
index 926e5a68ea9e..1b0f7b1e0881 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -43,6 +43,8 @@
 #include <linux/rmap.h>
 #include <linux/acct.h>
 #include <linux/cn_proc.h>
+#include <linux/delayacct.h>
+#include <linux/taskstats_kern.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -818,6 +820,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	if (clone_flags & CLONE_THREAD) {
 		atomic_inc(&current->signal->count);
 		atomic_inc(&current->signal->live);
+		taskstats_tgid_alloc(current->signal);
 		return 0;
 	}
 	sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
@@ -862,6 +865,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 	INIT_LIST_HEAD(&sig->cpu_timers[0]);
 	INIT_LIST_HEAD(&sig->cpu_timers[1]);
 	INIT_LIST_HEAD(&sig->cpu_timers[2]);
+	taskstats_tgid_init(sig);
 
 	task_lock(current->group_leader);
 	memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
@@ -883,6 +887,7 @@ static inline int copy_signal(unsigned long clone_flags, struct task_struct * ts
 void __cleanup_signal(struct signal_struct *sig)
 {
 	exit_thread_group_keys(sig);
+	taskstats_tgid_free(sig);
 	kmem_cache_free(signal_cachep, sig);
 }
 
@@ -1000,6 +1005,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_put_domain;
 
 	p->did_exec = 0;
+	delayacct_tsk_init(p);	/* Must remain after dup_task_struct() */
 	copy_flags(clone_flags, p);
 	p->pid = pid;
 	retval = -EFAULT;
diff --git a/kernel/futex.c b/kernel/futex.c
index cf0c8e21d1ab..dda2049692a2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -415,15 +415,15 @@ out_unlock:
  */
 void exit_pi_state_list(struct task_struct *curr)
 {
-	struct futex_hash_bucket *hb;
 	struct list_head *next, *head = &curr->pi_state_list;
 	struct futex_pi_state *pi_state;
+	struct futex_hash_bucket *hb;
 	union futex_key key;
 
 	/*
 	 * We are a ZOMBIE and nobody can enqueue itself on
 	 * pi_state_list anymore, but we have to be careful
-	 * versus waiters unqueueing themselfs
+	 * versus waiters unqueueing themselves:
 	 */
 	spin_lock_irq(&curr->pi_lock);
 	while (!list_empty(head)) {
@@ -431,21 +431,24 @@ void exit_pi_state_list(struct task_struct *curr)
 		next = head->next;
 		pi_state = list_entry(next, struct futex_pi_state, list);
 		key = pi_state->key;
+		hb = hash_futex(&key);
 		spin_unlock_irq(&curr->pi_lock);
 
-		hb = hash_futex(&key);
 		spin_lock(&hb->lock);
 
 		spin_lock_irq(&curr->pi_lock);
+		/*
+		 * We dropped the pi-lock, so re-check whether this
+		 * task still owns the PI-state:
+		 */
 		if (head->next != next) {
 			spin_unlock(&hb->lock);
 			continue;
 		}
 
-		list_del_init(&pi_state->list);
-
 		WARN_ON(pi_state->owner != curr);
-
+		WARN_ON(list_empty(&pi_state->list));
+		list_del_init(&pi_state->list);
 		pi_state->owner = NULL;
 		spin_unlock_irq(&curr->pi_lock);
 
@@ -470,7 +473,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	head = &hb->chain;
 
 	list_for_each_entry_safe(this, next, head, list) {
-		if (match_futex (&this->key, &me->key)) {
+		if (match_futex(&this->key, &me->key)) {
 			/*
 			 * Another waiter already exists - bump up
 			 * the refcount and return its pi_state:
@@ -482,6 +485,8 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 			if (unlikely(!pi_state))
 				return -EINVAL;
 
+			WARN_ON(!atomic_read(&pi_state->refcount));
+
 			atomic_inc(&pi_state->refcount);
 			me->pi_state = pi_state;
 
@@ -490,10 +495,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	}
 
 	/*
-	 * We are the first waiter - try to look up the real owner and
-	 * attach the new pi_state to it:
+	 * We are the first waiter - try to look up the real owner and attach
+	 * the new pi_state to it, but bail out when the owner died bit is set
+	 * and TID = 0:
 	 */
 	pid = uval & FUTEX_TID_MASK;
+	if (!pid && (uval & FUTEX_OWNER_DIED))
+		return -ESRCH;
 	p = futex_find_get_task(pid);
 	if (!p)
 		return -ESRCH;
@@ -510,6 +518,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
 	pi_state->key = me->key;
 
 	spin_lock_irq(&p->pi_lock);
+	WARN_ON(!list_empty(&pi_state->list));
 	list_add(&pi_state->list, &p->pi_state_list);
 	pi_state->owner = p;
 	spin_unlock_irq(&p->pi_lock);
@@ -573,20 +582,29 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
 	 * kept enabled while there is PI state around. We must also
 	 * preserve the owner died bit.)
 	 */
-	newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		newval = FUTEX_WAITERS | new_owner->pid;
 
-	inc_preempt_count();
-	curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-	dec_preempt_count();
+		inc_preempt_count();
+		curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+		dec_preempt_count();
+		if (curval == -EFAULT)
+			return -EFAULT;
+		if (curval != uval)
+			return -EINVAL;
+	}
 
-	if (curval == -EFAULT)
-		return -EFAULT;
-	if (curval != uval)
-		return -EINVAL;
+	spin_lock_irq(&pi_state->owner->pi_lock);
+	WARN_ON(list_empty(&pi_state->list));
+	list_del_init(&pi_state->list);
+	spin_unlock_irq(&pi_state->owner->pi_lock);
 
-	list_del_init(&pi_state->owner->pi_state_list);
+	spin_lock_irq(&new_owner->pi_lock);
+	WARN_ON(!list_empty(&pi_state->list));
 	list_add(&pi_state->list, &new_owner->pi_state_list);
 	pi_state->owner = new_owner;
+	spin_unlock_irq(&new_owner->pi_lock);
+
 	rt_mutex_unlock(&pi_state->pi_mutex);
 
 	return 0;
@@ -1236,6 +1254,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
 		/* Owner died? */
 		if (q.pi_state->owner != NULL) {
 			spin_lock_irq(&q.pi_state->owner->pi_lock);
+			WARN_ON(list_empty(&q.pi_state->list));
 			list_del_init(&q.pi_state->list);
 			spin_unlock_irq(&q.pi_state->owner->pi_lock);
 		} else
@@ -1244,6 +1263,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
 		q.pi_state->owner = current;
 
 		spin_lock_irq(&current->pi_lock);
+		WARN_ON(!list_empty(&q.pi_state->list));
 		list_add(&q.pi_state->list, &current->pi_state_list);
 		spin_unlock_irq(&current->pi_lock);
 
@@ -1427,9 +1447,11 @@ retry_locked:
 	 * again. If it succeeds then we can return without waking
 	 * anyone else up:
 	 */
-	inc_preempt_count();
-	uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
-	dec_preempt_count();
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		inc_preempt_count();
+		uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+		dec_preempt_count();
+	}
 
 	if (unlikely(uval == -EFAULT))
 		goto pi_faulted;
@@ -1462,9 +1484,11 @@ retry_locked:
 	/*
 	 * No waiters - kernel unlocks the futex:
 	 */
-	ret = unlock_futex_pi(uaddr, uval);
-	if (ret == -EFAULT)
-		goto pi_faulted;
+	if (!(uval & FUTEX_OWNER_DIED)) {
+		ret = unlock_futex_pi(uaddr, uval);
+		if (ret == -EFAULT)
+			goto pi_faulted;
+	}
 
 out_unlock:
 	spin_unlock(&hb->lock);
@@ -1683,9 +1707,9 @@ err_unlock:
  * Process a futex-list entry, check whether it's owned by the
  * dying task, and do notification if so:
  */
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
-	u32 uval, nval;
+	u32 uval, nval, mval;
 
 retry:
 	if (get_user(uval, uaddr))
@@ -1702,21 +1726,45 @@ retry:
 		 * thread-death.) The rest of the cleanup is done in
 		 * userspace.
 		 */
-		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
-						     uval | FUTEX_OWNER_DIED);
+		mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+		nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
+
 		if (nval == -EFAULT)
 			return -1;
 
 		if (nval != uval)
 			goto retry;
 
-		if (uval & FUTEX_WAITERS)
-			futex_wake(uaddr, 1);
+		/*
+		 * Wake robust non-PI futexes here. The wakeup of
+		 * PI futexes happens in exit_pi_state():
+		 */
+		if (!pi) {
+			if (uval & FUTEX_WAITERS)
+				futex_wake(uaddr, 1);
+		}
 	}
 	return 0;
 }
 
 /*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+				     struct robust_list __user **head, int *pi)
+{
+	unsigned long uentry;
+
+	if (get_user(uentry, (unsigned long *)head))
+		return -EFAULT;
+
+	*entry = (void *)(uentry & ~1UL);
+	*pi = uentry & 1;
+
+	return 0;
+}
+
+/*
  * Walk curr->robust_list (very carefully, it's a userspace list!)
  * and mark any locks found there dead, and notify any waiters.
  *
@@ -1726,14 +1774,14 @@ void exit_robust_list(struct task_struct *curr)
 {
 	struct robust_list_head __user *head = curr->robust_list;
 	struct robust_list __user *entry, *pending;
-	unsigned int limit = ROBUST_LIST_LIMIT;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
 	unsigned long futex_offset;
 
 	/*
 	 * Fetch the list head (which was registered earlier, via
 	 * sys_set_robust_list()):
 	 */
-	if (get_user(entry, &head->list.next))
+	if (fetch_robust_entry(&entry, &head->list.next, &pi))
 		return;
 	/*
 	 * Fetch the relative futex offset:
@@ -1744,10 +1792,11 @@ void exit_robust_list(struct task_struct *curr)
 	 * Fetch any possibly pending lock-add first, and handle it
 	 * if it exists:
 	 */
-	if (get_user(pending, &head->list_op_pending))
+	if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
 		return;
+
 	if (pending)
-		handle_futex_death((void *)pending + futex_offset, curr);
+		handle_futex_death((void *)pending + futex_offset, curr, pip);
 
 	while (entry != &head->list) {
 		/*
@@ -1756,12 +1805,12 @@ void exit_robust_list(struct task_struct *curr)
 		 */
 		if (entry != pending)
 			if (handle_futex_death((void *)entry + futex_offset,
-						curr))
+						curr, pi))
 				return;
 		/*
 		 * Fetch the next entry in the list:
 		 */
-		if (get_user(entry, &entry->next))
+		if (fetch_robust_entry(&entry, &entry->next, &pi))
 			return;
 		/*
 		 * Avoid excessively long or circular lists:
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
index d1d92b441fb7..d1aab1a452cc 100644
--- a/kernel/futex_compat.c
+++ b/kernel/futex_compat.c
@@ -12,6 +12,23 @@
 
 #include <asm/uaccess.h>
 
+
+/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int
+fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry,
+		   compat_uptr_t *head, int *pi)
+{
+	if (get_user(*uentry, head))
+		return -EFAULT;
+
+	*entry = compat_ptr((*uentry) & ~1);
+	*pi = (unsigned int)(*uentry) & 1;
+
+	return 0;
+}
+
 /*
  * Walk curr->robust_list (very carefully, it's a userspace list!)
  * and mark any locks found there dead, and notify any waiters.
@@ -22,17 +39,16 @@ void compat_exit_robust_list(struct task_struct *curr)
 {
 	struct compat_robust_list_head __user *head = curr->compat_robust_list;
 	struct robust_list __user *entry, *pending;
+	unsigned int limit = ROBUST_LIST_LIMIT, pi;
 	compat_uptr_t uentry, upending;
-	unsigned int limit = ROBUST_LIST_LIMIT;
 	compat_long_t futex_offset;
 
 	/*
 	 * Fetch the list head (which was registered earlier, via
 	 * sys_set_robust_list()):
 	 */
-	if (get_user(uentry, &head->list.next))
+	if (fetch_robust_entry(&uentry, &entry, &head->list.next, &pi))
 		return;
-	entry = compat_ptr(uentry);
 	/*
 	 * Fetch the relative futex offset:
 	 */
@@ -42,11 +58,11 @@ void compat_exit_robust_list(struct task_struct *curr)
 	 * Fetch any possibly pending lock-add first, and handle it
 	 * if it exists:
 	 */
-	if (get_user(upending, &head->list_op_pending))
+	if (fetch_robust_entry(&upending, &pending,
+			       &head->list_op_pending, &pi))
 		return;
-	pending = compat_ptr(upending);
 	if (upending)
-		handle_futex_death((void *)pending + futex_offset, curr);
+		handle_futex_death((void *)pending + futex_offset, curr, pi);
 
 	while (compat_ptr(uentry) != &head->list) {
 		/*
@@ -55,15 +71,15 @@ void compat_exit_robust_list(struct task_struct *curr)
 		 */
 		if (entry != pending)
 			if (handle_futex_death((void *)entry + futex_offset,
-						curr))
+						curr, pi))
 				return;
 
 		/*
 		 * Fetch the next entry in the list:
 		 */
-		if (get_user(uentry, (compat_uptr_t *)&entry->next))
+		if (fetch_robust_entry(&uentry, &entry,
+				       (compat_uptr_t *)&entry->next, &pi))
 			return;
-		entry = compat_ptr(uentry);
 		/*
 		 * Avoid excessively long or circular lists:
 		 */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 39277dd6bf90..ab16a5a4cfe9 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -275,8 +275,8 @@ static void upcase_if_global(struct kallsym_iter *iter)
 static int get_ksymbol_mod(struct kallsym_iter *iter)
 {
 	iter->owner = module_get_kallsym(iter->pos - kallsyms_num_syms,
-					 &iter->value,
-					 &iter->type, iter->name);
+					 &iter->value, &iter->type,
+					 iter->name, sizeof(iter->name));
 	if (iter->owner == NULL)
 		return 0;
 
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 24be714b04c7..4f9c60ef95e8 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -216,23 +216,6 @@ EXPORT_SYMBOL(kthread_bind);
  */
 int kthread_stop(struct task_struct *k)
 {
-	return kthread_stop_sem(k, NULL);
-}
-EXPORT_SYMBOL(kthread_stop);
-
-/**
- * kthread_stop_sem - stop a thread created by kthread_create().
- * @k: thread created by kthread_create().
- * @s: semaphore that @k waits on while idle.
- *
- * Does essentially the same thing as kthread_stop() above, but wakes
- * @k by calling up(@s).
- *
- * Returns the result of threadfn(), or %-EINTR if wake_up_process()
- * was never called.
- */
-int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
-{
 	int ret;
 
 	mutex_lock(&kthread_stop_lock);
@@ -246,10 +229,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 
 	/* Now set kthread_should_stop() to true, and wake it up. */
 	kthread_stop_info.k = k;
-	if (s)
-		up(s);
-	else
-		wake_up_process(k);
+	wake_up_process(k);
 	put_task_struct(k);
 
 	/* Once it dies, reset stop ptr, gather result and we're done. */
@@ -260,7 +240,7 @@ int kthread_stop_sem(struct task_struct *k, struct semaphore *s)
 
 	return ret;
 }
-EXPORT_SYMBOL(kthread_stop_sem);
+EXPORT_SYMBOL(kthread_stop);
 
 static __init int helper_init(void)
 {
diff --git a/kernel/module.c b/kernel/module.c
index 35e1b1f859d7..2a19cd47c046 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2019,10 +2019,8 @@ const char *module_address_lookup(unsigned long addr,
 	return NULL;
 }
 
-struct module *module_get_kallsym(unsigned int symnum,
-				  unsigned long *value,
-				  char *type,
-				  char namebuf[128])
+struct module *module_get_kallsym(unsigned int symnum, unsigned long *value,
+				char *type, char *name, size_t namelen)
 {
 	struct module *mod;
 
@@ -2031,9 +2029,8 @@ struct module *module_get_kallsym(unsigned int symnum,
 		if (symnum < mod->num_symtab) {
 			*value = mod->symtab[symnum].st_value;
 			*type = mod->symtab[symnum].st_info;
-			strncpy(namebuf,
-				mod->strtab + mod->symtab[symnum].st_name,
-				127);
+			strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
+				namelen);
 			mutex_unlock(&module_mutex);
 			return mod;
 		}
diff --git a/kernel/rtmutex-tester.c b/kernel/rtmutex-tester.c
index 494dac872a13..948bd8f643e2 100644
--- a/kernel/rtmutex-tester.c
+++ b/kernel/rtmutex-tester.c
@@ -275,6 +275,7 @@ static int test_func(void *data)
 
 		/* Wait for the next command to be executed */
 		schedule();
+		try_to_freeze();
 
 		if (signal_pending(current))
 			flush_signals(current);
diff --git a/kernel/sched.c b/kernel/sched.c
index d714611f1691..b44b9a43b0fc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -51,6 +51,7 @@
 #include <linux/times.h>
 #include <linux/acct.h>
 #include <linux/kprobes.h>
+#include <linux/delayacct.h>
 #include <asm/tlb.h>
 
 #include <asm/unistd.h>
@@ -501,9 +502,36 @@ struct file_operations proc_schedstat_operations = {
 	.release = single_release,
 };
 
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{
+	if (rq) {
+		rq->rq_sched_info.run_delay += delta_jiffies;
+		rq->rq_sched_info.pcnt++;
+	}
+}
+
+/*
+ * Expects runqueue lock to be held for atomicity of update
+ */
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{
+	if (rq)
+		rq->rq_sched_info.cpu_time += delta_jiffies;
+}
 # define schedstat_inc(rq, field)	do { (rq)->field++; } while (0)
 # define schedstat_add(rq, field, amt)	do { (rq)->field += (amt); } while (0)
 #else /* !CONFIG_SCHEDSTATS */
+static inline void
+rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies)
+{}
+static inline void
+rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies)
+{}
 # define schedstat_inc(rq, field)	do { } while (0)
 # define schedstat_add(rq, field, amt)	do { } while (0)
 #endif
@@ -523,7 +551,7 @@ static inline struct rq *this_rq_lock(void)
 	return rq;
 }
 
-#ifdef CONFIG_SCHEDSTATS
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
 /*
  * Called when a process is dequeued from the active array and given
  * the cpu.  We should note that with the exception of interactive
@@ -551,21 +579,16 @@ static inline void sched_info_dequeued(struct task_struct *t)
  */
 static void sched_info_arrive(struct task_struct *t)
 {
-	unsigned long now = jiffies, diff = 0;
-	struct rq *rq = task_rq(t);
+	unsigned long now = jiffies, delta_jiffies = 0;
 
 	if (t->sched_info.last_queued)
-		diff = now - t->sched_info.last_queued;
+		delta_jiffies = now - t->sched_info.last_queued;
 	sched_info_dequeued(t);
-	t->sched_info.run_delay += diff;
+	t->sched_info.run_delay += delta_jiffies;
 	t->sched_info.last_arrival = now;
 	t->sched_info.pcnt++;
 
-	if (!rq)
-		return;
-
-	rq->rq_sched_info.run_delay += diff;
-	rq->rq_sched_info.pcnt++;
+	rq_sched_info_arrive(task_rq(t), delta_jiffies);
 }
 
 /*
@@ -585,8 +608,9 @@ static void sched_info_arrive(struct task_struct *t)
  */
 static inline void sched_info_queued(struct task_struct *t)
 {
-	if (!t->sched_info.last_queued)
-		t->sched_info.last_queued = jiffies;
+	if (unlikely(sched_info_on()))
+		if (!t->sched_info.last_queued)
+			t->sched_info.last_queued = jiffies;
 }
 
 /*
@@ -595,13 +619,10 @@ static inline void sched_info_queued(struct task_struct *t)
  */
 static inline void sched_info_depart(struct task_struct *t)
 {
-	struct rq *rq = task_rq(t);
-	unsigned long diff = jiffies - t->sched_info.last_arrival;
-
-	t->sched_info.cpu_time += diff;
+	unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival;
 
-	if (rq)
-		rq->rq_sched_info.cpu_time += diff;
+	t->sched_info.cpu_time += delta_jiffies;
+	rq_sched_info_depart(task_rq(t), delta_jiffies);
 }
 
 /*
@@ -610,7 +631,7 @@ static inline void sched_info_depart(struct task_struct *t)
  * the idle task.)  We are only called when prev != next.
  */
 static inline void
-sched_info_switch(struct task_struct *prev, struct task_struct *next)
+__sched_info_switch(struct task_struct *prev, struct task_struct *next)
 {
 	struct rq *rq = task_rq(prev);
 
@@ -625,10 +646,16 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 	if (next != rq->idle)
 		sched_info_arrive(next);
 }
+static inline void
+sched_info_switch(struct task_struct *prev, struct task_struct *next)
+{
+	if (unlikely(sched_info_on()))
+		__sched_info_switch(prev, next);
+}
 #else
 #define sched_info_queued(t)		do { } while (0)
 #define sched_info_switch(t, next)	do { } while (0)
-#endif /* CONFIG_SCHEDSTATS */
+#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */
 
 /*
  * Adding/removing a task to/from a priority array:
@@ -1530,8 +1557,9 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
 
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
-#ifdef CONFIG_SCHEDSTATS
-	memset(&p->sched_info, 0, sizeof(p->sched_info));
+#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
+	if (unlikely(sched_info_on()))
+		memset(&p->sched_info, 0, sizeof(p->sched_info));
 #endif
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	p->oncpu = 0;
@@ -1788,7 +1816,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
 		WARN_ON(rq->prev_mm);
 		rq->prev_mm = oldmm;
 	}
+	/*
+	 * Since the runqueue lock will be released by the next
+	 * task (which is an invalid locking op but in the case
+	 * of the scheduler it's an obvious special-case), so we
+	 * do an early lockdep release here:
+	 */
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
 	spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
+#endif
 
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
@@ -4526,9 +4562,11 @@ void __sched io_schedule(void)
 {
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 
+	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	schedule();
 	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 
@@ -4537,9 +4575,11 @@ long __sched io_schedule_timeout(long timeout)
 	struct rq *rq = &__raw_get_cpu_var(runqueues);
 	long ret;
 
+	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	ret = schedule_timeout(timeout);
 	atomic_dec(&rq->nr_iowait);
+	delayacct_blkio_end();
 	return ret;
 }
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fd12f2556f0d..0f08a84ae307 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -311,8 +311,6 @@ void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
 	softirq_vec[nr].action = action;
 }
 
-EXPORT_UNUSED_SYMBOL(open_softirq);  /*  June 2006  */
-
 /* Tasklets */
 struct tasklet_head
 {
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
new file mode 100644
index 000000000000..f45179ce028e
--- /dev/null
+++ b/kernel/taskstats.c
@@ -0,0 +1,568 @@
+/*
+ * taskstats.c - Export per-task statistics to userland
+ *
+ * Copyright (C) Shailabh Nagar, IBM Corp. 2006
+ *           (C) Balbir Singh,   IBM Corp. 2006
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
+#include <linux/cpumask.h>
+#include <linux/percpu.h>
+#include <net/genetlink.h>
+#include <asm/atomic.h>
+
+/*
+ * Maximum length of a cpumask that can be specified in
+ * the TASKSTATS_CMD_ATTR_REGISTER/DEREGISTER_CPUMASK attribute
+ */
+#define TASKSTATS_CPUMASK_MAXLEN	(100+6*NR_CPUS)
+
+static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
+static int family_registered;
+kmem_cache_t *taskstats_cache;
+
+static struct genl_family family = {
+	.id		= GENL_ID_GENERATE,
+	.name		= TASKSTATS_GENL_NAME,
+	.version	= TASKSTATS_GENL_VERSION,
+	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
+};
+
+static struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1]
+__read_mostly = {
+	[TASKSTATS_CMD_ATTR_PID]  = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_TGID] = { .type = NLA_U32 },
+	[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING },
+	[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },};
+
+struct listener {
+	struct list_head list;
+	pid_t pid;
+	char valid;
+};
+
+struct listener_list {
+	struct rw_semaphore sem;
+	struct list_head list;
+};
+static DEFINE_PER_CPU(struct listener_list, listener_array);
+
+enum actions {
+	REGISTER,
+	DEREGISTER,
+	CPU_DONT_CARE
+};
+
+static int prepare_reply(struct genl_info *info, u8 cmd, struct sk_buff **skbp,
+			void **replyp, size_t size)
+{
+	struct sk_buff *skb;
+	void *reply;
+
+	/*
+	 * If new attributes are added, please revisit this allocation
+	 */
+	skb = nlmsg_new(size);
+	if (!skb)
+		return -ENOMEM;
+
+	if (!info) {
+		int seq = get_cpu_var(taskstats_seqnum)++;
+		put_cpu_var(taskstats_seqnum);
+
+		reply = genlmsg_put(skb, 0, seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	} else
+		reply = genlmsg_put(skb, info->snd_pid, info->snd_seq,
+				family.id, 0, 0,
+				cmd, family.version);
+	if (reply == NULL) {
+		nlmsg_free(skb);
+		return -EINVAL;
+	}
+
+	*skbp = skb;
+	*replyp = reply;
+	return 0;
+}
+
+/*
+ * Send taskstats data in @skb to listener with nl_pid @pid
+ */
+static int send_reply(struct sk_buff *skb, pid_t pid)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	void *reply = genlmsg_data(genlhdr);
+	int rc;
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	return genlmsg_unicast(skb, pid);
+}
+
+/*
+ * Send taskstats data in @skb to listeners registered for @cpu's exit data
+ */
+static int send_cpu_listeners(struct sk_buff *skb, unsigned int cpu)
+{
+	struct genlmsghdr *genlhdr = nlmsg_data((struct nlmsghdr *)skb->data);
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	struct sk_buff *skb_next, *skb_cur = skb;
+	void *reply = genlmsg_data(genlhdr);
+	int rc, ret, delcount = 0;
+
+	rc = genlmsg_end(skb, reply);
+	if (rc < 0) {
+		nlmsg_free(skb);
+		return rc;
+	}
+
+	rc = 0;
+	listeners = &per_cpu(listener_array, cpu);
+	down_read(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		skb_next = NULL;
+		if (!list_is_last(&s->list, &listeners->list)) {
+			skb_next = skb_clone(skb_cur, GFP_KERNEL);
+			if (!skb_next) {
+				nlmsg_free(skb_cur);
+				rc = -ENOMEM;
+				break;
+			}
+		}
+		ret = genlmsg_unicast(skb_cur, s->pid);
+		if (ret == -ECONNREFUSED) {
+			s->valid = 0;
+			delcount++;
+			rc = ret;
+		}
+		skb_cur = skb_next;
+	}
+	up_read(&listeners->sem);
+
+	if (!delcount)
+		return rc;
+
+	/* Delete invalidated entries */
+	down_write(&listeners->sem);
+	list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+		if (!s->valid) {
+			list_del(&s->list);
+			kfree(s);
+		}
+	}
+	up_write(&listeners->sem);
+	return rc;
+}
+
+static int fill_pid(pid_t pid, struct task_struct *pidtsk,
+		struct taskstats *stats)
+{
+	int rc;
+	struct task_struct *tsk = pidtsk;
+
+	if (!pidtsk) {
+		read_lock(&tasklist_lock);
+		tsk = find_task_by_pid(pid);
+		if (!tsk) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+		get_task_struct(tsk);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(tsk);
+
+	/*
+	 * Each accounting subsystem adds calls to its functions to
+	 * fill in relevant parts of struct taskstsats as follows
+	 *
+	 *	rc = per-task-foo(stats, tsk);
+	 *	if (rc)
+	 *		goto err;
+	 */
+
+	rc = delayacct_add_tsk(stats, tsk);
+	stats->version = TASKSTATS_VERSION;
+
+	/* Define err: label here if needed */
+	put_task_struct(tsk);
+	return rc;
+
+}
+
+static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
+		struct taskstats *stats)
+{
+	struct task_struct *tsk, *first;
+	unsigned long flags;
+
+	/*
+	 * Add additional stats from live tasks except zombie thread group
+	 * leaders who are already counted with the dead tasks
+	 */
+	first = tgidtsk;
+	if (!first) {
+		read_lock(&tasklist_lock);
+		first = find_task_by_pid(tgid);
+		if (!first) {
+			read_unlock(&tasklist_lock);
+			return -ESRCH;
+		}
+		get_task_struct(first);
+		read_unlock(&tasklist_lock);
+	} else
+		get_task_struct(first);
+
+	/* Start with stats from dead tasks */
+	spin_lock_irqsave(&first->signal->stats_lock, flags);
+	if (first->signal->stats)
+		memcpy(stats, first->signal->stats, sizeof(*stats));
+	spin_unlock_irqrestore(&first->signal->stats_lock, flags);
+
+	tsk = first;
+	read_lock(&tasklist_lock);
+	do {
+		if (tsk->exit_state == EXIT_ZOMBIE && thread_group_leader(tsk))
+			continue;
+		/*
+		 * Accounting subsystem can call its functions here to
+		 * fill in relevant parts of struct taskstsats as follows
+		 *
+		 *	per-task-foo(stats, tsk);
+		 */
+		delayacct_add_tsk(stats, tsk);
+
+	} while_each_thread(first, tsk);
+	read_unlock(&tasklist_lock);
+	stats->version = TASKSTATS_VERSION;
+
+	/*
+	 * Accounting subsytems can also add calls here to modify
+	 * fields of taskstats.
+	 */
+
+	return 0;
+}
+
+
+static void fill_tgid_exit(struct task_struct *tsk)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	if (!tsk->signal->stats)
+		goto ret;
+
+	/*
+	 * Each accounting subsystem calls its functions here to
+	 * accumalate its per-task stats for tsk, into the per-tgid structure
+	 *
+	 *	per-task-foo(tsk->signal->stats, tsk);
+	 */
+	delayacct_add_tsk(tsk->signal->stats, tsk);
+ret:
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+	return;
+}
+
+static int add_del_listener(pid_t pid, cpumask_t *maskp, int isadd)
+{
+	struct listener_list *listeners;
+	struct listener *s, *tmp;
+	unsigned int cpu;
+	cpumask_t mask = *maskp;
+
+	if (!cpus_subset(mask, cpu_possible_map))
+		return -EINVAL;
+
+	if (isadd == REGISTER) {
+		for_each_cpu_mask(cpu, mask) {
+			s = kmalloc_node(sizeof(struct listener), GFP_KERNEL,
+					 cpu_to_node(cpu));
+			if (!s)
+				goto cleanup;
+			s->pid = pid;
+			INIT_LIST_HEAD(&s->list);
+			s->valid = 1;
+
+			listeners = &per_cpu(listener_array, cpu);
+			down_write(&listeners->sem);
+			list_add(&s->list, &listeners->list);
+			up_write(&listeners->sem);
+		}
+		return 0;
+	}
+
+	/* Deregister or cleanup */
+cleanup:
+	for_each_cpu_mask(cpu, mask) {
+		listeners = &per_cpu(listener_array, cpu);
+		down_write(&listeners->sem);
+		list_for_each_entry_safe(s, tmp, &listeners->list, list) {
+			if (s->pid == pid) {
+				list_del(&s->list);
+				kfree(s);
+				break;
+			}
+		}
+		up_write(&listeners->sem);
+	}
+	return 0;
+}
+
+static int parse(struct nlattr *na, cpumask_t *mask)
+{
+	char *data;
+	int len;
+	int ret;
+
+	if (na == NULL)
+		return 1;
+	len = nla_len(na);
+	if (len > TASKSTATS_CPUMASK_MAXLEN)
+		return -E2BIG;
+	if (len < 1)
+		return -EINVAL;
+	data = kmalloc(len, GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+	nla_strlcpy(data, na, len);
+	ret = cpulist_parse(data, *mask);
+	kfree(data);
+	return ret;
+}
+
+static int taskstats_user_cmd(struct sk_buff *skb, struct genl_info *info)
+{
+	int rc = 0;
+	struct sk_buff *rep_skb;
+	struct taskstats stats;
+	void *reply;
+	size_t size;
+	struct nlattr *na;
+	cpumask_t mask;
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_REGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, REGISTER);
+
+	rc = parse(info->attrs[TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK], &mask);
+	if (rc < 0)
+		return rc;
+	if (rc == 0)
+		return add_del_listener(info->snd_pid, &mask, DEREGISTER);
+
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	memset(&stats, 0, sizeof(stats));
+	rc = prepare_reply(info, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		return rc;
+
+	if (info->attrs[TASKSTATS_CMD_ATTR_PID]) {
+		u32 pid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_PID]);
+		rc = fill_pid(pid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, pid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else if (info->attrs[TASKSTATS_CMD_ATTR_TGID]) {
+		u32 tgid = nla_get_u32(info->attrs[TASKSTATS_CMD_ATTR_TGID]);
+		rc = fill_tgid(tgid, NULL, &stats);
+		if (rc < 0)
+			goto err;
+
+		na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+		NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, tgid);
+		NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+				stats);
+	} else {
+		rc = -EINVAL;
+		goto err;
+	}
+
+	nla_nest_end(rep_skb, na);
+
+	return send_reply(rep_skb, info->snd_pid);
+
+nla_put_failure:
+	return genlmsg_cancel(rep_skb, reply);
+err:
+	nlmsg_free(rep_skb);
+	return rc;
+}
+
+void taskstats_exit_alloc(struct taskstats **ptidstats, unsigned int *mycpu)
+{
+	struct listener_list *listeners;
+	struct taskstats *tmp;
+	/*
+	 * This is the cpu on which the task is exiting currently and will
+	 * be the one for which the exit event is sent, even if the cpu
+	 * on which this function is running changes later.
+	 */
+	*mycpu = raw_smp_processor_id();
+
+	*ptidstats = NULL;
+	tmp = kmem_cache_zalloc(taskstats_cache, SLAB_KERNEL);
+	if (!tmp)
+		return;
+
+	listeners = &per_cpu(listener_array, *mycpu);
+	down_read(&listeners->sem);
+	if (!list_empty(&listeners->list)) {
+		*ptidstats = tmp;
+		tmp = NULL;
+	}
+	up_read(&listeners->sem);
+	kfree(tmp);
+}
+
+/* Send pid data out on exit */
+void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
+			int group_dead, unsigned int mycpu)
+{
+	int rc;
+	struct sk_buff *rep_skb;
+	void *reply;
+	size_t size;
+	int is_thread_group;
+	struct nlattr *na;
+	unsigned long flags;
+
+	if (!family_registered || !tidstats)
+		return;
+
+	spin_lock_irqsave(&tsk->signal->stats_lock, flags);
+	is_thread_group = tsk->signal->stats ? 1 : 0;
+	spin_unlock_irqrestore(&tsk->signal->stats_lock, flags);
+
+	rc = 0;
+	/*
+	 * Size includes space for nested attributes
+	 */
+	size = nla_total_size(sizeof(u32)) +
+		nla_total_size(sizeof(struct taskstats)) + nla_total_size(0);
+
+	if (is_thread_group)
+		size = 2 * size;	/* PID + STATS + TGID + STATS */
+
+	rc = prepare_reply(NULL, TASKSTATS_CMD_NEW, &rep_skb, &reply, size);
+	if (rc < 0)
+		goto ret;
+
+	rc = fill_pid(tsk->pid, tsk, tidstats);
+	if (rc < 0)
+		goto err_skb;
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_PID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_PID, (u32)tsk->pid);
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tidstats);
+	nla_nest_end(rep_skb, na);
+
+	if (!is_thread_group)
+		goto send;
+
+	/*
+	 * tsk has/had a thread group so fill the tsk->signal->stats structure
+	 * Doesn't matter if tsk is the leader or the last group member leaving
+	 */
+
+	fill_tgid_exit(tsk);
+	if (!group_dead)
+		goto send;
+
+	na = nla_nest_start(rep_skb, TASKSTATS_TYPE_AGGR_TGID);
+	NLA_PUT_U32(rep_skb, TASKSTATS_TYPE_TGID, (u32)tsk->tgid);
+	/* No locking needed for tsk->signal->stats since group is dead */
+	NLA_PUT_TYPE(rep_skb, struct taskstats, TASKSTATS_TYPE_STATS,
+			*tsk->signal->stats);
+	nla_nest_end(rep_skb, na);
+
+send:
+	send_cpu_listeners(rep_skb, mycpu);
+	return;
+
+nla_put_failure:
+	genlmsg_cancel(rep_skb, reply);
+	goto ret;
+err_skb:
+	nlmsg_free(rep_skb);
+ret:
+	return;
+}
+
+static struct genl_ops taskstats_ops = {
+	.cmd		= TASKSTATS_CMD_GET,
+	.doit		= taskstats_user_cmd,
+	.policy		= taskstats_cmd_get_policy,
+};
+
+/* Needed early in initialization */
+void __init taskstats_init_early(void)
+{
+	unsigned int i;
+
+	taskstats_cache = kmem_cache_create("taskstats_cache",
+						sizeof(struct taskstats),
+						0, SLAB_PANIC, NULL, NULL);
+	for_each_possible_cpu(i) {
+		INIT_LIST_HEAD(&(per_cpu(listener_array, i).list));
+		init_rwsem(&(per_cpu(listener_array, i).sem));
+	}
+}
+
+static int __init taskstats_init(void)
+{
+	int rc;
+
+	rc = genl_register_family(&family);
+	if (rc)
+		return rc;
+
+	rc = genl_register_ops(&family, &taskstats_ops);
+	if (rc < 0)
+		goto err;
+
+	family_registered = 1;
+	return 0;
+err:
+	genl_unregister_family(&family);
+	return rc;
+}
+
+/*
+ * late initcall ensures initialization of statistics collection
+ * mechanisms precedes initialization of the taskstats interface
+ */
+late_initcall(taskstats_init);
diff --git a/kernel/timer.c b/kernel/timer.c
index 2a87430a58d4..05809c2e2fd6 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -374,6 +374,7 @@ int del_timer_sync(struct timer_list *timer)
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
+		cpu_relax();
 	}
 }
 
@@ -968,6 +969,7 @@ void __init timekeeping_init(void)
 }
 
 
+static int timekeeping_suspended;
 /*
  * timekeeping_resume - Resumes the generic timekeeping subsystem.
  * @dev:	unused
@@ -983,6 +985,18 @@ static int timekeeping_resume(struct sys_device *dev)
 	write_seqlock_irqsave(&xtime_lock, flags);
 	/* restart the last cycle value */
 	clock->cycle_last = clocksource_read(clock);
+	clock->error = 0;
+	timekeeping_suspended = 0;
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+	return 0;
+}
+
+static int timekeeping_suspend(struct sys_device *dev, pm_message_t state)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+	timekeeping_suspended = 1;
 	write_sequnlock_irqrestore(&xtime_lock, flags);
 	return 0;
 }
@@ -990,6 +1004,7 @@ static int timekeeping_resume(struct sys_device *dev)
 /* sysfs resume/suspend bits for timekeeping */
 static struct sysdev_class timekeeping_sysclass = {
 	.resume		= timekeeping_resume,
+	.suspend	= timekeeping_suspend,
 	set_kset_name("timekeeping"),
 };
 
@@ -1100,13 +1115,16 @@ static void update_wall_time(void)
 {
 	cycle_t offset;
 
-	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
+	/* Make sure we're fully resumed: */
+	if (unlikely(timekeeping_suspended))
+		return;
 
 #ifdef CONFIG_GENERIC_TIME
 	offset = (clocksource_read(clock) - clock->cycle_last) & clock->mask;
 #else
 	offset = clock->cycle_interval;
 #endif
+	clock->xtime_nsec += (s64)xtime.tv_nsec << clock->shift;
 
 	/* normally this loop will run just once, however in the
 	 * case of lost or late ticks, it will accumulate correctly.