From e864c499d9e57805ae1f9e7ea404dd223759cd53 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:49 -0500
Subject: sched: track the next-highest priority on each runqueue

We will use this later in the series to reduce the amount of rq-lock
contention during a pull operation

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 756d981d91a4..7729f9a45a8b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -463,7 +463,10 @@ struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	int highest_prio; /* highest queued rt task prio */
+	struct {
+		int curr; /* highest queued rt task prio */
+		int next; /* next highest */
+	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
@@ -8169,7 +8172,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	rt_rq->highest_prio = MAX_RT_PRIO;
+	rt_rq->highest_prio.curr = MAX_RT_PRIO;
+	rt_rq->highest_prio.next = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
-- 
cgit v1.2.3


From 7e96fa5875d4a9be18d74d3ca7b90518d05bc426 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:50 -0500
Subject: sched: pull only one task during NEWIDLE balancing to limit critical
 section

git-id c4acb2c0669c5c5c9b28e9d02a34b5c67edf7092 attempted to limit
newidle critical section length by stopping after at least one task
was moved.  Further investigation has shown that there are other
paths nested further inside the algorithm which still remain that allow
long latencies to occur with newidle balancing.  This patch applies
the same technique inside balance_tasks() to limit the duration of
this optional balancing operation.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Nick Piggin <npiggin@suse.de>
---
 kernel/sched.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7729f9a45a8b..94d9a6c5ff94 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2984,6 +2984,16 @@ next:
 	pulled++;
 	rem_load_move -= p->se.load.weight;
 
+#ifdef CONFIG_PREEMPT
+	/*
+	 * NEWIDLE balancing is a source of latency, so preemptible kernels
+	 * will stop after the first task is pulled to minimize the critical
+	 * section.
+	 */
+	if (idle == CPU_NEWLY_IDLE)
+		goto out;
+#endif
+
 	/*
 	 * We only want to steal up to the prescribed amount of weighted load.
 	 */
@@ -3030,9 +3040,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 				sd, idle, all_pinned, &this_best_prio);
 		class = class->next;
 
+#ifdef CONFIG_PREEMPT
+		/*
+		 * NEWIDLE balancing is a source of latency, so preemptible
+		 * kernels will stop after the first task is pulled to minimize
+		 * the critical section.
+		 */
 		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
 			break;
-
+#endif
 	} while (class && max_load_move > total_load_moved);
 
 	return total_load_moved > 0;
-- 
cgit v1.2.3


From 8f45e2b516201d1bf681e6026fa5276385def565 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:51 -0500
Subject: sched: make double-lock-balance fair

double_lock balance() currently favors logically lower cpus since they
often do not have to release their own lock to acquire a second lock.
The result is that logically higher cpus can get starved when there is
a lot of pressure on the RQs.  This can result in higher latencies on
higher cpu-ids.

This patch makes the algorithm more fair by forcing all paths to have
to release both locks before acquiring them again.  Since callsites to
double_lock_balance already consider it a potential preemption/reschedule
point, they have the proper logic to recheck for atomicity violations.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 7 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 94d9a6c5ff94..8fca364f3593 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1608,21 +1608,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+#ifdef CONFIG_PREEMPT
+
 /*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations.  This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below.  However, it
+ * also adds more overhead and therefore may reduce throughput.
  */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(this_rq->lock)
+	__acquires(busiest->lock)
+	__acquires(this_rq->lock)
+{
+	spin_unlock(&this_rq->lock);
+	double_rq_lock(this_rq, busiest);
+
+	return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry.  This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	int ret = 0;
 
-	if (unlikely(!irqs_disabled())) {
-		/* printk() doesn't work good under rq->lock */
-		spin_unlock(&this_rq->lock);
-		BUG_ON(1);
-	}
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
@@ -1635,6 +1656,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	return ret;
 }
 
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+	if (unlikely(!irqs_disabled())) {
+		/* printk() doesn't work good under rq->lock */
+		spin_unlock(&this_rq->lock);
+		BUG_ON(1);
+	}
+
+	return _double_lock_balance(this_rq, busiest);
+}
+
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(busiest->lock)
 {
-- 
cgit v1.2.3


From 967fc04671feea4dbf780c9e55a0bc8fcf68a14e Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:52 -0500
Subject: sched: add sched_class->needs_post_schedule() member

We currently run class->post_schedule() outside of the rq->lock, which
means that we need to test for the need to post_schedule outside of
the lock to avoid a forced reacquistion.  This is currently not a problem
as we only look at rq->rt.overloaded.  However, we want to enhance this
going forward to look at more state to reduce the need to post_schedule to
a bare minimum set.  Therefore, we introduce a new member-func called
needs_post_schedule() which tests for the post_schedule condtion without
actually performing the work.  Therefore it is safe to call this
function before the rq->lock is released, because we are guaranteed not
to drop the lock at an intermediate point (such as what post_schedule()
may do).

We will use this later in the series

[ rostedt: removed paranoid BUG_ON ]

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  8 +++++++-
 kernel/sched_rt.c     | 24 ++++++++++++++----------
 3 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e5f928a079e8..836a86c32a65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1012,6 +1012,7 @@ struct sched_class {
 			      struct rq *busiest, struct sched_domain *sd,
 			      enum cpu_idle_type idle);
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+	int (*needs_post_schedule) (struct rq *this_rq);
 	void (*post_schedule) (struct rq *this_rq);
 	void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 8fca364f3593..3acbad8991a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2621,6 +2621,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
+#ifdef CONFIG_SMP
+	int post_schedule = 0;
+
+	if (current->sched_class->needs_post_schedule)
+		post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
 
 	rq->prev_mm = NULL;
 
@@ -2639,7 +2645,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
-	if (current->sched_class->post_schedule)
+	if (post_schedule)
 		current->sched_class->post_schedule(rq);
 #endif
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8d33843cb2c4..b0b6ea4ed674 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1290,20 +1290,23 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 		pull_rt_task(rq);
 }
 
+/*
+ * assumes rq->lock is held
+ */
+static int needs_post_schedule_rt(struct rq *rq)
+{
+	return rq->rt.overloaded ? 1 : 0;
+}
+
 static void post_schedule_rt(struct rq *rq)
 {
 	/*
-	 * If we have more than one rt_task queued, then
-	 * see if we can push the other rt_tasks off to other CPUS.
-	 * Note we may release the rq lock, and since
-	 * the lock was owned by prev, we need to release it
-	 * first via finish_lock_switch and then reaquire it here.
+	 * This is only called if needs_post_schedule_rt() indicates that
+	 * we need to push tasks away
 	 */
-	if (unlikely(rq->rt.overloaded)) {
-		spin_lock_irq(&rq->lock);
-		push_rt_tasks(rq);
-		spin_unlock_irq(&rq->lock);
-	}
+	spin_lock_irq(&rq->lock);
+	push_rt_tasks(rq);
+	spin_unlock_irq(&rq->lock);
 }
 
 /*
@@ -1557,6 +1560,7 @@ static const struct sched_class rt_sched_class = {
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
 	.pre_schedule		= pre_schedule_rt,
+	.needs_post_schedule	= needs_post_schedule_rt,
 	.post_schedule		= post_schedule_rt,
 	.task_wake_up		= task_wake_up_rt,
 	.switched_from		= switched_from_rt,
-- 
cgit v1.2.3


From 917b627d4d981dc614519d7b34ea31a976b14e12 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:53 -0500
Subject: sched: create "pushable_tasks" list to limit pushing to one attempt

The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis).  When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted.  This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.

When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state.  Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over.  The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.

However, the current implementation suffers from a limitation in the
push logic.  Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc).  Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue.  This causes two problems:

1) We can have the same tasks analyzed over and over again during each
   push, which extends out the fast path in the scheduler for no
   gain.  Consider a RQ that has dozens of tasks that are bound to a
   core.  Each one of those tasks will be encountered and skipped
   for each push operation while they are queued.

2) There may be lower-priority tasks under the unpushable task that
   could have been successfully pushed, but will never be considered
   until either the unpushable task is cleared, or a pull operation
   succeeds.  The net result is a potential latency source for mid
   priority tasks.

This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks".  A task is added to the list
each time a task is activated or preempted.  It is removed from the
list any time it is deactivated, made current, or fails to push.

This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper.  This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks.  Now every task will have a push attempted (when
appropriate).

This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.

[ rostedt: added a couple more BUG_ONs ]

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/init_task.h |   1 +
 include/linux/sched.h     |   1 +
 kernel/sched.c            |   4 ++
 kernel/sched_rt.c         | 119 +++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 107 insertions(+), 18 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 23fd8909b9e5..6851225f44a7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -140,6 +140,7 @@ extern struct group_info init_groups;
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+	.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
 	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
 	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
 	.real_parent	= &tsk,						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 836a86c32a65..440cabb2d432 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1179,6 +1179,7 @@ struct task_struct {
 #endif
 
 	struct list_head tasks;
+	struct plist_node pushable_tasks;
 
 	struct mm_struct *mm, *active_mm;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3acbad8991a2..24ab80c28765 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -471,6 +471,7 @@ struct rt_rq {
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
 	int overloaded;
+	struct plist_head pushable_tasks;
 #endif
 	int rt_throttled;
 	u64 rt_time;
@@ -2481,6 +2482,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
+	plist_node_init(&p->pushable_tasks, MAX_PRIO);
+
 	put_cpu();
 }
 
@@ -8237,6 +8240,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
+	plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
 #endif
 
 	rt_rq->rt_time = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b0b6ea4ed674..fe9da6084c87 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -49,6 +49,24 @@ static void update_rt_migration(struct rq *rq)
 		rq->rt.overloaded = 0;
 	}
 }
+
+static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+	plist_node_init(&p->pushable_tasks, p->prio);
+	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+}
+
+static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+}
+
+#else
+
+#define enqueue_pushable_task(rq, p) do { } while (0)
+#define dequeue_pushable_task(rq, p) do { } while (0)
+
 #endif /* CONFIG_SMP */
 
 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
@@ -751,6 +769,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 
 	enqueue_rt_entity(rt_se);
 
+	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+		enqueue_pushable_task(rq, p);
+
 	inc_cpu_load(rq, p->se.load.weight);
 }
 
@@ -761,6 +782,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se);
 
+	dequeue_pushable_task(rq, p);
+
 	dec_cpu_load(rq, p->se.load.weight);
 }
 
@@ -911,7 +934,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 	return next;
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *_pick_next_task_rt(struct rq *rq)
 {
 	struct sched_rt_entity *rt_se;
 	struct task_struct *p;
@@ -933,6 +956,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 
 	p = rt_task_of(rt_se);
 	p->se.exec_start = rq->clock;
+
+	return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+	struct task_struct *p = _pick_next_task_rt(rq);
+
+	/* The running task is never eligible for pushing */
+	if (p)
+		dequeue_pushable_task(rq, p);
+
 	return p;
 }
 
@@ -940,6 +975,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
 	update_curr_rt(rq);
 	p->se.exec_start = 0;
+
+	/*
+	 * The previous task needs to be made eligible for pushing
+	 * if it is still active
+	 */
+	if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+		enqueue_pushable_task(rq, p);
 }
 
 #ifdef CONFIG_SMP
@@ -1116,6 +1158,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 	return lowest_rq;
 }
 
+static inline int has_pushable_tasks(struct rq *rq)
+{
+	return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+	struct task_struct *p;
+
+	if (!has_pushable_tasks(rq))
+		return NULL;
+
+	p = plist_first_entry(&rq->rt.pushable_tasks,
+			      struct task_struct, pushable_tasks);
+
+	BUG_ON(rq->cpu != task_cpu(p));
+	BUG_ON(task_current(rq, p));
+	BUG_ON(p->rt.nr_cpus_allowed <= 1);
+
+	BUG_ON(!p->se.on_rq);
+	BUG_ON(!rt_task(p));
+
+	return p;
+}
+
 /*
  * If the current CPU has more than one RT task, see if the non
  * running task can migrate over to a CPU that is running a task
@@ -1125,13 +1192,12 @@ static int push_rt_task(struct rq *rq)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
-	int ret = 0;
 	int paranoid = RT_MAX_TRIES;
 
 	if (!rq->rt.overloaded)
 		return 0;
 
-	next_task = pick_next_highest_task_rt(rq, -1);
+	next_task = pick_next_pushable_task(rq);
 	if (!next_task)
 		return 0;
 
@@ -1163,12 +1229,19 @@ static int push_rt_task(struct rq *rq)
 		 * so it is possible that next_task has changed.
 		 * If it has, then try again.
 		 */
-		task = pick_next_highest_task_rt(rq, -1);
+		task = pick_next_pushable_task(rq);
 		if (unlikely(task != next_task) && task && paranoid--) {
 			put_task_struct(next_task);
 			next_task = task;
 			goto retry;
 		}
+
+		/*
+		 * Once we have failed to push this task, we will not
+		 * try again, since the other cpus will pull from us
+		 * when they are ready
+		 */
+		dequeue_pushable_task(rq, next_task);
 		goto out;
 	}
 
@@ -1180,23 +1253,12 @@ static int push_rt_task(struct rq *rq)
 
 	double_unlock_balance(rq, lowest_rq);
 
-	ret = 1;
 out:
 	put_task_struct(next_task);
 
-	return ret;
+	return 1;
 }
 
-/*
- * TODO: Currently we just use the second highest prio task on
- *       the queue, and stop when it can't migrate (or there's
- *       no more RT tasks).  There may be a case where a lower
- *       priority RT task has a different affinity than the
- *       higher RT task. In this case the lower RT task could
- *       possibly be able to migrate where as the higher priority
- *       RT task could not.  We currently ignore this issue.
- *       Enhancements are welcome!
- */
 static void push_rt_tasks(struct rq *rq)
 {
 	/* push_rt_task will return true if it moved an RT */
@@ -1295,7 +1357,7 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
  */
 static int needs_post_schedule_rt(struct rq *rq)
 {
-	return rq->rt.overloaded ? 1 : 0;
+	return has_pushable_tasks(rq);
 }
 
 static void post_schedule_rt(struct rq *rq)
@@ -1317,7 +1379,7 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    rq->rt.overloaded &&
+	    has_pushable_tasks(rq) &&
 	    p->rt.nr_cpus_allowed > 1)
 		push_rt_tasks(rq);
 }
@@ -1354,6 +1416,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 	if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
 		struct rq *rq = task_rq(p);
 
+		if (!task_current(rq, p)) {
+			/*
+			 * Make sure we dequeue this task from the pushable list
+			 * before going further.  It will either remain off of
+			 * the list because we are no longer pushable, or it
+			 * will be requeued.
+			 */
+			if (p->rt.nr_cpus_allowed > 1)
+				dequeue_pushable_task(rq, p);
+
+			/*
+			 * Requeue if our weight is changing and still > 1
+			 */
+			if (weight > 1)
+				enqueue_pushable_task(rq, p);
+
+		}
+
 		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
 			rq->rt.rt_nr_migratory++;
 		} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@ -1538,6 +1618,9 @@ static void set_curr_task_rt(struct rq *rq)
 	struct task_struct *p = rq->curr;
 
 	p->se.exec_start = rq->clock;
+
+	/* The running task is never eligible for pushing */
+	dequeue_pushable_task(rq, p);
 }
 
 static const struct sched_class rt_sched_class = {
-- 
cgit v1.2.3


From 01e3eb82278bf45221fc38b391bc5ee0f6a314d6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 Jan 2009 13:00:50 +0100
Subject: Revert "sched: improve preempt debugging"

This reverts commit 7317d7b87edb41a9135e30be1ec3f7ef817c53dd.

This has been reported (and bisected) by Alexey Zaytsev and
Kamalesh Babulal to produce annoying warnings during bootup
on both x86 and powerpc.

kernel_locked() is not a valid test in IRQ context (we update the
BKL's ->lock_depth and the preempt count separately and non-atomicalyy),
so we cannot put it into the generic preempt debugging checks which
can run in IRQ contexts too.

Reported-and-bisected-by: Alexey Zaytsev <alexey.zaytsev@gmail.com>
Reported-and-bisected-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d0..3b630d882660 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4440,7 +4440,7 @@ void __kprobes sub_preempt_count(int val)
 	/*
 	 * Underflow?
 	 */
-       if (DEBUG_LOCKS_WARN_ON(val > preempt_count() - (!!kernel_locked())))
+	if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
 		return;
 	/*
 	 * Is the spinlock portion underflowing?
-- 
cgit v1.2.3


From 5add95d4f7cf08f6f62510f19576992912387501 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:08 +0100
Subject: [CVE-2009-0029] System call wrappers part 06

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/sched.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d0..1a0fdfa5ddf9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5126,7 +5126,7 @@ int can_nice(const struct task_struct *p, const int nice)
  * sys_setpriority is a more generic, but much slower function that
  * does similar things.
  */
-asmlinkage long sys_nice(int increment)
+SYSCALL_DEFINE1(nice, int, increment)
 {
 	long nice, retval;
 
@@ -5433,8 +5433,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  * @policy: new policy.
  * @param: structure containing the new RT priority.
  */
-asmlinkage long
-sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
+SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy,
+		struct sched_param __user *, param)
 {
 	/* negative values for policy are not valid */
 	if (policy < 0)
@@ -5448,7 +5448,7 @@ sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
  * @pid: the pid in question.
  * @param: structure containing the new RT priority.
  */
-asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
+SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
 {
 	return do_sched_setscheduler(pid, -1, param);
 }
@@ -5457,7 +5457,7 @@ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param)
  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
  * @pid: the pid in question.
  */
-asmlinkage long sys_sched_getscheduler(pid_t pid)
+SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
 {
 	struct task_struct *p;
 	int retval;
@@ -5482,7 +5482,7 @@ asmlinkage long sys_sched_getscheduler(pid_t pid)
  * @pid: the pid in question.
  * @param: structure containing the RT priority.
  */
-asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param)
+SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
 {
 	struct sched_param lp;
 	struct task_struct *p;
@@ -5600,8 +5600,8 @@ static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to the new cpu mask
  */
-asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
-				      unsigned long __user *user_mask_ptr)
+SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
 {
 	cpumask_var_t new_mask;
 	int retval;
@@ -5648,8 +5648,8 @@ out_unlock:
  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
  * @user_mask_ptr: user-space pointer to hold the current cpu mask
  */
-asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
-				      unsigned long __user *user_mask_ptr)
+SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
+		unsigned long __user *, user_mask_ptr)
 {
 	int ret;
 	cpumask_var_t mask;
@@ -5678,7 +5678,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
  * This function yields the current CPU to other tasks. If there are no
  * other threads running on this CPU then this function will return.
  */
-asmlinkage long sys_sched_yield(void)
+SYSCALL_DEFINE0(sched_yield)
 {
 	struct rq *rq = this_rq_lock();
 
@@ -5819,7 +5819,7 @@ long __sched io_schedule_timeout(long timeout)
  * this syscall returns the maximum rt_priority that can be used
  * by a given scheduling class.
  */
-asmlinkage long sys_sched_get_priority_max(int policy)
+SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
 {
 	int ret = -EINVAL;
 
@@ -5844,7 +5844,7 @@ asmlinkage long sys_sched_get_priority_max(int policy)
  * this syscall returns the minimum rt_priority that can be used
  * by a given scheduling class.
  */
-asmlinkage long sys_sched_get_priority_min(int policy)
+SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
 {
 	int ret = -EINVAL;
 
-- 
cgit v1.2.3


From 754fe8d297bfae7b77f7ce866e2fb0c5fb186506 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:09 +0100
Subject: [CVE-2009-0029] System call wrappers part 07

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/exit.c   | 8 ++++----
 kernel/kexec.c  | 5 ++---
 kernel/sched.c  | 4 ++--
 kernel/signal.c | 2 +-
 kernel/sys.c    | 7 ++++---
 net/socket.c    | 2 +-
 6 files changed, 14 insertions(+), 14 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index fac9b040af2c..08895df0eab3 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1141,7 +1141,7 @@ NORET_TYPE void complete_and_exit(struct completion *comp, long code)
 
 EXPORT_SYMBOL(complete_and_exit);
 
-asmlinkage long sys_exit(int error_code)
+SYSCALL_DEFINE1(exit, int, error_code)
 {
 	do_exit((error_code&0xff)<<8);
 }
@@ -1182,7 +1182,7 @@ do_group_exit(int exit_code)
  * wait4()-ing process will get the correct exit code - even if this
  * thread is not the thread group leader.
  */
-asmlinkage long sys_exit_group(int error_code)
+SYSCALL_DEFINE1(exit_group, int, error_code)
 {
 	do_group_exit((error_code & 0xff) << 8);
 	/* NOTREACHED */
@@ -1795,8 +1795,8 @@ asmlinkage long sys_waitid(int which, pid_t upid,
 	return ret;
 }
 
-asmlinkage long sys_wait4(pid_t upid, int __user *stat_addr,
-			  int options, struct rusage __user *ru)
+SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
+		int, options, struct rusage __user *, ru)
 {
 	struct pid *pid = NULL;
 	enum pid_type type;
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 3fb855ad6aa0..8a6d7b08864e 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -934,9 +934,8 @@ struct kimage *kexec_crash_image;
 
 static DEFINE_MUTEX(kexec_mutex);
 
-asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
-				struct kexec_segment __user *segments,
-				unsigned long flags)
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+		struct kexec_segment __user *, segments, unsigned long, flags)
 {
 	struct kimage **dest_image, *image;
 	int result;
diff --git a/kernel/sched.c b/kernel/sched.c
index 1a0fdfa5ddf9..65c02037b052 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5869,8 +5869,8 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
-asmlinkage
-long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval)
+SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid,
+		struct timespec __user *, interval)
 {
 	struct task_struct *p;
 	unsigned int time_slice;
diff --git a/kernel/signal.c b/kernel/signal.c
index 3fe08eaa5dea..41f32e08615e 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1961,7 +1961,7 @@ EXPORT_SYMBOL(unblock_all_signals);
  * System call entry points.
  */
 
-asmlinkage long sys_restart_syscall(void)
+SYSCALL_DEFINE0(restart_syscall)
 {
 	struct restart_block *restart = &current_thread_info()->restart_block;
 	return restart->fn(restart);
diff --git a/kernel/sys.c b/kernel/sys.c
index cbe4502c28a1..39b192b40034 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -143,7 +143,7 @@ out:
 	return error;
 }
 
-asmlinkage long sys_setpriority(int which, int who, int niceval)
+SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
@@ -208,7 +208,7 @@ out:
  * has been offset by 20 (ie it returns 40..1 instead of -20..19)
  * to stay compatible.
  */
-asmlinkage long sys_getpriority(int which, int who)
+SYSCALL_DEFINE2(getpriority, int, which, int, who)
 {
 	struct task_struct *g, *p;
 	struct user_struct *user;
@@ -355,7 +355,8 @@ EXPORT_SYMBOL_GPL(kernel_power_off);
  *
  * reboot doesn't sync: do that yourself before calling this.
  */
-asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd, void __user * arg)
+SYSCALL_DEFINE4(reboot, int, magic1, int, magic2, unsigned int, cmd,
+		void __user *, arg)
 {
 	char buffer[256];
 
diff --git a/net/socket.c b/net/socket.c
index 06603d73c411..cc9b666e58f6 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1789,7 +1789,7 @@ out_put:
  *	Shutdown a socket.
  */
 
-asmlinkage long sys_shutdown(int fd, int how)
+SYSCALL_DEFINE2(shutdown, int, fd, int, how)
 {
 	int err, fput_needed;
 	struct socket *sock;
-- 
cgit v1.2.3


From 17da2bd90abf428523de0fb98f7075e00e3ed42e Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Wed, 14 Jan 2009 14:14:10 +0100
Subject: [CVE-2009-0029] System call wrappers part 08

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
---
 kernel/exit.c   |  7 +++----
 kernel/fork.c   |  2 +-
 kernel/futex.c  |  6 +++---
 kernel/module.c | 10 ++++------
 kernel/sched.c  |  2 +-
 kernel/signal.c | 18 +++++++-----------
 6 files changed, 19 insertions(+), 26 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/exit.c b/kernel/exit.c
index 08895df0eab3..f80dec3f1875 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1754,9 +1754,8 @@ end:
 	return retval;
 }
 
-asmlinkage long sys_waitid(int which, pid_t upid,
-			   struct siginfo __user *infop, int options,
-			   struct rusage __user *ru)
+SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
+		infop, int, options, struct rusage __user *, ru)
 {
 	struct pid *pid = NULL;
 	enum pid_type type;
@@ -1833,7 +1832,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
  * sys_waitpid() remains for compatibility. waitpid() should be
  * implemented by calling sys_wait4() from libc.a.
  */
-asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options)
+SYSCALL_DEFINE3(waitpid, pid_t, pid, int __user *, stat_addr, int, options)
 {
 	return sys_wait4(pid, stat_addr, options, NULL);
 }
diff --git a/kernel/fork.c b/kernel/fork.c
index 1d68f1255dd8..8eb37d38c6a4 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -901,7 +901,7 @@ static void copy_flags(unsigned long clone_flags, struct task_struct *p)
 	clear_freeze_flag(p);
 }
 
-asmlinkage long sys_set_tid_address(int __user *tidptr)
+SYSCALL_DEFINE1(set_tid_address, int __user *, tidptr)
 {
 	current->clear_child_tid = tidptr;
 
diff --git a/kernel/futex.c b/kernel/futex.c
index 002aa189eb09..e86931d8d4e9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1978,9 +1978,9 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 }
 
 
-asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
-			  struct timespec __user *utime, u32 __user *uaddr2,
-			  u32 val3)
+SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
+		struct timespec __user *, utime, u32 __user *, uaddr2,
+		u32, val3)
 {
 	struct timespec ts;
 	ktime_t t, *tp = NULL;
diff --git a/kernel/module.c b/kernel/module.c
index c9332c90d5a0..e8b51d41dd72 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -743,8 +743,8 @@ static void wait_for_zero_refcount(struct module *mod)
 	mutex_lock(&module_mutex);
 }
 
-asmlinkage long
-sys_delete_module(const char __user *name_user, unsigned int flags)
+SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
+		unsigned int, flags)
 {
 	struct module *mod;
 	char name[MODULE_NAME_LEN];
@@ -2296,10 +2296,8 @@ static noinline struct module *load_module(void __user *umod,
 }
 
 /* This is where the real work happens */
-asmlinkage long
-sys_init_module(void __user *umod,
-		unsigned long len,
-		const char __user *uargs)
+SYSCALL_DEFINE3(init_module, void __user *, umod,
+		unsigned long, len, const char __user *, uargs)
 {
 	struct module *mod;
 	int ret = 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index 65c02037b052..eb1931eef587 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5869,7 +5869,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
  * this syscall writes the default timeslice value of a given process
  * into the user-space timespec buffer. A value of '0' means infinity.
  */
-SYSCALL_DEFINE4(sched_rr_get_interval, pid_t, pid,
+SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
 		struct timespec __user *, interval)
 {
 	struct task_struct *p;
diff --git a/kernel/signal.c b/kernel/signal.c
index 41f32e08615e..278cc8737f17 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -2014,8 +2014,8 @@ int sigprocmask(int how, sigset_t *set, sigset_t *oldset)
 	return error;
 }
 
-asmlinkage long
-sys_rt_sigprocmask(int how, sigset_t __user *set, sigset_t __user *oset, size_t sigsetsize)
+SYSCALL_DEFINE4(rt_sigprocmask, int, how, sigset_t __user *, set,
+		sigset_t __user *, oset, size_t, sigsetsize)
 {
 	int error = -EINVAL;
 	sigset_t old_set, new_set;
@@ -2074,8 +2074,7 @@ out:
 	return error;
 }	
 
-asmlinkage long
-sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize)
+SYSCALL_DEFINE2(rt_sigpending, sigset_t __user *, set, size_t, sigsetsize)
 {
 	return do_sigpending(set, sigsetsize);
 }
@@ -2146,11 +2145,9 @@ int copy_siginfo_to_user(siginfo_t __user *to, siginfo_t *from)
 
 #endif
 
-asmlinkage long
-sys_rt_sigtimedwait(const sigset_t __user *uthese,
-		    siginfo_t __user *uinfo,
-		    const struct timespec __user *uts,
-		    size_t sigsetsize)
+SYSCALL_DEFINE4(rt_sigtimedwait, const sigset_t __user *, uthese,
+		siginfo_t __user *, uinfo, const struct timespec __user *, uts,
+		size_t, sigsetsize)
 {
 	int ret, sig;
 	sigset_t these;
@@ -2223,8 +2220,7 @@ sys_rt_sigtimedwait(const sigset_t __user *uthese,
 	return ret;
 }
 
-asmlinkage long
-sys_kill(pid_t pid, int sig)
+SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
 {
 	struct siginfo info;
 
-- 
cgit v1.2.3


From 398a153b16b09a68739928d4502455db9725ac86 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Wed, 14 Jan 2009 09:10:04 -0500
Subject: sched: fix build error in kernel/sched_rt.c when RT_GROUP_SCHED &&
 !SMP

Ingo found a build error in the scheduler when RT_GROUP_SCHED was
enabled, but SMP was not.  This patch rearranges the code such
that it is a little more streamlined and compiles under all permutations
of SMP, UP and RT_GROUP_SCHED.  It was boot tested on my 4-way x86_64
and it still passes preempt-test.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched.c    |   4 +
 kernel/sched_rt.c | 264 +++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 174 insertions(+), 94 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index dd1a1466c1e6..2b703f1fac3a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -466,7 +466,9 @@ struct rt_rq {
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	struct {
 		int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
 		int next; /* next highest */
+#endif
 	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
@@ -8267,8 +8269,10 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	rt_rq->highest_prio.curr = MAX_RT_PRIO;
+#ifdef CONFIG_SMP
 	rt_rq->highest_prio.next = MAX_RT_PRIO;
 #endif
+#endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4eda5f795f04..4230b15fe90e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,40 @@
  * policies)
  */
 
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+	return container_of(rt_se, struct task_struct, rt);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+	return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	return rt_se->rt_rq;
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+	return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	struct task_struct *p = rt_task_of(rt_se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->rt;
+}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
 #ifdef CONFIG_SMP
 
 static inline int rt_overloaded(struct rq *rq)
@@ -37,19 +71,35 @@ static inline void rt_clear_overload(struct rq *rq)
 	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 }
 
-static void update_rt_migration(struct rq *rq)
+static void update_rt_migration(struct rt_rq *rt_rq)
 {
-	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
-		if (!rq->rt.overloaded) {
-			rt_set_overload(rq);
-			rq->rt.overloaded = 1;
+	if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
+		if (!rt_rq->overloaded) {
+			rt_set_overload(rq_of_rt_rq(rt_rq));
+			rt_rq->overloaded = 1;
 		}
-	} else if (rq->rt.overloaded) {
-		rt_clear_overload(rq);
-		rq->rt.overloaded = 0;
+	} else if (rt_rq->overloaded) {
+		rt_clear_overload(rq_of_rt_rq(rt_rq));
+		rt_rq->overloaded = 0;
 	}
 }
 
+static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	if (rt_se->nr_cpus_allowed > 1)
+		rt_rq->rt_nr_migratory++;
+
+	update_rt_migration(rt_rq);
+}
+
+static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	if (rt_se->nr_cpus_allowed > 1)
+		rt_rq->rt_nr_migratory--;
+
+	update_rt_migration(rt_rq);
+}
+
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 {
 	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -68,14 +118,13 @@ static inline
 void enqueue_pushable_task(struct rq *rq, struct task_struct *p) {}
 static inline
 void dequeue_pushable_task(struct rq *rq, struct task_struct *p) {}
+static inline
+void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+static inline
+void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 
 #endif /* CONFIG_SMP */
 
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-	return container_of(rt_se, struct task_struct, rt);
-}
-
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
 	return !list_empty(&rt_se->run_list);
@@ -99,16 +148,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
-	return rt_rq->rq;
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-	return rt_se->rt_rq;
-}
-
 #define for_each_sched_rt_entity(rt_se) \
 	for (; rt_se; rt_se = rt_se->parent)
 
@@ -196,19 +235,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
-	return container_of(rt_rq, struct rq, rt);
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-	struct task_struct *p = rt_task_of(rt_se);
-	struct rq *rq = task_rq(p);
-
-	return &rq->rt;
-}
-
 #define for_each_sched_rt_entity(rt_se) \
 	for (; rt_se; rt_se = NULL)
 
@@ -567,7 +593,7 @@ static void update_curr_rt(struct rq *rq)
 	}
 }
 
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
 
 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
 
@@ -580,33 +606,24 @@ static inline int next_prio(struct rq *rq)
 	else
 		return MAX_RT_PRIO;
 }
-#endif
 
-static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+static void
+inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
-	int prio = rt_se_prio(rt_se);
-#ifdef CONFIG_SMP
 	struct rq *rq = rq_of_rt_rq(rt_rq);
-#endif
 
-	WARN_ON(!rt_prio(prio));
-	rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (prio < rt_rq->highest_prio.curr) {
+	if (prio < prev_prio) {
 
 		/*
 		 * If the new task is higher in priority than anything on the
-		 * run-queue, we have a new high that must be published to
-		 * the world.  We also know that the previous high becomes
-		 * our next-highest.
+		 * run-queue, we know that the previous high becomes our
+		 * next-highest.
 		 */
-		rt_rq->highest_prio.next = rt_rq->highest_prio.curr;
-		rt_rq->highest_prio.curr = prio;
-#ifdef CONFIG_SMP
+		rt_rq->highest_prio.next = prev_prio;
+
 		if (rq->online)
 			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-#endif
+
 	} else if (prio == rt_rq->highest_prio.curr)
 		/*
 		 * If the next task is equal in priority to the highest on
@@ -619,72 +636,131 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 		 * Otherwise, we need to recompute next-highest
 		 */
 		rt_rq->highest_prio.next = next_prio(rq);
-#endif
-#ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1)
-		rq->rt.rt_nr_migratory++;
+}
 
-	update_rt_migration(rq);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-	if (rt_se_boosted(rt_se))
-		rt_rq->rt_nr_boosted++;
+static void
+dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 
-	if (rt_rq->tg)
-		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
-#else
-	start_rt_bandwidth(&def_rt_bandwidth);
-#endif
+	if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
+		rt_rq->highest_prio.next = next_prio(rq);
+
+	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
 
+#else /* CONFIG_SMP */
+
 static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-#ifdef CONFIG_SMP
-	struct rq *rq = rq_of_rt_rq(rt_rq);
-	int highest_prio = rt_rq->highest_prio.curr;
-#endif
+void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+static inline
+void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+
+#endif /* CONFIG_SMP */
 
-	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	WARN_ON(!rt_rq->rt_nr_running);
-	rt_rq->rt_nr_running--;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+static void
+inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	int prev_prio = rt_rq->highest_prio.curr;
+
+	if (prio < prev_prio)
+		rt_rq->highest_prio.curr = prio;
+
+	inc_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+static void
+dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	int prev_prio = rt_rq->highest_prio.curr;
+
 	if (rt_rq->rt_nr_running) {
-		int prio = rt_se_prio(rt_se);
 
-		WARN_ON(prio < rt_rq->highest_prio.curr);
+		WARN_ON(prio < prev_prio);
 
 		/*
-		 * This may have been our highest or next-highest priority
-		 * task and therefore we may have some recomputation to do
+		 * This may have been our highest task, and therefore
+		 * we may have some recomputation to do
 		 */
-		if (prio == rt_rq->highest_prio.curr) {
+		if (prio == prev_prio) {
 			struct rt_prio_array *array = &rt_rq->active;
 
 			rt_rq->highest_prio.curr =
 				sched_find_first_bit(array->bitmap);
 		}
 
-		if (prio <= rt_rq->highest_prio.next)
-			rt_rq->highest_prio.next = next_prio(rq);
 	} else
 		rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#endif
-#ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1)
-		rq->rt.rt_nr_migratory--;
 
-	if (rq->online && rt_rq->highest_prio.curr != highest_prio)
-		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
+	dec_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+#else
+
+static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+
+#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 
-	update_rt_migration(rq);
-#endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted++;
+
+	if (rt_rq->tg)
+		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+}
+
+static void
+dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
 	if (rt_se_boosted(rt_se))
 		rt_rq->rt_nr_boosted--;
 
 	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
-#endif
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	start_rt_bandwidth(&def_rt_bandwidth);
+}
+
+static inline
+void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	int prio = rt_se_prio(rt_se);
+
+	WARN_ON(!rt_prio(prio));
+	rt_rq->rt_nr_running++;
+
+	inc_rt_prio(rt_rq, prio);
+	inc_rt_migration(rt_se, rt_rq);
+	inc_rt_group(rt_se, rt_rq);
+}
+
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+	WARN_ON(!rt_rq->rt_nr_running);
+	rt_rq->rt_nr_running--;
+
+	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+	dec_rt_migration(rt_se, rt_rq);
+	dec_rt_group(rt_se, rt_rq);
 }
 
 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -1453,7 +1529,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 			rq->rt.rt_nr_migratory--;
 		}
 
-		update_rt_migration(rq);
+		update_rt_migration(&rq->rt);
 	}
 
 	cpumask_copy(&p->cpus_allowed, new_mask);
-- 
cgit v1.2.3


From 98a4826b99bc4bcc34c604b2fc4fcf4d771600ec Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 14 Jan 2009 10:56:32 +0100
Subject: sched: fix bandwidth validation for UID grouping

Impact: make rt-limit tunables work again

Mark Glines reported:

> I've got an issue on x86-64 where I can't configure the system to allow
> RT tasks for a non-root user.
>
> In 2.6.26.5, I was able to do the following to set things up nicely:
> echo 450000 >/sys/kernel/uids/0/cpu_rt_runtime
> echo 450000 >/sys/kernel/uids/1000/cpu_rt_runtime
>
> Seems like every value I try to echo into the /sys files returns EINVAL.

For UID grouping we initialize the root group with infinite bandwidth
which by default is actually more than the global limit, therefore the
bandwidth check always fails.

Because the root group is a phantom group (for UID grouping) we cannot
runtime adjust it, therefore we let it reflect the global bandwidth
settings.

Reported-by: Mark Glines <mark@glines.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3b630d882660..ed62d1cee05c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9050,6 +9050,13 @@ static int tg_schedulable(struct task_group *tg, void *data)
 		runtime = d->rt_runtime;
 	}
 
+#ifdef CONFIG_USER_SCHED
+	if (tg == &root_task_group) {
+		period = global_rt_period();
+		runtime = global_rt_runtime();
+	}
+#endif
+
 	/*
 	 * Cannot have more runtime than the period.
 	 */
-- 
cgit v1.2.3


From cce7ade803699463ecc62a065ca522004f7ccb3d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 15 Jan 2009 14:53:37 +0100
Subject: sched: SCHED_IDLE weight change

Increase the SCHED_IDLE weight from 2 to 3, this gives much more stable
vruntime numbers.

time advanced in 100ms:

 weight=2

 64765.988352
 67012.881408
 88501.412352

 weight=3

 35496.181411
 34130.971298
 35497.411573

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index ed62d1cee05c..6acfb3c2398b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1323,8 +1323,8 @@ static inline void update_load_sub(struct load_weight *lw, unsigned long dec)
  * slice expiry etc.
  */
 
-#define WEIGHT_IDLEPRIO		2
-#define WMULT_IDLEPRIO		(1 << 31)
+#define WEIGHT_IDLEPRIO                3
+#define WMULT_IDLEPRIO         1431655765
 
 /*
  * Nice levels are multiplicative, with a gentle 10% change for every
-- 
cgit v1.2.3


From d942fb6c7d391baba3dddb566eb735fbf3df8528 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 26 Jan 2009 17:56:17 +0100
Subject: sched: fix sync wakeups

Pawel Dziekonski reported that the openssl benchmark and his
quantum chemistry application both show slowdowns due to the
scheduler under-parallelizing execution.

The reason are pipe wakeups still doing 'sync' wakeups which
overrides the normal buddy wakeup logic - even if waker and
wakee are loosely coupled.

Fix an inversion of logic in the buddy wakeup code.

Reported-by: Pawel Dziekonski <dzieko@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  4 ++++
 kernel/sched_fair.c | 11 ++---------
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c842a8..770b1f9ebe14 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2266,6 +2266,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 
+	if (!sync && (current->se.avg_overlap < sysctl_sched_migration_cost &&
+			    p->se.avg_overlap < sysctl_sched_migration_cost))
+		sync = 1;
+
 #ifdef CONFIG_SMP
 	if (sched_feat(LB_WAKEUP_UPDATE)) {
 		struct sched_domain *sd;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5cc1c162044f..fdc417504681 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1179,20 +1179,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	    int idx, unsigned long load, unsigned long this_load,
 	    unsigned int imbalance)
 {
-	struct task_struct *curr = this_rq->curr;
-	struct task_group *tg;
 	unsigned long tl = this_load;
 	unsigned long tl_per_task;
+	struct task_group *tg;
 	unsigned long weight;
 	int balanced;
 
 	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
 		return 0;
 
-	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
-			p->se.avg_overlap > sysctl_sched_migration_cost))
-		sync = 0;
-
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
 	 * effect of the currently running task from the load
@@ -1419,9 +1414,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
-	if (sched_feat(WAKEUP_OVERLAP) && (sync ||
-			(se->avg_overlap < sysctl_sched_migration_cost &&
-			 pse->avg_overlap < sysctl_sched_migration_cost))) {
+	if (sched_feat(WAKEUP_OVERLAP) && sync) {
 		resched_task(curr);
 		return;
 	}
-- 
cgit v1.2.3


From 1596e29773eadd96b0a5fc6e736afa52394cafda Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 28 Jan 2009 14:51:38 +0100
Subject: sched: symmetric sync vs avg_overlap

Reinstate the weakening of the sync hint if set. This yields a more
symmetric usage of avg_overlap.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 770b1f9ebe14..242d0d47a70d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2266,9 +2266,15 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 
-	if (!sync && (current->se.avg_overlap < sysctl_sched_migration_cost &&
-			    p->se.avg_overlap < sysctl_sched_migration_cost))
-		sync = 1;
+	if (!sync) {
+		if (current->se.avg_overlap < sysctl_sched_migration_cost &&
+			  p->se.avg_overlap < sysctl_sched_migration_cost)
+			sync = 1;
+	} else {
+		if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
+			  p->se.avg_overlap >= sysctl_sched_migration_cost)
+			sync = 0;
+	}
 
 #ifdef CONFIG_SMP
 	if (sched_feat(LB_WAKEUP_UPDATE)) {
-- 
cgit v1.2.3


From 483b4ee60edbefdfbff0dd538fb81f368d9e7c0d Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 4 Feb 2009 11:59:44 -0800
Subject: sched: fix nohz load balancer on cpu offline

Christian Borntraeger reports:

> After a logical cpu offline, even on a complete idle system, there
> is one cpu with full ticks. It turns out that nohz.cpu_mask has the
> the offlined cpu still set.
>
> In select_nohz_load_balancer() we check if the system is completely
> idle to turn of load balancing. We compare cpu_online_map with
> nohz.cpu_mask.  Since cpu_online_map is updated on cpu unplug,
> but nohz.cpu_mask is not, the check fails and the scheduler believes
> that we need an "idle load balancer" even on a fully idle system.
> Since the ilb cpu does not deactivate the timer tick this breaks NOHZ.

Fix the select_nohz_load_balancer() to not set the nohz.cpu_mask
while a cpu is going offline.

Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 242d0d47a70d..e1fc67d0674c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3890,19 +3890,24 @@ int select_nohz_load_balancer(int stop_tick)
 	int cpu = smp_processor_id();
 
 	if (stop_tick) {
-		cpumask_set_cpu(cpu, nohz.cpu_mask);
 		cpu_rq(cpu)->in_nohz_recently = 1;
 
-		/*
-		 * If we are going offline and still the leader, give up!
-		 */
-		if (!cpu_active(cpu) &&
-		    atomic_read(&nohz.load_balancer) == cpu) {
+		if (!cpu_active(cpu)) {
+			if (atomic_read(&nohz.load_balancer) != cpu)
+				return 0;
+
+			/*
+			 * If we are going offline and still the leader,
+			 * give up!
+			 */
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
+
 			return 0;
 		}
 
+		cpumask_set_cpu(cpu, nohz.cpu_mask);
+
 		/* time for ilb owner also to sleep */
 		if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
 			if (atomic_read(&nohz.load_balancer) == cpu)
-- 
cgit v1.2.3