From 63f01241176d7cbc976385aec32f0a209b0bc36a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 6 Dec 2010 14:48:10 -0500
Subject: sched: Remove unlikely() from rt_policy() in sched.c

The rt_policy() has an unlikely() that the policy it is checking is
of RT priority (SCHED_FIFO or SCHED_RR).

According to the annotate branch profiler it is incorrect most of the time:

 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
   36667   654674  94 rt_policy                      sched.c              126

This makes sense because the rt_policy() is used by the sched_set_scheduler()
and nice(). Although users may use sys_nice a bit, all RT users use
the sched_set_scheduler() to set their RT priority, including kernel
threads.

The above numbers were from a normal desktop computer running
firefox, evolution, xchat and was part of a distcc compile farm.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index dc91a4d09ac3..269a0450281c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -123,7 +123,7 @@
 
 static inline int rt_policy(int policy)
 {
-	if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR))
+	if (policy == SCHED_FIFO || policy == SCHED_RR)
 		return 1;
 	return 0;
 }
-- 
cgit v1.2.3


From e69c634190dc724ef2d845ace8d783031d3e492e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 6 Dec 2010 17:10:31 -0500
Subject: sched: Remove unlikely() from ttwu_post_activation

The unlikely() used in ttwu_post_activation() tests if the rq->idle_stamp
is set. But since this is for a wakeup, and wakeups happen when tasks
block on IO, and blocking tasks on IO may put the system into idle,
this can actually be a common occurence.

Running the annotated branch profiler on an average desktop running
firefox, evolution, xchat and distcc, the report shows:

 correct incorrect  %        Function             File              Line
 ------- ---------  -        --------             ----              ----
34884862 146110926  80 ttwu_post_activation      sched.c            2309

80% of the time, this unlikely is incorrect. Best not to assume what the
result is, and just remove the branch annotation.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 269a0450281c..6d24b2e8d82d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2458,7 +2458,7 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq,
 	if (p->sched_class->task_woken)
 		p->sched_class->task_woken(rq, p);
 
-	if (unlikely(rq->idle_stamp)) {
+	if (rq->idle_stamp) {
 		u64 delta = rq->clock - rq->idle_stamp;
 		u64 max = 2*sysctl_sched_migration_cost;
 
-- 
cgit v1.2.3


From 2da8c8bc44b572cbf623629ff736608dc7968436 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 7 Jun 2011 22:53:39 +0200
Subject: sched: Remove pointless in_atomic() definition check

It's really supposed to be defined here. If it's not then
we actually want the build to crash so that we know it,
and not keep it silent.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/sched.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index fd18f395a1bf..01d9536aaa8e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8028,7 +8028,6 @@ static inline int preempt_count_equals(int preempt_offset)
 
 void __might_sleep(const char *file, int line, int preempt_offset)
 {
-#ifdef in_atomic
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
 	if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) ||
@@ -8050,7 +8049,6 @@ void __might_sleep(const char *file, int line, int preempt_offset)
 	if (irqs_disabled())
 		print_irqtrace_events(current);
 	dump_stack();
-#endif
 }
 EXPORT_SYMBOL(__might_sleep);
 #endif
-- 
cgit v1.2.3


From bdd4e85dc36cdbcfc1608a5b2a17c80a9db8986a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 8 Jun 2011 01:13:27 +0200
Subject: sched: Isolate preempt counting in its own config option

Create a new CONFIG_PREEMPT_COUNT that handles the inc/dec
of preempt count offset independently. So that the offset
can be updated by preempt_disable() and preempt_enable()
even without the need for CONFIG_PREEMPT beeing set.

This prepares to make CONFIG_DEBUG_SPINLOCK_SLEEP working
with !CONFIG_PREEMPT where it currently doesn't detect
code that sleeps inside explicit preemption disabled
sections.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 include/linux/bit_spinlock.h |  2 +-
 include/linux/hardirq.h      |  4 ++--
 include/linux/pagemap.h      |  4 ++--
 include/linux/preempt.h      | 26 +++++++++++++++++---------
 include/linux/rcupdate.h     | 12 ++++++------
 include/linux/sched.h        |  2 +-
 kernel/Kconfig.preempt       |  3 +++
 kernel/sched.c               |  2 +-
 8 files changed, 33 insertions(+), 22 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h
index b4326bfa684f..564d997e2168 100644
--- a/include/linux/bit_spinlock.h
+++ b/include/linux/bit_spinlock.h
@@ -88,7 +88,7 @@ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
 {
 #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
 	return test_bit(bitnum, addr);
-#elif defined CONFIG_PREEMPT
+#elif defined CONFIG_PREEMPT_COUNT
 	return preempt_count();
 #else
 	return 1;
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index ba362171e8ae..f743883f769e 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -93,7 +93,7 @@
  */
 #define in_nmi()	(preempt_count() & NMI_MASK)
 
-#if defined(CONFIG_PREEMPT)
+#if defined(CONFIG_PREEMPT_COUNT)
 # define PREEMPT_CHECK_OFFSET 1
 #else
 # define PREEMPT_CHECK_OFFSET 0
@@ -115,7 +115,7 @@
 #define in_atomic_preempt_off() \
 		((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET)
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 # define preemptible()	(preempt_count() == 0 && !irqs_disabled())
 # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1)
 #else
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 716875e53520..8e38d4c140ff 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -134,7 +134,7 @@ static inline int page_cache_get_speculative(struct page *page)
 	VM_BUG_ON(in_interrupt());
 
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
-# ifdef CONFIG_PREEMPT
+# ifdef CONFIG_PREEMPT_COUNT
 	VM_BUG_ON(!in_atomic());
 # endif
 	/*
@@ -172,7 +172,7 @@ static inline int page_cache_add_speculative(struct page *page, int count)
 	VM_BUG_ON(in_interrupt());
 
 #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU)
-# ifdef CONFIG_PREEMPT
+# ifdef CONFIG_PREEMPT_COUNT
 	VM_BUG_ON(!in_atomic());
 # endif
 	VM_BUG_ON(page_count(page) == 0);
diff --git a/include/linux/preempt.h b/include/linux/preempt.h
index 2e681d9555bd..58969b2a8a82 100644
--- a/include/linux/preempt.h
+++ b/include/linux/preempt.h
@@ -27,6 +27,21 @@
 
 asmlinkage void preempt_schedule(void);
 
+#define preempt_check_resched() \
+do { \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
+		preempt_schedule(); \
+} while (0)
+
+#else /* !CONFIG_PREEMPT */
+
+#define preempt_check_resched()		do { } while (0)
+
+#endif /* CONFIG_PREEMPT */
+
+
+#ifdef CONFIG_PREEMPT_COUNT
+
 #define preempt_disable() \
 do { \
 	inc_preempt_count(); \
@@ -39,12 +54,6 @@ do { \
 	dec_preempt_count(); \
 } while (0)
 
-#define preempt_check_resched() \
-do { \
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
-		preempt_schedule(); \
-} while (0)
-
 #define preempt_enable() \
 do { \
 	preempt_enable_no_resched(); \
@@ -80,18 +89,17 @@ do { \
 	preempt_check_resched(); \
 } while (0)
 
-#else
+#else /* !CONFIG_PREEMPT_COUNT */
 
 #define preempt_disable()		do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable()		do { } while (0)
-#define preempt_check_resched()		do { } while (0)
 
 #define preempt_disable_notrace()		do { } while (0)
 #define preempt_enable_no_resched_notrace()	do { } while (0)
 #define preempt_enable_notrace()		do { } while (0)
 
-#endif
+#endif /* CONFIG_PREEMPT_COUNT */
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 99f9aa7c2804..8f4f881a0ad8 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -239,7 +239,7 @@ extern int rcu_read_lock_bh_held(void);
  * Check debug_lockdep_rcu_enabled() to prevent false positives during boot
  * and while lockdep is disabled.
  */
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 static inline int rcu_read_lock_sched_held(void)
 {
 	int lockdep_opinion = 0;
@@ -250,12 +250,12 @@ static inline int rcu_read_lock_sched_held(void)
 		lockdep_opinion = lock_is_held(&rcu_sched_lock_map);
 	return lockdep_opinion || preempt_count() != 0 || irqs_disabled();
 }
-#else /* #ifdef CONFIG_PREEMPT */
+#else /* #ifdef CONFIG_PREEMPT_COUNT */
 static inline int rcu_read_lock_sched_held(void)
 {
 	return 1;
 }
-#endif /* #else #ifdef CONFIG_PREEMPT */
+#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
 
 #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
@@ -276,17 +276,17 @@ static inline int rcu_read_lock_bh_held(void)
 	return 1;
 }
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 static inline int rcu_read_lock_sched_held(void)
 {
 	return preempt_count() != 0 || irqs_disabled();
 }
-#else /* #ifdef CONFIG_PREEMPT */
+#else /* #ifdef CONFIG_PREEMPT_COUNT */
 static inline int rcu_read_lock_sched_held(void)
 {
 	return 1;
 }
-#endif /* #else #ifdef CONFIG_PREEMPT */
+#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */
 
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 483c1ed5bc4d..4ecd5cbe7e24 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2502,7 +2502,7 @@ extern int _cond_resched(void);
 
 extern int __cond_resched_lock(spinlock_t *lock);
 
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 #define PREEMPT_LOCK_OFFSET	PREEMPT_OFFSET
 #else
 #define PREEMPT_LOCK_OFFSET	0
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index bf987b95b356..24e7cb0ba26a 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY
 
 config PREEMPT
 	bool "Preemptible Kernel (Low-Latency Desktop)"
+	select PREEMPT_COUNT
 	help
 	  This option reduces the latency of the kernel by making
 	  all kernel code (that is not executing in a critical section)
@@ -52,3 +53,5 @@ config PREEMPT
 
 endchoice
 
+config PREEMPT_COUNT
+       bool
\ No newline at end of file
diff --git a/kernel/sched.c b/kernel/sched.c
index 01d9536aaa8e..90ad7cf2b290 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2843,7 +2843,7 @@ void sched_fork(struct task_struct *p)
 #if defined(CONFIG_SMP)
 	p->on_cpu = 0;
 #endif
-#ifdef CONFIG_PREEMPT
+#ifdef CONFIG_PREEMPT_COUNT
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
-- 
cgit v1.2.3


From d902db1eb60387040fe541573083e47469db50ac Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 8 Jun 2011 19:31:56 +0200
Subject: sched: Generalize sleep inside spinlock detection

The sleeping inside spinlock detection is actually used
for more general sleeping inside atomic sections
debugging: preemption disabled, rcu read side critical
sections, interrupts, interrupt disabled, etc...

Change the name of the config and its help section to
reflect its more general role.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Ingo Molnar <mingo@elte.hu>
---
 Documentation/DocBook/kernel-hacking.tmpl  | 2 +-
 Documentation/SubmitChecklist              | 2 +-
 Documentation/development-process/4.Coding | 2 +-
 Documentation/ja_JP/SubmitChecklist        | 2 +-
 Documentation/zh_CN/SubmitChecklist        | 2 +-
 include/linux/kernel.h                     | 2 +-
 kernel/sched.c                             | 2 +-
 lib/Kconfig.debug                          | 8 +++++---
 8 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index 7b3f49363413..07a9c48de5a2 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -409,7 +409,7 @@ cond_resched(); /* Will sleep */
 
   <para>
    You should always compile your kernel
-   <symbol>CONFIG_DEBUG_SPINLOCK_SLEEP</symbol> on, and it will warn
+   <symbol>CONFIG_DEBUG_ATOMIC_SLEEP</symbol> on, and it will warn
    you if you break these rules.  If you <emphasis>do</emphasis> break
    the rules, you will eventually lock up your box.
   </para>
diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist
index da0382daa395..7b13be41c085 100644
--- a/Documentation/SubmitChecklist
+++ b/Documentation/SubmitChecklist
@@ -53,7 +53,7 @@ kernel patches.
 
 12: Has been tested with CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT,
     CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES,
-    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP all simultaneously
+    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP all simultaneously
     enabled.
 
 13: Has been build- and runtime tested with and without CONFIG_SMP and
diff --git a/Documentation/development-process/4.Coding b/Documentation/development-process/4.Coding
index f3f1a469443c..83f5f5b365a3 100644
--- a/Documentation/development-process/4.Coding
+++ b/Documentation/development-process/4.Coding
@@ -244,7 +244,7 @@ testing purposes.  In particular, you should turn on:
  - DEBUG_SLAB can find a variety of memory allocation and use errors; it
    should be used on most development kernels.
 
- - DEBUG_SPINLOCK, DEBUG_SPINLOCK_SLEEP, and DEBUG_MUTEXES will find a
+ - DEBUG_SPINLOCK, DEBUG_ATOMIC_SLEEP, and DEBUG_MUTEXES will find a
    number of common locking errors.
 
 There are quite a few other debugging options, some of which will be
diff --git a/Documentation/ja_JP/SubmitChecklist b/Documentation/ja_JP/SubmitChecklist
index 2df4576f1173..cb5507b1ac81 100644
--- a/Documentation/ja_JP/SubmitChecklist
+++ b/Documentation/ja_JP/SubmitChecklist
@@ -68,7 +68,7 @@ Linux ã‚«ãƒ¼ãƒãƒ«ãƒ‘ãƒƒãƒæŠ•ç¨¿è€…å‘ã‘ãƒã‚§ãƒƒã‚¯ãƒªã‚¹ãƒˆ
 
 12: CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB,
     CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK,
-    CONFIG_DEBUG_SPINLOCK_SLEEP ã“ã‚Œã‚‰å…¨ã¦ã‚’åŒæ™‚ã«æœ‰åŠ¹ã«ã—ã¦å‹•ä½œç¢ºèªã‚’
+    CONFIG_DEBUG_ATOMIC_SLEEP ã“ã‚Œã‚‰å…¨ã¦ã‚’åŒæ™‚ã«æœ‰åŠ¹ã«ã—ã¦å‹•ä½œç¢ºèªã‚’
     è¡Œã£ã¦ãã ã•ã„ã€‚
 
 13: CONFIG_SMP, CONFIG_PREEMPT ã‚’æœ‰åŠ¹ã«ã—ãŸå ´åˆã¨ç„¡åŠ¹ã«ã—ãŸå ´åˆã®ä¸¡æ–¹ã§
diff --git a/Documentation/zh_CN/SubmitChecklist b/Documentation/zh_CN/SubmitChecklist
index 951415bbab0c..4c741d6bc048 100644
--- a/Documentation/zh_CN/SubmitChecklist
+++ b/Documentation/zh_CN/SubmitChecklist
@@ -67,7 +67,7 @@ Linux
 
 12£ºÒÑ¾­Í¨¹ýCONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT,
     CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES,
-    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP²âÊÔ£¬²¢ÇÒÍ¬Ê±¶¼
+    CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP²âÊÔ£¬²¢ÇÒÍ¬Ê±¶¼
     Ê¹ÄÜ¡£
 
 13£ºÒÑ¾­¶¼¹¹½¨²¢ÇÒÊ¹ÓÃ»òÕß²»Ê¹ÓÃ CONFIG_SMP ºÍ CONFIG_PREEMPT²âÊÔÖ´ÐÐÊ±¼ä¡£
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index fb0e7329fee1..24b489f66592 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -121,7 +121,7 @@ extern int _cond_resched(void);
 # define might_resched() do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
   void __might_sleep(const char *file, int line, int preempt_offset);
 /**
  * might_sleep - annotation for functions that can sleep
diff --git a/kernel/sched.c b/kernel/sched.c
index 90ad7cf2b290..a5f318b8d659 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8018,7 +8018,7 @@ void __init sched_init(void)
 	scheduler_running = 1;
 }
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
 static inline int preempt_count_equals(int preempt_offset)
 {
 	int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth();
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index a7dd7b547fea..81a4f3302bc8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -648,13 +648,15 @@ config TRACE_IRQFLAGS
 	  Enables hooks to interrupt enabling and disabling for
 	  either tracing or lock debugging.
 
-config DEBUG_SPINLOCK_SLEEP
-	bool "Spinlock debugging: sleep-inside-spinlock checking"
+config DEBUG_ATOMIC_SLEEP
+	bool "Sleep inside atomic section checking"
 	select PREEMPT_COUNT
 	depends on DEBUG_KERNEL
 	help
 	  If you say Y here, various routines which may sleep will become very
-	  noisy if they are called with a spinlock held.
+	  noisy if they are called inside atomic sections: when a spinlock is
+	  held, inside an rcu read side critical section, inside preempt disabled
+	  sections, inside an interrupt, etc...
 
 config DEBUG_LOCKING_API_SELFTESTS
 	bool "Locking API boot-time self-tests"
-- 
cgit v1.2.3


From 307bf9803f25a8a3f53c1012110fb74e2f893eb0 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 10 Jun 2011 15:08:55 +0200
Subject: sched: Simplify mutex_spin_on_owner()

It does not make sense to rcu_read_lock/unlock() in every loop
iteration while spinning on the mutex.

Move the rcu protection outside the loop. Also simplify the
return path to always check for lock->owner == NULL which
meets the requirements of both owner changed and need_resched()
caused loop exits.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1106101458350.11814@ionos
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 25 +++++++++----------------
 1 file changed, 9 insertions(+), 16 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 59252754fbe0..e355ee72e83f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4306,11 +4306,8 @@ EXPORT_SYMBOL(schedule);
 
 static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 {
-	bool ret = false;
-
-	rcu_read_lock();
 	if (lock->owner != owner)
-		goto fail;
+		return false;
 
 	/*
 	 * Ensure we emit the owner->on_cpu, dereference _after_ checking
@@ -4320,11 +4317,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner)
 	 */
 	barrier();
 
-	ret = owner->on_cpu;
-fail:
-	rcu_read_unlock();
-
-	return ret;
+	return owner->on_cpu;
 }
 
 /*
@@ -4336,21 +4329,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner)
 	if (!sched_feat(OWNER_SPIN))
 		return 0;
 
+	rcu_read_lock();
 	while (owner_running(lock, owner)) {
 		if (need_resched())
-			return 0;
+			break;
 
 		arch_mutex_cpu_relax();
 	}
+	rcu_read_unlock();
 
 	/*
-	 * If the owner changed to another task there is likely
-	 * heavy contention, stop spinning.
+	 * We break out the loop above on need_resched() and when the
+	 * owner changed, which is a sign for heavy contention. Return
+	 * success only when lock->owner is NULL.
 	 */
-	if (lock->owner)
-		return 0;
-
-	return 1;
+	return lock->owner == NULL;
 }
 #endif
 
-- 
cgit v1.2.3


From 9763b67fb9f3050c6da739105888327587c30c4d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 13 Jul 2011 13:09:25 +0200
Subject: sched, cgroup: Optimize load_balance_fair()

Use for_each_leaf_cfs_rq() instead of list_for_each_entry_rcu(), this
achieves that load_balance_fair() only iterates those task_groups that
actually have tasks on busiest, and that we iterate bottom-up, trying to
move light groups before the heavier ones.

No idea if it will actually work out to be beneficial in practice, does
anybody have a cgroup workload that might show a difference one way or
the other?

[ Also move update_h_load to sched_fair.c, loosing #ifdef-ery ]

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Paul Turner <pjt@google.com>
Link: http://lkml.kernel.org/r/1310557009.2586.28.camel@twins
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 32 --------------------------------
 kernel/sched_fair.c | 40 +++++++++++++++++++++++++++++++++++-----
 2 files changed, 35 insertions(+), 37 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index b0e7ad796d3b..474f341d6f91 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1568,38 +1568,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 	return rq->avg_load_per_task;
 }
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
-
-/*
- * Compute the cpu's hierarchical load factor for each task group.
- * This needs to be done in a top-down fashion because the load of a child
- * group is a fraction of its parents load.
- */
-static int tg_load_down(struct task_group *tg, void *data)
-{
-	unsigned long load;
-	long cpu = (long)data;
-
-	if (!tg->parent) {
-		load = cpu_rq(cpu)->load.weight;
-	} else {
-		load = tg->parent->cfs_rq[cpu]->h_load;
-		load *= tg->se[cpu]->load.weight;
-		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
-	}
-
-	tg->cfs_rq[cpu]->h_load = load;
-
-	return 0;
-}
-
-static void update_h_load(long cpu)
-{
-	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
-}
-
-#endif
-
 #ifdef CONFIG_PREEMPT
 
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 6cdff849fc19..180bcf1efa79 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -2232,11 +2232,43 @@ static void update_shares(int cpu)
 	struct rq *rq = cpu_rq(cpu);
 
 	rcu_read_lock();
+	/*
+	 * Iterates the task_group tree in a bottom up fashion, see
+	 * list_add_leaf_cfs_rq() for details.
+	 */
 	for_each_leaf_cfs_rq(rq, cfs_rq)
 		update_shares_cpu(cfs_rq->tg, cpu);
 	rcu_read_unlock();
 }
 
+/*
+ * Compute the cpu's hierarchical load factor for each task group.
+ * This needs to be done in a top-down fashion because the load of a child
+ * group is a fraction of its parents load.
+ */
+static int tg_load_down(struct task_group *tg, void *data)
+{
+	unsigned long load;
+	long cpu = (long)data;
+
+	if (!tg->parent) {
+		load = cpu_rq(cpu)->load.weight;
+	} else {
+		load = tg->parent->cfs_rq[cpu]->h_load;
+		load *= tg->se[cpu]->load.weight;
+		load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
+	}
+
+	tg->cfs_rq[cpu]->h_load = load;
+
+	return 0;
+}
+
+static void update_h_load(long cpu)
+{
+	walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
+}
+
 static unsigned long
 load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  unsigned long max_load_move,
@@ -2244,14 +2276,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
 		  int *all_pinned)
 {
 	long rem_load_move = max_load_move;
-	int busiest_cpu = cpu_of(busiest);
-	struct task_group *tg;
+	struct cfs_rq *busiest_cfs_rq;
 
 	rcu_read_lock();
-	update_h_load(busiest_cpu);
+	update_h_load(cpu_of(busiest));
 
-	list_for_each_entry_rcu(tg, &task_groups, list) {
-		struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu];
+	for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) {
 		unsigned long busiest_h_load = busiest_cfs_rq->h_load;
 		unsigned long busiest_weight = busiest_cfs_rq->load.weight;
 		u64 rem_load, moved_load;
-- 
cgit v1.2.3


From 5f817d676b7b7ac4a29f5ed93063ae7a24550c12 Mon Sep 17 00:00:00 2001
From: Jan Schoenherr <schnhrr@cs.tu-berlin.de>
Date: Wed, 13 Jul 2011 20:13:31 +0200
Subject: sched: Fix (harmless) typo 'CONFG_FAIR_GROUP_SCHED'

This patch fixes a typo located in a comment.

Signed-off-by: Jan Schoenherr <schnhrr@cs.tu-berlin.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1310580816-10861-2-git-send-email-schnhrr@cs.tu-berlin.de
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 474f341d6f91..3b3826ebe793 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8362,7 +8362,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu)
 	list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
-#else /* !CONFG_FAIR_GROUP_SCHED */
+#else /* !CONFIG_FAIR_GROUP_SCHED */
 static inline void free_fair_sched_group(struct task_group *tg)
 {
 }
-- 
cgit v1.2.3


From 99bc52429f11d1f4f81495ac8237085aaeb6bccf Mon Sep 17 00:00:00 2001
From: Bianca Lutz <sowilo@cs.tu-berlin.de>
Date: Wed, 13 Jul 2011 20:13:36 +0200
Subject: sched: Do not attempt to destroy uninitialized rt_bandwidth

If a task group is to be created and alloc_fair_sched_group() fails,
then the rt_bandwidth of the corresponding task group is not yet
initialized. The caller, sched_create_group(), starts a clean up
procedure which calls free_rt_sched_group() which unconditionally
destroys the not yet initialized rt_bandwidth.

This crashes or hangs the system in lock_hrtimer_base(): UP systems
dereference a NULL pointer, while SMP systems loop endlessly on a
condition that cannot become true.

This patch simply avoids the destruction of rt_bandwidth when the
initialization code path was not reached.

(This was discovered by accident with a custom kernel modification.)

Signed-off-by: Bianca Lutz <sowilo@cs.tu-berlin.de>
Signed-off-by: Jan Schoenherr <schnhrr@cs.tu-berlin.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1310580816-10861-7-git-send-email-schnhrr@cs.tu-berlin.de
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index 3b3826ebe793..f107204db53f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8383,7 +8383,8 @@ static void free_rt_sched_group(struct task_group *tg)
 {
 	int i;
 
-	destroy_rt_bandwidth(&tg->rt_bandwidth);
+	if (tg->rt_se)
+		destroy_rt_bandwidth(&tg->rt_bandwidth);
 
 	for_each_possible_cpu(i) {
 		if (tg->rt_rq)
-- 
cgit v1.2.3


From 26a148eb9c790149750f7e77da0d96029443d400 Mon Sep 17 00:00:00 2001
From: Richard Kennedy <richard@rsk.demon.co.uk>
Date: Fri, 15 Jul 2011 11:41:31 +0100
Subject: sched: Reorder root_domain to remove 64 bit alignment padding

Reorder root_domain to remove 8 bytes of alignment padding on 64 bit
builds, this shrinks the size from 1736 to 1728 bytes, therefore using
one fewer cachelines.

Signed-off-by: Richard Kennedy <richard@rsk.demon.co.uk>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1310726492.1977.5.camel@castor.rsk
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index f107204db53f..e3f0bac05270 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -422,6 +422,7 @@ struct rt_rq {
  */
 struct root_domain {
 	atomic_t refcount;
+	atomic_t rto_count;
 	struct rcu_head rcu;
 	cpumask_var_t span;
 	cpumask_var_t online;
@@ -431,7 +432,6 @@ struct root_domain {
 	 * one runnable RT task.
 	 */
 	cpumask_var_t rto_mask;
-	atomic_t rto_count;
 	struct cpupri cpupri;
 };
 
-- 
cgit v1.2.3


From acb5a9ba3bd7cd8b3264f67a3789a9587d3b935b Mon Sep 17 00:00:00 2001
From: "Jan H. SchÃ¶nherr" <schnhrr@cs.tu-berlin.de>
Date: Thu, 14 Jul 2011 18:32:43 +0200
Subject: sched: Separate group-scheduling code more clearly
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Clean up cfs/rt runqueue initialization by moving group scheduling
related code into the corresponding functions.

Also, keep group scheduling as an add-on, so that things are only done
additionally, i. e. remove the init_*_rq() calls from init_tg_*_entry().
(This removes a redundant initalization during sched_init()).

In case of group scheduling rt_rq->highest_prio.curr is now initialized
twice, but adding another #ifdef seems not worth it.

Signed-off-by: Jan H. SchÃ¶nherr <schnhrr@cs.tu-berlin.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/1310661163-16606-1-git-send-email-schnhrr@cs.tu-berlin.de
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 43 +++++++++++++++++++------------------------
 1 file changed, 19 insertions(+), 24 deletions(-)

(limited to 'kernel/sched.c')

diff --git a/kernel/sched.c b/kernel/sched.c
index e3f0bac05270..6fdf7ffbebc6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7859,17 +7859,10 @@ int in_sched_functions(unsigned long addr)
 		&& addr < (unsigned long)__sched_text_end);
 }
 
-static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq)
+static void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
 	INIT_LIST_HEAD(&cfs_rq->tasks);
-#ifdef CONFIG_FAIR_GROUP_SCHED
-	cfs_rq->rq = rq;
-	/* allow initial update_cfs_load() to truncate */
-#ifdef CONFIG_SMP
-	cfs_rq->load_stamp = 1;
-#endif
-#endif
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -7889,13 +7882,9 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	/* delimiter for bitsearch: */
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
 	rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#ifdef CONFIG_SMP
 	rt_rq->highest_prio.next = MAX_RT_PRIO;
-#endif
-#endif
-#ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
 	plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock);
@@ -7905,11 +7894,6 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	rt_rq->rt_throttled = 0;
 	rt_rq->rt_runtime = 0;
 	raw_spin_lock_init(&rt_rq->rt_runtime_lock);
-
-#ifdef CONFIG_RT_GROUP_SCHED
-	rt_rq->rt_nr_boosted = 0;
-	rt_rq->rq = rq;
-#endif
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -7918,11 +7902,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 				struct sched_entity *parent)
 {
 	struct rq *rq = cpu_rq(cpu);
-	tg->cfs_rq[cpu] = cfs_rq;
-	init_cfs_rq(cfs_rq, rq);
+
 	cfs_rq->tg = tg;
+	cfs_rq->rq = rq;
+#ifdef CONFIG_SMP
+	/* allow initial update_cfs_load() to truncate */
+	cfs_rq->load_stamp = 1;
+#endif
 
+	tg->cfs_rq[cpu] = cfs_rq;
 	tg->se[cpu] = se;
+
 	/* se could be NULL for root_task_group */
 	if (!se)
 		return;
@@ -7945,12 +7935,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 {
 	struct rq *rq = cpu_rq(cpu);
 
-	tg->rt_rq[cpu] = rt_rq;
-	init_rt_rq(rt_rq, rq);
+	rt_rq->highest_prio.curr = MAX_RT_PRIO;
+	rt_rq->rt_nr_boosted = 0;
+	rt_rq->rq = rq;
 	rt_rq->tg = tg;
-	rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 
+	tg->rt_rq[cpu] = rt_rq;
 	tg->rt_se[cpu] = rt_se;
+
 	if (!rt_se)
 		return;
 
@@ -8032,7 +8024,7 @@ void __init sched_init(void)
 		rq->nr_running = 0;
 		rq->calc_load_active = 0;
 		rq->calc_load_update = jiffies + LOAD_FREQ;
-		init_cfs_rq(&rq->cfs, rq);
+		init_cfs_rq(&rq->cfs);
 		init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
 		root_task_group.shares = root_task_group_load;
@@ -8335,6 +8327,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
 		if (!se)
 			goto err_free_rq;
 
+		init_cfs_rq(cfs_rq);
 		init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
 	}
 
@@ -8425,6 +8418,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 		if (!rt_se)
 			goto err_free_rq;
 
+		init_rt_rq(rt_rq, cpu_rq(i));
+		rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 		init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 	}
 
-- 
cgit v1.2.3