From 63f01241176d7cbc976385aec32f0a209b0bc36a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 14:48:10 -0500 Subject: sched: Remove unlikely() from rt_policy() in sched.c The rt_policy() has an unlikely() that the policy it is checking is of RT priority (SCHED_FIFO or SCHED_RR). According to the annotate branch profiler it is incorrect most of the time: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 36667 654674 94 rt_policy sched.c 126 This makes sense because the rt_policy() is used by the sched_set_scheduler() and nice(). Although users may use sys_nice a bit, all RT users use the sched_set_scheduler() to set their RT priority, including kernel threads. The above numbers were from a normal desktop computer running firefox, evolution, xchat and was part of a distcc compile farm. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index dc91a4d09ac3..269a0450281c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -123,7 +123,7 @@ static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) + if (policy == SCHED_FIFO || policy == SCHED_RR) return 1; return 0; } -- cgit v1.2.3 From e69c634190dc724ef2d845ace8d783031d3e492e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 6 Dec 2010 17:10:31 -0500 Subject: sched: Remove unlikely() from ttwu_post_activation The unlikely() used in ttwu_post_activation() tests if the rq->idle_stamp is set. But since this is for a wakeup, and wakeups happen when tasks block on IO, and blocking tasks on IO may put the system into idle, this can actually be a common occurence. Running the annotated branch profiler on an average desktop running firefox, evolution, xchat and distcc, the report shows: correct incorrect % Function File Line ------- --------- - -------- ---- ---- 34884862 146110926 80 ttwu_post_activation sched.c 2309 80% of the time, this unlikely is incorrect. Best not to assume what the result is, and just remove the branch annotation. Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 269a0450281c..6d24b2e8d82d 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,7 +2458,7 @@ static inline void ttwu_post_activation(struct task_struct *p, struct rq *rq, if (p->sched_class->task_woken) p->sched_class->task_woken(rq, p); - if (unlikely(rq->idle_stamp)) { + if (rq->idle_stamp) { u64 delta = rq->clock - rq->idle_stamp; u64 max = 2*sysctl_sched_migration_cost; -- cgit v1.2.3 From 2da8c8bc44b572cbf623629ff736608dc7968436 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 7 Jun 2011 22:53:39 +0200 Subject: sched: Remove pointless in_atomic() definition check It's really supposed to be defined here. If it's not then we actually want the build to crash so that we know it, and not keep it silent. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Peter Zijlstra --- kernel/sched.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index fd18f395a1bf..01d9536aaa8e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8028,7 +8028,6 @@ static inline int preempt_count_equals(int preempt_offset) void __might_sleep(const char *file, int line, int preempt_offset) { -#ifdef in_atomic static unsigned long prev_jiffy; /* ratelimiting */ if ((preempt_count_equals(preempt_offset) && !irqs_disabled()) || @@ -8050,7 +8049,6 @@ void __might_sleep(const char *file, int line, int preempt_offset) if (irqs_disabled()) print_irqtrace_events(current); dump_stack(); -#endif } EXPORT_SYMBOL(__might_sleep); #endif -- cgit v1.2.3 From bdd4e85dc36cdbcfc1608a5b2a17c80a9db8986a Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2011 01:13:27 +0200 Subject: sched: Isolate preempt counting in its own config option Create a new CONFIG_PREEMPT_COUNT that handles the inc/dec of preempt count offset independently. So that the offset can be updated by preempt_disable() and preempt_enable() even without the need for CONFIG_PREEMPT beeing set. This prepares to make CONFIG_DEBUG_SPINLOCK_SLEEP working with !CONFIG_PREEMPT where it currently doesn't detect code that sleeps inside explicit preemption disabled sections. Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra --- include/linux/bit_spinlock.h | 2 +- include/linux/hardirq.h | 4 ++-- include/linux/pagemap.h | 4 ++-- include/linux/preempt.h | 26 +++++++++++++++++--------- include/linux/rcupdate.h | 12 ++++++------ include/linux/sched.h | 2 +- kernel/Kconfig.preempt | 3 +++ kernel/sched.c | 2 +- 8 files changed, 33 insertions(+), 22 deletions(-) (limited to 'kernel/sched.c') diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h index b4326bfa684f..564d997e2168 100644 --- a/include/linux/bit_spinlock.h +++ b/include/linux/bit_spinlock.h @@ -88,7 +88,7 @@ static inline int bit_spin_is_locked(int bitnum, unsigned long *addr) { #if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) return test_bit(bitnum, addr); -#elif defined CONFIG_PREEMPT +#elif defined CONFIG_PREEMPT_COUNT return preempt_count(); #else return 1; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index ba362171e8ae..f743883f769e 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -93,7 +93,7 @@ */ #define in_nmi() (preempt_count() & NMI_MASK) -#if defined(CONFIG_PREEMPT) +#if defined(CONFIG_PREEMPT_COUNT) # define PREEMPT_CHECK_OFFSET 1 #else # define PREEMPT_CHECK_OFFSET 0 @@ -115,7 +115,7 @@ #define in_atomic_preempt_off() \ ((preempt_count() & ~PREEMPT_ACTIVE) != PREEMPT_CHECK_OFFSET) -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT # define preemptible() (preempt_count() == 0 && !irqs_disabled()) # define IRQ_EXIT_OFFSET (HARDIRQ_OFFSET-1) #else diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 716875e53520..8e38d4c140ff 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -134,7 +134,7 @@ static inline int page_cache_get_speculative(struct page *page) VM_BUG_ON(in_interrupt()); #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) -# ifdef CONFIG_PREEMPT +# ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic()); # endif /* @@ -172,7 +172,7 @@ static inline int page_cache_add_speculative(struct page *page, int count) VM_BUG_ON(in_interrupt()); #if !defined(CONFIG_SMP) && defined(CONFIG_TREE_RCU) -# ifdef CONFIG_PREEMPT +# ifdef CONFIG_PREEMPT_COUNT VM_BUG_ON(!in_atomic()); # endif VM_BUG_ON(page_count(page) == 0); diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 2e681d9555bd..58969b2a8a82 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -27,6 +27,21 @@ asmlinkage void preempt_schedule(void); +#define preempt_check_resched() \ +do { \ + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + preempt_schedule(); \ +} while (0) + +#else /* !CONFIG_PREEMPT */ + +#define preempt_check_resched() do { } while (0) + +#endif /* CONFIG_PREEMPT */ + + +#ifdef CONFIG_PREEMPT_COUNT + #define preempt_disable() \ do { \ inc_preempt_count(); \ @@ -39,12 +54,6 @@ do { \ dec_preempt_count(); \ } while (0) -#define preempt_check_resched() \ -do { \ - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ - preempt_schedule(); \ -} while (0) - #define preempt_enable() \ do { \ preempt_enable_no_resched(); \ @@ -80,18 +89,17 @@ do { \ preempt_check_resched(); \ } while (0) -#else +#else /* !CONFIG_PREEMPT_COUNT */ #define preempt_disable() do { } while (0) #define preempt_enable_no_resched() do { } while (0) #define preempt_enable() do { } while (0) -#define preempt_check_resched() do { } while (0) #define preempt_disable_notrace() do { } while (0) #define preempt_enable_no_resched_notrace() do { } while (0) #define preempt_enable_notrace() do { } while (0) -#endif +#endif /* CONFIG_PREEMPT_COUNT */ #ifdef CONFIG_PREEMPT_NOTIFIERS diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 99f9aa7c2804..8f4f881a0ad8 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -239,7 +239,7 @@ extern int rcu_read_lock_bh_held(void); * Check debug_lockdep_rcu_enabled() to prevent false positives during boot * and while lockdep is disabled. */ -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) { int lockdep_opinion = 0; @@ -250,12 +250,12 @@ static inline int rcu_read_lock_sched_held(void) lockdep_opinion = lock_is_held(&rcu_sched_lock_map); return lockdep_opinion || preempt_count() != 0 || irqs_disabled(); } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPT_COUNT */ static inline int rcu_read_lock_sched_held(void) { return 1; } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ @@ -276,17 +276,17 @@ static inline int rcu_read_lock_bh_held(void) return 1; } -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT static inline int rcu_read_lock_sched_held(void) { return preempt_count() != 0 || irqs_disabled(); } -#else /* #ifdef CONFIG_PREEMPT */ +#else /* #ifdef CONFIG_PREEMPT_COUNT */ static inline int rcu_read_lock_sched_held(void) { return 1; } -#endif /* #else #ifdef CONFIG_PREEMPT */ +#endif /* #else #ifdef CONFIG_PREEMPT_COUNT */ #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 483c1ed5bc4d..4ecd5cbe7e24 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2502,7 +2502,7 @@ extern int _cond_resched(void); extern int __cond_resched_lock(spinlock_t *lock); -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT #define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET #else #define PREEMPT_LOCK_OFFSET 0 diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index bf987b95b356..24e7cb0ba26a 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -35,6 +35,7 @@ config PREEMPT_VOLUNTARY config PREEMPT bool "Preemptible Kernel (Low-Latency Desktop)" + select PREEMPT_COUNT help This option reduces the latency of the kernel by making all kernel code (that is not executing in a critical section) @@ -52,3 +53,5 @@ config PREEMPT endchoice +config PREEMPT_COUNT + bool \ No newline at end of file diff --git a/kernel/sched.c b/kernel/sched.c index 01d9536aaa8e..90ad7cf2b290 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2843,7 +2843,7 @@ void sched_fork(struct task_struct *p) #if defined(CONFIG_SMP) p->on_cpu = 0; #endif -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPT_COUNT /* Want to start with kernel preemption disabled. */ task_thread_info(p)->preempt_count = 1; #endif -- cgit v1.2.3 From d902db1eb60387040fe541573083e47469db50ac Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 8 Jun 2011 19:31:56 +0200 Subject: sched: Generalize sleep inside spinlock detection The sleeping inside spinlock detection is actually used for more general sleeping inside atomic sections debugging: preemption disabled, rcu read side critical sections, interrupts, interrupt disabled, etc... Change the name of the config and its help section to reflect its more general role. Signed-off-by: Frederic Weisbecker Acked-by: Paul E. McKenney Acked-by: Randy Dunlap Cc: Peter Zijlstra Cc: Ingo Molnar --- Documentation/DocBook/kernel-hacking.tmpl | 2 +- Documentation/SubmitChecklist | 2 +- Documentation/development-process/4.Coding | 2 +- Documentation/ja_JP/SubmitChecklist | 2 +- Documentation/zh_CN/SubmitChecklist | 2 +- include/linux/kernel.h | 2 +- kernel/sched.c | 2 +- lib/Kconfig.debug | 8 +++++--- 8 files changed, 12 insertions(+), 10 deletions(-) (limited to 'kernel/sched.c') diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl index 7b3f49363413..07a9c48de5a2 100644 --- a/Documentation/DocBook/kernel-hacking.tmpl +++ b/Documentation/DocBook/kernel-hacking.tmpl @@ -409,7 +409,7 @@ cond_resched(); /* Will sleep */ You should always compile your kernel - CONFIG_DEBUG_SPINLOCK_SLEEP on, and it will warn + CONFIG_DEBUG_ATOMIC_SLEEP on, and it will warn you if you break these rules. If you do break the rules, you will eventually lock up your box. diff --git a/Documentation/SubmitChecklist b/Documentation/SubmitChecklist index da0382daa395..7b13be41c085 100644 --- a/Documentation/SubmitChecklist +++ b/Documentation/SubmitChecklist @@ -53,7 +53,7 @@ kernel patches. 12: Has been tested with CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, - CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEP all simultaneously + CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEP all simultaneously enabled. 13: Has been build- and runtime tested with and without CONFIG_SMP and diff --git a/Documentation/development-process/4.Coding b/Documentation/development-process/4.Coding index f3f1a469443c..83f5f5b365a3 100644 --- a/Documentation/development-process/4.Coding +++ b/Documentation/development-process/4.Coding @@ -244,7 +244,7 @@ testing purposes. In particular, you should turn on: - DEBUG_SLAB can find a variety of memory allocation and use errors; it should be used on most development kernels. - - DEBUG_SPINLOCK, DEBUG_SPINLOCK_SLEEP, and DEBUG_MUTEXES will find a + - DEBUG_SPINLOCK, DEBUG_ATOMIC_SLEEP, and DEBUG_MUTEXES will find a number of common locking errors. There are quite a few other debugging options, some of which will be diff --git a/Documentation/ja_JP/SubmitChecklist b/Documentation/ja_JP/SubmitChecklist index 2df4576f1173..cb5507b1ac81 100644 --- a/Documentation/ja_JP/SubmitChecklist +++ b/Documentation/ja_JP/SubmitChecklist @@ -68,7 +68,7 @@ Linux カーネルパッチ投稿者向けチェックリスト 12: CONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, CONFIG_DEBUG_SPINLOCK, - CONFIG_DEBUG_SPINLOCK_SLEEP これら全てを同時に有効にして動作確認を + CONFIG_DEBUG_ATOMIC_SLEEP これら全てを同時に有効にして動作確認を 行ってください。 13: CONFIG_SMP, CONFIG_PREEMPT を有効にした場合と無効にした場合の両方で diff --git a/Documentation/zh_CN/SubmitChecklist b/Documentation/zh_CN/SubmitChecklist index 951415bbab0c..4c741d6bc048 100644 --- a/Documentation/zh_CN/SubmitChecklist +++ b/Documentation/zh_CN/SubmitChecklist @@ -67,7 +67,7 @@ Linux 12ѾͨCONFIG_PREEMPT, CONFIG_DEBUG_PREEMPT, CONFIG_DEBUG_SLAB, CONFIG_DEBUG_PAGEALLOC, CONFIG_DEBUG_MUTEXES, - CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_SPINLOCK_SLEEPԣͬʱ + CONFIG_DEBUG_SPINLOCK, CONFIG_DEBUG_ATOMIC_SLEEPԣͬʱ ʹܡ 13Ѿʹû߲ʹ CONFIG_SMP CONFIG_PREEMPTִʱ䡣 diff --git a/include/linux/kernel.h b/include/linux/kernel.h index fb0e7329fee1..24b489f66592 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -121,7 +121,7 @@ extern int _cond_resched(void); # define might_resched() do { } while (0) #endif -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP void __might_sleep(const char *file, int line, int preempt_offset); /** * might_sleep - annotation for functions that can sleep diff --git a/kernel/sched.c b/kernel/sched.c index 90ad7cf2b290..a5f318b8d659 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8018,7 +8018,7 @@ void __init sched_init(void) scheduler_running = 1; } -#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP +#ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { int nested = (preempt_count() & ~PREEMPT_ACTIVE) + rcu_preempt_depth(); diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index a7dd7b547fea..81a4f3302bc8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -648,13 +648,15 @@ config TRACE_IRQFLAGS Enables hooks to interrupt enabling and disabling for either tracing or lock debugging. -config DEBUG_SPINLOCK_SLEEP - bool "Spinlock debugging: sleep-inside-spinlock checking" +config DEBUG_ATOMIC_SLEEP + bool "Sleep inside atomic section checking" select PREEMPT_COUNT depends on DEBUG_KERNEL help If you say Y here, various routines which may sleep will become very - noisy if they are called with a spinlock held. + noisy if they are called inside atomic sections: when a spinlock is + held, inside an rcu read side critical section, inside preempt disabled + sections, inside an interrupt, etc... config DEBUG_LOCKING_API_SELFTESTS bool "Locking API boot-time self-tests" -- cgit v1.2.3 From 307bf9803f25a8a3f53c1012110fb74e2f893eb0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 10 Jun 2011 15:08:55 +0200 Subject: sched: Simplify mutex_spin_on_owner() It does not make sense to rcu_read_lock/unlock() in every loop iteration while spinning on the mutex. Move the rcu protection outside the loop. Also simplify the return path to always check for lock->owner == NULL which meets the requirements of both owner changed and need_resched() caused loop exits. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1106101458350.11814@ionos Signed-off-by: Ingo Molnar --- kernel/sched.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 59252754fbe0..e355ee72e83f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4306,11 +4306,8 @@ EXPORT_SYMBOL(schedule); static inline bool owner_running(struct mutex *lock, struct task_struct *owner) { - bool ret = false; - - rcu_read_lock(); if (lock->owner != owner) - goto fail; + return false; /* * Ensure we emit the owner->on_cpu, dereference _after_ checking @@ -4320,11 +4317,7 @@ static inline bool owner_running(struct mutex *lock, struct task_struct *owner) */ barrier(); - ret = owner->on_cpu; -fail: - rcu_read_unlock(); - - return ret; + return owner->on_cpu; } /* @@ -4336,21 +4329,21 @@ int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) if (!sched_feat(OWNER_SPIN)) return 0; + rcu_read_lock(); while (owner_running(lock, owner)) { if (need_resched()) - return 0; + break; arch_mutex_cpu_relax(); } + rcu_read_unlock(); /* - * If the owner changed to another task there is likely - * heavy contention, stop spinning. + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. */ - if (lock->owner) - return 0; - - return 1; + return lock->owner == NULL; } #endif -- cgit v1.2.3 From 9763b67fb9f3050c6da739105888327587c30c4d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 Jul 2011 13:09:25 +0200 Subject: sched, cgroup: Optimize load_balance_fair() Use for_each_leaf_cfs_rq() instead of list_for_each_entry_rcu(), this achieves that load_balance_fair() only iterates those task_groups that actually have tasks on busiest, and that we iterate bottom-up, trying to move light groups before the heavier ones. No idea if it will actually work out to be beneficial in practice, does anybody have a cgroup workload that might show a difference one way or the other? [ Also move update_h_load to sched_fair.c, loosing #ifdef-ery ] Signed-off-by: Peter Zijlstra Reviewed-by: Paul Turner Link: http://lkml.kernel.org/r/1310557009.2586.28.camel@twins Signed-off-by: Ingo Molnar --- kernel/sched.c | 32 -------------------------------- kernel/sched_fair.c | 40 +++++++++++++++++++++++++++++++++++----- 2 files changed, 35 insertions(+), 37 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index b0e7ad796d3b..474f341d6f91 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1568,38 +1568,6 @@ static unsigned long cpu_avg_load_per_task(int cpu) return rq->avg_load_per_task; } -#ifdef CONFIG_FAIR_GROUP_SCHED - -/* - * Compute the cpu's hierarchical load factor for each task group. - * This needs to be done in a top-down fashion because the load of a child - * group is a fraction of its parents load. - */ -static int tg_load_down(struct task_group *tg, void *data) -{ - unsigned long load; - long cpu = (long)data; - - if (!tg->parent) { - load = cpu_rq(cpu)->load.weight; - } else { - load = tg->parent->cfs_rq[cpu]->h_load; - load *= tg->se[cpu]->load.weight; - load /= tg->parent->cfs_rq[cpu]->load.weight + 1; - } - - tg->cfs_rq[cpu]->h_load = load; - - return 0; -} - -static void update_h_load(long cpu) -{ - walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); -} - -#endif - #ifdef CONFIG_PREEMPT static void double_rq_lock(struct rq *rq1, struct rq *rq2); diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 6cdff849fc19..180bcf1efa79 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -2232,11 +2232,43 @@ static void update_shares(int cpu) struct rq *rq = cpu_rq(cpu); rcu_read_lock(); + /* + * Iterates the task_group tree in a bottom up fashion, see + * list_add_leaf_cfs_rq() for details. + */ for_each_leaf_cfs_rq(rq, cfs_rq) update_shares_cpu(cfs_rq->tg, cpu); rcu_read_unlock(); } +/* + * Compute the cpu's hierarchical load factor for each task group. + * This needs to be done in a top-down fashion because the load of a child + * group is a fraction of its parents load. + */ +static int tg_load_down(struct task_group *tg, void *data) +{ + unsigned long load; + long cpu = (long)data; + + if (!tg->parent) { + load = cpu_rq(cpu)->load.weight; + } else { + load = tg->parent->cfs_rq[cpu]->h_load; + load *= tg->se[cpu]->load.weight; + load /= tg->parent->cfs_rq[cpu]->load.weight + 1; + } + + tg->cfs_rq[cpu]->h_load = load; + + return 0; +} + +static void update_h_load(long cpu) +{ + walk_tg_tree(tg_load_down, tg_nop, (void *)cpu); +} + static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, @@ -2244,14 +2276,12 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, int *all_pinned) { long rem_load_move = max_load_move; - int busiest_cpu = cpu_of(busiest); - struct task_group *tg; + struct cfs_rq *busiest_cfs_rq; rcu_read_lock(); - update_h_load(busiest_cpu); + update_h_load(cpu_of(busiest)); - list_for_each_entry_rcu(tg, &task_groups, list) { - struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; + for_each_leaf_cfs_rq(busiest, busiest_cfs_rq) { unsigned long busiest_h_load = busiest_cfs_rq->h_load; unsigned long busiest_weight = busiest_cfs_rq->load.weight; u64 rem_load, moved_load; -- cgit v1.2.3 From 5f817d676b7b7ac4a29f5ed93063ae7a24550c12 Mon Sep 17 00:00:00 2001 From: Jan Schoenherr Date: Wed, 13 Jul 2011 20:13:31 +0200 Subject: sched: Fix (harmless) typo 'CONFG_FAIR_GROUP_SCHED' This patch fixes a typo located in a comment. Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-2-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 474f341d6f91..3b3826ebe793 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8362,7 +8362,7 @@ static inline void unregister_fair_sched_group(struct task_group *tg, int cpu) list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); raw_spin_unlock_irqrestore(&rq->lock, flags); } -#else /* !CONFG_FAIR_GROUP_SCHED */ +#else /* !CONFIG_FAIR_GROUP_SCHED */ static inline void free_fair_sched_group(struct task_group *tg) { } -- cgit v1.2.3 From 99bc52429f11d1f4f81495ac8237085aaeb6bccf Mon Sep 17 00:00:00 2001 From: Bianca Lutz Date: Wed, 13 Jul 2011 20:13:36 +0200 Subject: sched: Do not attempt to destroy uninitialized rt_bandwidth If a task group is to be created and alloc_fair_sched_group() fails, then the rt_bandwidth of the corresponding task group is not yet initialized. The caller, sched_create_group(), starts a clean up procedure which calls free_rt_sched_group() which unconditionally destroys the not yet initialized rt_bandwidth. This crashes or hangs the system in lock_hrtimer_base(): UP systems dereference a NULL pointer, while SMP systems loop endlessly on a condition that cannot become true. This patch simply avoids the destruction of rt_bandwidth when the initialization code path was not reached. (This was discovered by accident with a custom kernel modification.) Signed-off-by: Bianca Lutz Signed-off-by: Jan Schoenherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310580816-10861-7-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index 3b3826ebe793..f107204db53f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -8383,7 +8383,8 @@ static void free_rt_sched_group(struct task_group *tg) { int i; - destroy_rt_bandwidth(&tg->rt_bandwidth); + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); for_each_possible_cpu(i) { if (tg->rt_rq) -- cgit v1.2.3 From 26a148eb9c790149750f7e77da0d96029443d400 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Fri, 15 Jul 2011 11:41:31 +0100 Subject: sched: Reorder root_domain to remove 64 bit alignment padding Reorder root_domain to remove 8 bytes of alignment padding on 64 bit builds, this shrinks the size from 1736 to 1728 bytes, therefore using one fewer cachelines. Signed-off-by: Richard Kennedy Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310726492.1977.5.camel@castor.rsk Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index f107204db53f..e3f0bac05270 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -422,6 +422,7 @@ struct rt_rq { */ struct root_domain { atomic_t refcount; + atomic_t rto_count; struct rcu_head rcu; cpumask_var_t span; cpumask_var_t online; @@ -431,7 +432,6 @@ struct root_domain { * one runnable RT task. */ cpumask_var_t rto_mask; - atomic_t rto_count; struct cpupri cpupri; }; -- cgit v1.2.3 From acb5a9ba3bd7cd8b3264f67a3789a9587d3b935b Mon Sep 17 00:00:00 2001 From: "Jan H. Schönherr" Date: Thu, 14 Jul 2011 18:32:43 +0200 Subject: sched: Separate group-scheduling code more clearly MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Clean up cfs/rt runqueue initialization by moving group scheduling related code into the corresponding functions. Also, keep group scheduling as an add-on, so that things are only done additionally, i. e. remove the init_*_rq() calls from init_tg_*_entry(). (This removes a redundant initalization during sched_init()). In case of group scheduling rt_rq->highest_prio.curr is now initialized twice, but adding another #ifdef seems not worth it. Signed-off-by: Jan H. Schönherr Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1310661163-16606-1-git-send-email-schnhrr@cs.tu-berlin.de Signed-off-by: Ingo Molnar --- kernel/sched.c | 43 +++++++++++++++++++------------------------ 1 file changed, 19 insertions(+), 24 deletions(-) (limited to 'kernel/sched.c') diff --git a/kernel/sched.c b/kernel/sched.c index e3f0bac05270..6fdf7ffbebc6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7859,17 +7859,10 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) +static void init_cfs_rq(struct cfs_rq *cfs_rq) { cfs_rq->tasks_timeline = RB_ROOT; INIT_LIST_HEAD(&cfs_rq->tasks); -#ifdef CONFIG_FAIR_GROUP_SCHED - cfs_rq->rq = rq; - /* allow initial update_cfs_load() to truncate */ -#ifdef CONFIG_SMP - cfs_rq->load_stamp = 1; -#endif -#endif cfs_rq->min_vruntime = (u64)(-(1LL << 20)); #ifndef CONFIG_64BIT cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime; @@ -7889,13 +7882,9 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) /* delimiter for bitsearch: */ __set_bit(MAX_RT_PRIO, array->bitmap); -#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED +#if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO; -#ifdef CONFIG_SMP rt_rq->highest_prio.next = MAX_RT_PRIO; -#endif -#endif -#ifdef CONFIG_SMP rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init_raw(&rt_rq->pushable_tasks, &rq->lock); @@ -7905,11 +7894,6 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) rt_rq->rt_throttled = 0; rt_rq->rt_runtime = 0; raw_spin_lock_init(&rt_rq->rt_runtime_lock); - -#ifdef CONFIG_RT_GROUP_SCHED - rt_rq->rt_nr_boosted = 0; - rt_rq->rq = rq; -#endif } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7918,11 +7902,17 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *parent) { struct rq *rq = cpu_rq(cpu); - tg->cfs_rq[cpu] = cfs_rq; - init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + cfs_rq->rq = rq; +#ifdef CONFIG_SMP + /* allow initial update_cfs_load() to truncate */ + cfs_rq->load_stamp = 1; +#endif + tg->cfs_rq[cpu] = cfs_rq; tg->se[cpu] = se; + /* se could be NULL for root_task_group */ if (!se) return; @@ -7945,12 +7935,14 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - tg->rt_rq[cpu] = rt_rq; - init_rt_rq(rt_rq, rq); + rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->rt_nr_boosted = 0; + rt_rq->rq = rq; rt_rq->tg = tg; - rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; + tg->rt_rq[cpu] = rt_rq; tg->rt_se[cpu] = rt_se; + if (!rt_se) return; @@ -8032,7 +8024,7 @@ void __init sched_init(void) rq->nr_running = 0; rq->calc_load_active = 0; rq->calc_load_update = jiffies + LOAD_FREQ; - init_cfs_rq(&rq->cfs, rq); + init_cfs_rq(&rq->cfs); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED root_task_group.shares = root_task_group_load; @@ -8335,6 +8327,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) if (!se) goto err_free_rq; + init_cfs_rq(cfs_rq); init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]); } @@ -8425,6 +8418,8 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) if (!rt_se) goto err_free_rq; + init_rt_rq(rt_rq, cpu_rq(i)); + rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime; init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]); } -- cgit v1.2.3