From 14a40ffccd6163bbcd1d6f32b28a88ffe6149fc6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Mar 2013 13:45:20 -0700 Subject: sched: replace PF_THREAD_BOUND with PF_NO_SETAFFINITY PF_THREAD_BOUND was originally used to mark kernel threads which were bound to a specific CPU using kthread_bind() and a task with the flag set allows cpus_allowed modifications only to itself. Workqueue is currently abusing it to prevent userland from meddling with cpus_allowed of workqueue workers. What we need is a flag to prevent userland from messing with cpus_allowed of certain kernel tasks. In kernel, anyone can (incorrectly) squash the flag, and, for worker-type usages, restricting cpus_allowed modification to the task itself doesn't provide meaningful extra proection as other tasks can inject work items to the task anyway. This patch replaces PF_THREAD_BOUND with PF_NO_SETAFFINITY. sched_setaffinity() checks the flag and return -EINVAL if set. set_cpus_allowed_ptr() is no longer affected by the flag. This will allow simplifying workqueue worker CPU affinity management. Signed-off-by: Tejun Heo Acked-by: Ingo Molnar Reviewed-by: Lai Jiangshan Cc: Peter Zijlstra Cc: Thomas Gleixner --- kernel/sched/core.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..23606ee961b5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4126,6 +4126,10 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) get_task_struct(p); rcu_read_unlock(); + if (p->flags & PF_NO_SETAFFINITY) { + retval = -EINVAL; + goto out_put_task; + } if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { retval = -ENOMEM; goto out_put_task; @@ -4773,11 +4777,6 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) goto out; } - if (unlikely((p->flags & PF_THREAD_BOUND) && p != current)) { - ret = -EINVAL; - goto out; - } - do_set_cpus_allowed(p, new_mask); /* Can the task run on the task's current CPU? If so, we're done */ -- cgit v1.2.3 From 383efcd00053ec40023010ce5034bd702e7ab373 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 18 Mar 2013 12:22:34 -0700 Subject: sched: Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s try_to_wake_up_local() should only be invoked to wake up another task in the same runqueue and BUG_ON()s are used to enforce the rule. Missing try_to_wake_up_local() can stall workqueue execution but such stalls are likely to be finite either by another work item being queued or the one blocked getting unblocked. There's no reason to trigger BUG while holding rq lock crashing the whole system. Convert BUG_ON()s in try_to_wake_up_local() to WARN_ON_ONCE()s. Signed-off-by: Tejun Heo Acked-by: Steven Rostedt Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20130318192234.GD3042@htj.dyndns.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b7b03cd2d4cd..306943f531a3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1488,8 +1488,10 @@ static void try_to_wake_up_local(struct task_struct *p) { struct rq *rq = task_rq(p); - BUG_ON(rq != this_rq()); - BUG_ON(p == current); + if (WARN_ON_ONCE(rq != this_rq()) || + WARN_ON_ONCE(p == current)) + return; + lockdep_assert_held(&rq->lock); if (!raw_spin_trylock(&p->pi_lock)) { -- cgit v1.2.3 From a1cbcaa9ea87b87a96b9fc465951dcf36e459ca2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Apr 2013 10:10:27 +0200 Subject: sched_clock: Prevent 64bit inatomicity on 32bit systems The sched_clock_remote() implementation has the following inatomicity problem on 32bit systems when accessing the remote scd->clock, which is a 64bit value. CPU0 CPU1 sched_clock_local() sched_clock_remote(CPU0) ... remote_clock = scd[CPU0]->clock read_low32bit(scd[CPU0]->clock) cmpxchg64(scd->clock,...) read_high32bit(scd[CPU0]->clock) While the update of scd->clock is using an atomic64 mechanism, the readout on the remote cpu is not, which can cause completely bogus readouts. It is a quite rare problem, because it requires the update to hit the narrow race window between the low/high readout and the update must go across the 32bit boundary. The resulting misbehaviour is, that CPU1 will see the sched_clock on CPU1 ~4 seconds ahead of it's own and update CPU1s sched_clock value to this bogus timestamp. This stays that way due to the clamping implementation for about 4 seconds until the synchronization with CLOCK_MONOTONIC undoes the problem. The issue is hard to observe, because it might only result in a less accurate SCHED_OTHER timeslicing behaviour. To create observable damage on realtime scheduling classes, it is necessary that the bogus update of CPU1 sched_clock happens in the context of an realtime thread, which then gets charged 4 seconds of RT runtime, which results in the RT throttler mechanism to trigger and prevent scheduling of RT tasks for a little less than 4 seconds. So this is quite unlikely as well. The issue was quite hard to decode as the reproduction time is between 2 days and 3 weeks and intrusive tracing makes it less likely, but the following trace recorded with trace_clock=global, which uses sched_clock_local(), gave the final hint: -0 0d..30 400269.477150: hrtimer_cancel: hrtimer=0xf7061e80 -0 0d..30 400269.477151: hrtimer_start: hrtimer=0xf7061e80 ... irq/20-S-587 1d..32 400273.772118: sched_wakeup: comm= ... target_cpu=0 -0 0dN.30 400273.772118: hrtimer_cancel: hrtimer=0xf7061e80 What happens is that CPU0 goes idle and invokes sched_clock_idle_sleep_event() which invokes sched_clock_local() and CPU1 runs a remote wakeup for CPU0 at the same time, which invokes sched_remote_clock(). The time jump gets propagated to CPU0 via sched_remote_clock() and stays stale on both cores for ~4 seconds. There are only two other possibilities, which could cause a stale sched clock: 1) ktime_get() which reads out CLOCK_MONOTONIC returns a sporadic wrong value. 2) sched_clock() which reads the TSC returns a sporadic wrong value. #1 can be excluded because sched_clock would continue to increase for one jiffy and then go stale. #2 can be excluded because it would not make the clock jump forward. It would just result in a stale sched_clock for one jiffy. After quite some brain twisting and finding the same pattern on other traces, sched_clock_remote() remained the only place which could cause such a problem and as explained above it's indeed racy on 32bit systems. So while on 64bit systems the readout is atomic, we need to verify the remote readout on 32bit machines. We need to protect the local->clock readout in sched_clock_remote() on 32bit as well because an NMI could hit between the low and the high readout, call sched_clock_local() and modify local->clock. Thanks to Siegfried Wulsch for bearing with my debug requests and going through the tedious tasks of running a bunch of reproducer systems to generate the debug information which let me decode the issue. Reported-by: Siegfried Wulsch Acked-by: Peter Zijlstra Cc: Steven Rostedt Link: http://lkml.kernel.org/r/alpine.LFD.2.02.1304051544160.21884@ionos Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- kernel/sched/clock.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) (limited to 'kernel/sched') diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c685e31492df..c3ae1446461c 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -176,10 +176,36 @@ static u64 sched_clock_remote(struct sched_clock_data *scd) u64 this_clock, remote_clock; u64 *ptr, old_val, val; +#if BITS_PER_LONG != 64 +again: + /* + * Careful here: The local and the remote clock values need to + * be read out atomic as we need to compare the values and + * then update either the local or the remote side. So the + * cmpxchg64 below only protects one readout. + * + * We must reread via sched_clock_local() in the retry case on + * 32bit as an NMI could use sched_clock_local() via the + * tracer and hit between the readout of + * the low32bit and the high 32bit portion. + */ + this_clock = sched_clock_local(my_scd); + /* + * We must enforce atomic readout on 32bit, otherwise the + * update on the remote cpu can hit inbetween the readout of + * the low32bit and the high 32bit portion. + */ + remote_clock = cmpxchg64(&scd->clock, 0, 0); +#else + /* + * On 64bit the read of [my]scd->clock is atomic versus the + * update, so we can avoid the above 32bit dance. + */ sched_clock_local(my_scd); again: this_clock = my_scd->clock; remote_clock = scd->clock; +#endif /* * Use the opportunity that we have both locks -- cgit v1.2.3 From fd9b86d37a600488dbd80fe60cca46b822bff1cd Mon Sep 17 00:00:00 2001 From: libin Date: Mon, 8 Apr 2013 14:39:12 +0800 Subject: sched/debug: Fix sd->*_idx limit range avoiding overflow Commit 201c373e8e ("sched/debug: Limit sd->*_idx range on sysctl") was an incomplete bug fix. This patch fixes sd->*_idx limit range to [0 ~ CPU_LOAD_IDX_MAX-1] avoiding array overflow caused by setting sd->*_idx to CPU_LOAD_IDX_MAX on sysctl. Signed-off-by: Libin Cc: Cc: Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/51626610.2040607@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 306943f531a3..fa077929e315 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4933,7 +4933,7 @@ static void sd_free_ctl_entry(struct ctl_table **tablep) } static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; static void set_table_entry(struct ctl_table *entry, -- cgit v1.2.3 From 28b4a521f618d9722bc780ea38b44718ce0fe283 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 5 Apr 2013 16:26:46 +0530 Subject: sched: Fix typo inside comment Fix typo: sched_domains_nume_distance -> sched_domains_numa_distance Signed-off-by: Viresh Kumar Cc: linaro-kernel@lists.linaro.org Cc: patches@linaro.org Cc: robin.randhawa@arm.com Cc: Steve.Bannister@arm.com Cc: Liviu.Dudau@arm.com Cc: charles.garcia-tobin@arm.com Cc: arvind.chauhan@arm.com Cc: peterz@infradead.org Link: http://lkml.kernel.org/r/cd8084746ac932106d6fa6be388b8f2d6aa9617c.1365159023.git.viresh.kumar@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 849deb96e61e..f5e1aa5b2684 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6252,7 +6252,7 @@ static void sched_init_numa(void) * 'level' contains the number of unique distances, excluding the * identity distance node_distance(i,i). * - * The sched_domains_nume_distance[] array includes the actual distance + * The sched_domains_numa_distance[] array includes the actual distance * numbers. */ -- cgit v1.2.3 From ee761f629d598579594d7e1eb8c552f3c5f71e4d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 21 Mar 2013 22:49:32 +0100 Subject: arch: Consolidate tsk_is_polling() Move it to a common place. Preparatory patch for implementing set/clear for the idle need_resched poll implementation. Signed-off-by: Thomas Gleixner Cc: Linus Torvalds Cc: Rusty Russell Cc: Paul McKenney Cc: Peter Zijlstra Reviewed-by: Cc: Srivatsa S. Bhat Cc: Magnus Damm Link: http://lkml.kernel.org/r/20130321215233.446034505@linutronix.de Signed-off-by: Thomas Gleixner --- arch/alpha/include/asm/thread_info.h | 2 -- arch/ia64/include/asm/thread_info.h | 2 -- arch/metag/include/asm/thread_info.h | 2 -- arch/microblaze/include/asm/thread_info.h | 1 - arch/mn10300/include/asm/thread_info.h | 2 -- arch/openrisc/include/asm/thread_info.h | 2 -- arch/parisc/include/asm/thread_info.h | 2 -- arch/powerpc/include/asm/thread_info.h | 2 -- arch/sh/include/asm/thread_info.h | 2 -- arch/sparc/include/asm/thread_info_32.h | 2 -- arch/sparc/include/asm/thread_info_64.h | 2 -- arch/tile/include/asm/thread_info.h | 2 -- arch/x86/include/asm/thread_info.h | 2 -- include/linux/sched.h | 20 ++++++++++++++++++++ kernel/sched/core.c | 5 ----- 15 files changed, 20 insertions(+), 30 deletions(-) (limited to 'kernel/sched') diff --git a/arch/alpha/include/asm/thread_info.h b/arch/alpha/include/asm/thread_info.h index 1f8c72959fb6..52cd2a4a3ff4 100644 --- a/arch/alpha/include/asm/thread_info.h +++ b/arch/alpha/include/asm/thread_info.h @@ -95,8 +95,6 @@ register struct thread_info *__current_thread_info __asm__("$8"); #define TS_POLLING 0x0010 /* idle task polling need_resched, skip sending interrupt */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/arch/ia64/include/asm/thread_info.h b/arch/ia64/include/asm/thread_info.h index 020d655ed082..cade13dd0299 100644 --- a/arch/ia64/include/asm/thread_info.h +++ b/arch/ia64/include/asm/thread_info.h @@ -131,8 +131,6 @@ struct thread_info { #define TS_POLLING 1 /* true if in idle loop and not sleeping */ #define TS_RESTORE_SIGMASK 2 /* restore signal mask in do_signal() */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/arch/metag/include/asm/thread_info.h b/arch/metag/include/asm/thread_info.h index 0ecd34d8b5f6..7c4a33006142 100644 --- a/arch/metag/include/asm/thread_info.h +++ b/arch/metag/include/asm/thread_info.h @@ -150,6 +150,4 @@ static inline int kstack_end(void *addr) #define _TIF_WORK_MASK (_TIF_ALLWORK_MASK & ~(_TIF_SYSCALL_TRACE | \ _TIF_SYSCALL_AUDIT | _TIF_SINGLESTEP)) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/microblaze/include/asm/thread_info.h b/arch/microblaze/include/asm/thread_info.h index 008f30433d22..de26ea6373de 100644 --- a/arch/microblaze/include/asm/thread_info.h +++ b/arch/microblaze/include/asm/thread_info.h @@ -182,7 +182,6 @@ static inline bool test_and_clear_restore_sigmask(void) ti->status &= ~TS_RESTORE_SIGMASK; return true; } -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif #endif /* __KERNEL__ */ diff --git a/arch/mn10300/include/asm/thread_info.h b/arch/mn10300/include/asm/thread_info.h index f90062b0622d..224b4262486d 100644 --- a/arch/mn10300/include/asm/thread_info.h +++ b/arch/mn10300/include/asm/thread_info.h @@ -165,8 +165,6 @@ void arch_release_thread_info(struct thread_info *ti); #define _TIF_WORK_MASK 0x0000FFFE /* work to do on interrupt/exception return */ #define _TIF_ALLWORK_MASK 0x0000FFFF /* work to do on any return to u-space */ -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/openrisc/include/asm/thread_info.h b/arch/openrisc/include/asm/thread_info.h index 07f3212422ad..d797acc901e4 100644 --- a/arch/openrisc/include/asm/thread_info.h +++ b/arch/openrisc/include/asm/thread_info.h @@ -128,8 +128,6 @@ register struct thread_info *current_thread_info_reg asm("r10"); /* For OpenRISC, this is anything in the LSW other than syscall trace */ #define _TIF_WORK_MASK (0xff & ~(_TIF_SYSCALL_TRACE|_TIF_SINGLESTEP)) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/parisc/include/asm/thread_info.h b/arch/parisc/include/asm/thread_info.h index d1fb79a36f3d..6182832e5b6c 100644 --- a/arch/parisc/include/asm/thread_info.h +++ b/arch/parisc/include/asm/thread_info.h @@ -77,8 +77,6 @@ struct thread_info { #define _TIF_SYSCALL_TRACE_MASK (_TIF_SYSCALL_TRACE | _TIF_SINGLESTEP | \ _TIF_BLOCKSTEP) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_PARISC_THREAD_INFO_H */ diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 406b7b9a1341..8ceea14d6fe4 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -182,8 +182,6 @@ static inline bool test_thread_local_flags(unsigned int flags) #define is_32bit_task() (1) #endif -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/sh/include/asm/thread_info.h b/arch/sh/include/asm/thread_info.h index 7d5ac4e48485..45a93669289d 100644 --- a/arch/sh/include/asm/thread_info.h +++ b/arch/sh/include/asm/thread_info.h @@ -207,8 +207,6 @@ static inline bool test_and_clear_restore_sigmask(void) return true; } -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* !__ASSEMBLY__ */ #endif /* __KERNEL__ */ diff --git a/arch/sparc/include/asm/thread_info_32.h b/arch/sparc/include/asm/thread_info_32.h index 25849ae3e900..dd3807599bb9 100644 --- a/arch/sparc/include/asm/thread_info_32.h +++ b/arch/sparc/include/asm/thread_info_32.h @@ -132,8 +132,6 @@ register struct thread_info *current_thread_info_reg asm("g6"); #define _TIF_DO_NOTIFY_RESUME_MASK (_TIF_NOTIFY_RESUME | \ _TIF_SIGPENDING) -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #endif /* __KERNEL__ */ #endif /* _ASM_THREAD_INFO_H */ diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 269bd92313df..d5e504251079 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -256,8 +256,6 @@ static inline bool test_and_clear_restore_sigmask(void) return true; } -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) - #define thread32_stack_is_64bit(__SP) (((__SP) & 0x1) != 0) #define test_thread_64bit_stack(__SP) \ ((test_thread_flag(TIF_32BIT) && !thread32_stack_is_64bit(__SP)) ? \ diff --git a/arch/tile/include/asm/thread_info.h b/arch/tile/include/asm/thread_info.h index e9c670d7a7fe..ccc8ef37235c 100644 --- a/arch/tile/include/asm/thread_info.h +++ b/arch/tile/include/asm/thread_info.h @@ -153,8 +153,6 @@ extern void _cpu_idle(void); #define TS_POLLING 0x0004 /* in idle loop but not sleeping */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 2cd056e3ada3..a1df6e84691f 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -241,8 +241,6 @@ static inline struct thread_info *current_thread_info(void) skip sending interrupt */ #define TS_RESTORE_SIGMASK 0x0008 /* restore signal mask in do_signal() */ -#define tsk_is_polling(t) (task_thread_info(t)->status & TS_POLLING) - #ifndef __ASSEMBLY__ #define HAVE_SET_RESTORE_SIGMASK 1 static inline void set_restore_sigmask(void) diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..6709a5813f27 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2621,6 +2621,26 @@ static inline int spin_needbreak(spinlock_t *lock) #endif } +/* + * Idle thread specific functions to determine the need_resched + * polling state. We have two versions, one based on TS_POLLING in + * thread_info.status and one based on TIF_POLLING_NRFLAG in + * thread_info.flags + */ +#ifdef TS_POLLING +static inline int tsk_is_polling(struct task_struct *p) +{ + return task_thread_info(p)->status & TS_POLLING; +} +#elif defined(TIF_POLLING_NRFLAG) +static inline int tsk_is_polling(struct task_struct *p) +{ + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); +} +#else +static inline int tsk_is_polling(struct task_struct *p) { return 0; } +#endif + /* * Thread group CPU time accounting. */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..243a20c5cf91 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -512,11 +512,6 @@ static inline void init_hrtick(void) * the target CPU. */ #ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) 0 -#endif - void resched_task(struct task_struct *p) { int cpu; -- cgit v1.2.3 From e614b3332a4f3f264a26da28e5a1f4cc3aea3974 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Thu, 4 Apr 2013 10:57:48 +0200 Subject: sched/cputime: Fix accounting on multi-threaded processes Recent commit 6fac4829 ("cputime: Use accessors to read task cputime stats") introduced a bug, where we account many times the cputime of the first thread, instead of cputimes of all the different threads. Signed-off-by: Stanislaw Gruszka Acked-by: Frederic Weisbecker Cc: Oleg Nesterov Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/20130404085740.GA2495@redhat.com Signed-off-by: Ingo Molnar --- kernel/sched/cputime.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index ed12cbb135f4..e93cca92f38b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -310,7 +310,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times) t = tsk; do { - task_cputime(tsk, &utime, &stime); + task_cputime(t, &utime, &stime); times->utime += utime; times->stime += stime; times->sum_exec_runtime += task_sched_runtime(t); -- cgit v1.2.3 From 4e2dcb73aecbde9fe4e3137c9ea35cb6aa6cb286 Mon Sep 17 00:00:00 2001 From: Zhang Hang Date: Wed, 10 Apr 2013 14:04:55 +0800 Subject: sched: Simplify can_migrate_task() At this point tsk_cache_hot is always true, so no need to check it. Signed-off-by: Zhang Hang Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/51650107.9040606@huawei.com [ Also remove unnecessary schedstat #ifdefs. ] Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 539760ef00c4..bf8ab4f5e603 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3921,20 +3921,17 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd); if (!tsk_cache_hot || env->sd->nr_balance_failed > env->sd->cache_nice_tries) { -#ifdef CONFIG_SCHEDSTATS + if (tsk_cache_hot) { schedstat_inc(env->sd, lb_hot_gained[env->idle]); schedstat_inc(p, se.statistics.nr_forced_migrations); } -#endif + return 1; } - if (tsk_cache_hot) { - schedstat_inc(p, se.statistics.nr_failed_migrations_hot); - return 0; - } - return 1; + schedstat_inc(p, se.statistics.nr_failed_migrations_hot); + return 0; } /* -- cgit v1.2.3 From b9b0853a4b377f84a5e6ed091816a9a2d6b10918 Mon Sep 17 00:00:00 2001 From: Libin Date: Mon, 1 Apr 2013 19:14:01 +0800 Subject: sched: Fix comment in rebalance_domains() A comment in function rebalance_domains() mentions arch_init_sched_domains(), but that function does not exist anymore. The proper function is init_sched_domains(). Signed-off-by: Libin Cc: Link: http://lkml.kernel.org/r/1364814841-49156-1-git-send-email-huawei.libin@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index bf8ab4f5e603..155783b4e4bf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5466,7 +5466,7 @@ void update_max_interval(void) * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * - * Balancing parameters are set up in arch_init_sched_domains. + * Balancing parameters are set up in init_sched_domains. */ static void rebalance_domains(int cpu, enum cpu_idle_type idle) { -- cgit v1.2.3 From 2e76c24d72372db35f226a49c2b99d0fd8cfd400 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:36:31 +0800 Subject: sched: Split cpuacct code out of core.c Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5155366F.5060404@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/Makefile | 1 + kernel/sched/core.c | 220 ----------------------------------------------- kernel/sched/cpuacct.c | 227 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 228 insertions(+), 220 deletions(-) create mode 100644 kernel/sched/cpuacct.c (limited to 'kernel/sched') diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index f06d249e103b..deaf90e4a1de 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -16,3 +16,4 @@ obj-$(CONFIG_SMP) += cpupri.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o +obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f5e1aa5b2684..c28222f72c80 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -8043,226 +8043,6 @@ struct cgroup_subsys cpu_cgroup_subsys = { #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - -/* - * CPU accounting code for task groups. - * - * Based on the work by Paul Menage (menage@google.com) and Balbir Singh - * (balbir@in.ibm.com). - */ - -struct cpuacct root_cpuacct; - -/* create a new cpu accounting group */ -static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) -{ - struct cpuacct *ca; - - if (!cgrp->parent) - return &root_cpuacct.css; - - ca = kzalloc(sizeof(*ca), GFP_KERNEL); - if (!ca) - goto out; - - ca->cpuusage = alloc_percpu(u64); - if (!ca->cpuusage) - goto out_free_ca; - - ca->cpustat = alloc_percpu(struct kernel_cpustat); - if (!ca->cpustat) - goto out_free_cpuusage; - - return &ca->css; - -out_free_cpuusage: - free_percpu(ca->cpuusage); -out_free_ca: - kfree(ca); -out: - return ERR_PTR(-ENOMEM); -} - -/* destroy an existing cpu accounting group */ -static void cpuacct_css_free(struct cgroup *cgrp) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - - free_percpu(ca->cpustat); - free_percpu(ca->cpuusage); - kfree(ca); -} - -static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - u64 data; - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit read safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - data = *cpuusage; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - data = *cpuusage; -#endif - - return data; -} - -static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) -{ - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - -#ifndef CONFIG_64BIT - /* - * Take rq->lock to make 64-bit write safe on 32-bit platforms. - */ - raw_spin_lock_irq(&cpu_rq(cpu)->lock); - *cpuusage = val; - raw_spin_unlock_irq(&cpu_rq(cpu)->lock); -#else - *cpuusage = val; -#endif -} - -/* return total cpu usage (in nanoseconds) of a group */ -static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - u64 totalcpuusage = 0; - int i; - - for_each_present_cpu(i) - totalcpuusage += cpuacct_cpuusage_read(ca, i); - - return totalcpuusage; -} - -static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, - u64 reset) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int err = 0; - int i; - - if (reset) { - err = -EINVAL; - goto out; - } - - for_each_present_cpu(i) - cpuacct_cpuusage_write(ca, i, 0); - -out: - return err; -} - -static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, - struct seq_file *m) -{ - struct cpuacct *ca = cgroup_ca(cgroup); - u64 percpu; - int i; - - for_each_present_cpu(i) { - percpu = cpuacct_cpuusage_read(ca, i); - seq_printf(m, "%llu ", (unsigned long long) percpu); - } - seq_printf(m, "\n"); - return 0; -} - -static const char *cpuacct_stat_desc[] = { - [CPUACCT_STAT_USER] = "user", - [CPUACCT_STAT_SYSTEM] = "system", -}; - -static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, - struct cgroup_map_cb *cb) -{ - struct cpuacct *ca = cgroup_ca(cgrp); - int cpu; - s64 val = 0; - - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_USER]; - val += kcpustat->cpustat[CPUTIME_NICE]; - } - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); - - val = 0; - for_each_online_cpu(cpu) { - struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); - val += kcpustat->cpustat[CPUTIME_SYSTEM]; - val += kcpustat->cpustat[CPUTIME_IRQ]; - val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; - } - - val = cputime64_to_clock_t(val); - cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); - - return 0; -} - -static struct cftype files[] = { - { - .name = "usage", - .read_u64 = cpuusage_read, - .write_u64 = cpuusage_write, - }, - { - .name = "usage_percpu", - .read_seq_string = cpuacct_percpu_seq_read, - }, - { - .name = "stat", - .read_map = cpuacct_stats_show, - }, - { } /* terminate */ -}; - -/* - * charge this task's execution time to its accounting group. - * - * called with rq->lock held. - */ -void cpuacct_charge(struct task_struct *tsk, u64 cputime) -{ - struct cpuacct *ca; - int cpu; - - if (unlikely(!cpuacct_subsys.active)) - return; - - cpu = task_cpu(tsk); - - rcu_read_lock(); - - ca = task_ca(tsk); - - for (; ca; ca = parent_ca(ca)) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - } - - rcu_read_unlock(); -} - -struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .css_alloc = cpuacct_css_alloc, - .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, - .base_cftypes = files, -}; -#endif /* CONFIG_CGROUP_CPUACCT */ - void dump_cpu_task(int cpu) { pr_info("Task dump for CPU %d:\n", cpu); diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c new file mode 100644 index 000000000000..50ec24b6193d --- /dev/null +++ b/kernel/sched/cpuacct.c @@ -0,0 +1,227 @@ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sched.h" + +/* + * CPU accounting code for task groups. + * + * Based on the work by Paul Menage (menage@google.com) and Balbir Singh + * (balbir@in.ibm.com). + */ + +struct cpuacct root_cpuacct; + +/* create a new cpu accounting group */ +static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) +{ + struct cpuacct *ca; + + if (!cgrp->parent) + return &root_cpuacct.css; + + ca = kzalloc(sizeof(*ca), GFP_KERNEL); + if (!ca) + goto out; + + ca->cpuusage = alloc_percpu(u64); + if (!ca->cpuusage) + goto out_free_ca; + + ca->cpustat = alloc_percpu(struct kernel_cpustat); + if (!ca->cpustat) + goto out_free_cpuusage; + + return &ca->css; + +out_free_cpuusage: + free_percpu(ca->cpuusage); +out_free_ca: + kfree(ca); +out: + return ERR_PTR(-ENOMEM); +} + +/* destroy an existing cpu accounting group */ +static void cpuacct_css_free(struct cgroup *cgrp) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + + free_percpu(ca->cpustat); + free_percpu(ca->cpuusage); + kfree(ca); +} + +static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + u64 data; + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit read safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + data = *cpuusage; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + data = *cpuusage; +#endif + + return data; +} + +static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val) +{ + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + +#ifndef CONFIG_64BIT + /* + * Take rq->lock to make 64-bit write safe on 32-bit platforms. + */ + raw_spin_lock_irq(&cpu_rq(cpu)->lock); + *cpuusage = val; + raw_spin_unlock_irq(&cpu_rq(cpu)->lock); +#else + *cpuusage = val; +#endif +} + +/* return total cpu usage (in nanoseconds) of a group */ +static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + u64 totalcpuusage = 0; + int i; + + for_each_present_cpu(i) + totalcpuusage += cpuacct_cpuusage_read(ca, i); + + return totalcpuusage; +} + +static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype, + u64 reset) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int err = 0; + int i; + + if (reset) { + err = -EINVAL; + goto out; + } + + for_each_present_cpu(i) + cpuacct_cpuusage_write(ca, i, 0); + +out: + return err; +} + +static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft, + struct seq_file *m) +{ + struct cpuacct *ca = cgroup_ca(cgroup); + u64 percpu; + int i; + + for_each_present_cpu(i) { + percpu = cpuacct_cpuusage_read(ca, i); + seq_printf(m, "%llu ", (unsigned long long) percpu); + } + seq_printf(m, "\n"); + return 0; +} + +static const char * const cpuacct_stat_desc[] = { + [CPUACCT_STAT_USER] = "user", + [CPUACCT_STAT_SYSTEM] = "system", +}; + +static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft, + struct cgroup_map_cb *cb) +{ + struct cpuacct *ca = cgroup_ca(cgrp); + int cpu; + s64 val = 0; + + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_USER]; + val += kcpustat->cpustat[CPUTIME_NICE]; + } + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_USER], val); + + val = 0; + for_each_online_cpu(cpu) { + struct kernel_cpustat *kcpustat = per_cpu_ptr(ca->cpustat, cpu); + val += kcpustat->cpustat[CPUTIME_SYSTEM]; + val += kcpustat->cpustat[CPUTIME_IRQ]; + val += kcpustat->cpustat[CPUTIME_SOFTIRQ]; + } + + val = cputime64_to_clock_t(val); + cb->fill(cb, cpuacct_stat_desc[CPUACCT_STAT_SYSTEM], val); + + return 0; +} + +static struct cftype files[] = { + { + .name = "usage", + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, + }, + { + .name = "usage_percpu", + .read_seq_string = cpuacct_percpu_seq_read, + }, + { + .name = "stat", + .read_map = cpuacct_stats_show, + }, + { } /* terminate */ +}; + +/* + * charge this task's execution time to its accounting group. + * + * called with rq->lock held. + */ +void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ + struct cpuacct *ca; + int cpu; + + if (unlikely(!cpuacct_subsys.active)) + return; + + cpu = task_cpu(tsk); + + rcu_read_lock(); + + ca = task_ca(tsk); + + for (; ca; ca = parent_ca(ca)) { + u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); + *cpuusage += cputime; + } + + rcu_read_unlock(); +} + +struct cgroup_subsys cpuacct_subsys = { + .name = "cpuacct", + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, + .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, +}; -- cgit v1.2.3 From 60fed7891d4115be0ed7ff9ce6851eda80533c64 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:36:43 +0800 Subject: sched: Split cpuacct code out of sched.h Add cpuacct.h and let sched.h include it. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5155367B.2060506@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.h | 52 ++++++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/sched.h | 48 +--------------------------------------------- 2 files changed, 53 insertions(+), 47 deletions(-) create mode 100644 kernel/sched/cpuacct.h (limited to 'kernel/sched') diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h new file mode 100644 index 000000000000..a7f3d4a8f535 --- /dev/null +++ b/kernel/sched/cpuacct.h @@ -0,0 +1,52 @@ +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +#ifdef CONFIG_CGROUP_CPUACCT + +#include +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +extern struct cgroup_subsys cpuacct_subsys; +extern struct cpuacct root_cpuacct; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + if (!ca || !ca->css.cgroup->parent) + return NULL; + return cgroup_ca(ca->css.cgroup->parent); +} + +extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); + +#else + +static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) +{ +} + +#endif diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 3bd15a43eebc..8116cf8e350f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -7,6 +7,7 @@ #include #include "cpupri.h" +#include "cpuacct.h" extern __read_mostly int scheduler_running; @@ -950,14 +951,6 @@ static const u32 prio_to_wmult[40] = { /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ - - CPUACCT_STAT_NSTATS, -}; - #define ENQUEUE_WAKEUP 1 #define ENQUEUE_HEAD 2 #ifdef CONFIG_SMP @@ -1054,45 +1047,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime extern void update_idle_cpu_load(struct rq *this_rq); -#ifdef CONFIG_CGROUP_CPUACCT -#include -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; -}; - -extern struct cgroup_subsys cpuacct_subsys; -extern struct cpuacct root_cpuacct; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -static inline struct cpuacct *parent_ca(struct cpuacct *ca) -{ - if (!ca || !ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); -} - -extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -#else -static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} -#endif - #ifdef CONFIG_PARAVIRT static inline u64 steal_ticks(u64 steal) { -- cgit v1.2.3 From dbe4b41f9800223949ce72e4289814697e0ea91a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:36:55 +0800 Subject: sched/cpuacct: Add cpuacct_init() So we don't open-coded initialization of cpuacct in core.c. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51553687.1060906@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 8 ++------ kernel/sched/cpuacct.c | 7 +++++++ kernel/sched/cpuacct.h | 5 +++++ 3 files changed, 14 insertions(+), 6 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c28222f72c80..92930a89529d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6936,12 +6936,8 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ -#ifdef CONFIG_CGROUP_CPUACCT - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - /* Too early, not expected to fail */ - BUG_ON(!root_cpuacct.cpuusage); -#endif + cpuacct_init(); + for_each_possible_cpu(i) { struct rq *rq; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 50ec24b6193d..48b5e9184dcc 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -218,6 +218,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } +void __init cpuacct_init(void) +{ + root_cpuacct.cpustat = &kernel_cpustat; + root_cpuacct.cpuusage = alloc_percpu(u64); + BUG_ON(!root_cpuacct.cpuusage); /* Too early, not expected to fail */ +} + struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .css_alloc = cpuacct_css_alloc, diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index a7f3d4a8f535..551acd729562 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -41,10 +41,15 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return cgroup_ca(ca->css.cgroup->parent); } +extern void cpuacct_init(void); extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); #else +static inline void cpuacct_init(void) +{ +} + static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) { } -- cgit v1.2.3 From 1966aaf7d54608e8ddb7ac454b461840e763409a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:37:06 +0800 Subject: sched/cpuacct: Add cpuacct_acount_field() So we can remove open-coded cpuacct code in cputime.c. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51553692.9060008@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 23 +++++++++++++++++++++++ kernel/sched/cpuacct.h | 6 ++++++ kernel/sched/cputime.c | 18 +----------------- 3 files changed, 30 insertions(+), 17 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 48b5e9184dcc..72bd971ea377 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -218,6 +218,29 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) rcu_read_unlock(); } +/* + * Add user/system time to cpuacct. + * + * Note: it's the caller that updates the account of the root cgroup. + */ +void cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ + struct kernel_cpustat *kcpustat; + struct cpuacct *ca; + + if (unlikely(!cpuacct_subsys.active)) + return; + + rcu_read_lock(); + ca = task_ca(p); + while (ca && (ca != &root_cpuacct)) { + kcpustat = this_cpu_ptr(ca->cpustat); + kcpustat->cpustat[index] += val; + ca = parent_ca(ca); + } + rcu_read_unlock(); +} + void __init cpuacct_init(void) { root_cpuacct.cpustat = &kernel_cpustat; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index 551acd729562..bd0409b85525 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -43,6 +43,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) extern void cpuacct_init(void); extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); +extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); #else @@ -54,4 +55,9 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) { } +static inline void +cpuacct_account_field(struct task_struct *p, int index, u64 val) +{ +} + #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 699d59756ece..33508dc78d0c 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -115,10 +115,6 @@ static int irqtime_account_si_update(void) static inline void task_group_account_field(struct task_struct *p, int index, u64 tmp) { -#ifdef CONFIG_CGROUP_CPUACCT - struct kernel_cpustat *kcpustat; - struct cpuacct *ca; -#endif /* * Since all updates are sure to touch the root cgroup, we * get ourselves ahead and touch it first. If the root cgroup @@ -127,19 +123,7 @@ static inline void task_group_account_field(struct task_struct *p, int index, */ __get_cpu_var(kernel_cpustat).cpustat[index] += tmp; -#ifdef CONFIG_CGROUP_CPUACCT - if (unlikely(!cpuacct_subsys.active)) - return; - - rcu_read_lock(); - ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += tmp; - ca = parent_ca(ca); - } - rcu_read_unlock(); -#endif + cpuacct_account_field(p, index, tmp); } /* -- cgit v1.2.3 From 543bc0e76e6bb84300eaf9833edc5a481f788678 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:37:29 +0800 Subject: sched/cpuacct: Remove redundant NULL checks in cpuacct_charge() This is a micro optimization for the hot path. - We don't need to check if @ca is NULL in parent_ca(). - We don't need to check if @ca is NULL in the beginning of the for loop. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/515536A9.5000700@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 6 +++++- kernel/sched/cpuacct.h | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 72bd971ea377..b2aaaba16d46 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -210,9 +210,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) ca = task_ca(tsk); - for (; ca; ca = parent_ca(ca)) { + while (true) { u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); *cpuusage += cputime; + + ca = parent_ca(ca); + if (!ca) + break; } rcu_read_unlock(); diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index bd0409b85525..45c168207fcc 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -36,7 +36,7 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) static inline struct cpuacct *parent_ca(struct cpuacct *ca) { - if (!ca || !ca->css.cgroup->parent) + if (!ca->css.cgroup->parent) return NULL; return cgroup_ca(ca->css.cgroup->parent); } -- cgit v1.2.3 From 5f40d804325e925409907e29f46ecb012090b6c2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:37:43 +0800 Subject: sched/cpuacct: Remove redundant NULL checks in cpuacct_acount_field() This is a micro optimazation for a hot path. - We don't need to check if @ca returned from task_ca() is NULL. - We don't need to check if @ca returned from parent_ca() is NULL. Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/515536B7.6060602@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 4 ++-- kernel/sched/cpuacct.h | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index b2aaaba16d46..071ae8d08181 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -237,10 +237,10 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_lock(); ca = task_ca(p); - while (ca && (ca != &root_cpuacct)) { + while (ca != &root_cpuacct) { kcpustat = this_cpu_ptr(ca->cpustat); kcpustat->cpustat[index] += val; - ca = parent_ca(ca); + ca = __parent_ca(ca); } rcu_read_unlock(); } diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index 45c168207fcc..b2f79ad1e524 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -34,6 +34,11 @@ static inline struct cpuacct *task_ca(struct task_struct *tsk) struct cpuacct, css); } +static inline struct cpuacct *__parent_ca(struct cpuacct *ca) +{ + return cgroup_ca(ca->css.cgroup->parent); +} + static inline struct cpuacct *parent_ca(struct cpuacct *ca) { if (!ca->css.cgroup->parent) -- cgit v1.2.3 From d1712796a880bea0a44739941116001923f3275b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:38:13 +0800 Subject: sched/cpuacct: Clean up cpuacct.h Now most of the code in cpuacct.h can be moved to cpuacct.c Signed-off-by: Li Zefan Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/515536D5.2080401@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 44 +++++++++++++++++++++++++++++++++++++++++++- kernel/sched/cpuacct.h | 46 ---------------------------------------------- 2 files changed, 43 insertions(+), 47 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 071ae8d08181..9305fd2f8cf9 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -16,7 +16,49 @@ * (balbir@in.ibm.com). */ -struct cpuacct root_cpuacct; +/* Time spent by the tasks of the cpu accounting group executing in ... */ +enum cpuacct_stat_index { + CPUACCT_STAT_USER, /* ... user mode */ + CPUACCT_STAT_SYSTEM, /* ... kernel mode */ + + CPUACCT_STAT_NSTATS, +}; + +/* track cpu usage of a group of tasks and its child groups */ +struct cpuacct { + struct cgroup_subsys_state css; + /* cpuusage holds pointer to a u64-type object on every cpu */ + u64 __percpu *cpuusage; + struct kernel_cpustat __percpu *cpustat; +}; + +/* return cpu accounting group corresponding to this container */ +static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) +{ + return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), + struct cpuacct, css); +} + +/* return cpu accounting group to which this task belongs */ +static inline struct cpuacct *task_ca(struct task_struct *tsk) +{ + return container_of(task_subsys_state(tsk, cpuacct_subsys_id), + struct cpuacct, css); +} + +static inline struct cpuacct *__parent_ca(struct cpuacct *ca) +{ + return cgroup_ca(ca->css.cgroup->parent); +} + +static inline struct cpuacct *parent_ca(struct cpuacct *ca) +{ + if (!ca->css.cgroup->parent) + return NULL; + return cgroup_ca(ca->css.cgroup->parent); +} + +static struct cpuacct root_cpuacct; /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index b2f79ad1e524..51cd76eb4f0f 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,51 +1,5 @@ -/* Time spent by the tasks of the cpu accounting group executing in ... */ -enum cpuacct_stat_index { - CPUACCT_STAT_USER, /* ... user mode */ - CPUACCT_STAT_SYSTEM, /* ... kernel mode */ - - CPUACCT_STAT_NSTATS, -}; - #ifdef CONFIG_CGROUP_CPUACCT -#include -/* track cpu usage of a group of tasks and its child groups */ -struct cpuacct { - struct cgroup_subsys_state css; - /* cpuusage holds pointer to a u64-type object on every cpu */ - u64 __percpu *cpuusage; - struct kernel_cpustat __percpu *cpustat; -}; - -extern struct cgroup_subsys cpuacct_subsys; -extern struct cpuacct root_cpuacct; - -/* return cpu accounting group corresponding to this container */ -static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp) -{ - return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id), - struct cpuacct, css); -} - -/* return cpu accounting group to which this task belongs */ -static inline struct cpuacct *task_ca(struct task_struct *tsk) -{ - return container_of(task_subsys_state(tsk, cpuacct_subsys_id), - struct cpuacct, css); -} - -static inline struct cpuacct *__parent_ca(struct cpuacct *ca) -{ - return cgroup_ca(ca->css.cgroup->parent); -} - -static inline struct cpuacct *parent_ca(struct cpuacct *ca) -{ - if (!ca->css.cgroup->parent) - return NULL; - return cgroup_ca(ca->css.cgroup->parent); -} - extern void cpuacct_init(void); extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); -- cgit v1.2.3 From 7943e15a3e91db78a7a3fbc84e45cf9d1c7c7d23 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:43:46 +0800 Subject: sched/cpuacct: Allocate per_cpu cpuusage for root cpuacct statically This is a preparation, so later we can initialize cpuacct earlier. Signed-off-by: Li Zefan Cc: Tejun Heo Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51553822.5000403@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 9305fd2f8cf9..a691c4dd65be 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -58,6 +58,7 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) return cgroup_ca(ca->css.cgroup->parent); } +static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); static struct cpuacct root_cpuacct; /* create a new cpu accounting group */ @@ -290,8 +291,7 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) void __init cpuacct_init(void) { root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = alloc_percpu(u64); - BUG_ON(!root_cpuacct.cpuusage); /* Too early, not expected to fail */ + root_cpuacct.cpuusage = &root_cpuacct_cpuusage; } struct cgroup_subsys cpuacct_subsys = { -- cgit v1.2.3 From 14c6d3c8a47ced185b6375c4940b5b393f1a294e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:44:04 +0800 Subject: sched/cpuacct: Initialize root cpuacct earlier Now we don't need cpuacct_init(), and instead we just initialize root_cpuacct when it's defined. Signed-off-by: Li Zefan Cc: Tejun Heo Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/51553834.9090701@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 2 -- kernel/sched/cpuacct.c | 11 ++++------- kernel/sched/cpuacct.h | 5 ----- 3 files changed, 4 insertions(+), 14 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 92930a89529d..ee8c1bd703fe 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6936,8 +6936,6 @@ void __init sched_init(void) #endif /* CONFIG_CGROUP_SCHED */ - cpuacct_init(); - for_each_possible_cpu(i) { struct rq *rq; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index a691c4dd65be..04255814a0ed 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -59,7 +59,10 @@ static inline struct cpuacct *parent_ca(struct cpuacct *ca) } static DEFINE_PER_CPU(u64, root_cpuacct_cpuusage); -static struct cpuacct root_cpuacct; +static struct cpuacct root_cpuacct = { + .cpustat = &kernel_cpustat, + .cpuusage = &root_cpuacct_cpuusage, +}; /* create a new cpu accounting group */ static struct cgroup_subsys_state *cpuacct_css_alloc(struct cgroup *cgrp) @@ -288,12 +291,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) rcu_read_unlock(); } -void __init cpuacct_init(void) -{ - root_cpuacct.cpustat = &kernel_cpustat; - root_cpuacct.cpuusage = &root_cpuacct_cpuusage; -} - struct cgroup_subsys cpuacct_subsys = { .name = "cpuacct", .css_alloc = cpuacct_css_alloc, diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index 51cd76eb4f0f..ed605624a5e7 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,15 +1,10 @@ #ifdef CONFIG_CGROUP_CPUACCT -extern void cpuacct_init(void); extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); #else -static inline void cpuacct_init(void) -{ -} - static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) { } -- cgit v1.2.3 From 621e2de02403a6f776852c564b79c38bf3cc9032 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:44:15 +0800 Subject: sched/cpuacct: Initialize cpuacct subsystem earlier Initialize cpuacct before the scheduler is functioning, so when cpuacct_charge() and cpuacct_account_field() are called, task_ca() won't return NULL. Signed-off-by: Li Zefan Cc: Tejun Heo Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5155383F.8000005@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 04255814a0ed..75e46d291f9c 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -292,9 +292,10 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) } struct cgroup_subsys cpuacct_subsys = { - .name = "cpuacct", - .css_alloc = cpuacct_css_alloc, - .css_free = cpuacct_css_free, - .subsys_id = cpuacct_subsys_id, - .base_cftypes = files, + .name = "cpuacct", + .css_alloc = cpuacct_css_alloc, + .css_free = cpuacct_css_free, + .subsys_id = cpuacct_subsys_id, + .base_cftypes = files, + .early_init = 1, }; -- cgit v1.2.3 From a2b0ae25fc8bfeeb4022b8e847ab811b3c8368d1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 29 Mar 2013 14:44:28 +0800 Subject: sched/cpuacct: No need to check subsys active state Now we're guaranteed when cpuacct_charge() and cpuacct_account_field() are called, cpuacct has already been properly initialized, so we no longer need those checks. Signed-off-by: Li Zefan Cc: Tejun Heo Acked-by: Peter Zijlstra Link: http://lkml.kernel.org/r/5155384C.7000508@huawei.com Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index 75e46d291f9c..ef57ab658722 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -247,9 +247,6 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) struct cpuacct *ca; int cpu; - if (unlikely(!cpuacct_subsys.active)) - return; - cpu = task_cpu(tsk); rcu_read_lock(); @@ -278,9 +275,6 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val) struct kernel_cpustat *kcpustat; struct cpuacct *ca; - if (unlikely(!cpuacct_subsys.active)) - return; - rcu_read_lock(); ca = task_ca(p); while (ca != &root_cpuacct) { -- cgit v1.2.3 From b329fd5b018ffd64cfef6a2551bb2ca4bbfbacf2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 10 Apr 2013 15:10:50 +0200 Subject: sched/cpuacct/UML: Fix header file dependency bug on the UML build The cpuacct split caused this build failure on UML: kernel/sched/cpuacct.c:94:2: error: implicit declaration of function 'ERR_PTR' Cc: Li Zefan Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched/cpuacct.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel/sched') diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index ef57ab658722..dbb7e2cd95eb 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -6,6 +6,7 @@ #include #include #include +#include #include "sched.h" -- cgit v1.2.3 From 41fcb9f230bf773656d1768b73000ef720bf00c3 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 17 Apr 2013 15:23:11 -0400 Subject: mutex: Move mutex spinning code from sched/core.c back to mutex.c As mentioned by Ingo, the SCHED_FEAT_OWNER_SPIN scheduler feature bit was really just an early hack to make with/without mutex-spinning testable. So it is no longer necessary. This patch removes the SCHED_FEAT_OWNER_SPIN feature bit and move the mutex spinning code from kernel/sched/core.c back to kernel/mutex.c which is where they should belong. Signed-off-by: Waiman Long Cc: Linus Torvalds Cc: Andrew Morton Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Chandramouleeswaran Aswin Cc: Davidlohr Bueso Cc: Norton Scott J Cc: Rik van Riel Cc: Paul E. McKenney Cc: David Howells Cc: Dave Jones Cc: Clark Williams Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366226594-5506-2-git-send-email-Waiman.Long@hp.com Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/mutex.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ kernel/sched/core.c | 45 --------------------------------------------- kernel/sched/features.h | 7 ------- 4 files changed, 46 insertions(+), 53 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index d35d2b6ddbfb..aefe45d79f53 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -320,7 +320,6 @@ extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); -extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner); struct nsproxy; struct user_namespace; diff --git a/kernel/mutex.c b/kernel/mutex.c index 52f23011b6e0..262d7177adad 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -95,6 +95,52 @@ void __sched mutex_lock(struct mutex *lock) EXPORT_SYMBOL(mutex_lock); #endif +#ifdef CONFIG_MUTEX_SPIN_ON_OWNER +/* + * Mutex spinning code migrated from kernel/sched/core.c + */ + +static inline bool owner_running(struct mutex *lock, struct task_struct *owner) +{ + if (lock->owner != owner) + return false; + + /* + * Ensure we emit the owner->on_cpu, dereference _after_ checking + * lock->owner still matches owner, if that fails, owner might + * point to free()d memory, if it still matches, the rcu_read_lock() + * ensures the memory stays valid. + */ + barrier(); + + return owner->on_cpu; +} + +/* + * Look out! "owner" is an entirely speculative pointer + * access and not reliable. + */ +static noinline +int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) +{ + rcu_read_lock(); + while (owner_running(lock, owner)) { + if (need_resched()) + break; + + arch_mutex_cpu_relax(); + } + rcu_read_unlock(); + + /* + * We break out the loop above on need_resched() and when the + * owner changed, which is a sign for heavy contention. Return + * success only when lock->owner is NULL. + */ + return lock->owner == NULL; +} +#endif + static __used noinline void __sched __mutex_unlock_slowpath(atomic_t *lock_count); /** diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7f12624a393c..b37a22b99e0e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2997,51 +2997,6 @@ void __sched schedule_preempt_disabled(void) preempt_disable(); } -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER - -static inline bool owner_running(struct mutex *lock, struct task_struct *owner) -{ - if (lock->owner != owner) - return false; - - /* - * Ensure we emit the owner->on_cpu, dereference _after_ checking - * lock->owner still matches owner, if that fails, owner might - * point to free()d memory, if it still matches, the rcu_read_lock() - * ensures the memory stays valid. - */ - barrier(); - - return owner->on_cpu; -} - -/* - * Look out! "owner" is an entirely speculative pointer - * access and not reliable. - */ -int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) -{ - if (!sched_feat(OWNER_SPIN)) - return 0; - - rcu_read_lock(); - while (owner_running(lock, owner)) { - if (need_resched()) - break; - - arch_mutex_cpu_relax(); - } - rcu_read_unlock(); - - /* - * We break out the loop above on need_resched() and when the - * owner changed, which is a sign for heavy contention. Return - * success only when lock->owner is NULL. - */ - return lock->owner == NULL; -} -#endif - #ifdef CONFIG_PREEMPT /* * this is the entry point to schedule() from in-kernel preemption diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 1ad1d2b5395f..99399f8e4799 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -45,13 +45,6 @@ SCHED_FEAT(HRTICK, false) SCHED_FEAT(DOUBLE_TICK, false) SCHED_FEAT(LB_BIAS, true) -/* - * Spin-wait on mutex acquisition when the mutex owner is running on - * another cpu -- assumes that when the owner is running, it will soon - * release the lock. Decreases scheduling overhead. - */ -SCHED_FEAT(OWNER_SPIN, true) - /* * Decrement CPU power based on time not spent running tasks */ -- cgit v1.2.3 From 642dbc39ab1ea00f47e0fee1b8e8a27da036d940 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 18 Apr 2013 18:34:26 +0200 Subject: sched: Fix wrong rq's runnable_avg update with rt tasks The current update of the rq's load can be erroneous when RT tasks are involved. The update of the load of a rq that becomes idle, is done only if the avg_idle is less than sysctl_sched_migration_cost. If RT tasks and short idle duration alternate, the runnable_avg will not be updated correctly and the time will be accounted as idle time when a CFS task wakes up. A new idle_enter function is called when the next task is the idle function so the elapsed time will be accounted as run time in the load of the rq, whatever the average idle time is. The function update_rq_runnable_avg is removed from idle_balance. When a RT task is scheduled on an idle CPU, the update of the rq's load is not done when the rq exit idle state because CFS's functions are not called. Then, the idle_balance, which is called just before entering the idle function, updates the rq's load and makes the assumption that the elapsed time since the last update, was only running time. As a consequence, the rq's load of a CPU that only runs a periodic RT task, is close to LOAD_AVG_MAX whatever the running duration of the RT task is. A new idle_exit function is called when the prev task is the idle function so the elapsed time will be accounted as idle time in the rq's load. Signed-off-by: Vincent Guittot Acked-by: Peter Zijlstra Acked-by: Steven Rostedt Cc: linaro-kernel@lists.linaro.org Cc: peterz@infradead.org Cc: pjt@google.com Cc: fweisbec@gmail.com Cc: efault@gmx.de Link: http://lkml.kernel.org/r/1366302867-5055-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 23 +++++++++++++++++++++-- kernel/sched/idle_task.c | 16 ++++++++++++++++ kernel/sched/sched.h | 12 ++++++++++++ 3 files changed, 49 insertions(+), 2 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 155783b4e4bf..1c977350e322 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1563,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq, se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter); } /* migrations, e.g. sleep=0 leave decay_count == 0 */ } + +/* + * Update the rq's load with the elapsed running time before entering + * idle. if the last scheduled task is not a CFS task, idle_enter will + * be the only way to update the runnable statistic. + */ +void idle_enter_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 1); +} + +/* + * Update the rq's load with the elapsed idle time before a task is + * scheduled. if the newly scheduled task is not a CFS task, idle_exit will + * be the only way to update the runnable statistic. + */ +void idle_exit_fair(struct rq *this_rq) +{ + update_rq_runnable_avg(this_rq, 0); +} + #else static inline void update_entity_load_avg(struct sched_entity *se, int update_cfs_rq) {} @@ -5217,8 +5238,6 @@ void idle_balance(int this_cpu, struct rq *this_rq) if (this_rq->avg_idle < sysctl_sched_migration_cost) return; - update_rq_runnable_avg(this_rq, 1); - /* * Drop the rq->lock, but keep IRQ/preempt disabled. */ diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index b6baf370cae9..b8ce77328341 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags) { return task_cpu(p); /* IDLE tasks as never migrated */ } + +static void pre_schedule_idle(struct rq *rq, struct task_struct *prev) +{ + idle_exit_fair(rq); +} + +static void post_schedule_idle(struct rq *rq) +{ + idle_enter_fair(rq); +} #endif /* CONFIG_SMP */ /* * Idle tasks are unconditionally rescheduled: @@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); +#ifdef CONFIG_SMP + /* Trigger the post schedule to do an idle_enter for CFS */ + rq->post_schedule = 1; +#endif return rq->idle; } @@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = { #ifdef CONFIG_SMP .select_task_rq = select_task_rq_idle, + .pre_schedule = pre_schedule_idle, + .post_schedule = post_schedule_idle, #endif .set_curr_task = set_curr_task_idle, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 8116cf8e350f..605426a63588 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1024,6 +1024,18 @@ extern void update_group_power(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq, int cpu); extern void idle_balance(int this_cpu, struct rq *this_rq); +/* + * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg + * becomes useful in lb + */ +#if defined(CONFIG_FAIR_GROUP_SCHED) +extern void idle_enter_fair(struct rq *this_rq); +extern void idle_exit_fair(struct rq *this_rq); +#else +static inline void idle_enter_fair(struct rq *this_rq) {} +static inline void idle_exit_fair(struct rq *this_rq) {} +#endif + #else /* CONFIG_SMP */ static inline void idle_balance(int cpu, struct rq *rq) -- cgit v1.2.3 From f1cd0858100c67273f2c74344e0c464344c4a982 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:37 +0900 Subject: sched: Change position of resched_cpu() in load_balance() cur_ld_moved is reset if env.flags hit LBF_NEED_BREAK. So, there is possibility that we miss doing resched_cpu(). Correct it as changing position of resched_cpu() before checking LBF_NEED_BREAK. Signed-off-by: Joonsoo Kim Tested-by: Jason Low Acked-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-2-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1c977350e322..25aaf93281de 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5080,17 +5080,17 @@ more_balance: double_rq_unlock(env.dst_rq, busiest); local_irq_restore(flags); - if (env.flags & LBF_NEED_BREAK) { - env.flags &= ~LBF_NEED_BREAK; - goto more_balance; - } - /* * some other cpu did the load balance for us. */ if (cur_ld_moved && env.dst_cpu != smp_processor_id()) resched_cpu(env.dst_cpu); + if (env.flags & LBF_NEED_BREAK) { + env.flags &= ~LBF_NEED_BREAK; + goto more_balance; + } + /* * Revisit (affine) tasks on src_cpu that couldn't be moved to * us and move them to an alternate dst_cpu in our sched_group -- cgit v1.2.3 From de5eb2dd7f171ee8a45d23cd41aa2efe9ab922b3 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:38 +0900 Subject: sched: Explicitly cpu_idle_type checking in rebalance_domains() After commit 88b8dac0, dst-cpu can be changed in load_balance(), then we can't know cpu_idle_type of dst-cpu when load_balance() return positive. So, add explicit cpu_idle_type checking. Signed-off-by: Joonsoo Kim Tested-by: Jason Low Acked-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-3-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 25aaf93281de..726e12905725 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5523,10 +5523,11 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle) if (time_after_eq(jiffies, sd->last_balance + interval)) { if (load_balance(cpu, rq, sd, idle, &balance)) { /* - * We've pulled tasks over so either we're no - * longer idle. + * The LBF_SOME_PINNED logic could have changed + * env->dst_cpu, so we can't know our idle + * state even if we migrated tasks. Update it. */ - idle = CPU_NOT_IDLE; + idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE; } sd->last_balance = jiffies; } -- cgit v1.2.3 From cfc03118047172f5bdc58d63c607d16d33ce5305 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:39 +0900 Subject: sched: Don't consider other cpus in our group in case of NEWLY_IDLE Commit 88b8dac0 makes load_balance() consider other cpus in its group, regardless of idle type. When we do NEWLY_IDLE balancing, we should not consider it, because a motivation of NEWLY_IDLE balancing is to turn this cpu to non idle state if needed. This is not the case of other cpus. So, change code not to consider other cpus for NEWLY_IDLE balancing. With this patch, assign 'if (pulled_task) this_rq->idle_stamp = 0' in idle_balance() is corrected, because NEWLY_IDLE balancing doesn't consider other cpus. Assigning to 'this_rq->idle_stamp' is now valid. Signed-off-by: Joonsoo Kim Tested-by: Jason Low Acked-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-4-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 726e12905725..dfa92b7b3dec 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5026,8 +5026,21 @@ static int load_balance(int this_cpu, struct rq *this_rq, .cpus = cpus, }; + /* + * For NEWLY_IDLE load_balancing, we don't need to consider + * other cpus in our group + */ + if (idle == CPU_NEWLY_IDLE) { + env.dst_grpmask = NULL; + /* + * we don't care max_lb_iterations in this case, + * in following patch, this will be removed + */ + max_lb_iterations = 0; + } else + max_lb_iterations = cpumask_weight(env.dst_grpmask); + cpumask_copy(cpus, cpu_active_mask); - max_lb_iterations = cpumask_weight(env.dst_grpmask); schedstat_inc(sd, lb_count[idle]); -- cgit v1.2.3 From d31980846f9688db3ee3e5863525c6ff8ace4c7c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:40 +0900 Subject: sched: Move up affinity check to mitigate useless redoing overhead Currently, LBF_ALL_PINNED is cleared after affinity check is passed. So, if task migration is skipped by small load value or small imbalance value in move_tasks(), we don't clear LBF_ALL_PINNED. At last, we trigger 'redo' in load_balance(). Imbalance value is often so small that any tasks cannot be moved to other cpus and, of course, this situation may be continued after we change the target cpu. So this patch move up affinity check code and clear LBF_ALL_PINNED before evaluating load value in order to mitigate useless redoing overhead. In addition, re-order some comments correctly. Signed-off-by: Joonsoo Kim Acked-by: Peter Zijlstra Tested-by: Jason Low Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-5-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dfa92b7b3dec..b8ef321641df 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3896,10 +3896,14 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) int tsk_cache_hot = 0; /* * We do not migrate tasks that are: - * 1) running (obviously), or + * 1) throttled_lb_pair, or * 2) cannot be migrated to this CPU due to cpus_allowed, or - * 3) are cache-hot on their current CPU. + * 3) running (obviously), or + * 4) are cache-hot on their current CPU. */ + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + return 0; + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { int new_dst_cpu; @@ -3967,9 +3971,6 @@ static int move_one_task(struct lb_env *env) struct task_struct *p, *n; list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { - if (throttled_lb_pair(task_group(p), env->src_rq->cpu, env->dst_cpu)) - continue; - if (!can_migrate_task(p, env)) continue; @@ -4021,7 +4022,7 @@ static int move_tasks(struct lb_env *env) break; } - if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) + if (!can_migrate_task(p, env)) goto next; load = task_h_load(p); @@ -4032,9 +4033,6 @@ static int move_tasks(struct lb_env *env) if ((load / 2) > env->imbalance) goto next; - if (!can_migrate_task(p, env)) - goto next; - move_task(p, env); pulled++; env->imbalance -= load; -- cgit v1.2.3 From e6252c3ef4b9cd251b53f7b68035f395d20b044e Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:41 +0900 Subject: sched: Rename load_balance_tmpmask to load_balance_mask This name doesn't represent specific meaning. So rename it to imply it's purpose. Signed-off-by: Joonsoo Kim Acked-by: Peter Zijlstra Tested-by: Jason Low Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-6-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 ++-- kernel/sched/fair.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee8c1bd703fe..cb49b2ab0e16 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6873,7 +6873,7 @@ struct task_group root_task_group; LIST_HEAD(task_groups); #endif -DECLARE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); void __init sched_init(void) { @@ -6910,7 +6910,7 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CPUMASK_OFFSTACK for_each_possible_cpu(i) { - per_cpu(load_balance_tmpmask, i) = (void *)ptr; + per_cpu(load_balance_mask, i) = (void *)ptr; ptr += cpumask_size(); } #endif /* CONFIG_CPUMASK_OFFSTACK */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b8ef321641df..5b1e96687b49 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4977,7 +4977,7 @@ static struct rq *find_busiest_queue(struct lb_env *env, #define MAX_PINNED_INTERVAL 512 /* Working cpumask for load_balance and load_balance_newidle. */ -DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask); +DEFINE_PER_CPU(cpumask_var_t, load_balance_mask); static int need_active_balance(struct lb_env *env) { @@ -5012,7 +5012,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_group *group; struct rq *busiest; unsigned long flags; - struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask); + struct cpumask *cpus = __get_cpu_var(load_balance_mask); struct lb_env env = { .sd = sd, -- cgit v1.2.3 From e02e60c109ca70935bad1131976bdbf5160cf576 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 23 Apr 2013 17:27:42 +0900 Subject: sched: Prevent to re-select dst-cpu in load_balance() Commit 88b8dac0 makes load_balance() consider other cpus in its group. But, in that, there is no code for preventing to re-select dst-cpu. So, same dst-cpu can be selected over and over. This patch add functionality to load_balance() in order to exclude cpu which is selected once. We prevent to re-select dst_cpu via env's cpus, so now, env's cpus is a candidate not only for src_cpus, but also dst_cpus. With this patch, we can remove lb_iterations and max_lb_iterations, because we decide whether we can go ahead or not via env's cpus. Signed-off-by: Joonsoo Kim Acked-by: Peter Zijlstra Tested-by: Jason Low Cc: Srivatsa Vaddagiri Cc: Davidlohr Bueso Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1366705662-3587-7-git-send-email-iamjoonsoo.kim@lge.com Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'kernel/sched') diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5b1e96687b49..acaf567a03d2 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3905,7 +3905,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) return 0; if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) { - int new_dst_cpu; + int cpu; schedstat_inc(p, se.statistics.nr_failed_migrations_affine); @@ -3920,12 +3920,15 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED)) return 0; - new_dst_cpu = cpumask_first_and(env->dst_grpmask, - tsk_cpus_allowed(p)); - if (new_dst_cpu < nr_cpu_ids) { - env->flags |= LBF_SOME_PINNED; - env->new_dst_cpu = new_dst_cpu; + /* Prevent to re-select dst_cpu via env's cpus */ + for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) { + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + env->flags |= LBF_SOME_PINNED; + env->new_dst_cpu = cpu; + break; + } } + return 0; } @@ -5008,7 +5011,6 @@ static int load_balance(int this_cpu, struct rq *this_rq, int *balance) { int ld_moved, cur_ld_moved, active_balance = 0; - int lb_iterations, max_lb_iterations; struct sched_group *group; struct rq *busiest; unsigned long flags; @@ -5028,15 +5030,8 @@ static int load_balance(int this_cpu, struct rq *this_rq, * For NEWLY_IDLE load_balancing, we don't need to consider * other cpus in our group */ - if (idle == CPU_NEWLY_IDLE) { + if (idle == CPU_NEWLY_IDLE) env.dst_grpmask = NULL; - /* - * we don't care max_lb_iterations in this case, - * in following patch, this will be removed - */ - max_lb_iterations = 0; - } else - max_lb_iterations = cpumask_weight(env.dst_grpmask); cpumask_copy(cpus, cpu_active_mask); @@ -5064,7 +5059,6 @@ redo: schedstat_add(sd, lb_imbalance[idle], env.imbalance); ld_moved = 0; - lb_iterations = 1; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -5121,14 +5115,17 @@ more_balance: * moreover subsequent load balance cycles should correct the * excess load moved. */ - if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0 && - lb_iterations++ < max_lb_iterations) { + if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) { env.dst_rq = cpu_rq(env.new_dst_cpu); env.dst_cpu = env.new_dst_cpu; env.flags &= ~LBF_SOME_PINNED; env.loop = 0; env.loop_break = sched_nr_migrate_break; + + /* Prevent to re-select dst_cpu via env's cpus */ + cpumask_clear_cpu(env.dst_cpu, env.cpus); + /* * Go back to "more_balance" rather than "redo" since we * need to continue with same src_cpu. -- cgit v1.2.3 From 25f55d9d01ad7a7ad248fd5af1d22675ffd202c5 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Tue, 23 Apr 2013 16:59:02 +0200 Subject: sched: Fix init NOHZ_IDLE flag On my SMP platform which is made of 5 cores in 2 clusters, I have the nr_busy_cpu field of sched_group_power struct that is not null when the platform is fully idle - which makes the scheduler unhappy. The root cause is: During the boot sequence, some CPUs reach the idle loop and set their NOHZ_IDLE flag while waiting for others CPUs to boot. But the nr_busy_cpus field is initialized later with the assumption that all CPUs are in the busy state whereas some CPUs have already set their NOHZ_IDLE flag. More generally, the NOHZ_IDLE flag must be initialized when new sched_domains are created in order to ensure that NOHZ_IDLE and nr_busy_cpus are aligned. This condition can be ensured by adding a synchronize_rcu() between the destruction of old sched_domains and the creation of new ones so the NOHZ_IDLE flag will not be updated with old sched_domain once it has been initialized. But this solution introduces a additionnal latency in the rebuild sequence that is called during cpu hotplug. As suggested by Frederic Weisbecker, another solution is to have the same rcu lifecycle for both NOHZ_IDLE and sched_domain struct. A new nohz_idle field is added to sched_domain so both status and sched_domain will share the same RCU lifecycle and will be always synchronized. In addition, there is no more need to protect nohz_idle against concurrent access as it is only modified by 2 exclusive functions called by local cpu. This solution has been prefered to the creation of a new struct with an extra pointer indirection for sched_domain. The synchronization is done at the cost of : - An additional indirection and a rcu_dereference for accessing nohz_idle. - We use only the nohz_idle field of the top sched_domain. Signed-off-by: Vincent Guittot Acked-by: Peter Zijlstra Cc: linaro-kernel@lists.linaro.org Cc: peterz@infradead.org Cc: fweisbec@gmail.com Cc: pjt@google.com Cc: rostedt@goodmis.org Cc: efault@gmx.de Link: http://lkml.kernel.org/r/1366729142-14662-1-git-send-email-vincent.guittot@linaro.org [ Fixed !NO_HZ build bug. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched/fair.c | 26 ++++++++++++++++---------- kernel/sched/sched.h | 1 - 3 files changed, 18 insertions(+), 11 deletions(-) (limited to 'kernel/sched') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6bdaa73ede13..a25168f4ab86 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -808,6 +808,8 @@ struct sched_domain { unsigned int wake_idx; unsigned int forkexec_idx; unsigned int smt_gain; + + int nohz_idle; /* NOHZ IDLE status */ int flags; /* See SD_* */ int level; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index acaf567a03d2..8bf7081b1ec5 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5420,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (!test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - clear_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || !sd->nohz_idle) + goto unlock; + sd->nohz_idle = 0; + + for (; sd; sd = sd->parent) atomic_inc(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } @@ -5435,13 +5438,16 @@ void set_cpu_sd_state_idle(void) struct sched_domain *sd; int cpu = smp_processor_id(); - if (test_bit(NOHZ_IDLE, nohz_flags(cpu))) - return; - set_bit(NOHZ_IDLE, nohz_flags(cpu)); - rcu_read_lock(); - for_each_domain(cpu, sd) + sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); + + if (!sd || sd->nohz_idle) + goto unlock; + sd->nohz_idle = 1; + + for (; sd; sd = sd->parent) atomic_dec(&sd->groups->sgp->nr_busy_cpus); +unlock: rcu_read_unlock(); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 605426a63588..4c225c4c7111 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1303,7 +1303,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled); enum rq_nohz_flag_bits { NOHZ_TICK_STOPPED, NOHZ_BALANCE_KICK, - NOHZ_IDLE, }; #define nohz_flags(cpu) (&cpu_rq(cpu)->nohz_flags) -- cgit v1.2.3