From 60fed7891d4115be0ed7ff9ce6851eda80533c64 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizefan@huawei.com>
Date: Fri, 29 Mar 2013 14:36:43 +0800
Subject: sched: Split cpuacct code out of sched.h

Add cpuacct.h and let sched.h include it.

Signed-off-by: Li Zefan <lizefan@huawei.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/5155367B.2060506@huawei.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/sched.h | 48 +-----------------------------------------------
 1 file changed, 1 insertion(+), 47 deletions(-)

(limited to 'kernel/sched/sched.h')

diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 3bd15a43eebc..8116cf8e350f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -7,6 +7,7 @@
 #include <linux/stop_machine.h>
 
 #include "cpupri.h"
+#include "cpuacct.h"
 
 extern __read_mostly int scheduler_running;
 
@@ -950,14 +951,6 @@ static const u32 prio_to_wmult[40] = {
  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
 };
 
-/* Time spent by the tasks of the cpu accounting group executing in ... */
-enum cpuacct_stat_index {
-	CPUACCT_STAT_USER,	/* ... user mode */
-	CPUACCT_STAT_SYSTEM,	/* ... kernel mode */
-
-	CPUACCT_STAT_NSTATS,
-};
-
 #define ENQUEUE_WAKEUP		1
 #define ENQUEUE_HEAD		2
 #ifdef CONFIG_SMP
@@ -1054,45 +1047,6 @@ extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime
 
 extern void update_idle_cpu_load(struct rq *this_rq);
 
-#ifdef CONFIG_CGROUP_CPUACCT
-#include <linux/cgroup.h>
-/* track cpu usage of a group of tasks and its child groups */
-struct cpuacct {
-	struct cgroup_subsys_state css;
-	/* cpuusage holds pointer to a u64-type object on every cpu */
-	u64 __percpu *cpuusage;
-	struct kernel_cpustat __percpu *cpustat;
-};
-
-extern struct cgroup_subsys cpuacct_subsys;
-extern struct cpuacct root_cpuacct;
-
-/* return cpu accounting group corresponding to this container */
-static inline struct cpuacct *cgroup_ca(struct cgroup *cgrp)
-{
-	return container_of(cgroup_subsys_state(cgrp, cpuacct_subsys_id),
-			    struct cpuacct, css);
-}
-
-/* return cpu accounting group to which this task belongs */
-static inline struct cpuacct *task_ca(struct task_struct *tsk)
-{
-	return container_of(task_subsys_state(tsk, cpuacct_subsys_id),
-			    struct cpuacct, css);
-}
-
-static inline struct cpuacct *parent_ca(struct cpuacct *ca)
-{
-	if (!ca || !ca->css.cgroup->parent)
-		return NULL;
-	return cgroup_ca(ca->css.cgroup->parent);
-}
-
-extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
-#else
-static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
-#endif
-
 #ifdef CONFIG_PARAVIRT
 static inline u64 steal_ticks(u64 steal)
 {
-- 
cgit v1.2.3


From 642dbc39ab1ea00f47e0fee1b8e8a27da036d940 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Thu, 18 Apr 2013 18:34:26 +0200
Subject: sched: Fix wrong rq's runnable_avg update with rt tasks

The current update of the rq's load can be erroneous when RT
tasks are involved.

The update of the load of a rq that becomes idle, is done only
if the avg_idle is less than sysctl_sched_migration_cost. If RT
tasks and short idle duration alternate, the runnable_avg will
not be updated correctly and the time will be accounted as idle
time when a CFS task wakes up.

A new idle_enter function is called when the next task is the
idle function so the elapsed time will be accounted as run time
in the load of the rq, whatever the average idle time is. The
function update_rq_runnable_avg is removed from idle_balance.

When a RT task is scheduled on an idle CPU, the update of the
rq's load is not done when the rq exit idle state because CFS's
functions are not called. Then, the idle_balance, which is
called just before entering the idle function, updates the rq's
load and makes the assumption that the elapsed time since the
last update, was only running time.

As a consequence, the rq's load of a CPU that only runs a
periodic RT task, is close to LOAD_AVG_MAX whatever the running
duration of the RT task is.

A new idle_exit function is called when the prev task is the
idle function so the elapsed time will be accounted as idle time
in the rq's load.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: linaro-kernel@lists.linaro.org
Cc: peterz@infradead.org
Cc: pjt@google.com
Cc: fweisbec@gmail.com
Cc: efault@gmx.de
Link: http://lkml.kernel.org/r/1366302867-5055-1-git-send-email-vincent.guittot@linaro.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 kernel/sched/fair.c      | 23 +++++++++++++++++++++--
 kernel/sched/idle_task.c | 16 ++++++++++++++++
 kernel/sched/sched.h     | 12 ++++++++++++
 3 files changed, 49 insertions(+), 2 deletions(-)

(limited to 'kernel/sched/sched.h')

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 155783b4e4bf..1c977350e322 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1563,6 +1563,27 @@ static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
 		se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
 	} /* migrations, e.g. sleep=0 leave decay_count == 0 */
 }
+
+/*
+ * Update the rq's load with the elapsed running time before entering
+ * idle. if the last scheduled task is not a CFS task, idle_enter will
+ * be the only way to update the runnable statistic.
+ */
+void idle_enter_fair(struct rq *this_rq)
+{
+	update_rq_runnable_avg(this_rq, 1);
+}
+
+/*
+ * Update the rq's load with the elapsed idle time before a task is
+ * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
+ * be the only way to update the runnable statistic.
+ */
+void idle_exit_fair(struct rq *this_rq)
+{
+	update_rq_runnable_avg(this_rq, 0);
+}
+
 #else
 static inline void update_entity_load_avg(struct sched_entity *se,
 					  int update_cfs_rq) {}
@@ -5217,8 +5238,6 @@ void idle_balance(int this_cpu, struct rq *this_rq)
 	if (this_rq->avg_idle < sysctl_sched_migration_cost)
 		return;
 
-	update_rq_runnable_avg(this_rq, 1);
-
 	/*
 	 * Drop the rq->lock, but keep IRQ/preempt disabled.
 	 */
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index b6baf370cae9..b8ce77328341 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -13,6 +13,16 @@ select_task_rq_idle(struct task_struct *p, int sd_flag, int flags)
 {
 	return task_cpu(p); /* IDLE tasks as never migrated */
 }
+
+static void pre_schedule_idle(struct rq *rq, struct task_struct *prev)
+{
+	idle_exit_fair(rq);
+}
+
+static void post_schedule_idle(struct rq *rq)
+{
+	idle_enter_fair(rq);
+}
 #endif /* CONFIG_SMP */
 /*
  * Idle tasks are unconditionally rescheduled:
@@ -25,6 +35,10 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
 	schedstat_inc(rq, sched_goidle);
+#ifdef CONFIG_SMP
+	/* Trigger the post schedule to do an idle_enter for CFS */
+	rq->post_schedule = 1;
+#endif
 	return rq->idle;
 }
 
@@ -86,6 +100,8 @@ const struct sched_class idle_sched_class = {
 
 #ifdef CONFIG_SMP
 	.select_task_rq		= select_task_rq_idle,
+	.pre_schedule		= pre_schedule_idle,
+	.post_schedule		= post_schedule_idle,
 #endif
 
 	.set_curr_task          = set_curr_task_idle,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 8116cf8e350f..605426a63588 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1024,6 +1024,18 @@ extern void update_group_power(struct sched_domain *sd, int cpu);
 extern void trigger_load_balance(struct rq *rq, int cpu);
 extern void idle_balance(int this_cpu, struct rq *this_rq);
 
+/*
+ * Only depends on SMP, FAIR_GROUP_SCHED may be removed when runnable_avg
+ * becomes useful in lb
+ */
+#if defined(CONFIG_FAIR_GROUP_SCHED)
+extern void idle_enter_fair(struct rq *this_rq);
+extern void idle_exit_fair(struct rq *this_rq);
+#else
+static inline void idle_enter_fair(struct rq *this_rq) {}
+static inline void idle_exit_fair(struct rq *this_rq) {}
+#endif
+
 #else	/* CONFIG_SMP */
 
 static inline void idle_balance(int cpu, struct rq *rq)
-- 
cgit v1.2.3


From 25f55d9d01ad7a7ad248fd5af1d22675ffd202c5 Mon Sep 17 00:00:00 2001
From: Vincent Guittot <vincent.guittot@linaro.org>
Date: Tue, 23 Apr 2013 16:59:02 +0200
Subject: sched: Fix init NOHZ_IDLE flag

On my SMP platform which is made of 5 cores in 2 clusters, I
have the nr_busy_cpu field of sched_group_power struct that is
not null when the platform is fully idle - which makes the
scheduler unhappy.

The root cause is:

During the boot sequence, some CPUs reach the idle loop and set
their NOHZ_IDLE flag while waiting for others CPUs to boot. But
the nr_busy_cpus field is initialized later with the assumption
that all CPUs are in the busy state whereas some CPUs have
already set their NOHZ_IDLE flag.

More generally, the NOHZ_IDLE flag must be initialized when new
sched_domains are created in order to ensure that NOHZ_IDLE and
nr_busy_cpus are aligned.

This condition can be ensured by adding a synchronize_rcu()
between the destruction of old sched_domains and the creation of
new ones so the NOHZ_IDLE flag will not be updated with old
sched_domain once it has been initialized. But this solution
introduces a additionnal latency in the rebuild sequence that is
called during cpu hotplug.

As suggested by Frederic Weisbecker, another solution is to have
the same rcu lifecycle for both NOHZ_IDLE and sched_domain
struct. A new nohz_idle field is added to sched_domain so both
status and sched_domain will share the same RCU lifecycle and
will be always synchronized. In addition, there is no more need
to protect nohz_idle against concurrent access as it is only
modified by 2 exclusive functions called by local cpu.

This solution has been prefered to the creation of a new struct
with an extra pointer indirection for sched_domain.

The synchronization is done at the cost of :

 - An additional indirection and a rcu_dereference for accessing nohz_idle.
 - We use only the nohz_idle field of the top sched_domain.

Signed-off-by: Vincent Guittot <vincent.guittot@linaro.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linaro-kernel@lists.linaro.org
Cc: peterz@infradead.org
Cc: fweisbec@gmail.com
Cc: pjt@google.com
Cc: rostedt@goodmis.org
Cc: efault@gmx.de
Link: http://lkml.kernel.org/r/1366729142-14662-1-git-send-email-vincent.guittot@linaro.org
[ Fixed !NO_HZ build bug. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
---
 include/linux/sched.h |  2 ++
 kernel/sched/fair.c   | 26 ++++++++++++++++----------
 kernel/sched/sched.h  |  1 -
 3 files changed, 18 insertions(+), 11 deletions(-)

(limited to 'kernel/sched/sched.h')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 6bdaa73ede13..a25168f4ab86 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -808,6 +808,8 @@ struct sched_domain {
 	unsigned int wake_idx;
 	unsigned int forkexec_idx;
 	unsigned int smt_gain;
+
+	int nohz_idle;			/* NOHZ IDLE status */
 	int flags;			/* See SD_* */
 	int level;
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index acaf567a03d2..8bf7081b1ec5 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5420,13 +5420,16 @@ static inline void set_cpu_sd_state_busy(void)
 	struct sched_domain *sd;
 	int cpu = smp_processor_id();
 
-	if (!test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-		return;
-	clear_bit(NOHZ_IDLE, nohz_flags(cpu));
-
 	rcu_read_lock();
-	for_each_domain(cpu, sd)
+	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+
+	if (!sd || !sd->nohz_idle)
+		goto unlock;
+	sd->nohz_idle = 0;
+
+	for (; sd; sd = sd->parent)
 		atomic_inc(&sd->groups->sgp->nr_busy_cpus);
+unlock:
 	rcu_read_unlock();
 }
 
@@ -5435,13 +5438,16 @@ void set_cpu_sd_state_idle(void)
 	struct sched_domain *sd;
 	int cpu = smp_processor_id();
 
-	if (test_bit(NOHZ_IDLE, nohz_flags(cpu)))
-		return;
-	set_bit(NOHZ_IDLE, nohz_flags(cpu));
-
 	rcu_read_lock();
-	for_each_domain(cpu, sd)
+	sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
+
+	if (!sd || sd->nohz_idle)
+		goto unlock;
+	sd->nohz_idle = 1;
+
+	for (; sd; sd = sd->parent)
 		atomic_dec(&sd->groups->sgp->nr_busy_cpus);
+unlock:
 	rcu_read_unlock();
 }
 
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 605426a63588..4c225c4c7111 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1303,7 +1303,6 @@ extern void account_cfs_bandwidth_used(int enabled, int was_enabled);
 enum rq_nohz_flag_bits {
 	NOHZ_TICK_STOPPED,
 	NOHZ_BALANCE_KICK,
-	NOHZ_IDLE,
 };
 
 #define nohz_flags(cpu)	(&cpu_rq(cpu)->nohz_flags)
-- 
cgit v1.2.3