Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler updates from Ingo Molnar: "The biggest change affects group scheduling: we now track the runnable average on a per-task entity basis, allowing a smoother, exponential decay average based load/weight estimation instead of the previous binary on-the-runqueue/off-the-runqueue load weight method. This will inevitably disturb workloads that were in some sort of borderline balancing state or unstable equilibrium, so an eye has to be kept on regressions. For that reason the new load average is only limited to group scheduling (shares distribution) at the moment (which was also hurting the most from the prior, crude weight calculation and whose scheduling quality wins most from this change) - but we plan to extend this to regular SMP balancing as well in the future, which will simplify and speed up things a bit. Other changes involve ongoing preparatory work to extend NOHZ to the scheduler as well, eventually allowing completely irq-free user-space execution." * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits) Revert "sched/autogroup: Fix crash on reboot when autogroup is disabled" cputime: Comment cputime's adjusting code cputime: Consolidate cputime adjustment code cputime: Rename thread_group_times to thread_group_cputime_adjusted cputime: Move thread_group_cputime() to sched code vtime: Warn if irqs aren't disabled on system time accounting APIs vtime: No need to disable irqs on vtime_account() vtime: Consolidate a bit the ctx switch code vtime: Explicitly account pending user time on process tick vtime: Remove the underscore prefix invasion sched/autogroup: Fix crash on reboot when autogroup is disabled cputime: Separate irqtime accounting from generic vtime cputime: Specialize irq vtime hooks kvm: Directly account vtime to system on guest switch vtime: Make vtime_account_system() irqsafe vtime: Gather vtime declarations to their own header file sched: Describe CFS load-balancer sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking sched: Make __update_entity_runnable_avg() fast sched: Update_cfs_shares at period edge ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2012-12-12 06:21:38 +0400
committer: Linus Torvalds <torvalds@linux-foundation.org> 2012-12-12 06:21:38 +0400
commit: f57d54bab696133fae569c5f01352249c36fc74f (patch)
tree: 8ebe3c6deaf95c424c86843c3d290fbf2a9e80d2 /include
parent: da830e589a45f0c42eef6f3cbd07275f8893f181 (diff)
parent: c1ad41f1f7270c1956da13fa8fd59d8d5929d56e (diff)
download: linux-f57d54bab696133fae569c5f01352249c36fc74f.tar.xz
5 files changed, 113 insertions, 28 deletions
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index cab3da3d0949..624ef3f45c8e 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -4,6 +4,7 @@
 #include <linux/preempt.h>
 #include <linux/lockdep.h>
 #include <linux/ftrace_irq.h>
+#include <linux/vtime.h>
 #include <asm/hardirq.h>
 
 /*
@@ -129,16 +130,6 @@ extern void synchronize_irq(unsigned int irq);
 # define synchronize_irq(irq)	barrier()
 #endif
 
-struct task_struct;
-
-#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING)
-static inline void vtime_account(struct task_struct *tsk)
-{
-}
-#else
-extern void vtime_account(struct task_struct *tsk);
-#endif
-
 #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU)
 
 static inline void rcu_nmi_enter(void)
@@ -162,7 +153,7 @@ extern void rcu_nmi_exit(void);
  */
 #define __irq_enter()					\
 	do {						\
-		vtime_account(current);		\
+		vtime_account_irq_enter(current);	\
 		add_preempt_count(HARDIRQ_OFFSET);	\
 		trace_hardirq_enter();			\
 	} while (0)
@@ -178,7 +169,7 @@ extern void irq_enter(void);
 #define __irq_exit()					\
 	do {						\
 		trace_hardirq_exit();			\
-		vtime_account(current);		\
+		vtime_account_irq_exit(current);	\
 		sub_preempt_count(HARDIRQ_OFFSET);	\
 	} while (0)
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 36d12f0884c3..66b70780e910 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -7,6 +7,7 @@
 #include <linux/cpumask.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
+#include <linux/vtime.h>
 #include <asm/irq.h>
 #include <asm/cputime.h>
 
@@ -126,16 +127,16 @@ extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t)
 extern void account_steal_time(cputime_t);
 extern void account_idle_time(cputime_t);
 
-extern void account_process_tick(struct task_struct *, int user);
-extern void account_steal_ticks(unsigned long ticks);
-extern void account_idle_ticks(unsigned long ticks);
-
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING
-extern void vtime_task_switch(struct task_struct *prev);
-extern void vtime_account_system(struct task_struct *tsk);
-extern void vtime_account_idle(struct task_struct *tsk);
+static inline void account_process_tick(struct task_struct *tsk, int user)
+{
+	vtime_account_user(tsk);
+}
 #else
-static inline void vtime_task_switch(struct task_struct *prev) { }
+extern void account_process_tick(struct task_struct *, int user);
 #endif
 
+extern void account_steal_ticks(unsigned long ticks);
+extern void account_idle_ticks(unsigned long ticks);
+
 #endif /* _LINUX_KERNEL_STAT_H */
diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h
index ecc554374e44..d5cddd8dcc5c 100644
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -726,7 +726,11 @@ static inline int kvm_deassign_device(struct kvm *kvm,
 static inline void kvm_guest_enter(void)
 {
 	BUG_ON(preemptible());
-	vtime_account(current);
+	/*
+	 * This is running in ioctl context so we can avoid
+	 * the call to vtime_account() with its unnecessary idle check.
+	 */
+	vtime_account_system_irqsafe(current);
 	current->flags |= PF_VCPU;
 	/* KVM does not hold any references to rcu protected data when it
 	 * switches CPU into a guest mode. In fact switching to a guest mode
@@ -740,7 +744,11 @@ static inline void kvm_guest_enter(void)
 
 static inline void kvm_guest_exit(void)
 {
-	vtime_account(current);
+	/*
+	 * This is running in ioctl context so we can avoid
+	 * the call to vtime_account() with its unnecessary idle check.
+	 */
+	vtime_account_system_irqsafe(current);
 	current->flags &= ~PF_VCPU;
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29116b853ece..b96ff1e43ada 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -436,13 +436,28 @@ struct cpu_itimer {
 };
 
 /**
+ * struct cputime - snaphsot of system and user cputime
+ * @utime: time spent in user mode
+ * @stime: time spent in system mode
+ *
+ * Gathers a generic snapshot of user and system time.
+ */
+struct cputime {
+	cputime_t utime;
+	cputime_t stime;
+};
+
+/**
  * struct task_cputime - collected CPU time counts
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
  *
- * This structure groups together three kinds of CPU time that are
- * tracked for threads and thread groups.  Most things considering
+ * This is an extension of struct cputime that includes the total runtime
+ * spent by the task from the scheduler point of view.
+ *
+ * As a result, this structure groups together three kinds of CPU time
+ * that are tracked for threads and thread groups.  Most things considering
  * CPU time want to group these counts together and treat all three
  * of them in parallel.
  */
@@ -583,7 +598,7 @@ struct signal_struct {
 	cputime_t gtime;
 	cputime_t cgtime;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-	cputime_t prev_utime, prev_stime;
+	struct cputime prev_cputime;
 #endif
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
 	unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
@@ -1064,6 +1079,7 @@ struct sched_class {
 
 #ifdef CONFIG_SMP
 	int  (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
+	void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
 
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
 	void (*post_schedule) (struct rq *this_rq);
@@ -1098,6 +1114,18 @@ struct load_weight {
 	unsigned long weight, inv_weight;
 };
 
+struct sched_avg {
+	/*
+	 * These sums represent an infinite geometric series and so are bound
+	 * above by 1024/(1-y).  Thus we only need a u32 to store them for for all
+	 * choices of y < 1-2^(-32)*1024.
+	 */
+	u32 runnable_avg_sum, runnable_avg_period;
+	u64 last_runnable_update;
+	s64 decay_count;
+	unsigned long load_avg_contrib;
+};
+
 #ifdef CONFIG_SCHEDSTATS
 struct sched_statistics {
 	u64			wait_start;
@@ -1158,6 +1186,15 @@ struct sched_entity {
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
 #endif
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+	/* Per-entity load-tracking */
+	struct sched_avg	avg;
+#endif
 };
 
 struct sched_rt_entity {
@@ -1321,7 +1358,7 @@ struct task_struct {
 	cputime_t utime, stime, utimescaled, stimescaled;
 	cputime_t gtime;
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
-	cputime_t prev_utime, prev_stime;
+	struct cputime prev_cputime;
 #endif
 	unsigned long nvcsw, nivcsw; /* context switch counts */
 	struct timespec start_time; 		/* monotonic time */
@@ -1732,8 +1769,8 @@ static inline void put_task_struct(struct task_struct *t)
 		__put_task_struct(t);
 }
 
-extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
-extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
+extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
 
 /*
  * Per process flags
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
new file mode 100644
index 000000000000..ae30ab58431a
--- /dev/null
+++ b/include/linux/vtime.h
@@ -0,0 +1,48 @@
+#ifndef _LINUX_KERNEL_VTIME_H
+#define _LINUX_KERNEL_VTIME_H
+
+struct task_struct;
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+extern void vtime_task_switch(struct task_struct *prev);
+extern void vtime_account_system(struct task_struct *tsk);
+extern void vtime_account_system_irqsafe(struct task_struct *tsk);
+extern void vtime_account_idle(struct task_struct *tsk);
+extern void vtime_account_user(struct task_struct *tsk);
+extern void vtime_account(struct task_struct *tsk);
+#else
+static inline void vtime_task_switch(struct task_struct *prev) { }
+static inline void vtime_account_system(struct task_struct *tsk) { }
+static inline void vtime_account_system_irqsafe(struct task_struct *tsk) { }
+static inline void vtime_account(struct task_struct *tsk) { }
+#endif
+
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+extern void irqtime_account_irq(struct task_struct *tsk);
+#else
+static inline void irqtime_account_irq(struct task_struct *tsk) { }
+#endif
+
+static inline void vtime_account_irq_enter(struct task_struct *tsk)
+{
+	/*
+	 * Hardirq can interrupt idle task anytime. So we need vtime_account()
+	 * that performs the idle check in CONFIG_VIRT_CPU_ACCOUNTING.
+	 * Softirq can also interrupt idle task directly if it calls
+	 * local_bh_enable(). Such case probably don't exist but we never know.
+	 * Ksoftirqd is not concerned because idle time is flushed on context
+	 * switch. Softirqs in the end of hardirqs are also not a problem because
+	 * the idle time is flushed on hardirq time already.
+	 */
+	vtime_account(tsk);
+	irqtime_account_irq(tsk);
+}
+
+static inline void vtime_account_irq_exit(struct task_struct *tsk)
+{
+	/* On hard|softirq exit we always account to hard|softirq cputime */
+	vtime_account_system(tsk);
+	irqtime_account_irq(tsk);
+}
+
+#endif /* _LINUX_KERNEL_VTIME_H */
author	Linus Torvalds <torvalds@linux-foundation.org>	2012-12-12 06:21:38 +0400
committer	Linus Torvalds <torvalds@linux-foundation.org>	2012-12-12 06:21:38 +0400
commit	f57d54bab696133fae569c5f01352249c36fc74f (patch)
tree	8ebe3c6deaf95c424c86843c3d290fbf2a9e80d2 /include
parent	da830e589a45f0c42eef6f3cbd07275f8893f181 (diff)
parent	c1ad41f1f7270c1956da13fa8fd59d8d5929d56e (diff)
download	linux-f57d54bab696133fae569c5f01352249c36fc74f.tar.xz