diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-12 06:21:38 +0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2012-12-12 06:21:38 +0400 |
commit | f57d54bab696133fae569c5f01352249c36fc74f (patch) | |
tree | 8ebe3c6deaf95c424c86843c3d290fbf2a9e80d2 /include | |
parent | da830e589a45f0c42eef6f3cbd07275f8893f181 (diff) | |
parent | c1ad41f1f7270c1956da13fa8fd59d8d5929d56e (diff) | |
download | linux-f57d54bab696133fae569c5f01352249c36fc74f.tar.xz |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"The biggest change affects group scheduling: we now track the runnable
average on a per-task entity basis, allowing a smoother, exponential
decay average based load/weight estimation instead of the previous
binary on-the-runqueue/off-the-runqueue load weight method.
This will inevitably disturb workloads that were in some sort of
borderline balancing state or unstable equilibrium, so an eye has to
be kept on regressions.
For that reason the new load average is only limited to group
scheduling (shares distribution) at the moment (which was also hurting
the most from the prior, crude weight calculation and whose scheduling
quality wins most from this change) - but we plan to extend this to
regular SMP balancing as well in the future, which will simplify and
speed up things a bit.
Other changes involve ongoing preparatory work to extend NOHZ to the
scheduler as well, eventually allowing completely irq-free user-space
execution."
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (33 commits)
Revert "sched/autogroup: Fix crash on reboot when autogroup is disabled"
cputime: Comment cputime's adjusting code
cputime: Consolidate cputime adjustment code
cputime: Rename thread_group_times to thread_group_cputime_adjusted
cputime: Move thread_group_cputime() to sched code
vtime: Warn if irqs aren't disabled on system time accounting APIs
vtime: No need to disable irqs on vtime_account()
vtime: Consolidate a bit the ctx switch code
vtime: Explicitly account pending user time on process tick
vtime: Remove the underscore prefix invasion
sched/autogroup: Fix crash on reboot when autogroup is disabled
cputime: Separate irqtime accounting from generic vtime
cputime: Specialize irq vtime hooks
kvm: Directly account vtime to system on guest switch
vtime: Make vtime_account_system() irqsafe
vtime: Gather vtime declarations to their own header file
sched: Describe CFS load-balancer
sched: Introduce temporary FAIR_GROUP_SCHED dependency for load-tracking
sched: Make __update_entity_runnable_avg() fast
sched: Update_cfs_shares at period edge
...
Diffstat (limited to 'include')
-rw-r--r-- | include/linux/hardirq.h | 15 | ||||
-rw-r--r-- | include/linux/kernel_stat.h | 17 | ||||
-rw-r--r-- | include/linux/kvm_host.h | 12 | ||||
-rw-r--r-- | include/linux/sched.h | 49 | ||||
-rw-r--r-- | include/linux/vtime.h | 48 |
5 files changed, 113 insertions, 28 deletions
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index cab3da3d0949..624ef3f45c8e 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -4,6 +4,7 @@ #include <linux/preempt.h> #include <linux/lockdep.h> #include <linux/ftrace_irq.h> +#include <linux/vtime.h> #include <asm/hardirq.h> /* @@ -129,16 +130,6 @@ extern void synchronize_irq(unsigned int irq); # define synchronize_irq(irq) barrier() #endif -struct task_struct; - -#if !defined(CONFIG_VIRT_CPU_ACCOUNTING) && !defined(CONFIG_IRQ_TIME_ACCOUNTING) -static inline void vtime_account(struct task_struct *tsk) -{ -} -#else -extern void vtime_account(struct task_struct *tsk); -#endif - #if defined(CONFIG_TINY_RCU) || defined(CONFIG_TINY_PREEMPT_RCU) static inline void rcu_nmi_enter(void) @@ -162,7 +153,7 @@ extern void rcu_nmi_exit(void); */ #define __irq_enter() \ do { \ - vtime_account(current); \ + vtime_account_irq_enter(current); \ add_preempt_count(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ } while (0) @@ -178,7 +169,7 @@ extern void irq_enter(void); #define __irq_exit() \ do { \ trace_hardirq_exit(); \ - vtime_account(current); \ + vtime_account_irq_exit(current); \ sub_preempt_count(HARDIRQ_OFFSET); \ } while (0) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 36d12f0884c3..66b70780e910 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -7,6 +7,7 @@ #include <linux/cpumask.h> #include <linux/interrupt.h> #include <linux/sched.h> +#include <linux/vtime.h> #include <asm/irq.h> #include <asm/cputime.h> @@ -126,16 +127,16 @@ extern void account_system_time(struct task_struct *, int, cputime_t, cputime_t) extern void account_steal_time(cputime_t); extern void account_idle_time(cputime_t); -extern void account_process_tick(struct task_struct *, int user); -extern void account_steal_ticks(unsigned long ticks); -extern void account_idle_ticks(unsigned long ticks); - #ifdef CONFIG_VIRT_CPU_ACCOUNTING -extern void vtime_task_switch(struct task_struct *prev); -extern void vtime_account_system(struct task_struct *tsk); -extern void vtime_account_idle(struct task_struct *tsk); +static inline void account_process_tick(struct task_struct *tsk, int user) +{ + vtime_account_user(tsk); +} #else -static inline void vtime_task_switch(struct task_struct *prev) { } +extern void account_process_tick(struct task_struct *, int user); #endif +extern void account_steal_ticks(unsigned long ticks); +extern void account_idle_ticks(unsigned long ticks); + #endif /* _LINUX_KERNEL_STAT_H */ diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ecc554374e44..d5cddd8dcc5c 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -726,7 +726,11 @@ static inline int kvm_deassign_device(struct kvm *kvm, static inline void kvm_guest_enter(void) { BUG_ON(preemptible()); - vtime_account(current); + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system_irqsafe(current); current->flags |= PF_VCPU; /* KVM does not hold any references to rcu protected data when it * switches CPU into a guest mode. In fact switching to a guest mode @@ -740,7 +744,11 @@ static inline void kvm_guest_enter(void) static inline void kvm_guest_exit(void) { - vtime_account(current); + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system_irqsafe(current); current->flags &= ~PF_VCPU; } diff --git a/include/linux/sched.h b/include/linux/sched.h index 29116b853ece..b96ff1e43ada 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -436,13 +436,28 @@ struct cpu_itimer { }; /** + * struct cputime - snaphsot of system and user cputime + * @utime: time spent in user mode + * @stime: time spent in system mode + * + * Gathers a generic snapshot of user and system time. + */ +struct cputime { + cputime_t utime; + cputime_t stime; +}; + +/** * struct task_cputime - collected CPU time counts * @utime: time spent in user mode, in &cputime_t units * @stime: time spent in kernel mode, in &cputime_t units * @sum_exec_runtime: total time spent on the CPU, in nanoseconds * - * This structure groups together three kinds of CPU time that are - * tracked for threads and thread groups. Most things considering + * This is an extension of struct cputime that includes the total runtime + * spent by the task from the scheduler point of view. + * + * As a result, this structure groups together three kinds of CPU time + * that are tracked for threads and thread groups. Most things considering * CPU time want to group these counts together and treat all three * of them in parallel. */ @@ -583,7 +598,7 @@ struct signal_struct { cputime_t gtime; cputime_t cgtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw; unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt; @@ -1064,6 +1079,7 @@ struct sched_class { #ifdef CONFIG_SMP int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags); + void (*migrate_task_rq)(struct task_struct *p, int next_cpu); void (*pre_schedule) (struct rq *this_rq, struct task_struct *task); void (*post_schedule) (struct rq *this_rq); @@ -1098,6 +1114,18 @@ struct load_weight { unsigned long weight, inv_weight; }; +struct sched_avg { + /* + * These sums represent an infinite geometric series and so are bound + * above by 1024/(1-y). Thus we only need a u32 to store them for for all + * choices of y < 1-2^(-32)*1024. + */ + u32 runnable_avg_sum, runnable_avg_period; + u64 last_runnable_update; + s64 decay_count; + unsigned long load_avg_contrib; +}; + #ifdef CONFIG_SCHEDSTATS struct sched_statistics { u64 wait_start; @@ -1158,6 +1186,15 @@ struct sched_entity { /* rq "owned" by this entity/group: */ struct cfs_rq *my_q; #endif +/* + * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be + * removed when useful for applications beyond shares distribution (e.g. + * load-balance). + */ +#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED) + /* Per-entity load-tracking */ + struct sched_avg avg; +#endif }; struct sched_rt_entity { @@ -1321,7 +1358,7 @@ struct task_struct { cputime_t utime, stime, utimescaled, stimescaled; cputime_t gtime; #ifndef CONFIG_VIRT_CPU_ACCOUNTING - cputime_t prev_utime, prev_stime; + struct cputime prev_cputime; #endif unsigned long nvcsw, nivcsw; /* context switch counts */ struct timespec start_time; /* monotonic time */ @@ -1732,8 +1769,8 @@ static inline void put_task_struct(struct task_struct *t) __put_task_struct(t); } -extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st); -extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st); +extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); +extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st); /* * Per process flags diff --git a/include/linux/vtime.h b/include/linux/vtime.h new file mode 100644 index 000000000000..ae30ab58431a --- /dev/null +++ b/include/linux/vtime.h @@ -0,0 +1,48 @@ +#ifndef _LINUX_KERNEL_VTIME_H +#define _LINUX_KERNEL_VTIME_H + +struct task_struct; + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +extern void vtime_task_switch(struct task_struct *prev); +extern void vtime_account_system(struct task_struct *tsk); +extern void vtime_account_system_irqsafe(struct task_struct *tsk); +extern void vtime_account_idle(struct task_struct *tsk); +extern void vtime_account_user(struct task_struct *tsk); +extern void vtime_account(struct task_struct *tsk); +#else +static inline void vtime_task_switch(struct task_struct *prev) { } +static inline void vtime_account_system(struct task_struct *tsk) { } +static inline void vtime_account_system_irqsafe(struct task_struct *tsk) { } +static inline void vtime_account(struct task_struct *tsk) { } +#endif + +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +extern void irqtime_account_irq(struct task_struct *tsk); +#else +static inline void irqtime_account_irq(struct task_struct *tsk) { } +#endif + +static inline void vtime_account_irq_enter(struct task_struct *tsk) +{ + /* + * Hardirq can interrupt idle task anytime. So we need vtime_account() + * that performs the idle check in CONFIG_VIRT_CPU_ACCOUNTING. + * Softirq can also interrupt idle task directly if it calls + * local_bh_enable(). Such case probably don't exist but we never know. + * Ksoftirqd is not concerned because idle time is flushed on context + * switch. Softirqs in the end of hardirqs are also not a problem because + * the idle time is flushed on hardirq time already. + */ + vtime_account(tsk); + irqtime_account_irq(tsk); +} + +static inline void vtime_account_irq_exit(struct task_struct *tsk) +{ + /* On hard|softirq exit we always account to hard|softirq cputime */ + vtime_account_system(tsk); + irqtime_account_irq(tsk); +} + +#endif /* _LINUX_KERNEL_VTIME_H */ |