diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-05 03:45:38 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-06-05 03:45:38 +0300 |
commit | f7f4e7fc6c517708738d1d1984b170e9475a130f (patch) | |
tree | 9744eba2f74f1f19818d8a4ab8b8d65f865ddec8 | |
parent | d9b446e294f21a9616d36a786087466da64afe0a (diff) | |
parent | 2539fc82aa9b07d968cf9ba1ffeec3e0416ac721 (diff) | |
download | linux-f7f4e7fc6c517708738d1d1984b170e9475a130f.tar.xz |
Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
- power-aware scheduling improvements (Patrick Bellasi)
- NUMA balancing improvements (Mel Gorman)
- vCPU scheduling fixes (Rohit Jain)
* 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Update util_est before updating schedutil
sched/cpufreq: Modify aggregate utilization to always include blocked FAIR utilization
sched/deadline/Documentation: Add overrun signal and GRUB-PA documentation
sched/core: Distinguish between idle_cpu() calls based on desired effect, introduce available_idle_cpu()
sched/wait: Include <linux/wait.h> in <linux/swait.h>
sched/numa: Stagger NUMA balancing scan periods for new threads
sched/core: Don't schedule threads on pre-empted vCPUs
sched/fair: Avoid calling sync_entity_load_avg() unnecessarily
sched/fair: Rearrange select_task_rq_fair() to optimize it
-rw-r--r-- | Documentation/scheduler/sched-deadline.txt | 25 | ||||
-rw-r--r-- | include/linux/sched.h | 1 | ||||
-rw-r--r-- | include/linux/swait.h | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 39 | ||||
-rw-r--r-- | kernel/sched/cpufreq_schedutil.c | 17 | ||||
-rw-r--r-- | kernel/sched/fair.c | 117 | ||||
-rw-r--r-- | kernel/sched/sched.h | 6 |
7 files changed, 137 insertions, 69 deletions
diff --git a/Documentation/scheduler/sched-deadline.txt b/Documentation/scheduler/sched-deadline.txt index 8ce78f82ae23..b14e03ff3528 100644 --- a/Documentation/scheduler/sched-deadline.txt +++ b/Documentation/scheduler/sched-deadline.txt @@ -49,7 +49,7 @@ CONTENTS 2.1 Main algorithm ------------------ - SCHED_DEADLINE uses three parameters, named "runtime", "period", and + SCHED_DEADLINE [18] uses three parameters, named "runtime", "period", and "deadline", to schedule tasks. A SCHED_DEADLINE task should receive "runtime" microseconds of execution time every "period" microseconds, and these "runtime" microseconds are available within "deadline" microseconds @@ -117,6 +117,10 @@ CONTENTS scheduling deadline = scheduling deadline + period remaining runtime = remaining runtime + runtime + The SCHED_FLAG_DL_OVERRUN flag in sched_attr's sched_flags field allows a task + to get informed about runtime overruns through the delivery of SIGXCPU + signals. + 2.2 Bandwidth reclaiming ------------------------ @@ -279,6 +283,19 @@ CONTENTS running_bw is incremented. +2.3 Energy-aware scheduling +------------------------ + + When cpufreq's schedutil governor is selected, SCHED_DEADLINE implements the + GRUB-PA [19] algorithm, reducing the CPU operating frequency to the minimum + value that still allows to meet the deadlines. This behavior is currently + implemented only for ARM architectures. + + A particular care must be taken in case the time needed for changing frequency + is of the same order of magnitude of the reservation period. In such cases, + setting a fixed CPU frequency results in a lower amount of deadline misses. + + 3. Scheduling Real-Time Tasks ============================= @@ -505,6 +522,12 @@ CONTENTS 17 - L. Abeni, G. Lipari, A. Parri, Y. Sun, Multicore CPU reclaiming: parallel or sequential?. In Proceedings of the 31st Annual ACM Symposium on Applied Computing, 2016. + 18 - J. Lelli, C. Scordino, L. Abeni, D. Faggioli, Deadline scheduling in the + Linux kernel, Software: Practice and Experience, 46(6): 821-839, June + 2016. + 19 - C. Scordino, L. Abeni, J. Lelli, Energy-Aware Real-Time Scheduling in + the Linux Kernel, 33rd ACM/SIGAPP Symposium On Applied Computing (SAC + 2018), Pau, France, April 2018. 4. Bandwidth management diff --git a/include/linux/sched.h b/include/linux/sched.h index 5a0c10b45273..14e4f9c12337 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1512,6 +1512,7 @@ static inline int task_nice(const struct task_struct *p) extern int can_nice(const struct task_struct *p, const int nice); extern int task_curr(const struct task_struct *p); extern int idle_cpu(int cpu); +extern int available_idle_cpu(int cpu); extern int sched_setscheduler(struct task_struct *, int, const struct sched_param *); extern int sched_setscheduler_nocheck(struct task_struct *, int, const struct sched_param *); extern int sched_setattr(struct task_struct *, const struct sched_attr *); diff --git a/include/linux/swait.h b/include/linux/swait.h index 13eac825819d..bf8cb0dee23c 100644 --- a/include/linux/swait.h +++ b/include/linux/swait.h @@ -5,6 +5,7 @@ #include <linux/list.h> #include <linux/stddef.h> #include <linux/spinlock.h> +#include <linux/wait.h> #include <asm/current.h> /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index e27034bd954e..e9866f86f304 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2194,27 +2194,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) INIT_HLIST_HEAD(&p->preempt_notifiers); #endif -#ifdef CONFIG_NUMA_BALANCING - if (p->mm && atomic_read(&p->mm->mm_users) == 1) { - p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); - p->mm->numa_scan_seq = 0; - } - - if (clone_flags & CLONE_VM) - p->numa_preferred_nid = current->numa_preferred_nid; - else - p->numa_preferred_nid = -1; - - p->node_stamp = 0ULL; - p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0; - p->numa_scan_period = sysctl_numa_balancing_scan_delay; - p->numa_work.next = &p->numa_work; - p->numa_faults = NULL; - p->last_task_numa_placement = 0; - p->last_sum_exec_runtime = 0; - - p->numa_group = NULL; -#endif /* CONFIG_NUMA_BALANCING */ + init_numa_balancing(clone_flags, p); } DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); @@ -4050,6 +4030,23 @@ int idle_cpu(int cpu) } /** + * available_idle_cpu - is a given CPU idle for enqueuing work. + * @cpu: the CPU in question. + * + * Return: 1 if the CPU is currently idle. 0 otherwise. + */ +int available_idle_cpu(int cpu) +{ + if (!idle_cpu(cpu)) + return 0; + + if (vcpu_is_preempted(cpu)) + return 0; + + return 1; +} + +/** * idle_task - return the idle task for a given CPU. * @cpu: the processor in question. * diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e13df951aca7..28592b62b1d5 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -183,22 +183,21 @@ static void sugov_get_util(struct sugov_cpu *sg_cpu) static unsigned long sugov_aggregate_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); - unsigned long util; - if (rq->rt.rt_nr_running) { - util = sg_cpu->max; - } else { - util = sg_cpu->util_dl; - if (rq->cfs.h_nr_running) - util += sg_cpu->util_cfs; - } + if (rq->rt.rt_nr_running) + return sg_cpu->max; /* + * Utilization required by DEADLINE must always be granted while, for + * FAIR, we use blocked utilization of IDLE CPUs as a mechanism to + * gracefully reduce the frequency when no tasks show up for longer + * periods of time. + * * Ideally we would like to set util_dl as min/guaranteed freq and * util_cfs + util_dl as requested freq. However, cpufreq is not yet * ready for such an interface. So, we only do the latter for now. */ - return min(util, sg_cpu->max); + return min(sg_cpu->max, (sg_cpu->util_dl + sg_cpu->util_cfs)); } static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 79f574dba096..e497c05aab7f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1139,6 +1139,47 @@ static unsigned int task_scan_max(struct task_struct *p) return max(smin, smax); } +void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ + int mm_users = 0; + struct mm_struct *mm = p->mm; + + if (mm) { + mm_users = atomic_read(&mm->mm_users); + if (mm_users == 1) { + mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay); + mm->numa_scan_seq = 0; + } + } + p->node_stamp = 0; + p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; + p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_work.next = &p->numa_work; + p->numa_faults = NULL; + p->numa_group = NULL; + p->last_task_numa_placement = 0; + p->last_sum_exec_runtime = 0; + + /* New address space, reset the preferred nid */ + if (!(clone_flags & CLONE_VM)) { + p->numa_preferred_nid = -1; + return; + } + + /* + * New thread, keep existing numa_preferred_nid which should be copied + * already by arch_dup_task_struct but stagger when scans start. + */ + if (mm) { + unsigned int delay; + + delay = min_t(unsigned int, task_scan_max(current), + current->numa_scan_period * mm_users * NSEC_PER_MSEC); + delay += 2 * TICK_NSEC; + p->node_stamp = delay; + } +} + static void account_numa_enqueue(struct rq *rq, struct task_struct *p) { rq->nr_numa_running += (p->numa_preferred_nid != -1); @@ -5345,6 +5386,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct sched_entity *se = &p->se; /* + * The code below (indirectly) updates schedutil which looks at + * the cfs_rq utilization to select a frequency. + * Let's add the task's estimated utilization to the cfs_rq's + * estimated utilization, before we update schedutil. + */ + util_est_enqueue(&rq->cfs, p); + + /* * If in_iowait is set, the code below may not trigger any cpufreq * utilization updates, so do it here explicitly with the IOWAIT flag * passed. @@ -5385,7 +5434,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) add_nr_running(rq, 1); - util_est_enqueue(&rq->cfs, p); hrtick_update(rq); } @@ -5858,8 +5906,8 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) * a cpufreq perspective, it's better to have higher utilisation * on one CPU. */ - if (idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) - return idle_cpu(prev_cpu) ? prev_cpu : this_cpu; + if (available_idle_cpu(this_cpu) && cpus_share_cache(this_cpu, prev_cpu)) + return available_idle_cpu(prev_cpu) ? prev_cpu : this_cpu; if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; @@ -6102,7 +6150,7 @@ find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this /* Traverse only the allowed CPUs */ for_each_cpu_and(i, sched_group_span(group), &p->cpus_allowed) { - if (idle_cpu(i)) { + if (available_idle_cpu(i)) { struct rq *rq = cpu_rq(i); struct cpuidle_state *idle = idle_get_state(rq); if (idle && idle->exit_latency < min_exit_latency) { @@ -6144,6 +6192,13 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) return prev_cpu; + /* + * We need task's util for capacity_spare_wake, sync it up to prev_cpu's + * last_update_time. + */ + if (!(sd_flag & SD_BALANCE_FORK)) + sync_entity_load_avg(&p->se); + while (sd) { struct sched_group *group; struct sched_domain *tmp; @@ -6224,7 +6279,7 @@ void __update_idle_core(struct rq *rq) if (cpu == core) continue; - if (!idle_cpu(cpu)) + if (!available_idle_cpu(cpu)) goto unlock; } @@ -6256,7 +6311,7 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int for_each_cpu(cpu, cpu_smt_mask(core)) { cpumask_clear_cpu(cpu, cpus); - if (!idle_cpu(cpu)) + if (!available_idle_cpu(cpu)) idle = false; } @@ -6285,7 +6340,7 @@ static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int t for_each_cpu(cpu, cpu_smt_mask(target)) { if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; - if (idle_cpu(cpu)) + if (available_idle_cpu(cpu)) return cpu; } @@ -6348,7 +6403,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t return -1; if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) continue; - if (idle_cpu(cpu)) + if (available_idle_cpu(cpu)) break; } @@ -6368,13 +6423,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) struct sched_domain *sd; int i, recent_used_cpu; - if (idle_cpu(target)) + if (available_idle_cpu(target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ - if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) + if (prev != target && cpus_share_cache(prev, target) && available_idle_cpu(prev)) return prev; /* Check a recently used CPU as a potential idle candidate: */ @@ -6382,7 +6437,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (recent_used_cpu != prev && recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && - idle_cpu(recent_used_cpu) && + available_idle_cpu(recent_used_cpu) && cpumask_test_cpu(p->recent_used_cpu, &p->cpus_allowed)) { /* * Replace recent_used_cpu with prev as it is a potential @@ -6558,7 +6613,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu) static int select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) { - struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL; + struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; @@ -6581,7 +6636,10 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f */ if (want_affine && (tmp->flags & SD_WAKE_AFFINE) && cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) { - affine_sd = tmp; + if (cpu != prev_cpu) + new_cpu = wake_affine(tmp, p, cpu, prev_cpu, sync); + + sd = NULL; /* Prefer wake_affine over balance flags */ break; } @@ -6591,33 +6649,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f break; } - if (affine_sd) { - sd = NULL; /* Prefer wake_affine over balance flags */ - if (cpu == prev_cpu) - goto pick_cpu; - - new_cpu = wake_affine(affine_sd, p, cpu, prev_cpu, sync); - } - - if (sd && !(sd_flag & SD_BALANCE_FORK)) { - /* - * We're going to need the task's util for capacity_spare_wake - * in find_idlest_group. Sync it up to prev_cpu's - * last_update_time. - */ - sync_entity_load_avg(&p->se); - } + if (unlikely(sd)) { + /* Slow path */ + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + /* Fast path */ - if (!sd) { -pick_cpu: - if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); - if (want_affine) - current->recent_used_cpu = cpu; - } - } else { - new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); + if (want_affine) + current->recent_used_cpu = cpu; } rcu_read_unlock(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index cb467c221b15..6601baf2361c 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1069,6 +1069,12 @@ enum numa_faults_stats { extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); extern int migrate_swap(struct task_struct *, struct task_struct *); +extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); +#else +static inline void +init_numa_balancing(unsigned long clone_flags, struct task_struct *p) +{ +} #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_SMP |