diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 322 |
1 files changed, 158 insertions, 164 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 04a3ce20da67..8a8bd7b13634 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -531,12 +531,15 @@ static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime) return min_vruntime; } -static inline int entity_before(struct sched_entity *a, +static inline bool entity_before(struct sched_entity *a, struct sched_entity *b) { return (s64)(a->vruntime - b->vruntime) < 0; } +#define __node_2_se(node) \ + rb_entry((node), struct sched_entity, run_node) + static void update_min_vruntime(struct cfs_rq *cfs_rq) { struct sched_entity *curr = cfs_rq->curr; @@ -552,8 +555,7 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) } if (leftmost) { /* non-empty tree */ - struct sched_entity *se; - se = rb_entry(leftmost, struct sched_entity, run_node); + struct sched_entity *se = __node_2_se(leftmost); if (!curr) vruntime = se->vruntime; @@ -569,37 +571,17 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) #endif } +static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +{ + return entity_before(__node_2_se(a), __node_2_se(b)); +} + /* * Enqueue an entity into the rb-tree: */ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node; - struct rb_node *parent = NULL; - struct sched_entity *entry; - bool leftmost = true; - - /* - * Find the right place in the rbtree: - */ - while (*link) { - parent = *link; - entry = rb_entry(parent, struct sched_entity, run_node); - /* - * We dont care about collisions. Nodes with - * the same key stay together. - */ - if (entity_before(se, entry)) { - link = &parent->rb_left; - } else { - link = &parent->rb_right; - leftmost = false; - } - } - - rb_link_node(&se->run_node, parent, link); - rb_insert_color_cached(&se->run_node, - &cfs_rq->tasks_timeline, leftmost); + rb_add_cached(&se->run_node, &cfs_rq->tasks_timeline, __entity_less); } static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -614,7 +596,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) if (!left) return NULL; - return rb_entry(left, struct sched_entity, run_node); + return __node_2_se(left); } static struct sched_entity *__pick_next_entity(struct sched_entity *se) @@ -624,7 +606,7 @@ static struct sched_entity *__pick_next_entity(struct sched_entity *se) if (!next) return NULL; - return rb_entry(next, struct sched_entity, run_node); + return __node_2_se(next); } #ifdef CONFIG_SCHED_DEBUG @@ -635,7 +617,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) if (!last) return NULL; - return rb_entry(last, struct sched_entity, run_node); + return __node_2_se(last); } /************************************************************** @@ -3943,6 +3925,22 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, trace_sched_util_est_cfs_tp(cfs_rq); } +static inline void util_est_dequeue(struct cfs_rq *cfs_rq, + struct task_struct *p) +{ + unsigned int enqueued; + + if (!sched_feat(UTIL_EST)) + return; + + /* Update root cfs_rq's estimated utilization */ + enqueued = cfs_rq->avg.util_est.enqueued; + enqueued -= min_t(unsigned int, enqueued, _task_util_est(p)); + WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued); + + trace_sched_util_est_cfs_tp(cfs_rq); +} + /* * Check if a (signed) value is within a specified (unsigned) margin, * based on the observation that: @@ -3956,23 +3954,16 @@ static inline bool within_margin(int value, int margin) return ((unsigned int)(value + margin - 1) < (2 * margin - 1)); } -static void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) +static inline void util_est_update(struct cfs_rq *cfs_rq, + struct task_struct *p, + bool task_sleep) { long last_ewma_diff; struct util_est ue; - int cpu; if (!sched_feat(UTIL_EST)) return; - /* Update root cfs_rq's estimated utilization */ - ue.enqueued = cfs_rq->avg.util_est.enqueued; - ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p)); - WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued); - - trace_sched_util_est_cfs_tp(cfs_rq); - /* * Skip update of task's estimated utilization when the task has not * yet completed an activation, e.g. being migrated. @@ -4012,8 +4003,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep) * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - cpu = cpu_of(rq_of(cfs_rq)); - if (task_util(p) > capacity_orig_of(cpu)) + if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) return; /* @@ -4052,7 +4042,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) if (!static_branch_unlikely(&sched_asym_cpucapacity)) return; - if (!p) { + if (!p || p->nr_cpus_allowed == 1) { rq->misfit_task_load = 0; return; } @@ -4096,8 +4086,11 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {} static inline void -util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, - bool task_sleep) {} +util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +static inline void +util_est_update(struct cfs_rq *cfs_rq, struct task_struct *p, + bool task_sleep) {} static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {} #endif /* CONFIG_SMP */ @@ -5419,7 +5412,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) s64 delta = slice - ran; if (delta < 0) { - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); return; } @@ -5436,7 +5429,7 @@ static void hrtick_update(struct rq *rq) { struct task_struct *curr = rq->curr; - if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class) + if (!hrtick_enabled_fair(rq) || curr->sched_class != &fair_sched_class) return; if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency) @@ -5609,6 +5602,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) int idle_h_nr_running = task_has_idle_policy(p); bool was_sched_idle = sched_idle_rq(rq); + util_est_dequeue(&rq->cfs, p); + for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); dequeue_entity(cfs_rq, se, flags); @@ -5659,7 +5654,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) rq->next_balance = jiffies; dequeue_throttle: - util_est_dequeue(&rq->cfs, p, task_sleep); + util_est_update(&rq->cfs, p, task_sleep); hrtick_update(rq); } @@ -6006,6 +6001,14 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p return new_cpu; } +static inline int __select_idle_cpu(int cpu) +{ + if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) + return cpu; + + return -1; +} + #ifdef CONFIG_SCHED_SMT DEFINE_STATIC_KEY_FALSE(sched_smt_present); EXPORT_SYMBOL_GPL(sched_smt_present); @@ -6064,74 +6067,51 @@ unlock: * there are no idle cores left in the system; tracked through * sd_llc->shared->has_idle_cores and enabled through update_idle_core() above. */ -static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); - int core, cpu; + bool idle = true; + int cpu; if (!static_branch_likely(&sched_smt_present)) - return -1; + return __select_idle_cpu(core); - if (!test_idle_cores(target, false)) - return -1; - - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - - for_each_cpu_wrap(core, cpus, target) { - bool idle = true; - - for_each_cpu(cpu, cpu_smt_mask(core)) { - if (!available_idle_cpu(cpu)) { - idle = false; - break; + for_each_cpu(cpu, cpu_smt_mask(core)) { + if (!available_idle_cpu(cpu)) { + idle = false; + if (*idle_cpu == -1) { + if (sched_idle_cpu(cpu) && cpumask_test_cpu(cpu, p->cpus_ptr)) { + *idle_cpu = cpu; + break; + } + continue; } + break; } - - if (idle) - return core; - - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); + if (*idle_cpu == -1 && cpumask_test_cpu(cpu, p->cpus_ptr)) + *idle_cpu = cpu; } - /* - * Failed to find an idle core; stop looking for one. - */ - set_idle_cores(target, 0); + if (idle) + return core; + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); return -1; } -/* - * Scan the local SMT mask for idle CPUs. - */ -static int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) -{ - int cpu; - - if (!static_branch_likely(&sched_smt_present)) - return -1; - - for_each_cpu(cpu, cpu_smt_mask(target)) { - if (!cpumask_test_cpu(cpu, p->cpus_ptr) || - !cpumask_test_cpu(cpu, sched_domain_span(sd))) - continue; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - return cpu; - } +#else /* CONFIG_SCHED_SMT */ - return -1; +static inline void set_idle_cores(int cpu, int val) +{ } -#else /* CONFIG_SCHED_SMT */ - -static inline int select_idle_core(struct task_struct *p, struct sched_domain *sd, int target) +static inline bool test_idle_cores(int cpu, bool def) { - return -1; + return def; } -static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd, int target) +static inline int select_idle_core(struct task_struct *p, int core, struct cpumask *cpus, int *idle_cpu) { - return -1; + return __select_idle_cpu(core); } #endif /* CONFIG_SCHED_SMT */ @@ -6144,49 +6124,61 @@ static inline int select_idle_smt(struct task_struct *p, struct sched_domain *sd static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int target) { struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + int i, cpu, idle_cpu = -1, nr = INT_MAX; + bool smt = test_idle_cores(target, false); + int this = smp_processor_id(); struct sched_domain *this_sd; - u64 avg_cost, avg_idle; u64 time; - int this = smp_processor_id(); - int cpu, nr = INT_MAX; this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); if (!this_sd) return -1; - /* - * Due to large variance we need a large fuzz factor; hackbench in - * particularly is sensitive here. - */ - avg_idle = this_rq()->avg_idle / 512; - avg_cost = this_sd->avg_scan_cost + 1; + cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_AVG_CPU) && avg_idle < avg_cost) - return -1; + if (sched_feat(SIS_PROP) && !smt) { + u64 avg_cost, avg_idle, span_avg; + + /* + * Due to large variance we need a large fuzz factor; + * hackbench in particularly is sensitive here. + */ + avg_idle = this_rq()->avg_idle / 512; + avg_cost = this_sd->avg_scan_cost + 1; - if (sched_feat(SIS_PROP)) { - u64 span_avg = sd->span_weight * avg_idle; + span_avg = sd->span_weight * avg_idle; if (span_avg > 4*avg_cost) nr = div_u64(span_avg, avg_cost); else nr = 4; - } - - time = cpu_clock(this); - cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + time = cpu_clock(this); + } for_each_cpu_wrap(cpu, cpus, target) { - if (!--nr) - return -1; - if (available_idle_cpu(cpu) || sched_idle_cpu(cpu)) - break; + if (smt) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + + } else { + if (!--nr) + return -1; + idle_cpu = __select_idle_cpu(cpu); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + break; + } } - time = cpu_clock(this) - time; - update_avg(&this_sd->avg_scan_cost, time); + if (smt) + set_idle_cores(this, false); - return cpu; + if (sched_feat(SIS_PROP) && !smt) { + time = cpu_clock(this) - time; + update_avg(&this_sd->avg_scan_cost, time); + } + + return idle_cpu; } /* @@ -6315,18 +6307,10 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if (!sd) return target; - i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - i = select_idle_cpu(p, sd, target); if ((unsigned)i < nr_cpumask_bits) return i; - i = select_idle_smt(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - return target; } @@ -6543,7 +6527,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * is already enough to scale the EM reported power * consumption at the (eventually clamped) cpu_capacity. */ - sum_util += schedutil_cpu_util(cpu, util_cfs, cpu_cap, + sum_util += effective_cpu_util(cpu, util_cfs, cpu_cap, ENERGY_UTIL, NULL); /* @@ -6553,7 +6537,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * NOTE: in case RT tasks are running, by default the * FREQUENCY_UTIL's utilization can be max OPP. */ - cpu_util = schedutil_cpu_util(cpu, util_cfs, cpu_cap, + cpu_util = effective_cpu_util(cpu, util_cfs, cpu_cap, FREQUENCY_UTIL, tsk); max_util = max(max_util, cpu_util); } @@ -6651,7 +6635,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) * IOW, placing the task there would make the CPU * overutilized. Take uclamp into account to see how * much capacity we can get out of the CPU; this is - * aligned with schedutil_cpu_util(). + * aligned with sched_cpu_util(). */ util = uclamp_rq_util_with(cpu_rq(cpu), util, p); if (!fits_capacity(util, cpu_cap)) @@ -7132,7 +7116,7 @@ done: __maybe_unused; list_move(&p->se.group_node, &rq->cfs_tasks); #endif - if (hrtick_enabled(rq)) + if (hrtick_enabled_fair(rq)) hrtick_start_fair(rq, p); update_misfit_status(p, rq); @@ -9389,8 +9373,11 @@ static struct rq *find_busiest_queue(struct lb_env *env, if (rt > env->fbq_type) continue; - capacity = capacity_of(i); nr_running = rq->cfs.h_nr_running; + if (!nr_running) + continue; + + capacity = capacity_of(i); /* * For ASYM_CPUCAPACITY domains, don't pick a CPU that could @@ -9496,13 +9483,32 @@ asym_active_balance(struct lb_env *env) } static inline bool -voluntary_active_balance(struct lb_env *env) +imbalanced_active_balance(struct lb_env *env) +{ + struct sched_domain *sd = env->sd; + + /* + * The imbalanced case includes the case of pinned tasks preventing a fair + * distribution of the load on the system but also the even distribution of the + * threads on a system with spare capacity + */ + if ((env->migration_type == migrate_task) && + (sd->nr_balance_failed > sd->cache_nice_tries+2)) + return 1; + + return 0; +} + +static int need_active_balance(struct lb_env *env) { struct sched_domain *sd = env->sd; if (asym_active_balance(env)) return 1; + if (imbalanced_active_balance(env)) + return 1; + /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. * It's worth migrating the task if the src_cpu's capacity is reduced @@ -9522,16 +9528,6 @@ voluntary_active_balance(struct lb_env *env) return 0; } -static int need_active_balance(struct lb_env *env) -{ - struct sched_domain *sd = env->sd; - - if (voluntary_active_balance(env)) - return 1; - - return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); -} - static int active_load_balance_cpu_stop(void *data); static int should_we_balance(struct lb_env *env) @@ -9623,6 +9619,8 @@ redo: env.src_rq = busiest; ld_moved = 0; + /* Clear this flag as soon as we find a pullable task */ + env.flags |= LBF_ALL_PINNED; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found @@ -9630,7 +9628,6 @@ redo: * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ - env.flags |= LBF_ALL_PINNED; env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running); more_balance: @@ -9756,10 +9753,12 @@ more_balance: if (!cpumask_test_cpu(this_cpu, busiest->curr->cpus_ptr)) { raw_spin_unlock_irqrestore(&busiest->lock, flags); - env.flags |= LBF_ALL_PINNED; goto out_one_pinned; } + /* Record that we found at least one task that could run on this_cpu */ + env.flags &= ~LBF_ALL_PINNED; + /* * ->active_balance synchronizes accesses to * ->active_balance_work. Once set, it's cleared @@ -9781,21 +9780,13 @@ more_balance: /* We've kicked active balancing, force task migration. */ sd->nr_balance_failed = sd->cache_nice_tries+1; } - } else + } else { sd->nr_balance_failed = 0; + } - if (likely(!active_balance) || voluntary_active_balance(&env)) { + if (likely(!active_balance) || need_active_balance(&env)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; - } else { - /* - * If we've begun active balancing, start to back off. This - * case may not be covered by the all_pinned logic if there - * is only 1 task on the busy runqueue (because we don't call - * detach_tasks). - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval *= 2; } goto out; @@ -10700,8 +10691,11 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) */ void trigger_load_balance(struct rq *rq) { - /* Don't need to rebalance while attached to NULL domain */ - if (unlikely(on_null_domain(rq))) + /* + * Don't need to rebalance while attached to NULL domain or + * runqueue CPU is not active + */ + if (unlikely(on_null_domain(rq) || !cpu_active(cpu_of(rq)))) return; if (time_after_eq(jiffies, rq->next_balance)) @@ -10806,7 +10800,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ - if (rq->curr == p) { + if (task_current(rq, p)) { if (p->prio > oldprio) resched_curr(rq); } else @@ -10939,7 +10933,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * kick off the schedule if running, otherwise just see * if we can still preempt the current task. */ - if (rq->curr == p) + if (task_current(rq, p)) resched_curr(rq); else check_preempt_curr(rq, p, 0); |