diff options
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 194 |
1 files changed, 130 insertions, 64 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 290f9e38378c..04a3ce20da67 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -906,6 +906,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) if (!schedstat_enabled()) return; + /* + * When the sched_schedstat changes from 0 to 1, some sched se + * maybe already in the runqueue, the se->statistics.wait_start + * will be 0.So it will let the delta wrong. We need to avoid this + * scenario. + */ + if (unlikely(!schedstat_val(se->statistics.wait_start))) + return; + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); if (entity_is_task(se)) { @@ -1550,7 +1559,8 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); static unsigned long cpu_util(int cpu); -static inline long adjust_numa_imbalance(int imbalance, int nr_running); +static inline long adjust_numa_imbalance(int imbalance, + int dst_running, int dst_weight); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1930,7 +1940,8 @@ static void task_numa_find_cpu(struct task_numa_env *env, src_running = env->src_stats.nr_running - 1; dst_running = env->dst_stats.nr_running + 1; imbalance = max(0, dst_running - src_running); - imbalance = adjust_numa_imbalance(imbalance, dst_running); + imbalance = adjust_numa_imbalance(imbalance, dst_running, + env->dst_stats.weight); /* Use idle CPU if there is no imbalance */ if (!imbalance) { @@ -4779,25 +4790,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) - break; + goto done; - if (dequeue) { - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - } else { - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - } + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; - if (qcfs_rq->load.weight) - dequeue = 0; + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; + } } - if (!se) - sub_nr_running(rq, task_delta); + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + } + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta); + +done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. @@ -5105,9 +5128,6 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) return; distribute_cfs_runtime(cfs_b); - - raw_spin_lock_irqsave(&cfs_b->lock, flags); - raw_spin_unlock_irqrestore(&cfs_b->lock, flags); } /* @@ -5477,6 +5497,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) struct cfs_rq *cfs_rq; struct sched_entity *se = &p->se; int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); /* * The code below (indirectly) updates schedutil which looks at @@ -5549,7 +5570,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) * into account, but that is not straightforward to implement, * and the following generally works well enough in practice. */ - if (flags & ENQUEUE_WAKEUP) + if (!task_new) update_overutilized_status(rq); enqueue_throttle: @@ -5804,6 +5825,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; + if (available_idle_cpu(prev_cpu)) + return prev_cpu; + return nr_cpumask_bits; } @@ -6062,10 +6086,11 @@ static int select_idle_core(struct task_struct *p, struct sched_domain *sd, int break; } } - cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); if (idle) return core; + + cpumask_andnot(cpus, cpus, cpu_smt_mask(core)); } /* @@ -6172,21 +6197,21 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) { - unsigned long best_cap = 0; + unsigned long task_util, best_cap = 0; int cpu, best_cpu = -1; struct cpumask *cpus; - sync_entity_load_avg(&p->se); - cpus = this_cpu_cpumask_var_ptr(select_idle_mask); cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); + task_util = uclamp_task_util(p); + for_each_cpu_wrap(cpu, cpus, target) { unsigned long cpu_cap = capacity_of(cpu); if (!available_idle_cpu(cpu) && !sched_idle_cpu(cpu)) continue; - if (task_fits_capacity(p, cpu_cap)) + if (fits_capacity(task_util, cpu_cap)) return cpu; if (cpu_cap > best_cap) { @@ -6198,44 +6223,42 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) return best_cpu; } +static inline bool asym_fits_capacity(int task_util, int cpu) +{ + if (static_branch_unlikely(&sched_asym_cpucapacity)) + return fits_capacity(task_util, capacity_of(cpu)); + + return true; +} + /* * Try and locate an idle core/thread in the LLC cache domain. */ static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; + unsigned long task_util; int i, recent_used_cpu; /* - * For asymmetric CPU capacity systems, our domain of interest is - * sd_asym_cpucapacity rather than sd_llc. + * On asymmetric system, update task utilization because we will check + * that the task fits with cpu's capacity. */ if (static_branch_unlikely(&sched_asym_cpucapacity)) { - sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); - /* - * On an asymmetric CPU capacity system where an exclusive - * cpuset defines a symmetric island (i.e. one unique - * capacity_orig value through the cpuset), the key will be set - * but the CPUs within that cpuset will not have a domain with - * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric - * capacity path. - */ - if (!sd) - goto symmetric; - - i = select_idle_capacity(p, sd, target); - return ((unsigned)i < nr_cpumask_bits) ? i : target; + sync_entity_load_avg(&p->se); + task_util = uclamp_task_util(p); } -symmetric: - if (available_idle_cpu(target) || sched_idle_cpu(target)) + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_capacity(task_util, target)) return target; /* * If the previous CPU is cache affine and idle, don't be stupid: */ if (prev != target && cpus_share_cache(prev, target) && - (available_idle_cpu(prev) || sched_idle_cpu(prev))) + (available_idle_cpu(prev) || sched_idle_cpu(prev)) && + asym_fits_capacity(task_util, prev)) return prev; /* @@ -6258,7 +6281,8 @@ symmetric: recent_used_cpu != target && cpus_share_cache(recent_used_cpu, target) && (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && - cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr)) { + cpumask_test_cpu(p->recent_used_cpu, p->cpus_ptr) && + asym_fits_capacity(task_util, recent_used_cpu)) { /* * Replace recent_used_cpu with prev as it is a potential * candidate for the next wake: @@ -6267,6 +6291,26 @@ symmetric: return recent_used_cpu; } + /* + * For asymmetric CPU capacity systems, our domain of interest is + * sd_asym_cpucapacity rather than sd_llc. + */ + if (static_branch_unlikely(&sched_asym_cpucapacity)) { + sd = rcu_dereference(per_cpu(sd_asym_cpucapacity, target)); + /* + * On an asymmetric CPU capacity system where an exclusive + * cpuset defines a symmetric island (i.e. one unique + * capacity_orig value through the cpuset), the key will be set + * but the CPUs within that cpuset will not have a domain with + * SD_ASYM_CPUCAPACITY. These should follow the usual symmetric + * capacity path. + */ + if (sd) { + i = select_idle_capacity(p, sd, target); + return ((unsigned)i < nr_cpumask_bits) ? i : target; + } + } + sd = rcu_dereference(per_cpu(sd_llc, target)); if (!sd) return target; @@ -6287,7 +6331,7 @@ symmetric: } /** - * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks + * cpu_util - Estimates the amount of capacity of a CPU used by CFS tasks. * @cpu: the CPU to get the utilization of * * The unit of the return value must be the one of capacity so we can compare @@ -6663,7 +6707,7 @@ fail: /* * select_task_rq_fair: Select target runqueue for the waking task in domains - * that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE, + * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, * SD_BALANCE_FORK, or SD_BALANCE_EXEC. * * Balances load by selecting the idlest CPU in the idlest group, or under @@ -6674,15 +6718,17 @@ fail: * preempt must be disabled. */ static int -select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags) +select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) { + int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); struct sched_domain *tmp, *sd = NULL; int cpu = smp_processor_id(); int new_cpu = prev_cpu; int want_affine = 0; - int sync = (wake_flags & WF_SYNC) && !(current->flags & PF_EXITING); + /* SD_flags and WF_flags share the first nibble */ + int sd_flag = wake_flags & 0xF; - if (sd_flag & SD_BALANCE_WAKE) { + if (wake_flags & WF_TTWU) { record_wakee(p); if (sched_energy_enabled()) { @@ -6719,9 +6765,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f if (unlikely(sd)) { /* Slow path */ new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); - } else if (sd_flag & SD_BALANCE_WAKE) { /* XXX always ? */ + } else if (wake_flags & WF_TTWU) { /* XXX always ? */ /* Fast path */ - new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); if (want_affine) @@ -8738,6 +8783,16 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* + * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. + * This is an approximation as the number of running tasks may not be + * related to the number of busy CPUs due to sched_setaffinity. + */ +static inline bool allow_numa_imbalance(int dst_running, int dst_weight) +{ + return (dst_running < (dst_weight >> 2)); +} + +/* * find_idlest_group() finds and returns the least busy CPU group within the * domain. * @@ -8755,9 +8810,6 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) .group_type = group_overloaded, }; - imbalance = scale_load_down(NICE_0_LOAD) * - (sd->imbalance_pct-100) / 100; - do { int local_group; @@ -8811,6 +8863,11 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) switch (local_sgs.group_type) { case group_overloaded: case group_fully_busy: + + /* Calculate allowed imbalance based on load */ + imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; + /* * When comparing groups across NUMA domains, it's possible for * the local domain to be very lightly loaded relative to the @@ -8867,7 +8924,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) * a real need of migration, periodic load balance will * take care of it. */ - if (local_sgs.idle_cpus) + if (allow_numa_imbalance(local_sgs.sum_nr_running, sd->span_weight)) return NULL; } @@ -8969,16 +9026,19 @@ next_group: } } -static inline long adjust_numa_imbalance(int imbalance, int nr_running) +#define NUMA_IMBALANCE_MIN 2 + +static inline long adjust_numa_imbalance(int imbalance, + int dst_running, int dst_weight) { - unsigned int imbalance_min; + if (!allow_numa_imbalance(dst_running, dst_weight)) + return imbalance; /* * Allow a small imbalance based on a simple pair of communicating - * tasks that remain local when the source domain is almost idle. + * tasks that remain local when the destination is lightly loaded. */ - imbalance_min = 2; - if (nr_running <= imbalance_min) + if (imbalance <= NUMA_IMBALANCE_MIN) return 0; return imbalance; @@ -9031,7 +9091,8 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * emptying busiest. */ if (local->group_type == group_has_spare) { - if (busiest->group_type > group_fully_busy) { + if ((busiest->group_type > group_fully_busy) && + !(env->sd->flags & SD_SHARE_PKG_RESOURCES)) { /* * If busiest is overloaded, try to fill spare * capacity. This might end up creating spare capacity @@ -9080,9 +9141,10 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s } /* Consider allowing a small imbalance between NUMA groups */ - if (env->sd->flags & SD_NUMA) + if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - busiest->sum_nr_running); + busiest->sum_nr_running, busiest->group_weight); + } return; } @@ -10047,6 +10109,10 @@ static inline int find_new_ilb(void) for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { + + if (ilb == smp_processor_id()) + continue; + if (idle_cpu(ilb)) return ilb; } @@ -10484,7 +10550,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } #endif /* CONFIG_NO_HZ_COMMON */ /* - * idle_balance is called by schedule() if this_cpu is about to become + * newidle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. * * Returns: @@ -11158,8 +11224,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class - __section("__fair_sched_class") = { +DEFINE_SCHED_CLASS(fair) = { + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, |