summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c446
1 files changed, 351 insertions, 95 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ac855b2f4774..6483834f1278 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -38,7 +38,7 @@
* (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
*/
unsigned int sysctl_sched_latency = 6000000ULL;
-unsigned int normalized_sysctl_sched_latency = 6000000ULL;
+static unsigned int normalized_sysctl_sched_latency = 6000000ULL;
/*
* The initial- and re-scaling of tunables is configurable
@@ -58,8 +58,8 @@ enum sched_tunable_scaling sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_L
*
* (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_min_granularity = 750000ULL;
-unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
+unsigned int sysctl_sched_min_granularity = 750000ULL;
+static unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
/*
* This value is kept at sysctl_sched_latency/sysctl_sched_min_granularity
@@ -81,8 +81,8 @@ unsigned int sysctl_sched_child_runs_first __read_mostly;
*
* (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
*/
-unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
-unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
+unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
+static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
@@ -94,6 +94,14 @@ int __weak arch_asym_cpu_priority(int cpu)
{
return -cpu;
}
+
+/*
+ * The margin used when comparing utilization with CPU capacity:
+ * util * margin < capacity * 1024
+ *
+ * (default: ~20%)
+ */
+static unsigned int capacity_margin = 1280;
#endif
#ifdef CONFIG_CFS_BANDWIDTH
@@ -110,14 +118,6 @@ int __weak arch_asym_cpu_priority(int cpu)
unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
#endif
-/*
- * The margin used when comparing utilization with CPU capacity:
- * util * margin < capacity * 1024
- *
- * (default: ~20%)
- */
-unsigned int capacity_margin = 1280;
-
static inline void update_load_add(struct load_weight *lw, unsigned long inc)
{
lw->weight += inc;
@@ -352,10 +352,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
}
}
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
- list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
- leaf_cfs_rq_list)
+/* Iterate through all leaf cfs_rq's on a runqueue: */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
@@ -447,8 +446,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
}
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
- for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
@@ -703,9 +702,9 @@ void init_entity_runnable_average(struct sched_entity *se)
memset(sa, 0, sizeof(*sa));
/*
- * Tasks are intialized with full load to be seen as heavy tasks until
+ * Tasks are initialized with full load to be seen as heavy tasks until
* they get a chance to stabilize to their real load level.
- * Group entities are intialized with zero load to reflect the fact that
+ * Group entities are initialized with zero load to reflect the fact that
* nothing has been attached to the task group yet.
*/
if (entity_is_task(se))
@@ -2734,6 +2733,17 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
WRITE_ONCE(*ptr, res); \
} while (0)
+/*
+ * Remove and clamp on negative, from a local variable.
+ *
+ * A variant of sub_positive(), which does not use explicit load-store
+ * and is thus optimized for local variable updates.
+ */
+#define lsub_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ *ptr -= min_t(typeof(*ptr), *ptr, _val); \
+} while (0)
+
#ifdef CONFIG_SMP
static inline void
enqueue_runnable_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -3604,7 +3614,7 @@ static inline unsigned long _task_util_est(struct task_struct *p)
{
struct util_est ue = READ_ONCE(p->se.avg.util_est);
- return max(ue.ewma, ue.enqueued);
+ return (max(ue.ewma, ue.enqueued) | UTIL_AVG_UNCHANGED);
}
static inline unsigned long task_util_est(struct task_struct *p)
@@ -3622,7 +3632,7 @@ static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
/* Update root cfs_rq's estimated utilization */
enqueued = cfs_rq->avg.util_est.enqueued;
- enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
+ enqueued += _task_util_est(p);
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
}
@@ -3650,8 +3660,7 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
/* Update root cfs_rq's estimated utilization */
ue.enqueued = cfs_rq->avg.util_est.enqueued;
- ue.enqueued -= min_t(unsigned int, ue.enqueued,
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+ ue.enqueued -= min_t(unsigned int, ue.enqueued, _task_util_est(p));
WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
/*
@@ -3966,8 +3975,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
/*
* When dequeuing a sched_entity, we must:
* - Update loads to have both entity and cfs_rq synced with now.
- * - Substract its load from the cfs_rq->runnable_avg.
- * - Substract its previous weight from cfs_rq->load.weight.
+ * - Subtract its load from the cfs_rq->runnable_avg.
+ * - Subtract its previous weight from cfs_rq->load.weight.
* - For group entity, update its weight to reflect the new share
* of its group cfs_rq.
*/
@@ -4640,7 +4649,7 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
}
/*
@@ -4774,7 +4783,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
- cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ lsub_positive(&cfs_b->runtime, runtime);
cfs_b->distribute_running = 0;
raw_spin_unlock(&cfs_b->lock);
}
@@ -5072,6 +5081,24 @@ static inline void hrtick_update(struct rq *rq)
}
#endif
+#ifdef CONFIG_SMP
+static inline unsigned long cpu_util(int cpu);
+static unsigned long capacity_of(int cpu);
+
+static inline bool cpu_overutilized(int cpu)
+{
+ return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
+static inline void update_overutilized_status(struct rq *rq)
+{
+ if (!READ_ONCE(rq->rd->overutilized) && cpu_overutilized(rq->cpu))
+ WRITE_ONCE(rq->rd->overutilized, SG_OVERUTILIZED);
+}
+#else
+static inline void update_overutilized_status(struct rq *rq) { }
+#endif
+
/*
* The enqueue_task method is called before nr_running is
* increased. Here we update the fair scheduling stats and
@@ -5129,8 +5156,26 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_cfs_group(se);
}
- if (!se)
+ if (!se) {
add_nr_running(rq, 1);
+ /*
+ * Since new tasks are assigned an initial util_avg equal to
+ * half of the spare capacity of their CPU, tiny tasks have the
+ * ability to cross the overutilized threshold, which will
+ * result in the load balancer ruining all the task placement
+ * done by EAS. As a way to mitigate that effect, do not account
+ * for the first enqueue operation of new tasks during the
+ * overutilized flag detection.
+ *
+ * A better way of solving this problem would be to wait for
+ * the PELT signals of tasks to converge before taking them
+ * into account, but that is not straightforward to implement,
+ * and the following generally works well enough in practice.
+ */
+ if (flags & ENQUEUE_WAKEUP)
+ update_overutilized_status(rq);
+
+ }
hrtick_update(rq);
}
@@ -6241,7 +6286,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
util = READ_ONCE(cfs_rq->avg.util_avg);
/* Discount task's util from CPU's util */
- util -= min_t(unsigned int, util, task_util(p));
+ lsub_positive(&util, task_util(p));
/*
* Covered cases:
@@ -6290,10 +6335,9 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p)
* properly fix the execl regression and it helps in further
* reducing the chances for the above race.
*/
- if (unlikely(task_on_rq_queued(p) || current == p)) {
- estimated -= min_t(unsigned int, estimated,
- (_task_util_est(p) | UTIL_AVG_UNCHANGED));
- }
+ if (unlikely(task_on_rq_queued(p) || current == p))
+ lsub_positive(&estimated, _task_util_est(p));
+
util = max(util, estimated);
}
@@ -6333,6 +6377,213 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
}
/*
+ * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
+ * to @dst_cpu.
+ */
+static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
+{
+ struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
+ unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+
+ /*
+ * If @p migrates from @cpu to another, remove its contribution. Or,
+ * if @p migrates from another CPU to @cpu, add its contribution. In
+ * the other cases, @cpu is not impacted by the migration, so the
+ * util_avg should already be correct.
+ */
+ if (task_cpu(p) == cpu && dst_cpu != cpu)
+ sub_positive(&util, task_util(p));
+ else if (task_cpu(p) != cpu && dst_cpu == cpu)
+ util += task_util(p);
+
+ if (sched_feat(UTIL_EST)) {
+ util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+ /*
+ * During wake-up, the task isn't enqueued yet and doesn't
+ * appear in the cfs_rq->avg.util_est.enqueued of any rq,
+ * so just add it (if needed) to "simulate" what will be
+ * cpu_util() after the task has been enqueued.
+ */
+ if (dst_cpu == cpu)
+ util_est += _task_util_est(p);
+
+ util = max(util, util_est);
+ }
+
+ return min(util, capacity_orig_of(cpu));
+}
+
+/*
+ * compute_energy(): Estimates the energy that would be consumed if @p was
+ * migrated to @dst_cpu. compute_energy() predicts what will be the utilization
+ * landscape of the * CPUs after the task migration, and uses the Energy Model
+ * to compute what would be the energy if we decided to actually migrate that
+ * task.
+ */
+static long
+compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd)
+{
+ long util, max_util, sum_util, energy = 0;
+ int cpu;
+
+ for (; pd; pd = pd->next) {
+ max_util = sum_util = 0;
+ /*
+ * The capacity state of CPUs of the current rd can be driven by
+ * CPUs of another rd if they belong to the same performance
+ * domain. So, account for the utilization of these CPUs too
+ * by masking pd with cpu_online_mask instead of the rd span.
+ *
+ * If an entire performance domain is outside of the current rd,
+ * it will not appear in its pd list and will not be accounted
+ * by compute_energy().
+ */
+ for_each_cpu_and(cpu, perf_domain_span(pd), cpu_online_mask) {
+ util = cpu_util_next(cpu, p, dst_cpu);
+ util = schedutil_energy_util(cpu, util);
+ max_util = max(util, max_util);
+ sum_util += util;
+ }
+
+ energy += em_pd_energy(pd->em_pd, max_util, sum_util);
+ }
+
+ return energy;
+}
+
+/*
+ * find_energy_efficient_cpu(): Find most energy-efficient target CPU for the
+ * waking task. find_energy_efficient_cpu() looks for the CPU with maximum
+ * spare capacity in each performance domain and uses it as a potential
+ * candidate to execute the task. Then, it uses the Energy Model to figure
+ * out which of the CPU candidates is the most energy-efficient.
+ *
+ * The rationale for this heuristic is as follows. In a performance domain,
+ * all the most energy efficient CPU candidates (according to the Energy
+ * Model) are those for which we'll request a low frequency. When there are
+ * several CPUs for which the frequency request will be the same, we don't
+ * have enough data to break the tie between them, because the Energy Model
+ * only includes active power costs. With this model, if we assume that
+ * frequency requests follow utilization (e.g. using schedutil), the CPU with
+ * the maximum spare capacity in a performance domain is guaranteed to be among
+ * the best candidates of the performance domain.
+ *
+ * In practice, it could be preferable from an energy standpoint to pack
+ * small tasks on a CPU in order to let other CPUs go in deeper idle states,
+ * but that could also hurt our chances to go cluster idle, and we have no
+ * ways to tell with the current Energy Model if this is actually a good
+ * idea or not. So, find_energy_efficient_cpu() basically favors
+ * cluster-packing, and spreading inside a cluster. That should at least be
+ * a good thing for latency, and this is consistent with the idea that most
+ * of the energy savings of EAS come from the asymmetry of the system, and
+ * not so much from breaking the tie between identical CPUs. That's also the
+ * reason why EAS is enabled in the topology code only for systems where
+ * SD_ASYM_CPUCAPACITY is set.
+ *
+ * NOTE: Forkees are not accepted in the energy-aware wake-up path because
+ * they don't have any useful utilization data yet and it's not possible to
+ * forecast their impact on energy consumption. Consequently, they will be
+ * placed by find_idlest_cpu() on the least loaded CPU, which might turn out
+ * to be energy-inefficient in some use-cases. The alternative would be to
+ * bias new tasks towards specific types of CPUs first, or to try to infer
+ * their util_avg from the parent task, but those heuristics could hurt
+ * other use-cases too. So, until someone finds a better way to solve this,
+ * let's keep things simple by re-using the existing slow path.
+ */
+
+static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu)
+{
+ unsigned long prev_energy = ULONG_MAX, best_energy = ULONG_MAX;
+ struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+ int cpu, best_energy_cpu = prev_cpu;
+ struct perf_domain *head, *pd;
+ unsigned long cpu_cap, util;
+ struct sched_domain *sd;
+
+ rcu_read_lock();
+ pd = rcu_dereference(rd->pd);
+ if (!pd || READ_ONCE(rd->overutilized))
+ goto fail;
+ head = pd;
+
+ /*
+ * Energy-aware wake-up happens on the lowest sched_domain starting
+ * from sd_asym_cpucapacity spanning over this_cpu and prev_cpu.
+ */
+ sd = rcu_dereference(*this_cpu_ptr(&sd_asym_cpucapacity));
+ while (sd && !cpumask_test_cpu(prev_cpu, sched_domain_span(sd)))
+ sd = sd->parent;
+ if (!sd)
+ goto fail;
+
+ sync_entity_load_avg(&p->se);
+ if (!task_util_est(p))
+ goto unlock;
+
+ for (; pd; pd = pd->next) {
+ unsigned long cur_energy, spare_cap, max_spare_cap = 0;
+ int max_spare_cap_cpu = -1;
+
+ for_each_cpu_and(cpu, perf_domain_span(pd), sched_domain_span(sd)) {
+ if (!cpumask_test_cpu(cpu, &p->cpus_allowed))
+ continue;
+
+ /* Skip CPUs that will be overutilized. */
+ util = cpu_util_next(cpu, p, cpu);
+ cpu_cap = capacity_of(cpu);
+ if (cpu_cap * 1024 < util * capacity_margin)
+ continue;
+
+ /* Always use prev_cpu as a candidate. */
+ if (cpu == prev_cpu) {
+ prev_energy = compute_energy(p, prev_cpu, head);
+ best_energy = min(best_energy, prev_energy);
+ continue;
+ }
+
+ /*
+ * Find the CPU with the maximum spare capacity in
+ * the performance domain
+ */
+ spare_cap = cpu_cap - util;
+ if (spare_cap > max_spare_cap) {
+ max_spare_cap = spare_cap;
+ max_spare_cap_cpu = cpu;
+ }
+ }
+
+ /* Evaluate the energy impact of using this CPU. */
+ if (max_spare_cap_cpu >= 0) {
+ cur_energy = compute_energy(p, max_spare_cap_cpu, head);
+ if (cur_energy < best_energy) {
+ best_energy = cur_energy;
+ best_energy_cpu = max_spare_cap_cpu;
+ }
+ }
+ }
+unlock:
+ rcu_read_unlock();
+
+ /*
+ * Pick the best CPU if prev_cpu cannot be used, or if it saves at
+ * least 6% of the energy used by prev_cpu.
+ */
+ if (prev_energy == ULONG_MAX)
+ return best_energy_cpu;
+
+ if ((prev_energy - best_energy) > (prev_energy >> 4))
+ return best_energy_cpu;
+
+ return prev_cpu;
+
+fail:
+ rcu_read_unlock();
+
+ return -1;
+}
+
+/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
* SD_BALANCE_FORK, or SD_BALANCE_EXEC.
@@ -6355,8 +6606,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
if (sd_flag & SD_BALANCE_WAKE) {
record_wakee(p);
- want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
- && cpumask_test_cpu(cpu, &p->cpus_allowed);
+
+ if (static_branch_unlikely(&sched_energy_present)) {
+ new_cpu = find_energy_efficient_cpu(p, prev_cpu);
+ if (new_cpu >= 0)
+ return new_cpu;
+ new_cpu = prev_cpu;
+ }
+
+ want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu) &&
+ cpumask_test_cpu(cpu, &p->cpus_allowed);
}
rcu_read_lock();
@@ -6520,7 +6779,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
static void set_last_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
for_each_sched_entity(se) {
@@ -6532,7 +6791,7 @@ static void set_last_buddy(struct sched_entity *se)
static void set_next_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
+ if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
return;
for_each_sched_entity(se) {
@@ -6590,8 +6849,8 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
/* Idle tasks are by definition preempted by non-idle tasks. */
- if (unlikely(curr->policy == SCHED_IDLE) &&
- likely(p->policy != SCHED_IDLE))
+ if (unlikely(task_has_idle_policy(curr)) &&
+ likely(!task_has_idle_policy(p)))
goto preempt;
/*
@@ -7012,7 +7271,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env)
if (p->sched_class != &fair_sched_class)
return 0;
- if (unlikely(p->policy == SCHED_IDLE))
+ if (unlikely(task_has_idle_policy(p)))
return 0;
/*
@@ -7387,27 +7646,10 @@ static inline bool others_have_blocked(struct rq *rq)
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load.weight)
- return false;
-
- if (cfs_rq->avg.load_sum)
- return false;
-
- if (cfs_rq->avg.util_sum)
- return false;
-
- if (cfs_rq->avg.runnable_load_sum)
- return false;
-
- return true;
-}
-
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct cfs_rq *cfs_rq, *pos;
+ struct cfs_rq *cfs_rq;
const struct sched_class *curr_class;
struct rq_flags rf;
bool done = true;
@@ -7419,7 +7661,7 @@ static void update_blocked_averages(int cpu)
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
- for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
struct sched_entity *se;
/* throttled entities do not contribute to load */
@@ -7434,13 +7676,6 @@ static void update_blocked_averages(int cpu)
if (se && !skip_blocked_update(se))
update_load_avg(cfs_rq_of(se), se, 0);
- /*
- * There can be a lot of idle CPU cgroups. Don't let fully
- * decayed cfs_rqs linger on the list.
- */
- if (cfs_rq_is_decayed(cfs_rq))
- list_del_leaf_cfs_rq(cfs_rq);
-
/* Don't need periodic decay once load/util_avg are null */
if (cfs_rq_has_blocked(cfs_rq))
done = false;
@@ -7896,16 +8131,16 @@ static bool update_nohz_stats(struct rq *rq, bool force)
* update_sg_lb_stats - Update sched_group's statistics for load balancing.
* @env: The load balancing environment.
* @group: sched_group whose statistics are to be updated.
- * @load_idx: Load index of sched_domain of this_cpu for load calc.
- * @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
- * @overload: Indicate pullable load (e.g. >1 runnable task).
+ * @sg_status: Holds flag indicating the status of the sched_group
*/
static inline void update_sg_lb_stats(struct lb_env *env,
- struct sched_group *group, int load_idx,
- int local_group, struct sg_lb_stats *sgs,
- bool *overload)
+ struct sched_group *group,
+ struct sg_lb_stats *sgs,
+ int *sg_status)
{
+ int local_group = cpumask_test_cpu(env->dst_cpu, sched_group_span(group));
+ int load_idx = get_sd_load_idx(env->sd, env->idle);
unsigned long load;
int i, nr_running;
@@ -7929,7 +8164,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
nr_running = rq->nr_running;
if (nr_running > 1)
- *overload = true;
+ *sg_status |= SG_OVERLOAD;
+
+ if (cpu_overutilized(i))
+ *sg_status |= SG_OVERUTILIZED;
#ifdef CONFIG_NUMA_BALANCING
sgs->nr_numa_running += rq->nr_numa_running;
@@ -7945,7 +8183,7 @@ static inline void update_sg_lb_stats(struct lb_env *env,
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
sgs->group_misfit_task_load < rq->misfit_task_load) {
sgs->group_misfit_task_load = rq->misfit_task_load;
- *overload = 1;
+ *sg_status |= SG_OVERLOAD;
}
}
@@ -8090,17 +8328,14 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- int load_idx;
- bool overload = false;
bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
+ int sg_status = 0;
#ifdef CONFIG_NO_HZ_COMMON
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
env->flags |= LBF_NOHZ_STATS;
#endif
- load_idx = get_sd_load_idx(env->sd, env->idle);
-
do {
struct sg_lb_stats *sgs = &tmp_sgs;
int local_group;
@@ -8115,8 +8350,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
update_group_capacity(env->sd, env->dst_cpu);
}
- update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
- &overload);
+ update_sg_lb_stats(env, sg, sgs, &sg_status);
if (local_group)
goto next_group;
@@ -8165,9 +8399,15 @@ next_group:
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
if (!env->sd->parent) {
+ struct root_domain *rd = env->dst_rq->rd;
+
/* update overload indicator if we are at root domain */
- if (READ_ONCE(env->dst_rq->rd->overload) != overload)
- WRITE_ONCE(env->dst_rq->rd->overload, overload);
+ WRITE_ONCE(rd->overload, sg_status & SG_OVERLOAD);
+
+ /* Update over-utilization (tipping point, U >= 0) indicator */
+ WRITE_ONCE(rd->overutilized, sg_status & SG_OVERUTILIZED);
+ } else if (sg_status & SG_OVERUTILIZED) {
+ WRITE_ONCE(env->dst_rq->rd->overutilized, SG_OVERUTILIZED);
}
}
@@ -8394,6 +8634,14 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
* this level.
*/
update_sd_lb_stats(env, &sds);
+
+ if (static_branch_unlikely(&sched_energy_present)) {
+ struct root_domain *rd = env->dst_rq->rd;
+
+ if (rcu_dereference(rd->pd) && !READ_ONCE(rd->overutilized))
+ goto out_balanced;
+ }
+
local = &sds.local_stat;
busiest = &sds.busiest_stat;
@@ -8910,13 +9158,22 @@ out_all_pinned:
sd->nr_balance_failed = 0;
out_one_pinned:
+ ld_moved = 0;
+
+ /*
+ * idle_balance() disregards balance intervals, so we could repeatedly
+ * reach this code, which would lead to balance_interval skyrocketting
+ * in a short amount of time. Skip the balance_interval increase logic
+ * to avoid that.
+ */
+ if (env.idle == CPU_NEWLY_IDLE)
+ goto out;
+
/* tune up the balancing interval */
- if (((env.flags & LBF_ALL_PINNED) &&
- sd->balance_interval < MAX_PINNED_INTERVAL) ||
- (sd->balance_interval < sd->max_interval))
+ if ((env.flags & LBF_ALL_PINNED &&
+ sd->balance_interval < MAX_PINNED_INTERVAL) ||
+ sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
-
- ld_moved = 0;
out:
return ld_moved;
}
@@ -9281,7 +9538,7 @@ static void nohz_balancer_kick(struct rq *rq)
}
}
- sd = rcu_dereference(per_cpu(sd_asym, cpu));
+ sd = rcu_dereference(per_cpu(sd_asym_packing, cpu));
if (sd) {
for_each_cpu(i, sched_domain_span(sd)) {
if (i == cpu ||
@@ -9533,9 +9790,7 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
return false;
}
- /*
- * barrier, pairs with nohz_balance_enter_idle(), ensures ...
- */
+ /* could be _relaxed() */
flags = atomic_fetch_andnot(NOHZ_KICK_MASK, nohz_flags(this_cpu));
if (!(flags & NOHZ_KICK_MASK))
return false;
@@ -9785,6 +10040,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
task_tick_numa(rq, curr);
update_misfit_status(curr, rq);
+ update_overutilized_status(task_rq(curr));
}
/*
@@ -10289,10 +10545,10 @@ const struct sched_class fair_sched_class = {
#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
- struct cfs_rq *cfs_rq, *pos;
+ struct cfs_rq *cfs_rq;
rcu_read_lock();
- for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
+ for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}