summaryrefslogtreecommitdiff
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c187
1 files changed, 140 insertions, 47 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 908c9cdae2f0..ee271bb661cc 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
+static unsigned long capacity_of(int cpu);
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
@@ -1456,7 +1457,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
static unsigned long weighted_cpuload(struct rq *rq);
static unsigned long source_load(int cpu, int type);
static unsigned long target_load(int cpu, int type);
-static unsigned long capacity_of(int cpu);
/* Cached statistics for all CPUs within a node */
struct numa_stats {
@@ -1464,8 +1464,6 @@ struct numa_stats {
/* Total compute capacity of CPUs on a node */
unsigned long compute_capacity;
-
- unsigned int nr_running;
};
/*
@@ -1473,36 +1471,16 @@ struct numa_stats {
*/
static void update_numa_stats(struct numa_stats *ns, int nid)
{
- int smt, cpu, cpus = 0;
- unsigned long capacity;
+ int cpu;
memset(ns, 0, sizeof(*ns));
for_each_cpu(cpu, cpumask_of_node(nid)) {
struct rq *rq = cpu_rq(cpu);
- ns->nr_running += rq->nr_running;
ns->load += weighted_cpuload(rq);
ns->compute_capacity += capacity_of(cpu);
-
- cpus++;
}
- /*
- * If we raced with hotplug and there are no CPUs left in our mask
- * the @ns structure is NULL'ed and task_numa_compare() will
- * not find this node attractive.
- *
- * We'll detect a huge imbalance and bail there.
- */
- if (!cpus)
- return;
-
- /* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
- smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
- capacity = cpus / smt; /* cores */
-
- capacity = min_t(unsigned, capacity,
- DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
}
struct task_numa_env {
@@ -3723,6 +3701,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
WRITE_ONCE(p->se.avg.util_est, ue);
}
+static inline int task_fits_capacity(struct task_struct *p, long capacity)
+{
+ return capacity * 1024 > task_util_est(p) * capacity_margin;
+}
+
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
+{
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return;
+
+ if (!p) {
+ rq->misfit_task_load = 0;
+ return;
+ }
+
+ if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ rq->misfit_task_load = 0;
+ return;
+ }
+
+ rq->misfit_task_load = task_h_load(p);
+}
+
#else /* CONFIG_SMP */
#define UPDATE_TG 0x0
@@ -3752,6 +3753,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
static inline void
util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
bool task_sleep) {}
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
#endif /* CONFIG_SMP */
@@ -6280,6 +6282,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
{
long min_cap, max_cap;
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return 0;
+
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
@@ -6290,7 +6295,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
/* Bring task utilization in sync with prev_cpu */
sync_entity_load_avg(&p->se);
- return min_cap * 1024 < task_util(p) * capacity_margin;
+ return !task_fits_capacity(p, min_cap);
}
/*
@@ -6709,9 +6714,12 @@ done: __maybe_unused;
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
+ update_misfit_status(p, rq);
+
return p;
idle:
+ update_misfit_status(NULL, rq);
new_tasks = idle_balance(rq, rf);
/*
@@ -6917,6 +6925,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
enum fbq_type { regular, remote, all };
+enum group_type {
+ group_other = 0,
+ group_misfit_task,
+ group_imbalanced,
+ group_overloaded,
+};
+
#define LBF_ALL_PINNED 0x01
#define LBF_NEED_BREAK 0x02
#define LBF_DST_PINNED 0x04
@@ -6947,6 +6962,7 @@ struct lb_env {
unsigned int loop_max;
enum fbq_type fbq_type;
+ enum group_type src_grp_type;
struct list_head tasks;
};
@@ -7327,7 +7343,7 @@ static inline bool others_have_blocked(struct rq *rq)
if (READ_ONCE(rq->avg_dl.util_avg))
return true;
-#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
+#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
if (READ_ONCE(rq->avg_irq.util_avg))
return true;
#endif
@@ -7490,12 +7506,6 @@ static unsigned long task_h_load(struct task_struct *p)
/********** Helpers for find_busiest_group ************************/
-enum group_type {
- group_other = 0,
- group_imbalanced,
- group_overloaded,
-};
-
/*
* sg_lb_stats - stats of a sched_group required for load_balancing
*/
@@ -7511,6 +7521,7 @@ struct sg_lb_stats {
unsigned int group_weight;
enum group_type group_type;
int group_no_capacity;
+ unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
@@ -7619,13 +7630,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
+ sdg->sgc->max_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
{
struct sched_domain *child = sd->child;
struct sched_group *group, *sdg = sd->groups;
- unsigned long capacity, min_capacity;
+ unsigned long capacity, min_capacity, max_capacity;
unsigned long interval;
interval = msecs_to_jiffies(sd->balance_interval);
@@ -7639,6 +7651,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
capacity = 0;
min_capacity = ULONG_MAX;
+ max_capacity = 0;
if (child->flags & SD_OVERLAP) {
/*
@@ -7669,6 +7682,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
}
min_capacity = min(capacity, min_capacity);
+ max_capacity = max(capacity, max_capacity);
}
} else {
/*
@@ -7682,12 +7696,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
capacity += sgc->capacity;
min_capacity = min(sgc->min_capacity, min_capacity);
+ max_capacity = max(sgc->max_capacity, max_capacity);
group = group->next;
} while (group != child->groups);
}
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = min_capacity;
+ sdg->sgc->max_capacity = max_capacity;
}
/*
@@ -7783,16 +7799,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
}
/*
- * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
* per-CPU capacity than sched_group ref.
*/
static inline bool
-group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
return sg->sgc->min_capacity * capacity_margin <
ref->sgc->min_capacity * 1024;
}
+/*
+ * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity_orig than sched_group ref.
+ */
+static inline bool
+group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->max_capacity * capacity_margin <
+ ref->sgc->max_capacity * 1024;
+}
+
static inline enum
group_type group_classify(struct sched_group *group,
struct sg_lb_stats *sgs)
@@ -7803,6 +7830,9 @@ group_type group_classify(struct sched_group *group,
if (sg_imbalanced(group))
return group_imbalanced;
+ if (sgs->group_misfit_task_load)
+ return group_misfit_task;
+
return group_other;
}
@@ -7835,7 +7865,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
- * @overload: Indicate more than one runnable task for any CPU.
+ * @overload: Indicate pullable load (e.g. >1 runnable task).
*/
static inline void update_sg_lb_stats(struct lb_env *env,
struct sched_group *group, int load_idx,
@@ -7877,6 +7907,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
*/
if (!nr_running && idle_cpu(i))
sgs->idle_cpus++;
+
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+ sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *overload = 1;
+ }
}
/* Adjust by relative CPU capacity of the group */
@@ -7912,6 +7948,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
{
struct sg_lb_stats *busiest = &sds->busiest_stat;
+ /*
+ * Don't try to pull misfit tasks we can't help.
+ * We can use max_capacity here as reduction in capacity on some
+ * CPUs in the group should either be possible to resolve
+ * internally or be covered by avg_load imbalance (eventually).
+ */
+ if (sgs->group_type == group_misfit_task &&
+ (!group_smaller_max_cpu_capacity(sg, sds->local) ||
+ !group_has_capacity(env, &sds->local_stat)))
+ return false;
+
if (sgs->group_type > busiest->group_type)
return true;
@@ -7931,7 +7978,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
* power/energy consequences are not considered.
*/
if (sgs->sum_nr_running <= sgs->group_weight &&
- group_smaller_cpu_capacity(sds->local, sg))
+ group_smaller_min_cpu_capacity(sds->local, sg))
+ return false;
+
+ /*
+ * If we have more than one misfit sg go with the biggest misfit.
+ */
+ if (sgs->group_type == group_misfit_task &&
+ sgs->group_misfit_task_load < busiest->group_misfit_task_load)
return false;
asym_packing:
@@ -8002,11 +8056,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- int load_idx, prefer_sibling = 0;
+ int load_idx;
bool overload = false;
-
- if (child && child->flags & SD_PREFER_SIBLING)
- prefer_sibling = 1;
+ bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
#ifdef CONFIG_NO_HZ_COMMON
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
@@ -8080,8 +8132,8 @@ next_group:
if (!env->sd->parent) {
/* update overload indicator if we are at root domain */
- if (env->dst_rq->rd->overload != overload)
- env->dst_rq->rd->overload = overload;
+ if (READ_ONCE(env->dst_rq->rd->overload) != overload)
+ WRITE_ONCE(env->dst_rq->rd->overload, overload);
}
}
@@ -8231,8 +8283,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* factors in sg capacity and sgs with smaller group_type are
* skipped when updating the busiest sg:
*/
- if (busiest->avg_load <= sds->avg_load ||
- local->avg_load >= sds->avg_load) {
+ if (busiest->group_type != group_misfit_task &&
+ (busiest->avg_load <= sds->avg_load ||
+ local->avg_load >= sds->avg_load)) {
env->imbalance = 0;
return fix_small_imbalance(env, sds);
}
@@ -8266,6 +8319,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
+ /* Boost imbalance to allow misfit task to be balanced. */
+ if (busiest->group_type == group_misfit_task) {
+ env->imbalance = max_t(long, env->imbalance,
+ busiest->group_misfit_task_load);
+ }
+
/*
* if *imbalance is less than the average load per runnable task
* there is no guarantee that any tasks will be moved so we'll have
@@ -8332,6 +8391,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
busiest->group_no_capacity)
goto force_balance;
+ /* Misfit tasks should be dealt with regardless of the avg load */
+ if (busiest->group_type == group_misfit_task)
+ goto force_balance;
+
/*
* If the local group is busier than the selected busiest group
* don't try and pull any tasks.
@@ -8369,6 +8432,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
force_balance:
/* Looks like there is an imbalance. Compute it */
+ env->src_grp_type = busiest->group_type;
calculate_imbalance(env, &sds);
return env->imbalance ? sds.busiest : NULL;
@@ -8416,8 +8480,32 @@ static struct rq *find_busiest_queue(struct lb_env *env,
if (rt > env->fbq_type)
continue;
+ /*
+ * For ASYM_CPUCAPACITY domains with misfit tasks we simply
+ * seek the "biggest" misfit task.
+ */
+ if (env->src_grp_type == group_misfit_task) {
+ if (rq->misfit_task_load > busiest_load) {
+ busiest_load = rq->misfit_task_load;
+ busiest = rq;
+ }
+
+ continue;
+ }
+
capacity = capacity_of(i);
+ /*
+ * For ASYM_CPUCAPACITY domains, don't pick a CPU that could
+ * eventually lead to active_balancing high->low capacity.
+ * Higher per-CPU capacity is considered better than balancing
+ * average load.
+ */
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+ capacity_of(env->dst_cpu) < capacity &&
+ rq->nr_running == 1)
+ continue;
+
wl = weighted_cpuload(rq);
/*
@@ -8485,6 +8573,9 @@ static int need_active_balance(struct lb_env *env)
return 1;
}
+ if (env->src_grp_type == group_misfit_task)
+ return 1;
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -9127,7 +9218,7 @@ static void nohz_balancer_kick(struct rq *rq)
if (time_before(now, nohz.next_balance))
goto out;
- if (rq->nr_running >= 2) {
+ if (rq->nr_running >= 2 || rq->misfit_task_load) {
flags = NOHZ_KICK_MASK;
goto out;
}
@@ -9496,7 +9587,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
rq_unpin_lock(this_rq, rf);
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
+ !READ_ONCE(this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
@@ -9658,6 +9749,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+
+ update_misfit_status(curr, rq);
}
/*