summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/completion.c5
-rw-r--r--kernel/sched/core.c361
-rw-r--r--kernel/sched/cpudeadline.h3
-rw-r--r--kernel/sched/cpupri.h3
-rw-r--r--kernel/sched/deadline.c142
-rw-r--r--kernel/sched/debug.c11
-rw-r--r--kernel/sched/fair.c389
-rw-r--r--kernel/sched/idle_task.c5
-rw-r--r--kernel/sched/rt.c19
-rw-r--r--kernel/sched/sched.h45
-rw-r--r--kernel/sched/stop_task.c5
-rw-r--r--kernel/sched/wait.c66
12 files changed, 813 insertions, 241 deletions
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index a63f4dc27909..607f852b4d04 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -148,7 +148,7 @@ EXPORT_SYMBOL(wait_for_completion_timeout);
*
* This waits to be signaled for completion of a specific task. It is NOT
* interruptible and there is no timeout. The caller is accounted as waiting
- * for IO.
+ * for IO (which traditionally means blkio only).
*/
void __sched wait_for_completion_io(struct completion *x)
{
@@ -163,7 +163,8 @@ EXPORT_SYMBOL(wait_for_completion_io);
*
* This waits for either a completion of a specific task to be signaled or for a
* specified timeout to expire. The timeout is in jiffies. It is not
- * interruptible. The caller is accounted as waiting for IO.
+ * interruptible. The caller is accounted as waiting for IO (which traditionally
+ * means blkio only).
*
* Return: 0 if timed out, and positive (at least 1, or number of jiffies left
* till timeout) if completed.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 44999505e1bf..bb398c0c5f08 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1008,6 +1008,9 @@ inline int task_curr(const struct task_struct *p)
return cpu_curr(task_cpu(p)) == p;
}
+/*
+ * Can drop rq->lock because from sched_class::switched_from() methods drop it.
+ */
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
const struct sched_class *prev_class,
int oldprio)
@@ -1015,6 +1018,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
if (prev_class != p->sched_class) {
if (prev_class->switched_from)
prev_class->switched_from(rq, p);
+ /* Possble rq->lock 'hole'. */
p->sched_class->switched_to(rq, p);
} else if (oldprio != p->prio || dl_task(p))
p->sched_class->prio_changed(rq, p, oldprio);
@@ -1054,7 +1058,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
* ttwu() will sort out the placement.
*/
WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
- !(task_preempt_count(p) & PREEMPT_ACTIVE));
+ !p->on_rq);
#ifdef CONFIG_LOCKDEP
/*
@@ -1407,7 +1411,8 @@ out:
static inline
int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
{
- cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+ if (p->nr_cpus_allowed > 1)
+ cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
/*
* In order not to call set_task_cpu() on a blocking task we need
@@ -1623,8 +1628,10 @@ void wake_up_if_idle(int cpu)
struct rq *rq = cpu_rq(cpu);
unsigned long flags;
- if (!is_idle_task(rq->curr))
- return;
+ rcu_read_lock();
+
+ if (!is_idle_task(rcu_dereference(rq->curr)))
+ goto out;
if (set_nr_if_polling(rq->idle)) {
trace_sched_wake_idle_without_ipi(cpu);
@@ -1635,6 +1642,9 @@ void wake_up_if_idle(int cpu)
/* Else cpu is not in idle, do nothing here */
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
+
+out:
+ rcu_read_unlock();
}
bool cpus_share_cache(int this_cpu, int that_cpu)
@@ -1853,12 +1863,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
p->numa_work.next = &p->numa_work;
- p->numa_faults_memory = NULL;
- p->numa_faults_buffer_memory = NULL;
+ p->numa_faults = NULL;
p->last_task_numa_placement = 0;
p->last_sum_exec_runtime = 0;
- INIT_LIST_HEAD(&p->numa_entry);
p->numa_group = NULL;
#endif /* CONFIG_NUMA_BALANCING */
}
@@ -2034,25 +2042,6 @@ static inline int dl_bw_cpus(int i)
}
#endif
-static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw -= tsk_bw;
-}
-
-static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
-{
- dl_b->total_bw += tsk_bw;
-}
-
-static inline
-bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
-{
- return dl_b->bw != -1 &&
- dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
-}
-
/*
* We must be sure that accepting a new task (or allowing changing the
* parameters of an existing one) is consistent with the bandwidth
@@ -2220,7 +2209,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
/**
* finish_task_switch - clean up after a task-switch
- * @rq: runqueue associated with task-switch
* @prev: the thread we just switched away from.
*
* finish_task_switch must be called after the context switch, paired
@@ -2232,10 +2220,16 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
* so, we finish that here outside of the runqueue lock. (Doing it
* with the lock held can cause deadlocks; see schedule() for
* details.)
+ *
+ * The context switch have flipped the stack from under us and restored the
+ * local variables which were saved when this task called schedule() in the
+ * past. prev == current is still correct but we need to recalculate this_rq
+ * because prev may have moved to another CPU.
*/
-static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+static struct rq *finish_task_switch(struct task_struct *prev)
__releases(rq->lock)
{
+ struct rq *rq = this_rq();
struct mm_struct *mm = rq->prev_mm;
long prev_state;
@@ -2275,6 +2269,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
}
tick_nohz_task_switch(current);
+ return rq;
}
#ifdef CONFIG_SMP
@@ -2309,25 +2304,22 @@ static inline void post_schedule(struct rq *rq)
asmlinkage __visible void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- struct rq *rq = this_rq();
-
- finish_task_switch(rq, prev);
+ struct rq *rq;
- /*
- * FIXME: do we need to worry about rq being invalidated by the
- * task_switch?
- */
+ /* finish_task_switch() drops rq->lock and enables preemtion */
+ preempt_disable();
+ rq = finish_task_switch(prev);
post_schedule(rq);
+ preempt_enable();
if (current->set_child_tid)
put_user(task_pid_vnr(current), current->set_child_tid);
}
/*
- * context_switch - switch to the new MM and the new
- * thread's register state.
+ * context_switch - switch to the new MM and the new thread's register state.
*/
-static inline void
+static inline struct rq *
context_switch(struct rq *rq, struct task_struct *prev,
struct task_struct *next)
{
@@ -2366,14 +2358,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
context_tracking_task_switch(prev, next);
/* Here we just switch the register state and the stack. */
switch_to(prev, next, prev);
-
barrier();
- /*
- * this_rq must be evaluated again because prev may have moved
- * CPUs since it called schedule(), thus the 'rq' on its stack
- * frame will be invalid.
- */
- finish_task_switch(this_rq(), prev);
+
+ return finish_task_switch(prev);
}
/*
@@ -2475,44 +2462,6 @@ EXPORT_PER_CPU_SYMBOL(kstat);
EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
/*
- * Return any ns on the sched_clock that have not yet been accounted in
- * @p in case that task is currently running.
- *
- * Called with task_rq_lock() held on @rq.
- */
-static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
-{
- u64 ns = 0;
-
- /*
- * Must be ->curr _and_ ->on_rq. If dequeued, we would
- * project cycles that may never be accounted to this
- * thread, breaking clock_gettime().
- */
- if (task_current(rq, p) && task_on_rq_queued(p)) {
- update_rq_clock(rq);
- ns = rq_clock_task(rq) - p->se.exec_start;
- if ((s64)ns < 0)
- ns = 0;
- }
-
- return ns;
-}
-
-unsigned long long task_delta_exec(struct task_struct *p)
-{
- unsigned long flags;
- struct rq *rq;
- u64 ns = 0;
-
- rq = task_rq_lock(p, &flags);
- ns = do_task_delta_exec(p, rq);
- task_rq_unlock(rq, p, &flags);
-
- return ns;
-}
-
-/*
* Return accounted runtime for the task.
* In case the task is currently running, return the runtime plus current's
* pending runtime that have not been accounted yet.
@@ -2521,7 +2470,7 @@ unsigned long long task_sched_runtime(struct task_struct *p)
{
unsigned long flags;
struct rq *rq;
- u64 ns = 0;
+ u64 ns;
#if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
/*
@@ -2540,7 +2489,16 @@ unsigned long long task_sched_runtime(struct task_struct *p)
#endif
rq = task_rq_lock(p, &flags);
- ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+ /*
+ * Must be ->curr _and_ ->on_rq. If dequeued, we would
+ * project cycles that may never be accounted to this
+ * thread, breaking clock_gettime().
+ */
+ if (task_current(rq, p) && task_on_rq_queued(p)) {
+ update_rq_clock(rq);
+ p->sched_class->update_curr(rq);
+ }
+ ns = p->se.sum_exec_runtime;
task_rq_unlock(rq, p, &flags);
return ns;
@@ -2802,7 +2760,7 @@ need_resched:
preempt_disable();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- rcu_note_context_switch(cpu);
+ rcu_note_context_switch();
prev = rq->curr;
schedule_debug(prev);
@@ -2855,15 +2813,8 @@ need_resched:
rq->curr = next;
++*switch_count;
- context_switch(rq, prev, next); /* unlocks the rq */
- /*
- * The context switch have flipped the stack from under us
- * and restored the local variables which were saved when
- * this task called schedule() in the past. prev == current
- * is still correct, but it can be moved to another cpu/rq.
- */
- cpu = smp_processor_id();
- rq = cpu_rq(cpu);
+ rq = context_switch(rq, prev, next); /* unlocks the rq */
+ cpu = cpu_of(rq);
} else
raw_spin_unlock_irq(&rq->lock);
@@ -2903,10 +2854,14 @@ asmlinkage __visible void __sched schedule_user(void)
* or we have been woken up remotely but the IPI has not yet arrived,
* we haven't yet exited the RCU idle mode. Do it here manually until
* we find a better solution.
+ *
+ * NB: There are buggy callers of this function. Ideally we
+ * should warn if prev_state != IN_USER, but that will trigger
+ * too frequently to make sense yet.
*/
- user_exit();
+ enum ctx_state prev_state = exception_enter();
schedule();
- user_enter();
+ exception_exit(prev_state);
}
#endif
@@ -2951,6 +2906,47 @@ asmlinkage __visible void __sched notrace preempt_schedule(void)
}
NOKPROBE_SYMBOL(preempt_schedule);
EXPORT_SYMBOL(preempt_schedule);
+
+#ifdef CONFIG_CONTEXT_TRACKING
+/**
+ * preempt_schedule_context - preempt_schedule called by tracing
+ *
+ * The tracing infrastructure uses preempt_enable_notrace to prevent
+ * recursion and tracing preempt enabling caused by the tracing
+ * infrastructure itself. But as tracing can happen in areas coming
+ * from userspace or just about to enter userspace, a preempt enable
+ * can occur before user_exit() is called. This will cause the scheduler
+ * to be called when the system is still in usermode.
+ *
+ * To prevent this, the preempt_enable_notrace will use this function
+ * instead of preempt_schedule() to exit user context if needed before
+ * calling the scheduler.
+ */
+asmlinkage __visible void __sched notrace preempt_schedule_context(void)
+{
+ enum ctx_state prev_ctx;
+
+ if (likely(!preemptible()))
+ return;
+
+ do {
+ __preempt_count_add(PREEMPT_ACTIVE);
+ /*
+ * Needs preempt disabled in case user_exit() is traced
+ * and the tracer calls preempt_enable_notrace() causing
+ * an infinite recursion.
+ */
+ prev_ctx = exception_enter();
+ __schedule();
+ exception_exit(prev_ctx);
+
+ __preempt_count_sub(PREEMPT_ACTIVE);
+ barrier();
+ } while (need_resched());
+}
+EXPORT_SYMBOL_GPL(preempt_schedule_context);
+#endif /* CONFIG_CONTEXT_TRACKING */
+
#endif /* CONFIG_PREEMPT */
/*
@@ -4637,6 +4633,81 @@ void init_idle(struct task_struct *idle, int cpu)
#endif
}
+int cpuset_cpumask_can_shrink(const struct cpumask *cur,
+ const struct cpumask *trial)
+{
+ int ret = 1, trial_cpus;
+ struct dl_bw *cur_dl_b;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ cur_dl_b = dl_bw_of(cpumask_any(cur));
+ trial_cpus = cpumask_weight(trial);
+
+ raw_spin_lock_irqsave(&cur_dl_b->lock, flags);
+ if (cur_dl_b->bw != -1 &&
+ cur_dl_b->bw * trial_cpus < cur_dl_b->total_bw)
+ ret = 0;
+ raw_spin_unlock_irqrestore(&cur_dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ return ret;
+}
+
+int task_can_attach(struct task_struct *p,
+ const struct cpumask *cs_cpus_allowed)
+{
+ int ret = 0;
+
+ /*
+ * Kthreads which disallow setaffinity shouldn't be moved
+ * to a new cpuset; we don't want to change their cpu
+ * affinity and isolating such threads by their set of
+ * allowed nodes is unnecessary. Thus, cpusets are not
+ * applicable for such threads. This prevents checking for
+ * success of set_cpus_allowed_ptr() on all attached tasks
+ * before cpus_allowed may be changed.
+ */
+ if (p->flags & PF_NO_SETAFFINITY) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+#ifdef CONFIG_SMP
+ if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
+ cs_cpus_allowed)) {
+ unsigned int dest_cpu = cpumask_any_and(cpu_active_mask,
+ cs_cpus_allowed);
+ struct dl_bw *dl_b;
+ bool overflow;
+ int cpus;
+ unsigned long flags;
+
+ rcu_read_lock_sched();
+ dl_b = dl_bw_of(dest_cpu);
+ raw_spin_lock_irqsave(&dl_b->lock, flags);
+ cpus = dl_bw_cpus(dest_cpu);
+ overflow = __dl_overflow(dl_b, cpus, 0, p->dl.dl_bw);
+ if (overflow)
+ ret = -EBUSY;
+ else {
+ /*
+ * We reserve space for this task in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, p->dl.dl_bw);
+ }
+ raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+ rcu_read_unlock_sched();
+
+ }
+#endif
+out:
+ return ret;
+}
+
#ifdef CONFIG_SMP
/*
* move_queued_task - move a queued task to new rq.
@@ -6087,7 +6158,9 @@ static void claim_allocations(int cpu, struct sched_domain *sd)
#ifdef CONFIG_NUMA
static int sched_domains_numa_levels;
+enum numa_topology_type sched_numa_topology_type;
static int *sched_domains_numa_distance;
+int sched_max_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
static int sched_domains_curr_level;
#endif
@@ -6259,7 +6332,7 @@ static void sched_numa_warn(const char *str)
printk(KERN_WARNING "\n");
}
-static bool find_numa_distance(int distance)
+bool find_numa_distance(int distance)
{
int i;
@@ -6274,6 +6347,56 @@ static bool find_numa_distance(int distance)
return false;
}
+/*
+ * A system can have three types of NUMA topology:
+ * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system
+ * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes
+ * NUMA_BACKPLANE: nodes can reach other nodes through a backplane
+ *
+ * The difference between a glueless mesh topology and a backplane
+ * topology lies in whether communication between not directly
+ * connected nodes goes through intermediary nodes (where programs
+ * could run), or through backplane controllers. This affects
+ * placement of programs.
+ *
+ * The type of topology can be discerned with the following tests:
+ * - If the maximum distance between any nodes is 1 hop, the system
+ * is directly connected.
+ * - If for two nodes A and B, located N > 1 hops away from each other,
+ * there is an intermediary node C, which is < N hops away from both
+ * nodes A and B, the system is a glueless mesh.
+ */
+static void init_numa_topology_type(void)
+{
+ int a, b, c, n;
+
+ n = sched_max_numa_distance;
+
+ if (n <= 1)
+ sched_numa_topology_type = NUMA_DIRECT;
+
+ for_each_online_node(a) {
+ for_each_online_node(b) {
+ /* Find two nodes furthest removed from each other. */
+ if (node_distance(a, b) < n)
+ continue;
+
+ /* Is there an intermediary node between a and b? */
+ for_each_online_node(c) {
+ if (node_distance(a, c) < n &&
+ node_distance(b, c) < n) {
+ sched_numa_topology_type =
+ NUMA_GLUELESS_MESH;
+ return;
+ }
+ }
+
+ sched_numa_topology_type = NUMA_BACKPLANE;
+ return;
+ }
+ }
+}
+
static void sched_init_numa(void)
{
int next_distance, curr_distance = node_distance(0, 0);
@@ -6327,6 +6450,10 @@ static void sched_init_numa(void)
if (!sched_debug())
break;
}
+
+ if (!level)
+ return;
+
/*
* 'level' contains the number of unique distances, excluding the
* identity distance node_distance(i,i).
@@ -6406,6 +6533,9 @@ static void sched_init_numa(void)
sched_domain_topology = tl;
sched_domains_numa_levels = level;
+ sched_max_numa_distance = sched_domains_numa_distance[level - 1];
+
+ init_numa_topology_type();
}
static void sched_domains_numa_masks_set(int cpu)
@@ -7158,6 +7288,25 @@ static inline int preempt_count_equals(int preempt_offset)
void __might_sleep(const char *file, int line, int preempt_offset)
{
+ /*
+ * Blocking primitives will set (and therefore destroy) current->state,
+ * since we will exit with TASK_RUNNING make sure we enter with it,
+ * otherwise we will destroy state.
+ */
+ if (WARN_ONCE(current->state != TASK_RUNNING,
+ "do not call blocking ops when !TASK_RUNNING; "
+ "state=%lx set at [<%p>] %pS\n",
+ current->state,
+ (void *)current->task_state_change,
+ (void *)current->task_state_change))
+ __set_current_state(TASK_RUNNING);
+
+ ___might_sleep(file, line, preempt_offset);
+}
+EXPORT_SYMBOL(__might_sleep);
+
+void ___might_sleep(const char *file, int line, int preempt_offset)
+{
static unsigned long prev_jiffy; /* ratelimiting */
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
@@ -7189,7 +7338,7 @@ void __might_sleep(const char *file, int line, int preempt_offset)
#endif
dump_stack();
}
-EXPORT_SYMBOL(__might_sleep);
+EXPORT_SYMBOL(___might_sleep);
#endif
#ifdef CONFIG_MAGIC_SYSRQ
@@ -7403,8 +7552,12 @@ void sched_move_task(struct task_struct *tsk)
if (unlikely(running))
put_prev_task(rq, tsk);
- tg = container_of(task_css_check(tsk, cpu_cgrp_id,
- lockdep_is_held(&tsk->sighand->siglock)),
+ /*
+ * All callers are synchronized by task_rq_lock(); we do not use RCU
+ * which is pointless here. Thus, we pass "true" to task_css_check()
+ * to prevent lockdep warnings.
+ */
+ tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
struct task_group, css);
tg = autogroup_task_group(tsk, tg);
tsk->sched_task_group = tg;
@@ -7833,6 +7986,11 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css)
sched_offline_group(tg);
}
+static void cpu_cgroup_fork(struct task_struct *task)
+{
+ sched_move_task(task);
+}
+
static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css,
struct cgroup_taskset *tset)
{
@@ -8205,6 +8363,7 @@ struct cgroup_subsys cpu_cgrp_subsys = {
.css_free = cpu_cgroup_css_free,
.css_online = cpu_cgroup_css_online,
.css_offline = cpu_cgroup_css_offline,
+ .fork = cpu_cgroup_fork,
.can_attach = cpu_cgroup_can_attach,
.attach = cpu_cgroup_attach,
.exit = cpu_cgroup_exit,
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
index 538c9796ad4a..020039bd1326 100644
--- a/kernel/sched/cpudeadline.h
+++ b/kernel/sched/cpudeadline.h
@@ -25,9 +25,6 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p,
void cpudl_set(struct cpudl *cp, int cpu, u64 dl, int is_valid);
int cpudl_init(struct cpudl *cp);
void cpudl_cleanup(struct cpudl *cp);
-#else
-#define cpudl_set(cp, cpu, dl) do { } while (0)
-#define cpudl_init() do { } while (0)
#endif /* CONFIG_SMP */
#endif /* _LINUX_CPUDL_H */
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index 6b033347fdfd..63cbb9ca0496 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -26,9 +26,6 @@ int cpupri_find(struct cpupri *cp,
void cpupri_set(struct cpupri *cp, int cpu, int pri);
int cpupri_init(struct cpupri *cp);
void cpupri_cleanup(struct cpupri *cp);
-#else
-#define cpupri_set(cp, cpu, pri) do { } while (0)
-#define cpupri_init() do { } while (0)
#endif
#endif /* _LINUX_CPUPRI_H */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 256e577faf1b..e5db8c6feebd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -518,12 +518,20 @@ again:
}
/*
- * We need to take care of a possible races here. In fact, the
- * task might have changed its scheduling policy to something
- * different from SCHED_DEADLINE or changed its reservation
- * parameters (through sched_setattr()).
+ * We need to take care of several possible races here:
+ *
+ * - the task might have changed its scheduling policy
+ * to something different than SCHED_DEADLINE
+ * - the task might have changed its reservation parameters
+ * (through sched_setattr())
+ * - the task might have been boosted by someone else and
+ * might be in the boosting/deboosting path
+ *
+ * In all this cases we bail out, as the task is already
+ * in the runqueue or is going to be enqueued back anyway.
*/
- if (!dl_task(p) || dl_se->dl_new)
+ if (!dl_task(p) || dl_se->dl_new ||
+ dl_se->dl_boosted || !dl_se->dl_throttled)
goto unlock;
sched_clock_tick();
@@ -532,7 +540,7 @@ again:
dl_se->dl_yielded = 0;
if (task_on_rq_queued(p)) {
enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
- if (task_has_dl_policy(rq->curr))
+ if (dl_task(rq->curr))
check_preempt_curr_dl(rq, p, 0);
else
resched_curr(rq);
@@ -555,11 +563,6 @@ void init_dl_task_timer(struct sched_dl_entity *dl_se)
{
struct hrtimer *timer = &dl_se->dl_timer;
- if (hrtimer_active(timer)) {
- hrtimer_try_to_cancel(timer);
- return;
- }
-
hrtimer_init(timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
timer->function = dl_task_timer;
}
@@ -625,7 +628,7 @@ static void update_curr_dl(struct rq *rq)
sched_rt_avg_update(rq, delta_exec);
- dl_se->runtime -= delta_exec;
+ dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
if (dl_runtime_exceeded(rq, dl_se)) {
__dequeue_task_dl(rq, curr, 0);
if (likely(start_dl_timer(dl_se, curr->dl.dl_boosted)))
@@ -847,8 +850,19 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags)
* smaller than our one... OTW we keep our runtime and
* deadline.
*/
- if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio))
+ if (pi_task && p->dl.dl_boosted && dl_prio(pi_task->normal_prio)) {
pi_se = &pi_task->dl;
+ } else if (!dl_prio(p->normal_prio)) {
+ /*
+ * Special case in which we have a !SCHED_DEADLINE task
+ * that is going to be deboosted, but exceedes its
+ * runtime while doing so. No point in replenishing
+ * it, as it's going to return back to its original
+ * scheduling class after this.
+ */
+ BUG_ON(!p->dl.dl_boosted || flags != ENQUEUE_REPLENISH);
+ return;
+ }
/*
* If p is throttled, we do nothing. In fact, if it exhausted
@@ -914,7 +928,7 @@ select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
struct task_struct *curr;
struct rq *rq;
- if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
+ if (sd_flag != SD_BALANCE_WAKE)
goto out;
rq = cpu_rq(cpu);
@@ -999,6 +1013,10 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
{
hrtick_start(rq, p->dl.runtime);
}
+#else /* !CONFIG_SCHED_HRTICK */
+static void start_hrtick_dl(struct rq *rq, struct task_struct *p)
+{
+}
#endif
static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1052,10 +1070,8 @@ struct task_struct *pick_next_task_dl(struct rq *rq, struct task_struct *prev)
/* Running task will never be pushed. */
dequeue_pushable_dl_task(rq, p);
-#ifdef CONFIG_SCHED_HRTICK
if (hrtick_enabled(rq))
start_hrtick_dl(rq, p);
-#endif
set_post_schedule(rq);
@@ -1074,10 +1090,8 @@ static void task_tick_dl(struct rq *rq, struct task_struct *p, int queued)
{
update_curr_dl(rq);
-#ifdef CONFIG_SCHED_HRTICK
if (hrtick_enabled(rq) && queued && p->dl.runtime > 0)
start_hrtick_dl(rq, p);
-#endif
}
static void task_fork_dl(struct task_struct *p)
@@ -1314,6 +1328,7 @@ static int push_dl_task(struct rq *rq)
{
struct task_struct *next_task;
struct rq *later_rq;
+ int ret = 0;
if (!rq->dl.overloaded)
return 0;
@@ -1359,7 +1374,6 @@ retry:
* The task is still there. We don't try
* again, some other cpu will pull it when ready.
*/
- dequeue_pushable_dl_task(rq, next_task);
goto out;
}
@@ -1375,6 +1389,7 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
activate_task(later_rq, next_task, 0);
+ ret = 1;
resched_curr(later_rq);
@@ -1383,7 +1398,7 @@ retry:
out:
put_task_struct(next_task);
- return 1;
+ return ret;
}
static void push_dl_tasks(struct rq *rq)
@@ -1489,7 +1504,7 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p)
p->nr_cpus_allowed > 1 &&
dl_task(rq->curr) &&
(rq->curr->nr_cpus_allowed < 2 ||
- dl_entity_preempt(&rq->curr->dl, &p->dl))) {
+ !dl_entity_preempt(&p->dl, &rq->curr->dl))) {
push_dl_tasks(rq);
}
}
@@ -1498,10 +1513,33 @@ static void set_cpus_allowed_dl(struct task_struct *p,
const struct cpumask *new_mask)
{
struct rq *rq;
+ struct root_domain *src_rd;
int weight;
BUG_ON(!dl_task(p));
+ rq = task_rq(p);
+ src_rd = rq->rd;
+ /*
+ * Migrating a SCHED_DEADLINE task between exclusive
+ * cpusets (different root_domains) entails a bandwidth
+ * update. We already made space for us in the destination
+ * domain (see cpuset_can_attach()).
+ */
+ if (!cpumask_intersects(src_rd->span, new_mask)) {
+ struct dl_bw *src_dl_b;
+
+ src_dl_b = dl_bw_of(cpu_of(rq));
+ /*
+ * We now free resources of the root_domain we are migrating
+ * off. In the worst case, sched_setattr() may temporary fail
+ * until we complete the update.
+ */
+ raw_spin_lock(&src_dl_b->lock);
+ __dl_clear(src_dl_b, p->dl.dl_bw);
+ raw_spin_unlock(&src_dl_b->lock);
+ }
+
/*
* Update only if the task is actually running (i.e.,
* it is on the rq AND it is not throttled).
@@ -1518,8 +1556,6 @@ static void set_cpus_allowed_dl(struct task_struct *p,
if ((p->nr_cpus_allowed > 1) == (weight > 1))
return;
- rq = task_rq(p);
-
/*
* The process used to be able to migrate OR it can now migrate
*/
@@ -1567,22 +1603,48 @@ void init_sched_dl_class(void)
#endif /* CONFIG_SMP */
+/*
+ * Ensure p's dl_timer is cancelled. May drop rq->lock for a while.
+ */
+static void cancel_dl_timer(struct rq *rq, struct task_struct *p)
+{
+ struct hrtimer *dl_timer = &p->dl.dl_timer;
+
+ /* Nobody will change task's class if pi_lock is held */
+ lockdep_assert_held(&p->pi_lock);
+
+ if (hrtimer_active(dl_timer)) {
+ int ret = hrtimer_try_to_cancel(dl_timer);
+
+ if (unlikely(ret == -1)) {
+ /*
+ * Note, p may migrate OR new deadline tasks
+ * may appear in rq when we are unlocking it.
+ * A caller of us must be fine with that.
+ */
+ raw_spin_unlock(&rq->lock);
+ hrtimer_cancel(dl_timer);
+ raw_spin_lock(&rq->lock);
+ }
+ }
+}
+
static void switched_from_dl(struct rq *rq, struct task_struct *p)
{
- if (hrtimer_active(&p->dl.dl_timer) && !dl_policy(p->policy))
- hrtimer_try_to_cancel(&p->dl.dl_timer);
+ cancel_dl_timer(rq, p);
__dl_clear_params(p);
-#ifdef CONFIG_SMP
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
* from an overloaded cpu, if any.
*/
- if (!rq->dl.dl_nr_running)
- pull_dl_task(rq);
-#endif
+ if (!task_on_rq_queued(p) || rq->dl.dl_nr_running)
+ return;
+
+ if (pull_dl_task(rq))
+ resched_curr(rq);
}
/*
@@ -1603,12 +1665,17 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && rq->curr != p) {
#ifdef CONFIG_SMP
- if (rq->dl.overloaded && push_dl_task(rq) && rq != task_rq(p))
+ if (p->nr_cpus_allowed > 1 && rq->dl.overloaded &&
+ push_dl_task(rq) && rq != task_rq(p))
/* Only reschedule if pushing failed */
check_resched = 0;
#endif /* CONFIG_SMP */
- if (check_resched && task_has_dl_policy(rq->curr))
- check_preempt_curr_dl(rq, p, 0);
+ if (check_resched) {
+ if (dl_task(rq->curr))
+ check_preempt_curr_dl(rq, p, 0);
+ else
+ resched_curr(rq);
+ }
}
}
@@ -1678,4 +1745,15 @@ const struct sched_class dl_sched_class = {
.prio_changed = prio_changed_dl,
.switched_from = switched_from_dl,
.switched_to = switched_to_dl,
+
+ .update_curr = update_curr_dl,
};
+
+#ifdef CONFIG_SCHED_DEBUG
+extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq);
+
+void print_dl_stats(struct seq_file *m, int cpu)
+{
+ print_dl_rq(m, cpu, &cpu_rq(cpu)->dl);
+}
+#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index ce33780d8f20..92cc52001e74 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -261,6 +261,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq)
#undef P
}
+void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq)
+{
+ SEQ_printf(m, "\ndl_rq[%d]:\n", cpu);
+ SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running);
+}
+
extern __read_mostly int sched_clock_running;
static void print_cpu(struct seq_file *m, int cpu)
@@ -329,6 +335,7 @@ do { \
spin_lock_irqsave(&sched_debug_lock, flags);
print_cfs_stats(m, cpu);
print_rt_stats(m, cpu);
+ print_dl_stats(m, cpu);
print_rq(m, rq, cpu);
spin_unlock_irqrestore(&sched_debug_lock, flags);
@@ -528,8 +535,8 @@ static void sched_show_numa(struct task_struct *p, struct seq_file *m)
unsigned long nr_faults = -1;
int cpu_current, home_node;
- if (p->numa_faults_memory)
- nr_faults = p->numa_faults_memory[2*node + i];
+ if (p->numa_faults)
+ nr_faults = p->numa_faults[2*node + i];
cpu_current = !i ? (task_node(p) == node) :
(pol && node_isset(node, pol->v.nodes));
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 0b069bf3e708..df2cdf77f899 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -726,6 +726,11 @@ static void update_curr(struct cfs_rq *cfs_rq)
account_cfs_rq_runtime(cfs_rq, delta_exec);
}
+static void update_curr_fair(struct rq *rq)
+{
+ update_curr(cfs_rq_of(&rq->curr->se));
+}
+
static inline void
update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -828,11 +833,12 @@ static unsigned int task_nr_scan_windows(struct task_struct *p)
static unsigned int task_scan_min(struct task_struct *p)
{
+ unsigned int scan_size = ACCESS_ONCE(sysctl_numa_balancing_scan_size);
unsigned int scan, floor;
unsigned int windows = 1;
- if (sysctl_numa_balancing_scan_size < MAX_SCAN_WINDOW)
- windows = MAX_SCAN_WINDOW / sysctl_numa_balancing_scan_size;
+ if (scan_size < MAX_SCAN_WINDOW)
+ windows = MAX_SCAN_WINDOW / scan_size;
floor = 1000 / windows;
scan = sysctl_numa_balancing_scan_period_min / task_nr_scan_windows(p);
@@ -867,7 +873,6 @@ struct numa_group {
spinlock_t lock; /* nr_tasks, tasks */
int nr_tasks;
pid_t gid;
- struct list_head task_list;
struct rcu_head rcu;
nodemask_t active_nodes;
@@ -895,18 +900,24 @@ pid_t task_numa_group_id(struct task_struct *p)
return p->numa_group ? p->numa_group->gid : 0;
}
-static inline int task_faults_idx(int nid, int priv)
+/*
+ * The averaged statistics, shared & private, memory & cpu,
+ * occupy the first half of the array. The second half of the
+ * array is for current counters, which are averaged into the
+ * first set by task_numa_placement.
+ */
+static inline int task_faults_idx(enum numa_faults_stats s, int nid, int priv)
{
- return NR_NUMA_HINT_FAULT_TYPES * nid + priv;
+ return NR_NUMA_HINT_FAULT_TYPES * (s * nr_node_ids + nid) + priv;
}
static inline unsigned long task_faults(struct task_struct *p, int nid)
{
- if (!p->numa_faults_memory)
+ if (!p->numa_faults)
return 0;
- return p->numa_faults_memory[task_faults_idx(nid, 0)] +
- p->numa_faults_memory[task_faults_idx(nid, 1)];
+ return p->numa_faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults(struct task_struct *p, int nid)
@@ -914,14 +925,79 @@ static inline unsigned long group_faults(struct task_struct *p, int nid)
if (!p->numa_group)
return 0;
- return p->numa_group->faults[task_faults_idx(nid, 0)] +
- p->numa_group->faults[task_faults_idx(nid, 1)];
+ return p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 0)] +
+ p->numa_group->faults[task_faults_idx(NUMA_MEM, nid, 1)];
}
static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
{
- return group->faults_cpu[task_faults_idx(nid, 0)] +
- group->faults_cpu[task_faults_idx(nid, 1)];
+ return group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 0)] +
+ group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)];
+}
+
+/* Handle placement on systems where not all nodes are directly connected. */
+static unsigned long score_nearby_nodes(struct task_struct *p, int nid,
+ int maxdist, bool task)
+{
+ unsigned long score = 0;
+ int node;
+
+ /*
+ * All nodes are directly connected, and the same distance
+ * from each other. No need for fancy placement algorithms.
+ */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return 0;
+
+ /*
+ * This code is called for each node, introducing N^2 complexity,
+ * which should be ok given the number of nodes rarely exceeds 8.
+ */
+ for_each_online_node(node) {
+ unsigned long faults;
+ int dist = node_distance(nid, node);
+
+ /*
+ * The furthest away nodes in the system are not interesting
+ * for placement; nid was already counted.
+ */
+ if (dist == sched_max_numa_distance || node == nid)
+ continue;
+
+ /*
+ * On systems with a backplane NUMA topology, compare groups
+ * of nodes, and move tasks towards the group with the most
+ * memory accesses. When comparing two nodes at distance
+ * "hoplimit", only nodes closer by than "hoplimit" are part
+ * of each group. Skip other nodes.
+ */
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist > maxdist)
+ continue;
+
+ /* Add up the faults from nearby nodes. */
+ if (task)
+ faults = task_faults(p, node);
+ else
+ faults = group_faults(p, node);
+
+ /*
+ * On systems with a glueless mesh NUMA topology, there are
+ * no fixed "groups of nodes". Instead, nodes that are not
+ * directly connected bounce traffic through intermediate
+ * nodes; a numa_group can occupy any set of nodes.
+ * The further away a node is, the less the faults count.
+ * This seems to result in good task placement.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ faults *= (sched_max_numa_distance - dist);
+ faults /= (sched_max_numa_distance - LOCAL_DISTANCE);
+ }
+
+ score += faults;
+ }
+
+ return score;
}
/*
@@ -930,11 +1006,12 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid)
* larger multiplier, in order to group tasks together that are almost
* evenly spread out between numa nodes.
*/
-static inline unsigned long task_weight(struct task_struct *p, int nid)
+static inline unsigned long task_weight(struct task_struct *p, int nid,
+ int dist)
{
- unsigned long total_faults;
+ unsigned long faults, total_faults;
- if (!p->numa_faults_memory)
+ if (!p->numa_faults)
return 0;
total_faults = p->total_numa_faults;
@@ -942,15 +1019,29 @@ static inline unsigned long task_weight(struct task_struct *p, int nid)
if (!total_faults)
return 0;
- return 1000 * task_faults(p, nid) / total_faults;
+ faults = task_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, true);
+
+ return 1000 * faults / total_faults;
}
-static inline unsigned long group_weight(struct task_struct *p, int nid)
+static inline unsigned long group_weight(struct task_struct *p, int nid,
+ int dist)
{
- if (!p->numa_group || !p->numa_group->total_faults)
+ unsigned long faults, total_faults;
+
+ if (!p->numa_group)
+ return 0;
+
+ total_faults = p->numa_group->total_faults;
+
+ if (!total_faults)
return 0;
- return 1000 * group_faults(p, nid) / p->numa_group->total_faults;
+ faults = group_faults(p, nid);
+ faults += score_nearby_nodes(p, nid, dist, false);
+
+ return 1000 * faults / total_faults;
}
bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
@@ -1083,6 +1174,7 @@ struct task_numa_env {
struct numa_stats src_stats, dst_stats;
int imbalance_pct;
+ int dist;
struct task_struct *best_task;
long best_imp;
@@ -1162,11 +1254,29 @@ static void task_numa_compare(struct task_numa_env *env,
long load;
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
+ int dist = env->dist;
rcu_read_lock();
- cur = ACCESS_ONCE(dst_rq->curr);
- if (cur->pid == 0) /* idle */
+
+ raw_spin_lock_irq(&dst_rq->lock);
+ cur = dst_rq->curr;
+ /*
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe under RCU read lock.
+ * Note that rcu_read_lock() itself can't protect from the final
+ * put_task_struct() after the last schedule().
+ */
+ if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ raw_spin_unlock_irq(&dst_rq->lock);
+
+ /*
+ * Because we have preemption enabled we can get migrated around and
+ * end try selecting ourselves (current == env->p) as a swap candidate.
+ */
+ if (cur == env->p)
+ goto unlock;
/*
* "imp" is the fault differential for the source task between the
@@ -1185,8 +1295,8 @@ static void task_numa_compare(struct task_numa_env *env,
* in any group then look only at task weights.
*/
if (cur->numa_group == env->p->numa_group) {
- imp = taskimp + task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp = taskimp + task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
/*
* Add some hysteresis to prevent swapping the
* tasks within a group over tiny differences.
@@ -1200,11 +1310,11 @@ static void task_numa_compare(struct task_numa_env *env,
* instead.
*/
if (cur->numa_group)
- imp += group_weight(cur, env->src_nid) -
- group_weight(cur, env->dst_nid);
+ imp += group_weight(cur, env->src_nid, dist) -
+ group_weight(cur, env->dst_nid, dist);
else
- imp += task_weight(cur, env->src_nid) -
- task_weight(cur, env->dst_nid);
+ imp += task_weight(cur, env->src_nid, dist) -
+ task_weight(cur, env->dst_nid, dist);
}
}
@@ -1303,7 +1413,7 @@ static int task_numa_migrate(struct task_struct *p)
};
struct sched_domain *sd;
unsigned long taskweight, groupweight;
- int nid, ret;
+ int nid, ret, dist;
long taskimp, groupimp;
/*
@@ -1331,29 +1441,45 @@ static int task_numa_migrate(struct task_struct *p)
return -EINVAL;
}
- taskweight = task_weight(p, env.src_nid);
- groupweight = group_weight(p, env.src_nid);
- update_numa_stats(&env.src_stats, env.src_nid);
env.dst_nid = p->numa_preferred_nid;
- taskimp = task_weight(p, env.dst_nid) - taskweight;
- groupimp = group_weight(p, env.dst_nid) - groupweight;
+ dist = env.dist = node_distance(env.src_nid, env.dst_nid);
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ update_numa_stats(&env.src_stats, env.src_nid);
+ taskimp = task_weight(p, env.dst_nid, dist) - taskweight;
+ groupimp = group_weight(p, env.dst_nid, dist) - groupweight;
update_numa_stats(&env.dst_stats, env.dst_nid);
/* Try to find a spot on the preferred nid. */
task_numa_find_cpu(&env, taskimp, groupimp);
- /* No space available on the preferred nid. Look elsewhere. */
- if (env.best_cpu == -1) {
+ /*
+ * Look at other nodes in these cases:
+ * - there is no space available on the preferred_nid
+ * - the task is part of a numa_group that is interleaved across
+ * multiple NUMA nodes; in order to better consolidate the group,
+ * we need to check other locations.
+ */
+ if (env.best_cpu == -1 || (p->numa_group &&
+ nodes_weight(p->numa_group->active_nodes) > 1)) {
for_each_online_node(nid) {
if (nid == env.src_nid || nid == p->numa_preferred_nid)
continue;
+ dist = node_distance(env.src_nid, env.dst_nid);
+ if (sched_numa_topology_type == NUMA_BACKPLANE &&
+ dist != env.dist) {
+ taskweight = task_weight(p, env.src_nid, dist);
+ groupweight = group_weight(p, env.src_nid, dist);
+ }
+
/* Only consider nodes where both task and groups benefit */
- taskimp = task_weight(p, nid) - taskweight;
- groupimp = group_weight(p, nid) - groupweight;
+ taskimp = task_weight(p, nid, dist) - taskweight;
+ groupimp = group_weight(p, nid, dist) - groupweight;
if (taskimp < 0 && groupimp < 0)
continue;
+ env.dist = dist;
env.dst_nid = nid;
update_numa_stats(&env.dst_stats, env.dst_nid);
task_numa_find_cpu(&env, taskimp, groupimp);
@@ -1408,7 +1534,7 @@ static void numa_migrate_preferred(struct task_struct *p)
unsigned long interval = HZ;
/* This task has no NUMA fault statistics yet */
- if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults_memory))
+ if (unlikely(p->numa_preferred_nid == -1 || !p->numa_faults))
return;
/* Periodically retry migrating the task to the preferred node */
@@ -1520,7 +1646,7 @@ static void update_task_scan_period(struct task_struct *p,
* scanning faster if shared accesses dominate as it may
* simply bounce migrations uselessly
*/
- ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared + 1));
diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
}
@@ -1557,6 +1683,92 @@ static u64 numa_get_avg_runtime(struct task_struct *p, u64 *period)
return delta;
}
+/*
+ * Determine the preferred nid for a task in a numa_group. This needs to
+ * be done in a way that produces consistent results with group_weight,
+ * otherwise workloads might not converge.
+ */
+static int preferred_group_nid(struct task_struct *p, int nid)
+{
+ nodemask_t nodes;
+ int dist;
+
+ /* Direct connections between all NUMA nodes. */
+ if (sched_numa_topology_type == NUMA_DIRECT)
+ return nid;
+
+ /*
+ * On a system with glueless mesh NUMA topology, group_weight
+ * scores nodes according to the number of NUMA hinting faults on
+ * both the node itself, and on nearby nodes.
+ */
+ if (sched_numa_topology_type == NUMA_GLUELESS_MESH) {
+ unsigned long score, max_score = 0;
+ int node, max_node = nid;
+
+ dist = sched_max_numa_distance;
+
+ for_each_online_node(node) {
+ score = group_weight(p, node, dist);
+ if (score > max_score) {
+ max_score = score;
+ max_node = node;
+ }
+ }
+ return max_node;
+ }
+
+ /*
+ * Finding the preferred nid in a system with NUMA backplane
+ * interconnect topology is more involved. The goal is to locate
+ * tasks from numa_groups near each other in the system, and
+ * untangle workloads from different sides of the system. This requires
+ * searching down the hierarchy of node groups, recursively searching
+ * inside the highest scoring group of nodes. The nodemask tricks
+ * keep the complexity of the search down.
+ */
+ nodes = node_online_map;
+ for (dist = sched_max_numa_distance; dist > LOCAL_DISTANCE; dist--) {
+ unsigned long max_faults = 0;
+ nodemask_t max_group;
+ int a, b;
+
+ /* Are there nodes at this distance from each other? */
+ if (!find_numa_distance(dist))
+ continue;
+
+ for_each_node_mask(a, nodes) {
+ unsigned long faults = 0;
+ nodemask_t this_group;
+ nodes_clear(this_group);
+
+ /* Sum group's NUMA faults; includes a==b case. */
+ for_each_node_mask(b, nodes) {
+ if (node_distance(a, b) < dist) {
+ faults += group_faults(p, b);
+ node_set(b, this_group);
+ node_clear(b, nodes);
+ }
+ }
+
+ /* Remember the top group. */
+ if (faults > max_faults) {
+ max_faults = faults;
+ max_group = this_group;
+ /*
+ * subtle: at the smallest distance there is
+ * just one node left in each "group", the
+ * winner is the preferred nid.
+ */
+ nid = a;
+ }
+ }
+ /* Next round, evaluate the nodes within max_group. */
+ nodes = max_group;
+ }
+ return nid;
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1, max_group_nid = -1;
@@ -1584,18 +1796,23 @@ static void task_numa_placement(struct task_struct *p)
/* Find the node with the highest number of faults */
for_each_online_node(nid) {
+ /* Keep track of the offsets in numa_faults array */
+ int mem_idx, membuf_idx, cpu_idx, cpubuf_idx;
unsigned long faults = 0, group_faults = 0;
- int priv, i;
+ int priv;
for (priv = 0; priv < NR_NUMA_HINT_FAULT_TYPES; priv++) {
long diff, f_diff, f_weight;
- i = task_faults_idx(nid, priv);
+ mem_idx = task_faults_idx(NUMA_MEM, nid, priv);
+ membuf_idx = task_faults_idx(NUMA_MEMBUF, nid, priv);
+ cpu_idx = task_faults_idx(NUMA_CPU, nid, priv);
+ cpubuf_idx = task_faults_idx(NUMA_CPUBUF, nid, priv);
/* Decay existing window, copy faults since last scan */
- diff = p->numa_faults_buffer_memory[i] - p->numa_faults_memory[i] / 2;
- fault_types[priv] += p->numa_faults_buffer_memory[i];
- p->numa_faults_buffer_memory[i] = 0;
+ diff = p->numa_faults[membuf_idx] - p->numa_faults[mem_idx] / 2;
+ fault_types[priv] += p->numa_faults[membuf_idx];
+ p->numa_faults[membuf_idx] = 0;
/*
* Normalize the faults_from, so all tasks in a group
@@ -1605,21 +1822,27 @@ static void task_numa_placement(struct task_struct *p)
* faults are less important.
*/
f_weight = div64_u64(runtime << 16, period + 1);
- f_weight = (f_weight * p->numa_faults_buffer_cpu[i]) /
+ f_weight = (f_weight * p->numa_faults[cpubuf_idx]) /
(total_faults + 1);
- f_diff = f_weight - p->numa_faults_cpu[i] / 2;
- p->numa_faults_buffer_cpu[i] = 0;
+ f_diff = f_weight - p->numa_faults[cpu_idx] / 2;
+ p->numa_faults[cpubuf_idx] = 0;
- p->numa_faults_memory[i] += diff;
- p->numa_faults_cpu[i] += f_diff;
- faults += p->numa_faults_memory[i];
+ p->numa_faults[mem_idx] += diff;
+ p->numa_faults[cpu_idx] += f_diff;
+ faults += p->numa_faults[mem_idx];
p->total_numa_faults += diff;
if (p->numa_group) {
- /* safe because we can only change our own group */
- p->numa_group->faults[i] += diff;
- p->numa_group->faults_cpu[i] += f_diff;
+ /*
+ * safe because we can only change our own group
+ *
+ * mem_idx represents the offset for a given
+ * nid and priv in a specific region because it
+ * is at the beginning of the numa_faults array.
+ */
+ p->numa_group->faults[mem_idx] += diff;
+ p->numa_group->faults_cpu[mem_idx] += f_diff;
p->numa_group->total_faults += diff;
- group_faults += p->numa_group->faults[i];
+ group_faults += p->numa_group->faults[mem_idx];
}
}
@@ -1639,7 +1862,7 @@ static void task_numa_placement(struct task_struct *p)
if (p->numa_group) {
update_numa_active_node_mask(p->numa_group);
spin_unlock_irq(group_lock);
- max_nid = max_group_nid;
+ max_nid = preferred_group_nid(p, max_group_nid);
}
if (max_faults) {
@@ -1682,7 +1905,6 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
atomic_set(&grp->refcount, 1);
spin_lock_init(&grp->lock);
- INIT_LIST_HEAD(&grp->task_list);
grp->gid = p->pid;
/* Second half of the array tracks nids where faults happen */
grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES *
@@ -1691,11 +1913,10 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
node_set(task_node(current), grp->active_nodes);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
- grp->faults[i] = p->numa_faults_memory[i];
+ grp->faults[i] = p->numa_faults[i];
grp->total_faults = p->total_numa_faults;
- list_add(&p->numa_entry, &grp->task_list);
grp->nr_tasks++;
rcu_assign_pointer(p->numa_group, grp);
}
@@ -1750,13 +1971,12 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags,
double_lock_irq(&my_grp->lock, &grp->lock);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) {
- my_grp->faults[i] -= p->numa_faults_memory[i];
- grp->faults[i] += p->numa_faults_memory[i];
+ my_grp->faults[i] -= p->numa_faults[i];
+ grp->faults[i] += p->numa_faults[i];
}
my_grp->total_faults -= p->total_numa_faults;
grp->total_faults += p->total_numa_faults;
- list_move(&p->numa_entry, &grp->task_list);
my_grp->nr_tasks--;
grp->nr_tasks++;
@@ -1776,27 +1996,23 @@ no_join:
void task_numa_free(struct task_struct *p)
{
struct numa_group *grp = p->numa_group;
- void *numa_faults = p->numa_faults_memory;
+ void *numa_faults = p->numa_faults;
unsigned long flags;
int i;
if (grp) {
spin_lock_irqsave(&grp->lock, flags);
for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++)
- grp->faults[i] -= p->numa_faults_memory[i];
+ grp->faults[i] -= p->numa_faults[i];
grp->total_faults -= p->total_numa_faults;
- list_del(&p->numa_entry);
grp->nr_tasks--;
spin_unlock_irqrestore(&grp->lock, flags);
RCU_INIT_POINTER(p->numa_group, NULL);
put_numa_group(grp);
}
- p->numa_faults_memory = NULL;
- p->numa_faults_buffer_memory = NULL;
- p->numa_faults_cpu= NULL;
- p->numa_faults_buffer_cpu = NULL;
+ p->numa_faults = NULL;
kfree(numa_faults);
}
@@ -1819,24 +2035,14 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
return;
/* Allocate buffer to track faults on a per-node basis */
- if (unlikely(!p->numa_faults_memory)) {
- int size = sizeof(*p->numa_faults_memory) *
+ if (unlikely(!p->numa_faults)) {
+ int size = sizeof(*p->numa_faults) *
NR_NUMA_HINT_FAULT_BUCKETS * nr_node_ids;
- p->numa_faults_memory = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
- if (!p->numa_faults_memory)
+ p->numa_faults = kzalloc(size, GFP_KERNEL|__GFP_NOWARN);
+ if (!p->numa_faults)
return;
- BUG_ON(p->numa_faults_buffer_memory);
- /*
- * The averaged statistics, shared & private, memory & cpu,
- * occupy the first half of the array. The second half of the
- * array is for current counters, which are averaged into the
- * first set by task_numa_placement.
- */
- p->numa_faults_cpu = p->numa_faults_memory + (2 * nr_node_ids);
- p->numa_faults_buffer_memory = p->numa_faults_memory + (4 * nr_node_ids);
- p->numa_faults_buffer_cpu = p->numa_faults_memory + (6 * nr_node_ids);
p->total_numa_faults = 0;
memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
@@ -1876,8 +2082,8 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags)
if (migrated)
p->numa_pages_migrated += pages;
- p->numa_faults_buffer_memory[task_faults_idx(mem_node, priv)] += pages;
- p->numa_faults_buffer_cpu[task_faults_idx(cpu_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_MEMBUF, mem_node, priv)] += pages;
+ p->numa_faults[task_faults_idx(NUMA_CPUBUF, cpu_node, priv)] += pages;
p->numa_faults_locality[local] += pages;
}
@@ -4446,7 +4652,7 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
}
- } else {
+ } else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
if (load < min_load || (load == min_load && i == this_cpu)) {
min_load = load;
@@ -4524,9 +4730,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
- if (p->nr_cpus_allowed == 1)
- return prev_cpu;
-
if (sd_flag & SD_BALANCE_WAKE)
want_affine = cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
@@ -5166,7 +5369,7 @@ static bool migrate_improves_locality(struct task_struct *p, struct lb_env *env)
struct numa_group *numa_group = rcu_dereference(p->numa_group);
int src_nid, dst_nid;
- if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults_memory ||
+ if (!sched_feat(NUMA_FAVOUR_HIGHER) || !p->numa_faults ||
!(env->sd->flags & SD_NUMA)) {
return false;
}
@@ -5205,7 +5408,7 @@ static bool migrate_degrades_locality(struct task_struct *p, struct lb_env *env)
if (!sched_feat(NUMA) || !sched_feat(NUMA_RESIST_LOWER))
return false;
- if (!p->numa_faults_memory || !(env->sd->flags & SD_NUMA))
+ if (!p->numa_faults || !(env->sd->flags & SD_NUMA))
return false;
src_nid = cpu_to_node(env->src_cpu);
@@ -6149,8 +6352,10 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
* with a large weight task outweighs the tasks on the system).
*/
if (prefer_sibling && sds->local &&
- sds->local_stat.group_has_free_capacity)
+ sds->local_stat.group_has_free_capacity) {
sgs->group_capacity_factor = min(sgs->group_capacity_factor, 1U);
+ sgs->group_type = group_classify(sg, sgs);
+ }
if (update_sd_pick_busiest(env, sds, sg, sgs)) {
sds->busiest = sg;
@@ -7938,6 +8143,8 @@ const struct sched_class fair_sched_class = {
.get_rr_interval = get_rr_interval_fair,
+ .update_curr = update_curr_fair,
+
#ifdef CONFIG_FAIR_GROUP_SCHED
.task_move_group = task_move_group_fair,
#endif
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
index 67ad4e7f506a..c65dac8c97cd 100644
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -75,6 +75,10 @@ static unsigned int get_rr_interval_idle(struct rq *rq, struct task_struct *task
return 0;
}
+static void update_curr_idle(struct rq *rq)
+{
+}
+
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
@@ -101,4 +105,5 @@ const struct sched_class idle_sched_class = {
.prio_changed = prio_changed_idle,
.switched_to = switched_to_idle,
+ .update_curr = update_curr_idle,
};
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index d024e6ce30ba..ee15f5a0d1c1 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1301,9 +1301,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
struct task_struct *curr;
struct rq *rq;
- if (p->nr_cpus_allowed == 1)
- goto out;
-
/* For anything but wake ups, just return the task_cpu */
if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
goto out;
@@ -1351,16 +1348,22 @@ out:
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
- if (rq->curr->nr_cpus_allowed == 1)
+ /*
+ * Current can't be migrated, useless to reschedule,
+ * let's hope p can move out.
+ */
+ if (rq->curr->nr_cpus_allowed == 1 ||
+ !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
return;
+ /*
+ * p is migratable, so let's not schedule it and
+ * see if it is pushed or pulled somewhere else.
+ */
if (p->nr_cpus_allowed != 1
&& cpupri_find(&rq->rd->cpupri, p, NULL))
return;
- if (!cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
- return;
-
/*
* There appears to be other cpus that can accept
* current and none to run 'p', so lets reschedule
@@ -2128,6 +2131,8 @@ const struct sched_class rt_sched_class = {
.prio_changed = prio_changed_rt,
.switched_to = switched_to_rt,
+
+ .update_curr = update_curr_rt,
};
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 24156c8434d1..9a2a45c970e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -176,6 +176,25 @@ struct dl_bw {
u64 bw, total_bw;
};
+static inline
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw -= tsk_bw;
+}
+
+static inline
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+{
+ dl_b->total_bw += tsk_bw;
+}
+
+static inline
+bool __dl_overflow(struct dl_bw *dl_b, int cpus, u64 old_bw, u64 new_bw)
+{
+ return dl_b->bw != -1 &&
+ dl_b->bw * cpus < dl_b->total_bw - old_bw + new_bw;
+}
+
extern struct mutex sched_domains_mutex;
#ifdef CONFIG_CGROUP_SCHED
@@ -678,7 +697,25 @@ static inline u64 rq_clock_task(struct rq *rq)
return rq->clock_task;
}
+#ifdef CONFIG_NUMA
+enum numa_topology_type {
+ NUMA_DIRECT,
+ NUMA_GLUELESS_MESH,
+ NUMA_BACKPLANE,
+};
+extern enum numa_topology_type sched_numa_topology_type;
+extern int sched_max_numa_distance;
+extern bool find_numa_distance(int distance);
+#endif
+
#ifdef CONFIG_NUMA_BALANCING
+/* The regions in numa_faults array from task_struct */
+enum numa_faults_stats {
+ NUMA_MEM = 0,
+ NUMA_CPU,
+ NUMA_MEMBUF,
+ NUMA_CPUBUF
+};
extern void sched_setnuma(struct task_struct *p, int node);
extern int migrate_task_to(struct task_struct *p, int cpu);
extern int migrate_swap(struct task_struct *, struct task_struct *);
@@ -1127,6 +1164,11 @@ struct sched_class {
void (*task_fork) (struct task_struct *p);
void (*task_dead) (struct task_struct *p);
+ /*
+ * The switched_from() call is allowed to drop rq->lock, therefore we
+ * cannot assume the switched_from/switched_to pair is serliazed by
+ * rq->lock. They are however serialized by p->pi_lock.
+ */
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
@@ -1135,6 +1177,8 @@ struct sched_class {
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);
+ void (*update_curr) (struct rq *rq);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
@@ -1502,6 +1546,7 @@ extern struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq);
extern struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq);
extern void print_cfs_stats(struct seq_file *m, int cpu);
extern void print_rt_stats(struct seq_file *m, int cpu);
+extern void print_dl_stats(struct seq_file *m, int cpu);
extern void init_cfs_rq(struct cfs_rq *cfs_rq);
extern void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq);
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index 67426e529f59..79ffec45a6ac 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -102,6 +102,10 @@ get_rr_interval_stop(struct rq *rq, struct task_struct *task)
return 0;
}
+static void update_curr_stop(struct rq *rq)
+{
+}
+
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
@@ -128,4 +132,5 @@ const struct sched_class stop_sched_class = {
.prio_changed = prio_changed_stop,
.switched_to = switched_to_stop,
+ .update_curr = update_curr_stop,
};
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 5a62915f47a8..852143a79f36 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -9,6 +9,7 @@
#include <linux/mm.h>
#include <linux/wait.h>
#include <linux/hash.h>
+#include <linux/kthread.h>
void __init_waitqueue_head(wait_queue_head_t *q, const char *name, struct lock_class_key *key)
{
@@ -297,6 +298,71 @@ int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *
}
EXPORT_SYMBOL(autoremove_wake_function);
+static inline bool is_kthread_should_stop(void)
+{
+ return (current->flags & PF_KTHREAD) && kthread_should_stop();
+}
+
+/*
+ * DEFINE_WAIT_FUNC(wait, woken_wake_func);
+ *
+ * add_wait_queue(&wq, &wait);
+ * for (;;) {
+ * if (condition)
+ * break;
+ *
+ * p->state = mode; condition = true;
+ * smp_mb(); // A smp_wmb(); // C
+ * if (!wait->flags & WQ_FLAG_WOKEN) wait->flags |= WQ_FLAG_WOKEN;
+ * schedule() try_to_wake_up();
+ * p->state = TASK_RUNNING; ~~~~~~~~~~~~~~~~~~
+ * wait->flags &= ~WQ_FLAG_WOKEN; condition = true;
+ * smp_mb() // B smp_wmb(); // C
+ * wait->flags |= WQ_FLAG_WOKEN;
+ * }
+ * remove_wait_queue(&wq, &wait);
+ *
+ */
+long wait_woken(wait_queue_t *wait, unsigned mode, long timeout)
+{
+ set_current_state(mode); /* A */
+ /*
+ * The above implies an smp_mb(), which matches with the smp_wmb() from
+ * woken_wake_function() such that if we observe WQ_FLAG_WOKEN we must
+ * also observe all state before the wakeup.
+ */
+ if (!(wait->flags & WQ_FLAG_WOKEN) && !is_kthread_should_stop())
+ timeout = schedule_timeout(timeout);
+ __set_current_state(TASK_RUNNING);
+
+ /*
+ * The below implies an smp_mb(), it too pairs with the smp_wmb() from
+ * woken_wake_function() such that we must either observe the wait
+ * condition being true _OR_ WQ_FLAG_WOKEN such that we will not miss
+ * an event.
+ */
+ set_mb(wait->flags, wait->flags & ~WQ_FLAG_WOKEN); /* B */
+
+ return timeout;
+}
+EXPORT_SYMBOL(wait_woken);
+
+int woken_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+ /*
+ * Although this function is called under waitqueue lock, LOCK
+ * doesn't imply write barrier and the users expects write
+ * barrier semantics on wakeup functions. The following
+ * smp_wmb() is equivalent to smp_wmb() in try_to_wake_up()
+ * and is paired with set_mb() in wait_woken().
+ */
+ smp_wmb(); /* C */
+ wait->flags |= WQ_FLAG_WOKEN;
+
+ return default_wake_function(wait, mode, sync, key);
+}
+EXPORT_SYMBOL(woken_wake_function);
+
int wake_bit_function(wait_queue_t *wait, unsigned mode, int sync, void *arg)
{
struct wait_bit_key *key = arg;