diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2023-10-31 02:12:15 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2023-10-31 02:12:15 +0300 |
commit | 63ce50fff9240d66cf3b59663f458f55ba6dcfcc (patch) | |
tree | c2613289116d6f80a50f534e9c5e4afc26064574 /kernel/sched | |
parent | 3cf3fabccb9dc821ffaec3ad6bf0cd6b278bd012 (diff) | |
parent | 984ffb6a4366752c949f7b39640aecdce222607f (diff) | |
download | linux-63ce50fff9240d66cf3b59663f458f55ba6dcfcc.tar.xz |
Merge tag 'sched-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"Fair scheduler (SCHED_OTHER) improvements:
- Remove the old and now unused SIS_PROP code & option
- Scan cluster before LLC in the wake-up path
- Use candidate prev/recent_used CPU if scanning failed for cluster
wakeup
NUMA scheduling improvements:
- Improve the VMA access-PID code to better skip/scan VMAs
- Extend tracing to cover VMA-skipping decisions
- Improve/fix the recently introduced sched_numa_find_nth_cpu() code
- Generalize numa_map_to_online_node()
Energy scheduling improvements:
- Remove the EM_MAX_COMPLEXITY limit
- Add tracepoints to track energy computation
- Make the behavior of the 'sched_energy_aware' sysctl more
consistent
- Consolidate and clean up access to a CPU's max compute capacity
- Fix uclamp code corner cases
RT scheduling improvements:
- Drive dl_rq->overloaded with dl_rq->pushable_dl_tasks updates
- Drive the ->rto_mask with rt_rq->pushable_tasks updates
Scheduler scalability improvements:
- Rate-limit updates to tg->load_avg
- On x86 disable IBRS when CPU is offline to improve single-threaded
performance
- Micro-optimize in_task() and in_interrupt()
- Micro-optimize the PSI code
- Avoid updating PSI triggers and ->rtpoll_total when there are no
state changes
Core scheduler infrastructure improvements:
- Use saved_state to reduce some spurious freezer wakeups
- Bring in a handful of fast-headers improvements to scheduler
headers
- Make the scheduler UAPI headers more widely usable by user-space
- Simplify the control flow of scheduler syscalls by using lock
guards
- Fix sched_setaffinity() vs. CPU hotplug race
Scheduler debuggability improvements:
- Disallow writing invalid values to sched_rt_period_us
- Fix a race in the rq-clock debugging code triggering warnings
- Fix a warning in the bandwidth distribution code
- Micro-optimize in_atomic_preempt_off() checks
- Enforce that the tasklist_lock is held in for_each_thread()
- Print the TGID in sched_show_task()
- Remove the /proc/sys/kernel/sched_child_runs_first sysctl
... and misc cleanups & fixes"
* tag 'sched-core-2023-10-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (82 commits)
sched/fair: Remove SIS_PROP
sched/fair: Use candidate prev/recent_used CPU if scanning failed for cluster wakeup
sched/fair: Scan cluster before scanning LLC in wake-up path
sched: Add cpus_share_resources API
sched/core: Fix RQCF_ACT_SKIP leak
sched/fair: Remove unused 'curr' argument from pick_next_entity()
sched/nohz: Update comments about NEWILB_KICK
sched/fair: Remove duplicate #include
sched/psi: Update poll => rtpoll in relevant comments
sched: Make PELT acronym definition searchable
sched: Fix stop_one_cpu_nowait() vs hotplug
sched/psi: Bail out early from irq time accounting
sched/topology: Rename 'DIE' domain to 'PKG'
sched/psi: Delete the 'update_total' function parameter from update_triggers()
sched/psi: Avoid updating PSI triggers and ->rtpoll_total when there are no state changes
sched/headers: Remove comment referring to rq::cpu_load, since this has been removed
sched/numa: Complete scanning of inactive VMAs when there is no alternative
sched/numa: Complete scanning of partial VMAs regardless of PID activity
sched/numa: Move up the access pid reset logic
sched/numa: Trace decisions related to skipping VMAs
...
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/build_utility.c | 1 | ||||
-rw-r--r-- | kernel/sched/core.c | 645 | ||||
-rw-r--r-- | kernel/sched/cpudeadline.c | 2 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 73 | ||||
-rw-r--r-- | kernel/sched/debug.c | 7 | ||||
-rw-r--r-- | kernel/sched/fair.c | 432 | ||||
-rw-r--r-- | kernel/sched/features.h | 1 | ||||
-rw-r--r-- | kernel/sched/idle.c | 4 | ||||
-rw-r--r-- | kernel/sched/pelt.c | 2 | ||||
-rw-r--r-- | kernel/sched/psi.c | 58 | ||||
-rw-r--r-- | kernel/sched/rt.c | 95 | ||||
-rw-r--r-- | kernel/sched/sched.h | 45 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 4 | ||||
-rw-r--r-- | kernel/sched/topology.c | 215 |
14 files changed, 740 insertions, 844 deletions
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c index 99bdd96f454f..80a3df49ab47 100644 --- a/kernel/sched/build_utility.c +++ b/kernel/sched/build_utility.c @@ -34,7 +34,6 @@ #include <linux/nospec.h> #include <linux/proc_fs.h> #include <linux/psi.h> -#include <linux/psi.h> #include <linux/ptrace_api.h> #include <linux/sched_clock.h> #include <linux/security.h> diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9b075b541865..81885748871d 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -85,7 +85,6 @@ #include "sched.h" #include "stats.h" -#include "autogroup.h" #include "autogroup.h" #include "pelt.h" @@ -114,6 +113,7 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(sched_overutilized_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_cfs_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_util_est_se_tp); EXPORT_TRACEPOINT_SYMBOL_GPL(sched_update_nr_running_tp); +EXPORT_TRACEPOINT_SYMBOL_GPL(sched_compute_energy_tp); DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); @@ -919,14 +919,13 @@ static bool set_nr_if_polling(struct task_struct *p) struct thread_info *ti = task_thread_info(p); typeof(ti->flags) val = READ_ONCE(ti->flags); - for (;;) { + do { if (!(val & _TIF_POLLING_NRFLAG)) return false; if (val & _TIF_NEED_RESCHED) return true; - if (try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)) - break; - } + } while (!try_cmpxchg(&ti->flags, &val, val | _TIF_NEED_RESCHED)); + return true; } @@ -1480,16 +1479,12 @@ static void __uclamp_update_util_min_rt_default(struct task_struct *p) static void uclamp_update_util_min_rt_default(struct task_struct *p) { - struct rq_flags rf; - struct rq *rq; - if (!rt_task(p)) return; /* Protect updates to p->uclamp_* */ - rq = task_rq_lock(p, &rf); + guard(task_rq_lock)(p); __uclamp_update_util_min_rt_default(p); - task_rq_unlock(rq, p, &rf); } static inline struct uclamp_se @@ -1785,9 +1780,8 @@ static void uclamp_update_root_tg(void) uclamp_se_set(&tg->uclamp_req[UCLAMP_MAX], sysctl_sched_uclamp_util_max, false); - rcu_read_lock(); + guard(rcu)(); cpu_util_update_eff(&root_task_group.css); - rcu_read_unlock(); } #else static void uclamp_update_root_tg(void) { } @@ -1814,10 +1808,9 @@ static void uclamp_sync_util_min_rt_default(void) smp_mb__after_spinlock(); read_unlock(&tasklist_lock); - rcu_read_lock(); + guard(rcu)(); for_each_process_thread(g, p) uclamp_update_util_min_rt_default(p); - rcu_read_unlock(); } static int sysctl_sched_uclamp_handler(struct ctl_table *table, int write, @@ -2218,10 +2211,10 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p, p->sched_class->prio_changed(rq, p, oldprio); } -void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) +void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags) { if (p->sched_class == rq->curr->sched_class) - rq->curr->sched_class->check_preempt_curr(rq, p, flags); + rq->curr->sched_class->wakeup_preempt(rq, p, flags); else if (sched_class_above(p->sched_class, rq->curr->sched_class)) resched_curr(rq); @@ -2239,31 +2232,21 @@ int __task_state_match(struct task_struct *p, unsigned int state) if (READ_ONCE(p->__state) & state) return 1; -#ifdef CONFIG_PREEMPT_RT if (READ_ONCE(p->saved_state) & state) return -1; -#endif + return 0; } static __always_inline int task_state_match(struct task_struct *p, unsigned int state) { -#ifdef CONFIG_PREEMPT_RT - int match; - /* - * Serialize against current_save_and_set_rtlock_wait_state() and - * current_restore_rtlock_saved_state(). + * Serialize against current_save_and_set_rtlock_wait_state(), + * current_restore_rtlock_saved_state(), and __refrigerator(). */ - raw_spin_lock_irq(&p->pi_lock); - match = __task_state_match(p, state); - raw_spin_unlock_irq(&p->pi_lock); - - return match; -#else + guard(raw_spinlock_irq)(&p->pi_lock); return __task_state_match(p, state); -#endif } /* @@ -2417,10 +2400,9 @@ void migrate_disable(void) return; } - preempt_disable(); + guard(preempt)(); this_rq()->nr_pinned++; p->migration_disabled = 1; - preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_disable); @@ -2444,7 +2426,7 @@ void migrate_enable(void) * Ensure stop_task runs either before or after this, and that * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). */ - preempt_disable(); + guard(preempt)(); if (p->cpus_ptr != &p->cpus_mask) __set_cpus_allowed_ptr(p, &ac); /* @@ -2455,7 +2437,6 @@ void migrate_enable(void) barrier(); p->migration_disabled = 0; this_rq()->nr_pinned--; - preempt_enable(); } EXPORT_SYMBOL_GPL(migrate_enable); @@ -2527,7 +2508,7 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, rq_lock(rq, rf); WARN_ON_ONCE(task_cpu(p) != new_cpu); activate_task(rq, p, 0); - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); return rq; } @@ -2664,9 +2645,11 @@ static int migration_cpu_stop(void *data) * it. */ WARN_ON_ONCE(!pending->stop_pending); + preempt_disable(); task_rq_unlock(rq, p, &rf); stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, &pending->arg, &pending->stop_work); + preempt_enable(); return 0; } out: @@ -2986,12 +2969,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag complete = true; } + preempt_disable(); task_rq_unlock(rq, p, rf); - if (push_task) { stop_one_cpu_nowait(rq->cpu, push_cpu_stop, p, &rq->push_work); } + preempt_enable(); if (complete) complete_all(&pending->done); @@ -3057,12 +3041,13 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag if (flags & SCA_MIGRATE_ENABLE) p->migration_flags &= ~MDF_PUSH; + preempt_disable(); task_rq_unlock(rq, p, rf); - if (!stop_pending) { stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, &pending->arg, &pending->stop_work); } + preempt_enable(); if (flags & SCA_MIGRATE_ENABLE) return 0; @@ -3409,7 +3394,7 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); - check_preempt_curr(dst_rq, p, 0); + wakeup_preempt(dst_rq, p, 0); rq_unpin_lock(dst_rq, &drf); rq_unpin_lock(src_rq, &srf); @@ -3516,13 +3501,11 @@ out: */ void kick_process(struct task_struct *p) { - int cpu; + guard(preempt)(); + int cpu = task_cpu(p); - preempt_disable(); - cpu = task_cpu(p); if ((cpu != smp_processor_id()) && task_curr(p)) smp_send_reschedule(cpu); - preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); @@ -3785,7 +3768,7 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, } activate_task(rq, p, en_flags); - check_preempt_curr(rq, p, wake_flags); + wakeup_preempt(rq, p, wake_flags); ttwu_do_wakeup(p); @@ -3809,9 +3792,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (rq->avg_idle > max) rq->avg_idle = max; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle / 2; - rq->idle_stamp = 0; } #endif @@ -3856,7 +3836,7 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) * it should preempt the task that is current now. */ update_rq_clock(rq); - check_preempt_curr(rq, p, wake_flags); + wakeup_preempt(rq, p, wake_flags); } ttwu_do_wakeup(p); ret = 1; @@ -3956,6 +3936,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } +/* + * Whether CPUs are share cache resources, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_resources(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu); +} + static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* @@ -4036,13 +4028,17 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) * The caller holds p::pi_lock if p != current or has preemption * disabled when p == current. * - * The rules of PREEMPT_RT saved_state: + * The rules of saved_state: * * The related locking code always holds p::pi_lock when updating * p::saved_state, which means the code is fully serialized in both cases. * - * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other - * bits set. This allows to distinguish all wakeup scenarios. + * For PREEMPT_RT, the lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. + * No other bits set. This allows to distinguish all wakeup scenarios. + * + * For FREEZER, the wakeup happens via TASK_FROZEN. No other bits set. This + * allows us to prevent early wakeup of tasks before they can be run on + * asymmetric ISA architectures (eg ARMv9). */ static __always_inline bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) @@ -4056,13 +4052,13 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) *success = !!(match = __task_state_match(p, state)); -#ifdef CONFIG_PREEMPT_RT /* * Saved state preserves the task state across blocking on - * an RT lock. If the state matches, set p::saved_state to - * TASK_RUNNING, but do not wake the task because it waits - * for a lock wakeup. Also indicate success because from - * the regular waker's point of view this has succeeded. + * an RT lock or TASK_FREEZABLE tasks. If the state matches, + * set p::saved_state to TASK_RUNNING, but do not wake the task + * because it waits for a lock wakeup or __thaw_task(). Also + * indicate success because from the regular waker's point of + * view this has succeeded. * * After acquiring the lock the task will restore p::__state * from p::saved_state which ensures that the regular @@ -4072,7 +4068,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) */ if (match < 0) p->saved_state = TASK_RUNNING; -#endif + return match > 0; } @@ -4254,7 +4250,7 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in * __schedule(). See the comment for smp_mb__after_spinlock(). * - * A similar smb_rmb() lives in try_invoke_on_locked_down_task(). + * A similar smp_rmb() lives in __task_needs_rq_lock(). */ smp_rmb(); if (READ_ONCE(p->on_rq) && ttwu_runnable(p, wake_flags)) @@ -4871,7 +4867,7 @@ void wake_up_new_task(struct task_struct *p) activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); - check_preempt_curr(rq, p, WF_FORK); + wakeup_preempt(rq, p, WF_FORK); #ifdef CONFIG_SMP if (p->sched_class->task_woken) { /* @@ -5374,8 +5370,6 @@ context_switch(struct rq *rq, struct task_struct *prev, /* switch_mm_cid() requires the memory barriers above. */ switch_mm_cid(rq, prev, next); - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - prepare_lock_switch(rq, next, rf); /* Here we just switch the register state and the stack. */ @@ -5916,8 +5910,7 @@ static noinline void __schedule_bug(struct task_struct *prev) print_modules(); if (irqs_disabled()) print_irqtrace_events(prev); - if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) - && in_atomic_preempt_off()) { + if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { pr_err("Preemption disabled at:"); print_ip_sym(KERN_ERR, preempt_disable_ip); } @@ -6368,8 +6361,9 @@ static void sched_core_balance(struct rq *rq) struct sched_domain *sd; int cpu = cpu_of(rq); - preempt_disable(); - rcu_read_lock(); + guard(preempt)(); + guard(rcu)(); + raw_spin_rq_unlock_irq(rq); for_each_domain(cpu, sd) { if (need_resched()) @@ -6379,8 +6373,6 @@ static void sched_core_balance(struct rq *rq) break; } raw_spin_rq_lock_irq(rq); - rcu_read_unlock(); - preempt_enable(); } static DEFINE_PER_CPU(struct balance_callback, core_balance_head); @@ -6615,6 +6607,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Promote REQ to ACT */ rq->clock_update_flags <<= 1; update_rq_clock(rq); + rq->clock_update_flags = RQCF_UPDATED; switch_count = &prev->nivcsw; @@ -6694,8 +6687,6 @@ static void __sched notrace __schedule(unsigned int sched_mode) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { - rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unpin_lock(rq, &rf); __balance_callbacks(rq); raw_spin_rq_unlock_irq(rq); @@ -6734,12 +6725,10 @@ static inline void sched_submit_work(struct task_struct *tsk) * If a worker goes to sleep, notify and ask workqueue whether it * wants to wake up a task to maintain concurrency. */ - if (task_flags & (PF_WQ_WORKER | PF_IO_WORKER)) { - if (task_flags & PF_WQ_WORKER) - wq_worker_sleeping(tsk); - else - io_wq_worker_sleeping(tsk); - } + if (task_flags & PF_WQ_WORKER) + wq_worker_sleeping(tsk); + else if (task_flags & PF_IO_WORKER) + io_wq_worker_sleeping(tsk); /* * spinlock and rwlock must not flush block requests. This will @@ -7225,9 +7214,8 @@ static inline int rt_effective_prio(struct task_struct *p, int prio) void set_user_nice(struct task_struct *p, long nice) { bool queued, running; - int old_prio; - struct rq_flags rf; struct rq *rq; + int old_prio; if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) return; @@ -7235,7 +7223,9 @@ void set_user_nice(struct task_struct *p, long nice) * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ - rq = task_rq_lock(p, &rf); + CLASS(task_rq_lock, rq_guard)(p); + rq = rq_guard.rq; + update_rq_clock(rq); /* @@ -7246,8 +7236,9 @@ void set_user_nice(struct task_struct *p, long nice) */ if (task_has_dl_policy(p) || task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); - goto out_unlock; + return; } + queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) @@ -7270,9 +7261,6 @@ void set_user_nice(struct task_struct *p, long nice) * lowered its priority, then reschedule its CPU: */ p->sched_class->prio_changed(rq, p, old_prio); - -out_unlock: - task_rq_unlock(rq, p, &rf); } EXPORT_SYMBOL(set_user_nice); @@ -7545,6 +7533,21 @@ static struct task_struct *find_process_by_pid(pid_t pid) return pid ? find_task_by_vpid(pid) : current; } +static struct task_struct *find_get_task(pid_t pid) +{ + struct task_struct *p; + guard(rcu)(); + + p = find_process_by_pid(pid); + if (likely(p)) + get_task_struct(p); + + return p; +} + +DEFINE_CLASS(find_get_task, struct task_struct *, if (_T) put_task_struct(_T), + find_get_task(pid), pid_t pid) + /* * sched_setparam() passes in -1 for its policy, to let the functions * it calls know not to change it. @@ -7582,14 +7585,11 @@ static void __setscheduler_params(struct task_struct *p, static bool check_same_owner(struct task_struct *p) { const struct cred *cred = current_cred(), *pcred; - bool match; + guard(rcu)(); - rcu_read_lock(); pcred = __task_cred(p); - match = (uid_eq(cred->euid, pcred->euid) || - uid_eq(cred->euid, pcred->uid)); - rcu_read_unlock(); - return match; + return (uid_eq(cred->euid, pcred->euid) || + uid_eq(cred->euid, pcred->uid)); } /* @@ -8001,27 +8001,17 @@ static int do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lparam; - struct task_struct *p; - int retval; if (!param || pid < 0) return -EINVAL; if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); - - if (likely(p)) { - retval = sched_setscheduler(p, policy, &lparam); - put_task_struct(p); - } + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; - return retval; + return sched_setscheduler(p, policy, &lparam); } /* @@ -8117,7 +8107,6 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, unsigned int, flags) { struct sched_attr attr; - struct task_struct *p; int retval; if (!uattr || pid < 0 || flags) @@ -8132,21 +8121,14 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, if (attr.sched_flags & SCHED_FLAG_KEEP_POLICY) attr.sched_policy = SETPARAM_POLICY; - rcu_read_lock(); - retval = -ESRCH; - p = find_process_by_pid(pid); - if (likely(p)) - get_task_struct(p); - rcu_read_unlock(); + CLASS(find_get_task, p)(pid); + if (!p) + return -ESRCH; - if (likely(p)) { - if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) - get_params(p, &attr); - retval = sched_setattr(p, &attr); - put_task_struct(p); - } + if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS) + get_params(p, &attr); - return retval; + return sched_setattr(p, &attr); } /** @@ -8164,16 +8146,17 @@ SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) if (pid < 0) return -EINVAL; - retval = -ESRCH; - rcu_read_lock(); + guard(rcu)(); p = find_process_by_pid(pid); - if (p) { - retval = security_task_getscheduler(p); - if (!retval) - retval = p->policy - | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0); + if (!p) + return -ESRCH; + + retval = security_task_getscheduler(p); + if (!retval) { + retval = p->policy; + if (p->sched_reset_on_fork) + retval |= SCHED_RESET_ON_FORK; } - rcu_read_unlock(); return retval; } @@ -8194,30 +8177,23 @@ SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) if (!param || pid < 0) return -EINVAL; - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - if (task_has_rt_policy(p)) - lp.sched_priority = p->rt_priority; - rcu_read_unlock(); + if (task_has_rt_policy(p)) + lp.sched_priority = p->rt_priority; + } /* * This one might sleep, we cannot do it with a spinlock held ... */ - retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; - - return retval; - -out_unlock: - rcu_read_unlock(); - return retval; + return copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; } /* @@ -8277,46 +8253,38 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, usize < SCHED_ATTR_SIZE_VER0 || flags) return -EINVAL; - rcu_read_lock(); - p = find_process_by_pid(pid); - retval = -ESRCH; - if (!p) - goto out_unlock; + scoped_guard (rcu) { + p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - kattr.sched_policy = p->policy; - if (p->sched_reset_on_fork) - kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; - get_params(p, &kattr); - kattr.sched_flags &= SCHED_FLAG_ALL; + kattr.sched_policy = p->policy; + if (p->sched_reset_on_fork) + kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; + get_params(p, &kattr); + kattr.sched_flags &= SCHED_FLAG_ALL; #ifdef CONFIG_UCLAMP_TASK - /* - * This could race with another potential updater, but this is fine - * because it'll correctly read the old or the new value. We don't need - * to guarantee who wins the race as long as it doesn't return garbage. - */ - kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; - kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; + /* + * This could race with another potential updater, but this is fine + * because it'll correctly read the old or the new value. We don't need + * to guarantee who wins the race as long as it doesn't return garbage. + */ + kattr.sched_util_min = p->uclamp_req[UCLAMP_MIN].value; + kattr.sched_util_max = p->uclamp_req[UCLAMP_MAX].value; #endif - - rcu_read_unlock(); + } return sched_attr_copy_to_user(uattr, &kattr, usize); - -out_unlock: - rcu_read_unlock(); - return retval; } #ifdef CONFIG_SMP int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) { - int ret = 0; - /* * If the task isn't a deadline task or admission control is * disabled then we don't care about affinity changes. @@ -8330,11 +8298,11 @@ int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask) * tasks allowed to run on all the CPUs in the task's * root_domain. */ - rcu_read_lock(); + guard(rcu)(); if (!cpumask_subset(task_rq(p)->rd->span, mask)) - ret = -EBUSY; - rcu_read_unlock(); - return ret; + return -EBUSY; + + return 0; } #endif @@ -8404,39 +8372,24 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) { struct affinity_context ac; struct cpumask *user_mask; - struct task_struct *p; int retval; - rcu_read_lock(); - - p = find_process_by_pid(pid); - if (!p) { - rcu_read_unlock(); + CLASS(find_get_task, p)(pid); + if (!p) return -ESRCH; - } - - /* Prevent p going away */ - get_task_struct(p); - rcu_read_unlock(); - if (p->flags & PF_NO_SETAFFINITY) { - retval = -EINVAL; - goto out_put_task; - } + if (p->flags & PF_NO_SETAFFINITY) + return -EINVAL; if (!check_same_owner(p)) { - rcu_read_lock(); - if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { - rcu_read_unlock(); - retval = -EPERM; - goto out_put_task; - } - rcu_read_unlock(); + guard(rcu)(); + if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) + return -EPERM; } retval = security_task_setscheduler(p); if (retval) - goto out_put_task; + return retval; /* * With non-SMP configs, user_cpus_ptr/user_mask isn't used and @@ -8446,8 +8399,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) if (user_mask) { cpumask_copy(user_mask, in_mask); } else if (IS_ENABLED(CONFIG_SMP)) { - retval = -ENOMEM; - goto out_put_task; + return -ENOMEM; } ac = (struct affinity_context){ @@ -8459,8 +8411,6 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) retval = __sched_setaffinity(p, &ac); kfree(ac.user_mask); -out_put_task: - put_task_struct(p); return retval; } @@ -8502,28 +8452,21 @@ SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, long sched_getaffinity(pid_t pid, struct cpumask *mask) { struct task_struct *p; - unsigned long flags; int retval; - rcu_read_lock(); - - retval = -ESRCH; + guard(rcu)(); p = find_process_by_pid(pid); if (!p) - goto out_unlock; + return -ESRCH; retval = security_task_getscheduler(p); if (retval) - goto out_unlock; + return retval; - raw_spin_lock_irqsave(&p->pi_lock, flags); + guard(raw_spinlock_irqsave)(&p->pi_lock); cpumask_and(mask, &p->cpus_mask, cpu_active_mask); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); -out_unlock: - rcu_read_unlock(); - - return retval; + return 0; } /** @@ -8970,55 +8913,46 @@ int __sched yield_to(struct task_struct *p, bool preempt) { struct task_struct *curr = current; struct rq *rq, *p_rq; - unsigned long flags; int yielded = 0; - local_irq_save(flags); - rq = this_rq(); + scoped_guard (irqsave) { + rq = this_rq(); again: - p_rq = task_rq(p); - /* - * If we're the only runnable task on the rq and target rq also - * has only one task, there's absolutely no point in yielding. - */ - if (rq->nr_running == 1 && p_rq->nr_running == 1) { - yielded = -ESRCH; - goto out_irq; - } + p_rq = task_rq(p); + /* + * If we're the only runnable task on the rq and target rq also + * has only one task, there's absolutely no point in yielding. + */ + if (rq->nr_running == 1 && p_rq->nr_running == 1) + return -ESRCH; - double_rq_lock(rq, p_rq); - if (task_rq(p) != p_rq) { - double_rq_unlock(rq, p_rq); - goto again; - } + guard(double_rq_lock)(rq, p_rq); + if (task_rq(p) != p_rq) + goto again; - if (!curr->sched_class->yield_to_task) - goto out_unlock; + if (!curr->sched_class->yield_to_task) + return 0; - if (curr->sched_class != p->sched_class) - goto out_unlock; + if (curr->sched_class != p->sched_class) + return 0; - if (task_on_cpu(p_rq, p) || !task_is_running(p)) - goto out_unlock; + if (task_on_cpu(p_rq, p) || !task_is_running(p)) + return 0; - yielded = curr->sched_class->yield_to_task(rq, p); - if (yielded) { - schedstat_inc(rq->yld_count); - /* - * Make p's CPU reschedule; pick_next_entity takes care of - * fairness. - */ - if (preempt && rq != p_rq) - resched_curr(p_rq); + yielded = curr->sched_class->yield_to_task(rq, p); + if (yielded) { + schedstat_inc(rq->yld_count); + /* + * Make p's CPU reschedule; pick_next_entity + * takes care of fairness. + */ + if (preempt && rq != p_rq) + resched_curr(p_rq); + } } -out_unlock: - double_rq_unlock(rq, p_rq); -out_irq: - local_irq_restore(flags); - - if (yielded > 0) + if (yielded) schedule(); return yielded; @@ -9121,38 +9055,30 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) { - struct task_struct *p; - unsigned int time_slice; - struct rq_flags rf; - struct rq *rq; + unsigned int time_slice = 0; int retval; if (pid < 0) return -EINVAL; - retval = -ESRCH; - rcu_read_lock(); - p = find_process_by_pid(pid); - if (!p) - goto out_unlock; + scoped_guard (rcu) { + struct task_struct *p = find_process_by_pid(pid); + if (!p) + return -ESRCH; - retval = security_task_getscheduler(p); - if (retval) - goto out_unlock; + retval = security_task_getscheduler(p); + if (retval) + return retval; - rq = task_rq_lock(p, &rf); - time_slice = 0; - if (p->sched_class->get_rr_interval) - time_slice = p->sched_class->get_rr_interval(rq, p); - task_rq_unlock(rq, p, &rf); + scoped_guard (task_rq_lock, p) { + struct rq *rq = scope.rq; + if (p->sched_class->get_rr_interval) + time_slice = p->sched_class->get_rr_interval(rq, p); + } + } - rcu_read_unlock(); jiffies_to_timespec64(time_slice, t); return 0; - -out_unlock: - rcu_read_unlock(); - return retval; } /** @@ -9211,9 +9137,9 @@ void sched_show_task(struct task_struct *p) if (pid_alive(p)) ppid = task_pid_nr(rcu_dereference(p->real_parent)); rcu_read_unlock(); - pr_cont(" stack:%-5lu pid:%-5d ppid:%-6d flags:0x%08lx\n", - free, task_pid_nr(p), ppid, - read_task_thread_flags(p)); + pr_cont(" stack:%-5lu pid:%-5d tgid:%-5d ppid:%-6d flags:0x%08lx\n", + free, task_pid_nr(p), task_tgid_nr(p), + ppid, read_task_thread_flags(p)); print_worker_info(KERN_INFO, p); print_stop_info(KERN_INFO, p); @@ -9543,9 +9469,11 @@ static void balance_push(struct rq *rq) * Temporarily drop rq->lock such that we can wake-up the stop task. * Both preemption and IRQs are still disabled. */ + preempt_disable(); raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, this_cpu_ptr(&push_work)); + preempt_enable(); /* * At this point need_resched() is true and we'll take the loop in * schedule(). The next pick is obviously going to be the stop task @@ -10051,7 +9979,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP rq->sd = NULL; rq->rd = NULL; - rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE; + rq->cpu_capacity = SCHED_CAPACITY_SCALE; rq->balance_callback = &balance_push_callback; rq->active_balance = 0; rq->next_balance = jiffies; @@ -10060,8 +9988,6 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); @@ -10536,17 +10462,18 @@ void sched_move_task(struct task_struct *tsk) int queued, running, queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; struct task_group *group; - struct rq_flags rf; struct rq *rq; - rq = task_rq_lock(tsk, &rf); + CLASS(task_rq_lock, rq_guard)(tsk); + rq = rq_guard.rq; + /* * Esp. with SCHED_AUTOGROUP enabled it is possible to get superfluous * group changes. */ group = sched_get_task_group(tsk); if (group == tsk->sched_task_group) - goto unlock; + return; update_rq_clock(rq); @@ -10571,9 +10498,6 @@ void sched_move_task(struct task_struct *tsk) */ resched_curr(rq); } - -unlock: - task_rq_unlock(rq, tsk, &rf); } static inline struct task_group *css_tg(struct cgroup_subsys_state *css) @@ -10610,11 +10534,9 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) #ifdef CONFIG_UCLAMP_TASK_GROUP /* Propagate the effective uclamp value for the new group */ - mutex_lock(&uclamp_mutex); - rcu_read_lock(); + guard(mutex)(&uclamp_mutex); + guard(rcu)(); cpu_util_update_eff(css); - rcu_read_unlock(); - mutex_unlock(&uclamp_mutex); #endif return 0; @@ -10765,8 +10687,8 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, static_branch_enable(&sched_uclamp_used); - mutex_lock(&uclamp_mutex); - rcu_read_lock(); + guard(mutex)(&uclamp_mutex); + guard(rcu)(); tg = css_tg(of_css(of)); if (tg->uclamp_req[clamp_id].value != req.util) @@ -10781,9 +10703,6 @@ static ssize_t cpu_uclamp_write(struct kernfs_open_file *of, char *buf, /* Update effective clamps to track the most restrictive value */ cpu_util_update_eff(of_css(of)); - rcu_read_unlock(); - mutex_unlock(&uclamp_mutex); - return nbytes; } @@ -10809,10 +10728,10 @@ static inline void cpu_uclamp_print(struct seq_file *sf, u64 percent; u32 rem; - rcu_read_lock(); - tg = css_tg(seq_css(sf)); - util_clamp = tg->uclamp_req[clamp_id].value; - rcu_read_unlock(); + scoped_guard (rcu) { + tg = css_tg(seq_css(sf)); + util_clamp = tg->uclamp_req[clamp_id].value; + } if (util_clamp == SCHED_CAPACITY_SCALE) { seq_puts(sf, "max\n"); @@ -10903,11 +10822,12 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, * Prevent race between setting of cfs_rq->runtime_enabled and * unthrottle_offline_cfs_rqs(). */ - cpus_read_lock(); - mutex_lock(&cfs_constraints_mutex); + guard(cpus_read_lock)(); + guard(mutex)(&cfs_constraints_mutex); + ret = __cfs_schedulable(tg, period, quota); if (ret) - goto out_unlock; + return ret; runtime_enabled = quota != RUNTIME_INF; runtime_was_enabled = cfs_b->quota != RUNTIME_INF; @@ -10917,39 +10837,38 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota, */ if (runtime_enabled && !runtime_was_enabled) cfs_bandwidth_usage_inc(); - raw_spin_lock_irq(&cfs_b->lock); - cfs_b->period = ns_to_ktime(period); - cfs_b->quota = quota; - cfs_b->burst = burst; - __refill_cfs_bandwidth_runtime(cfs_b); + scoped_guard (raw_spinlock_irq, &cfs_b->lock) { + cfs_b->period = ns_to_ktime(period); + cfs_b->quota = quota; + cfs_b->burst = burst; - /* Restart the period timer (if active) to handle new period expiry: */ - if (runtime_enabled) - start_cfs_bandwidth(cfs_b); + __refill_cfs_bandwidth_runtime(cfs_b); - raw_spin_unlock_irq(&cfs_b->lock); + /* + * Restart the period timer (if active) to handle new + * period expiry: + */ + if (runtime_enabled) + start_cfs_bandwidth(cfs_b); + } for_each_online_cpu(i) { struct cfs_rq *cfs_rq = tg->cfs_rq[i]; struct rq *rq = cfs_rq->rq; - struct rq_flags rf; - rq_lock_irq(rq, &rf); + guard(rq_lock_irq)(rq); cfs_rq->runtime_enabled = runtime_enabled; cfs_rq->runtime_remaining = 0; if (cfs_rq->throttled) unthrottle_cfs_rq(cfs_rq); - rq_unlock_irq(rq, &rf); } + if (runtime_was_enabled && !runtime_enabled) cfs_bandwidth_usage_dec(); -out_unlock: - mutex_unlock(&cfs_constraints_mutex); - cpus_read_unlock(); - return ret; + return 0; } static int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us) @@ -11134,7 +11053,6 @@ static int tg_cfs_schedulable_down(struct task_group *tg, void *data) static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) { - int ret; struct cfs_schedulable_data data = { .tg = tg, .period = period, @@ -11146,11 +11064,8 @@ static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota) do_div(data.quota, NSEC_PER_USEC); } - rcu_read_lock(); - ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data); } static int cpu_cfs_stat_show(struct seq_file *sf, void *v) @@ -11755,14 +11670,12 @@ int __sched_mm_cid_migrate_from_fetch_cid(struct rq *src_rq, * are not the last task to be migrated from this cpu for this mm, so * there is no need to move src_cid to the destination cpu. */ - rcu_read_lock(); + guard(rcu)(); src_task = rcu_dereference(src_rq->curr); if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - rcu_read_unlock(); t->last_mm_cid = -1; return -1; } - rcu_read_unlock(); return src_cid; } @@ -11806,18 +11719,17 @@ int __sched_mm_cid_migrate_from_try_steal_cid(struct rq *src_rq, * the lazy-put flag, this task will be responsible for transitioning * from lazy-put flag set to MM_CID_UNSET. */ - rcu_read_lock(); - src_task = rcu_dereference(src_rq->curr); - if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { - rcu_read_unlock(); - /* - * We observed an active task for this mm, there is therefore - * no point in moving this cid to the destination cpu. - */ - t->last_mm_cid = -1; - return -1; + scoped_guard (rcu) { + src_task = rcu_dereference(src_rq->curr); + if (READ_ONCE(src_task->mm_cid_active) && src_task->mm == mm) { + /* + * We observed an active task for this mm, there is therefore + * no point in moving this cid to the destination cpu. + */ + t->last_mm_cid = -1; + return -1; + } } - rcu_read_unlock(); /* * The src_cid is unused, so it can be unset. @@ -11890,7 +11802,6 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_ { struct rq *rq = cpu_rq(cpu); struct task_struct *t; - unsigned long flags; int cid, lazy_cid; cid = READ_ONCE(pcpu_cid->cid); @@ -11925,23 +11836,21 @@ static void sched_mm_cid_remote_clear(struct mm_struct *mm, struct mm_cid *pcpu_ * the lazy-put flag, that task will be responsible for transitioning * from lazy-put flag set to MM_CID_UNSET. */ - rcu_read_lock(); - t = rcu_dereference(rq->curr); - if (READ_ONCE(t->mm_cid_active) && t->mm == mm) { - rcu_read_unlock(); - return; + scoped_guard (rcu) { + t = rcu_dereference(rq->curr); + if (READ_ONCE(t->mm_cid_active) && t->mm == mm) + return; } - rcu_read_unlock(); /* * The cid is unused, so it can be unset. * Disable interrupts to keep the window of cid ownership without rq * lock small. */ - local_irq_save(flags); - if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) - __mm_cid_put(mm, cid); - local_irq_restore(flags); + scoped_guard (irqsave) { + if (try_cmpxchg(&pcpu_cid->cid, &lazy_cid, MM_CID_UNSET)) + __mm_cid_put(mm, cid); + } } static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) @@ -11963,14 +11872,13 @@ static void sched_mm_cid_remote_clear_old(struct mm_struct *mm, int cpu) * snapshot associated with this cid if an active task using the mm is * observed on this rq. */ - rcu_read_lock(); - curr = rcu_dereference(rq->curr); - if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { - WRITE_ONCE(pcpu_cid->time, rq_clock); - rcu_read_unlock(); - return; + scoped_guard (rcu) { + curr = rcu_dereference(rq->curr); + if (READ_ONCE(curr->mm_cid_active) && curr->mm == mm) { + WRITE_ONCE(pcpu_cid->time, rq_clock); + return; + } } - rcu_read_unlock(); if (rq_clock < pcpu_cid->time + SCHED_MM_CID_PERIOD_NS) return; @@ -12064,7 +11972,6 @@ void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) void sched_mm_cid_exit_signals(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12072,7 +11979,7 @@ void sched_mm_cid_exit_signals(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); preempt_enable_no_resched(); /* holding spinlock */ WRITE_ONCE(t->mm_cid_active, 0); /* @@ -12082,13 +11989,11 @@ void sched_mm_cid_exit_signals(struct task_struct *t) smp_mb(); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; - rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_before_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12096,7 +12001,7 @@ void sched_mm_cid_before_execve(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); + guard(rq_lock_irqsave)(rq); preempt_enable_no_resched(); /* holding spinlock */ WRITE_ONCE(t->mm_cid_active, 0); /* @@ -12106,13 +12011,11 @@ void sched_mm_cid_before_execve(struct task_struct *t) smp_mb(); mm_cid_put(mm); t->last_mm_cid = t->mm_cid = -1; - rq_unlock_irqrestore(rq, &rf); } void sched_mm_cid_after_execve(struct task_struct *t) { struct mm_struct *mm = t->mm; - struct rq_flags rf; struct rq *rq; if (!mm) @@ -12120,16 +12023,16 @@ void sched_mm_cid_after_execve(struct task_struct *t) preempt_disable(); rq = this_rq(); - rq_lock_irqsave(rq, &rf); - preempt_enable_no_resched(); /* holding spinlock */ - WRITE_ONCE(t->mm_cid_active, 1); - /* - * Store t->mm_cid_active before loading per-mm/cpu cid. - * Matches barrier in sched_mm_cid_remote_clear_old(). - */ - smp_mb(); - t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); - rq_unlock_irqrestore(rq, &rf); + scoped_guard (rq_lock_irqsave, rq) { + preempt_enable_no_resched(); /* holding spinlock */ + WRITE_ONCE(t->mm_cid_active, 1); + /* + * Store t->mm_cid_active before loading per-mm/cpu cid. + * Matches barrier in sched_mm_cid_remote_clear_old(). + */ + smp_mb(); + t->last_mm_cid = t->mm_cid = mm_cid_get(rq, mm); + } rseq_set_notify_resume(t); } diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 57c92d751bcd..95baa12a1029 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -131,7 +131,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, if (!dl_task_fits_capacity(p, cpu)) { cpumask_clear_cpu(cpu, later_mask); - cap = capacity_orig_of(cpu); + cap = arch_scale_cpu_capacity(cpu); if (cap > max_cap || (cpu == task_cpu(p) && cap == max_cap)) { diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 58b542bf2893..b28114478b82 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -132,7 +132,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) int i; for_each_cpu_and(i, mask, cpu_active_mask) - cap += capacity_orig_of(i); + cap += arch_scale_cpu_capacity(i); return cap; } @@ -144,7 +144,7 @@ static inline unsigned long __dl_bw_capacity(const struct cpumask *mask) static inline unsigned long dl_bw_capacity(int i) { if (!sched_asym_cpucap_active() && - capacity_orig_of(i) == SCHED_CAPACITY_SCALE) { + arch_scale_cpu_capacity(i) == SCHED_CAPACITY_SCALE) { return dl_bw_cpus(i) << SCHED_CAPACITY_SHIFT; } else { RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(), @@ -509,7 +509,6 @@ void init_dl_rq(struct dl_rq *dl_rq) /* zero means no -deadline tasks */ dl_rq->earliest_dl.curr = dl_rq->earliest_dl.next = 0; - dl_rq->dl_nr_migratory = 0; dl_rq->overloaded = 0; dl_rq->pushable_dl_tasks_root = RB_ROOT_CACHED; #else @@ -553,39 +552,6 @@ static inline void dl_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->dlo_mask); } -static void update_dl_migration(struct dl_rq *dl_rq) -{ - if (dl_rq->dl_nr_migratory && dl_rq->dl_nr_running > 1) { - if (!dl_rq->overloaded) { - dl_set_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 1; - } - } else if (dl_rq->overloaded) { - dl_clear_overload(rq_of_dl_rq(dl_rq)); - dl_rq->overloaded = 0; - } -} - -static void inc_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory++; - - update_dl_migration(dl_rq); -} - -static void dec_dl_migration(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) -{ - struct task_struct *p = dl_task_of(dl_se); - - if (p->nr_cpus_allowed > 1) - dl_rq->dl_nr_migratory--; - - update_dl_migration(dl_rq); -} - #define __node_2_pdl(node) \ rb_entry((node), struct task_struct, pushable_dl_tasks) @@ -594,6 +560,11 @@ static inline bool __pushable_less(struct rb_node *a, const struct rb_node *b) return dl_entity_preempt(&__node_2_pdl(a)->dl, &__node_2_pdl(b)->dl); } +static inline int has_pushable_dl_tasks(struct rq *rq) +{ + return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); +} + /* * The list of pushable -deadline task is not a plist, like in * sched_rt.c, it is an rb-tree with tasks ordered by deadline. @@ -609,6 +580,11 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) __pushable_less); if (leftmost) rq->dl.earliest_dl.next = p->dl.deadline; + + if (!rq->dl.overloaded) { + dl_set_overload(rq); + rq->dl.overloaded = 1; + } } static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) @@ -625,11 +601,11 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) dl_rq->earliest_dl.next = __node_2_pdl(leftmost)->dl.deadline; RB_CLEAR_NODE(&p->pushable_dl_tasks); -} -static inline int has_pushable_dl_tasks(struct rq *rq) -{ - return !RB_EMPTY_ROOT(&rq->dl.pushable_dl_tasks_root.rb_root); + if (!has_pushable_dl_tasks(rq) && rq->dl.overloaded) { + dl_clear_overload(rq); + rq->dl.overloaded = 0; + } } static int push_dl_task(struct rq *rq); @@ -763,7 +739,7 @@ static inline void deadline_queue_pull_task(struct rq *rq) static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags); static void __dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags); -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, int flags); +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags); static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, struct rq *rq) @@ -1175,7 +1151,7 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) enqueue_task_dl(rq, p, ENQUEUE_REPLENISH); if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); @@ -1504,7 +1480,6 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) add_nr_running(rq_of_dl_rq(dl_rq), 1); inc_dl_deadline(dl_rq, deadline); - inc_dl_migration(dl_se, dl_rq); } static inline @@ -1518,7 +1493,6 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) sub_nr_running(rq_of_dl_rq(dl_rq), 1); dec_dl_deadline(dl_rq, dl_se->deadline); - dec_dl_migration(dl_se, dl_rq); } static inline bool __dl_less(struct rb_node *a, const struct rb_node *b) @@ -1939,7 +1913,7 @@ static int balance_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) * Only called when both the current and waking task are -deadline * tasks. */ -static void check_preempt_curr_dl(struct rq *rq, struct task_struct *p, +static void wakeup_preempt_dl(struct rq *rq, struct task_struct *p, int flags) { if (dl_entity_preempt(&p->dl, &rq->curr->dl)) { @@ -2291,9 +2265,6 @@ static int push_dl_task(struct rq *rq) struct rq *later_rq; int ret = 0; - if (!rq->dl.overloaded) - return 0; - next_task = pick_next_pushable_dl_task(rq); if (!next_task) return 0; @@ -2449,9 +2420,11 @@ skip: double_unlock_balance(this_rq, src_rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(this_rq); stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, push_task, &src_rq->push_work); + preempt_enable(); raw_spin_rq_lock(this_rq); } } @@ -2652,7 +2625,7 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p) deadline_queue_push_tasks(rq); #endif if (dl_task(rq->curr)) - check_preempt_curr_dl(rq, p, 0); + wakeup_preempt_dl(rq, p, 0); else resched_curr(rq); } else { @@ -2721,7 +2694,7 @@ DEFINE_SCHED_CLASS(dl) = { .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, - .check_preempt_curr = check_preempt_curr_dl, + .wakeup_preempt = wakeup_preempt_dl, .pick_next_task = pick_next_task_dl, .put_prev_task = put_prev_task_dl, diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 4c3d0d9f3db6..4580a450700e 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -8,7 +8,7 @@ */ /* - * This allows printing both to /proc/sched_debug and + * This allows printing both to /sys/kernel/debug/sched/debug and * to the console */ #define SEQ_printf(m, x...) \ @@ -724,9 +724,6 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) PU(rt_nr_running); -#ifdef CONFIG_SMP - PU(rt_nr_migratory); -#endif P(rt_throttled); PN(rt_time); PN(rt_runtime); @@ -748,7 +745,6 @@ void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) PU(dl_nr_running); #ifdef CONFIG_SMP - PU(dl_nr_migratory); dl_bw = &cpu_rq(cpu)->rd->dl_bw; #else dl_bw = &dl_rq->dl_bw; @@ -864,7 +860,6 @@ static void sched_debug_header(struct seq_file *m) #define PN(x) \ SEQ_printf(m, " .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x)) PN(sysctl_sched_base_slice); - P(sysctl_sched_child_runs_first); P(sysctl_sched_features); #undef PN #undef P diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index df348aa55d3c..8767988242ee 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,8 +51,6 @@ #include <asm/switch_to.h> -#include <linux/sched/cond_resched.h> - #include "sched.h" #include "stats.h" #include "autogroup.h" @@ -78,12 +76,6 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; unsigned int sysctl_sched_base_slice = 750000ULL; static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; -/* - * After fork, child runs first. If set to 0 (default) then - * parent will (try to) run first. - */ -unsigned int sysctl_sched_child_runs_first __read_mostly; - const_debug unsigned int sysctl_sched_migration_cost = 500000UL; int sched_thermal_decay_shift; @@ -145,13 +137,6 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; #ifdef CONFIG_SYSCTL static struct ctl_table sched_fair_sysctls[] = { - { - .procname = "sched_child_runs_first", - .data = &sysctl_sched_child_runs_first, - .maxlen = sizeof(unsigned int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, #ifdef CONFIG_CFS_BANDWIDTH { .procname = "sched_cfs_bandwidth_slice_us", @@ -2899,19 +2884,7 @@ static void task_numa_placement(struct task_struct *p) } /* Cannot migrate task to CPU-less node */ - if (max_nid != NUMA_NO_NODE && !node_state(max_nid, N_CPU)) { - int near_nid = max_nid; - int distance, near_distance = INT_MAX; - - for_each_node_state(nid, N_CPU) { - distance = node_distance(max_nid, nid); - if (distance < near_distance) { - near_nid = nid; - near_distance = distance; - } - } - max_nid = near_nid; - } + max_nid = numa_nearest_node(max_nid, N_CPU); if (ng) { numa_group_count_active_nodes(ng); @@ -3182,7 +3155,7 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } -static bool vma_is_accessed(struct vm_area_struct *vma) +static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long pids; /* @@ -3194,8 +3167,20 @@ static bool vma_is_accessed(struct vm_area_struct *vma) if (READ_ONCE(current->mm->numa_scan_seq) < 2) return true; - pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1]; - return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids); + pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; + if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) + return true; + + /* + * Complete a scan that has already started regardless of PID access, or + * some VMAs may never be scanned in multi-threaded applications: + */ + if (mm->numa_scan_offset > vma->vm_start) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID); + return true; + } + + return false; } #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) @@ -3215,6 +3200,8 @@ static void task_numa_work(struct callback_head *work) unsigned long nr_pte_updates = 0; long pages, virtpages; struct vma_iterator vmi; + bool vma_pids_skipped; + bool vma_pids_forced = false; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -3257,7 +3244,6 @@ static void task_numa_work(struct callback_head *work) */ p->node_stamp += 2 * TICK_NSEC; - start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ virtpages = pages * 8; /* Scan up to this much virtual space */ @@ -3267,6 +3253,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; + + /* + * VMAs are skipped if the current PID has not trapped a fault within + * the VMA recently. Allow scanning to be forced if there is no + * suitable VMA remaining. + */ + vma_pids_skipped = false; + +retry_pids: + start = mm->numa_scan_offset; vma_iter_init(&vmi, mm, start); vma = vma_next(&vmi); if (!vma) { @@ -3279,6 +3275,7 @@ static void task_numa_work(struct callback_head *work) do { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); continue; } @@ -3289,15 +3286,19 @@ static void task_numa_work(struct callback_head *work) * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO); continue; + } /* * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!vma_is_accessible(vma)) + if (!vma_is_accessible(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); continue; + } /* Initialise new per-VMA NUMAB state. */ if (!vma->numab_state) { @@ -3310,8 +3311,15 @@ static void task_numa_work(struct callback_head *work) msecs_to_jiffies(sysctl_numa_balancing_scan_delay); /* Reset happens after 4 times scan delay of scan start */ - vma->numab_state->next_pid_reset = vma->numab_state->next_scan + + vma->numab_state->pids_active_reset = vma->numab_state->next_scan + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + + /* + * Ensure prev_scan_seq does not match numa_scan_seq, + * to prevent VMAs being skipped prematurely on the + * first scan: + */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1; } /* @@ -3319,23 +3327,35 @@ static void task_numa_work(struct callback_head *work) * delay the scan for new VMAs. */ if (mm->numa_scan_seq && time_before(jiffies, - vma->numab_state->next_scan)) + vma->numab_state->next_scan)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY); continue; + } - /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(vma)) + /* RESET access PIDs regularly for old VMAs. */ + if (mm->numa_scan_seq && + time_after(jiffies, vma->numab_state->pids_active_reset)) { + vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset + + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]); + vma->numab_state->pids_active[1] = 0; + } + + /* Do not rescan VMAs twice within the same sequence. */ + if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) { + mm->numa_scan_offset = vma->vm_end; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED); continue; + } /* - * RESET access PIDs regularly for old VMAs. Resetting after checking - * vma for recent access to avoid clearing PID info before access.. + * Do not scan the VMA if task has not accessed it, unless no other + * VMA candidate exists. */ - if (mm->numa_scan_seq && - time_after(jiffies, vma->numab_state->next_pid_reset)) { - vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset + - msecs_to_jiffies(VMA_PID_RESET_PERIOD); - vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]); - vma->numab_state->access_pids[1] = 0; + if (!vma_pids_forced && !vma_is_accessed(mm, vma)) { + vma_pids_skipped = true; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); + continue; } do { @@ -3362,8 +3382,28 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); + + /* VMA scan is complete, do not scan until next sequence. */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq; + + /* + * Only force scan within one VMA at a time, to limit the + * cost of scanning a potentially uninteresting VMA. + */ + if (vma_pids_forced) + break; } for_each_vma(vmi, vma); + /* + * If no VMAs are remaining and VMAs were skipped due to the PID + * not accessing the VMA previously, then force a scan to ensure + * forward progress: + */ + if (!vma && !vma_pids_forced && vma_pids_skipped) { + vma_pids_forced = true; + goto retry_pids; + } + out: /* * It is possible to reach the end of the VMA list but the last few @@ -3942,7 +3982,8 @@ static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) { - long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + long delta; + u64 now; /* * No need to update load_avg for root_task_group as it is not used. @@ -3950,9 +3991,19 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq) if (cfs_rq->tg == &root_task_group) return; + /* + * For migration heavy workloads, access to tg->load_avg can be + * unbound. Limit the update rate to at most once per ms. + */ + now = sched_clock_cpu(cpu_of(rq_of(cfs_rq))); + if (now - cfs_rq->last_update_tg_load_avg < NSEC_PER_MSEC) + return; + + delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; if (abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; + cfs_rq->last_update_tg_load_avg = now; } } @@ -4626,22 +4677,6 @@ static inline unsigned long task_util_est(struct task_struct *p) return max(task_util(p), _task_util_est(p)); } -#ifdef CONFIG_UCLAMP_TASK -static inline unsigned long uclamp_task_util(struct task_struct *p, - unsigned long uclamp_min, - unsigned long uclamp_max) -{ - return clamp(task_util_est(p), uclamp_min, uclamp_max); -} -#else -static inline unsigned long uclamp_task_util(struct task_struct *p, - unsigned long uclamp_min, - unsigned long uclamp_max) -{ - return task_util_est(p); -} -#endif - static inline void util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) { @@ -4745,7 +4780,7 @@ static inline void util_est_update(struct cfs_rq *cfs_rq, * To avoid overestimation of actual task utilization, skip updates if * we cannot grant there is idle time in this CPU. */ - if (task_util(p) > capacity_orig_of(cpu_of(rq_of(cfs_rq)))) + if (task_util(p) > arch_scale_cpu_capacity(cpu_of(rq_of(cfs_rq)))) return; /* @@ -4793,14 +4828,14 @@ static inline int util_fits_cpu(unsigned long util, return fits; /* - * We must use capacity_orig_of() for comparing against uclamp_min and + * We must use arch_scale_cpu_capacity() for comparing against uclamp_min and * uclamp_max. We only care about capacity pressure (by using * capacity_of()) for comparing against the real util. * * If a task is boosted to 1024 for example, we don't want a tiny * pressure to skew the check whether it fits a CPU or not. * - * Similarly if a task is capped to capacity_orig_of(little_cpu), it + * Similarly if a task is capped to arch_scale_cpu_capacity(little_cpu), it * should fit a little cpu even if there's some pressure. * * Only exception is for thermal pressure since it has a direct impact @@ -4812,7 +4847,7 @@ static inline int util_fits_cpu(unsigned long util, * For uclamp_max, we can tolerate a drop in performance level as the * goal is to cap the task. So it's okay if it's getting less. */ - capacity_orig = capacity_orig_of(cpu); + capacity_orig = arch_scale_cpu_capacity(cpu); capacity_orig_thermal = capacity_orig - arch_scale_thermal_pressure(cpu); /* @@ -4932,7 +4967,7 @@ static inline void update_misfit_status(struct task_struct *p, struct rq *rq) static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq) { - return true; + return !cfs_rq->nr_running; } #define UPDATE_TG 0x0 @@ -5267,7 +5302,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * 4) do not run the "skip" process, if something else is available */ static struct sched_entity * -pick_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *curr) +pick_next_entity(struct cfs_rq *cfs_rq) { /* * Enabling NEXT_BUDDY will affect latency but not fairness. @@ -5811,13 +5846,13 @@ static void unthrottle_cfs_rq_async(struct cfs_rq *cfs_rq) static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) { - struct cfs_rq *local_unthrottle = NULL; int this_cpu = smp_processor_id(); u64 runtime, remaining = 1; bool throttled = false; - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq, *tmp; struct rq_flags rf; struct rq *rq; + LIST_HEAD(local_unthrottle); rcu_read_lock(); list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq, @@ -5833,11 +5868,9 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) if (!cfs_rq_throttled(cfs_rq)) goto next; -#ifdef CONFIG_SMP /* Already queued for async unthrottle */ if (!list_empty(&cfs_rq->throttled_csd_list)) goto next; -#endif /* By the above checks, this should never be true */ SCHED_WARN_ON(cfs_rq->runtime_remaining > 0); @@ -5854,11 +5887,17 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) /* we check whether we're throttled above */ if (cfs_rq->runtime_remaining > 0) { - if (cpu_of(rq) != this_cpu || - SCHED_WARN_ON(local_unthrottle)) + if (cpu_of(rq) != this_cpu) { unthrottle_cfs_rq_async(cfs_rq); - else - local_unthrottle = cfs_rq; + } else { + /* + * We currently only expect to be unthrottling + * a single cfs_rq locally. + */ + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + list_add_tail(&cfs_rq->throttled_csd_list, + &local_unthrottle); + } } else { throttled = true; } @@ -5866,15 +5905,23 @@ static bool distribute_cfs_runtime(struct cfs_bandwidth *cfs_b) next: rq_unlock_irqrestore(rq, &rf); } - rcu_read_unlock(); - if (local_unthrottle) { - rq = cpu_rq(this_cpu); + list_for_each_entry_safe(cfs_rq, tmp, &local_unthrottle, + throttled_csd_list) { + struct rq *rq = rq_of(cfs_rq); + rq_lock_irqsave(rq, &rf); - if (cfs_rq_throttled(local_unthrottle)) - unthrottle_cfs_rq(local_unthrottle); + + list_del_init(&cfs_rq->throttled_csd_list); + + if (cfs_rq_throttled(cfs_rq)) + unthrottle_cfs_rq(cfs_rq); + rq_unlock_irqrestore(rq, &rf); } + SCHED_WARN_ON(!list_empty(&local_unthrottle)); + + rcu_read_unlock(); return throttled; } @@ -6204,9 +6251,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); -#ifdef CONFIG_SMP INIT_LIST_HEAD(&cfs_rq->throttled_csd_list); -#endif } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -7164,45 +7209,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; struct sched_domain_shared *sd_share; - struct rq *this_rq = this_rq(); - int this = smp_processor_id(); - struct sched_domain *this_sd = NULL; - u64 time = 0; cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP) && !has_idle_core) { - u64 avg_cost, avg_idle, span_avg; - unsigned long now = jiffies; - - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - - /* - * If we're busy, the assumption that the last idle period - * predicts the future is flawed; age away the remaining - * predicted idle time. - */ - if (unlikely(this_rq->wake_stamp < now)) { - while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) { - this_rq->wake_stamp++; - this_rq->wake_avg_idle >>= 1; - } - } - - avg_idle = this_rq->wake_avg_idle; - avg_cost = this_sd->avg_scan_cost + 1; - - span_avg = sd->span_weight * avg_idle; - if (span_avg > 4*avg_cost) - nr = div_u64(span_avg, avg_cost); - else - nr = 4; - - time = cpu_clock(this); - } - if (sched_feat(SIS_UTIL)) { sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); if (sd_share) { @@ -7214,6 +7223,30 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool } } + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + + if (sg->flags & SD_CLUSTER) { + for_each_cpu_wrap(cpu, sched_group_span(sg), target + 1) { + if (!cpumask_test_cpu(cpu, cpus)) + continue; + + if (has_idle_core) { + i = select_idle_core(p, cpu, cpus, &idle_cpu); + if ((unsigned int)i < nr_cpumask_bits) + return i; + } else { + if (--nr <= 0) + return -1; + idle_cpu = __select_idle_cpu(cpu, p); + if ((unsigned int)idle_cpu < nr_cpumask_bits) + return idle_cpu; + } + } + cpumask_andnot(cpus, cpus, sched_group_span(sg)); + } + } + for_each_cpu_wrap(cpu, cpus, target + 1) { if (has_idle_core) { i = select_idle_core(p, cpu, cpus, &idle_cpu); @@ -7221,7 +7254,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool return i; } else { - if (!--nr) + if (--nr <= 0) return -1; idle_cpu = __select_idle_cpu(cpu, p); if ((unsigned int)idle_cpu < nr_cpumask_bits) @@ -7232,18 +7265,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (has_idle_core) set_idle_cores(target, false); - if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { - time = cpu_clock(this) - time; - - /* - * Account for the scan cost of wakeups against the average - * idle time. - */ - this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time); - - update_avg(&this_sd->avg_scan_cost, time); - } - return idle_cpu; } @@ -7283,7 +7304,7 @@ select_idle_capacity(struct task_struct *p, struct sched_domain *sd, int target) * Look for the CPU with best capacity. */ else if (fits < 0) - cpu_cap = capacity_orig_of(cpu) - thermal_load_avg(cpu_rq(cpu)); + cpu_cap = arch_scale_cpu_capacity(cpu) - thermal_load_avg(cpu_rq(cpu)); /* * First, select CPU which fits better (-1 being better than 0). @@ -7323,7 +7344,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) bool has_idle_core = false; struct sched_domain *sd; unsigned long task_util, util_min, util_max; - int i, recent_used_cpu; + int i, recent_used_cpu, prev_aff = -1; /* * On asymmetric system, update task utilization because we will check @@ -7350,8 +7371,14 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) */ if (prev != target && cpus_share_cache(prev, target) && (available_idle_cpu(prev) || sched_idle_cpu(prev)) && - asym_fits_cpu(task_util, util_min, util_max, prev)) - return prev; + asym_fits_cpu(task_util, util_min, util_max, prev)) { + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(prev, target)) + return prev; + + prev_aff = prev; + } /* * Allow a per-cpu kthread to stack with the wakee if the @@ -7378,7 +7405,13 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && cpumask_test_cpu(recent_used_cpu, p->cpus_ptr) && asym_fits_cpu(task_util, util_min, util_max, recent_used_cpu)) { - return recent_used_cpu; + + if (!static_branch_unlikely(&sched_cluster_active) || + cpus_share_resources(recent_used_cpu, target)) + return recent_used_cpu; + + } else { + recent_used_cpu = -1; } /* @@ -7419,6 +7452,17 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) if ((unsigned)i < nr_cpumask_bits) return i; + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster + * first. But prev_cpu or recent_used_cpu may also be a good candidate, + * use them if possible when no idle CPU found in select_idle_cpu(). + */ + if ((unsigned int)prev_aff < nr_cpumask_bits) + return prev_aff; + if ((unsigned int)recent_used_cpu < nr_cpumask_bits) + return recent_used_cpu; + return target; } @@ -7525,7 +7569,7 @@ cpu_util(int cpu, struct task_struct *p, int dst_cpu, int boost) util = max(util, util_est); } - return min(util, capacity_orig_of(cpu)); + return min(util, arch_scale_cpu_capacity(cpu)); } unsigned long cpu_util_cfs(int cpu) @@ -7677,11 +7721,16 @@ compute_energy(struct energy_env *eenv, struct perf_domain *pd, { unsigned long max_util = eenv_pd_max_util(eenv, pd_cpus, p, dst_cpu); unsigned long busy_time = eenv->pd_busy_time; + unsigned long energy; if (dst_cpu >= 0) busy_time = min(eenv->pd_cap, busy_time + eenv->task_busy_time); - return em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); + energy = em_cpu_energy(pd->em_pd, max_util, busy_time, eenv->cpu_cap); + + trace_sched_compute_energy_tp(p, dst_cpu, energy, max_util, busy_time); + + return energy; } /* @@ -7756,7 +7805,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) target = prev_cpu; sync_entity_load_avg(&p->se); - if (!uclamp_task_util(p, p_util_min, p_util_max)) + if (!task_util_est(p) && p_util_min == 0) goto unlock; eenv_task_busy_time(&eenv, p, prev_cpu); @@ -7764,11 +7813,10 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) for (; pd; pd = pd->next) { unsigned long util_min = p_util_min, util_max = p_util_max; unsigned long cpu_cap, cpu_thermal_cap, util; - unsigned long cur_delta, max_spare_cap = 0; + long prev_spare_cap = -1, max_spare_cap = -1; unsigned long rq_util_min, rq_util_max; - unsigned long prev_spare_cap = 0; + unsigned long cur_delta, base_energy; int max_spare_cap_cpu = -1; - unsigned long base_energy; int fits, max_fits = -1; cpumask_and(cpus, perf_domain_span(pd), cpu_online_mask); @@ -7831,7 +7879,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) prev_spare_cap = cpu_cap; prev_fits = fits; } else if ((fits > max_fits) || - ((fits == max_fits) && (cpu_cap > max_spare_cap))) { + ((fits == max_fits) && ((long)cpu_cap > max_spare_cap))) { /* * Find the CPU with the maximum spare capacity * among the remaining CPUs in the performance @@ -7843,7 +7891,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) } } - if (max_spare_cap_cpu < 0 && prev_spare_cap == 0) + if (max_spare_cap_cpu < 0 && prev_spare_cap < 0) continue; eenv_pd_busy_time(&eenv, cpus, p); @@ -7851,7 +7899,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) base_energy = compute_energy(&eenv, pd, cpus, p, -1); /* Evaluate the energy impact of using prev_cpu. */ - if (prev_spare_cap > 0) { + if (prev_spare_cap > -1) { prev_delta = compute_energy(&eenv, pd, cpus, p, prev_cpu); /* CPU utilization has changed */ @@ -8052,7 +8100,7 @@ static void set_next_buddy(struct sched_entity *se) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) +static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int wake_flags) { struct task_struct *curr = rq->curr; struct sched_entity *se = &curr->se, *pse = &p->se; @@ -8065,7 +8113,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ /* * This is possible from callers such as attach_tasks(), in which we - * unconditionally check_preempt_curr() after an enqueue (which may have + * unconditionally wakeup_preempt() after an enqueue (which may have * lead to a throttle). This both saves work and prevents false * next-buddy nomination below. */ @@ -8157,7 +8205,7 @@ again: goto again; } - se = pick_next_entity(cfs_rq, curr); + se = pick_next_entity(cfs_rq); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -8220,7 +8268,7 @@ again: } } - se = pick_next_entity(cfs_rq, curr); + se = pick_next_entity(cfs_rq); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -8259,7 +8307,7 @@ simple: put_prev_task(rq, prev); do { - se = pick_next_entity(cfs_rq, NULL); + se = pick_next_entity(cfs_rq); set_next_entity(cfs_rq, se); cfs_rq = group_cfs_rq(se); } while (cfs_rq); @@ -8972,7 +9020,7 @@ static void attach_task(struct rq *rq, struct task_struct *p) WARN_ON_ONCE(task_rq(p) != rq); activate_task(rq, p, ENQUEUE_NOCLOCK); - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } /* @@ -9312,8 +9360,6 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu) unsigned long capacity = scale_rt_capacity(cpu); struct sched_group *sdg = sd->groups; - cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); - if (!capacity) capacity = 1; @@ -9389,7 +9435,7 @@ static inline int check_cpu_capacity(struct rq *rq, struct sched_domain *sd) { return ((rq->cpu_capacity * sd->imbalance_pct) < - (rq->cpu_capacity_orig * 100)); + (arch_scale_cpu_capacity(cpu_of(rq)) * 100)); } /* @@ -9400,7 +9446,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd) static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) { return rq->misfit_task_load && - (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || + (arch_scale_cpu_capacity(rq->cpu) < rq->rd->max_cpu_capacity || check_cpu_capacity(rq, sd)); } @@ -9552,7 +9598,7 @@ static bool sched_use_asym_prio(struct sched_domain *sd, int cpu) * can only do it if @group is an SMT group and has exactly on busy CPU. Larger * imbalances in the number of CPUS are dealt with in find_busiest_group(). * - * If we are balancing load within an SMT core, or at DIE domain level, always + * If we are balancing load within an SMT core, or at PKG domain level, always * proceed. * * Return: true if @env::dst_cpu can do with asym_packing load balance. False @@ -11251,13 +11297,15 @@ more_balance: busiest->push_cpu = this_cpu; active_balance = 1; } - raw_spin_rq_unlock_irqrestore(busiest, flags); + preempt_disable(); + raw_spin_rq_unlock_irqrestore(busiest, flags); if (active_balance) { stop_one_cpu_nowait(cpu_of(busiest), active_load_balance_cpu_stop, busiest, &busiest->active_balance_work); } + preempt_enable(); } } else { sd->nr_balance_failed = 0; @@ -11565,36 +11613,39 @@ static inline int on_null_domain(struct rq *rq) #ifdef CONFIG_NO_HZ_COMMON /* - * idle load balancing details - * - When one of the busy CPUs notice that there may be an idle rebalancing + * NOHZ idle load balancing (ILB) details: + * + * - When one of the busy CPUs notices that there may be an idle rebalancing * needed, they will kick the idle load balancer, which then does idle * load balancing for all the idle CPUs. - * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED not set + * + * - HK_TYPE_MISC CPUs are used for this task, because HK_TYPE_SCHED is not set * anywhere yet. */ - static inline int find_new_ilb(void) { - int ilb; const struct cpumask *hk_mask; + int ilb_cpu; hk_mask = housekeeping_cpumask(HK_TYPE_MISC); - for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) { + for_each_cpu_and(ilb_cpu, nohz.idle_cpus_mask, hk_mask) { - if (ilb == smp_processor_id()) + if (ilb_cpu == smp_processor_id()) continue; - if (idle_cpu(ilb)) - return ilb; + if (idle_cpu(ilb_cpu)) + return ilb_cpu; } - return nr_cpu_ids; + return -1; } /* - * Kick a CPU to do the nohz balancing, if it is time for it. We pick any - * idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). + * Kick a CPU to do the NOHZ balancing, if it is time for it, via a cross-CPU + * SMP function call (IPI). + * + * We pick the first idle CPU in the HK_TYPE_MISC housekeeping set (if there is one). */ static void kick_ilb(unsigned int flags) { @@ -11608,8 +11659,7 @@ static void kick_ilb(unsigned int flags) nohz.next_balance = jiffies+1; ilb_cpu = find_new_ilb(); - - if (ilb_cpu >= nr_cpu_ids) + if (ilb_cpu < 0) return; /* @@ -11622,7 +11672,7 @@ static void kick_ilb(unsigned int flags) /* * This way we generate an IPI on the target CPU which - * is idle. And the softirq performing nohz idle load balance + * is idle, and the softirq performing NOHZ idle load balancing * will be run before returning from the IPI. */ smp_call_function_single_async(ilb_cpu, &cpu_rq(ilb_cpu)->nohz_csd); @@ -11651,7 +11701,7 @@ static void nohz_balancer_kick(struct rq *rq) /* * None are in tickless mode and hence no need for NOHZ idle load - * balancing. + * balancing: */ if (likely(!atomic_read(&nohz.nr_cpus))) return; @@ -11673,9 +11723,8 @@ static void nohz_balancer_kick(struct rq *rq) sd = rcu_dereference(rq->sd); if (sd) { /* - * If there's a CFS task and the current CPU has reduced - * capacity; kick the ILB to see if there's a better CPU to run - * on. + * If there's a runnable CFS task and the current CPU has reduced + * capacity, kick the ILB to see if there's a better CPU to run on: */ if (rq->cfs.h_nr_running >= 1 && check_cpu_capacity(rq, sd)) { flags = NOHZ_STATS_KICK | NOHZ_BALANCE_KICK; @@ -11727,11 +11776,11 @@ static void nohz_balancer_kick(struct rq *rq) if (sds) { /* * If there is an imbalance between LLC domains (IOW we could - * increase the overall cache use), we need some less-loaded LLC - * domain to pull some load. Likewise, we may need to spread + * increase the overall cache utilization), we need a less-loaded LLC + * domain to pull some load from. Likewise, we may need to spread * load within the current LLC domain (e.g. packed SMT cores but * other CPUs are idle). We can't really know from here how busy - * the others are - so just get a nohz balance going if it looks + * the others are - so just get a NOHZ balance going if it looks * like this LLC domain has tasks we could move. */ nr_busy = atomic_read(&sds->nr_busy_cpus); @@ -12001,8 +12050,19 @@ static bool nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) } /* - * Check if we need to run the ILB for updating blocked load before entering - * idle state. + * Check if we need to directly run the ILB for updating blocked load before + * entering idle state. Here we run ILB directly without issuing IPIs. + * + * Note that when this function is called, the tick may not yet be stopped on + * this CPU yet. nohz.idle_cpus_mask is updated only when tick is stopped and + * cleared on the next busy tick. In other words, nohz.idle_cpus_mask updates + * don't align with CPUs enter/exit idle to avoid bottlenecks due to high idle + * entry/exit rate (usec). So it is possible that _nohz_idle_balance() is + * called from this function on (this) CPU that's not yet in the mask. That's + * OK because the goal of nohz_run_idle_balance() is to run ILB only for + * updating the blocked load of already idle CPUs without waking up one of + * those idle CPUs and outside the preempt disable / irq off phase of the local + * cpu about to enter idle, because it can take a long time. */ void nohz_run_idle_balance(int cpu) { @@ -12447,7 +12507,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) if (p->prio > oldprio) resched_curr(rq); } else - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } #ifdef CONFIG_FAIR_GROUP_SCHED @@ -12549,7 +12609,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) if (task_current(rq, p)) resched_curr(rq); else - check_preempt_curr(rq, p, 0); + wakeup_preempt(rq, p, 0); } } @@ -12908,7 +12968,7 @@ DEFINE_SCHED_CLASS(fair) = { .yield_task = yield_task_fair, .yield_to_task = yield_to_task_fair, - .check_preempt_curr = check_preempt_wakeup, + .wakeup_preempt = check_preempt_wakeup_fair, .pick_next_task = __pick_next_task_fair, .put_prev_task = put_prev_task_fair, diff --git a/kernel/sched/features.h b/kernel/sched/features.h index f770168230ae..a3ddf84de430 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true) /* diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 5007b25c5bc6..565f8374ddbb 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -401,7 +401,7 @@ balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) /* * Idle tasks are unconditionally rescheduled: */ -static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags) +static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) { resched_curr(rq); } @@ -482,7 +482,7 @@ DEFINE_SCHED_CLASS(idle) = { /* dequeue is not valid, we print a debug message there: */ .dequeue_task = dequeue_task_idle, - .check_preempt_curr = check_preempt_curr_idle, + .wakeup_preempt = wakeup_preempt_idle, .pick_next_task = pick_next_task_idle, .put_prev_task = put_prev_task_idle, diff --git a/kernel/sched/pelt.c b/kernel/sched/pelt.c index 0f310768260c..63b6cf898220 100644 --- a/kernel/sched/pelt.c +++ b/kernel/sched/pelt.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Per Entity Load Tracking + * Per Entity Load Tracking (PELT) * * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> * diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 1d0f634725a6..7b4aa5809c0f 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -434,14 +434,13 @@ static u64 window_update(struct psi_window *win, u64 now, u64 value) return growth; } -static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, +static void update_triggers(struct psi_group *group, u64 now, enum psi_aggregators aggregator) { struct psi_trigger *t; u64 *total = group->total[aggregator]; struct list_head *triggers; u64 *aggregator_total; - *update_total = false; if (aggregator == PSI_AVGS) { triggers = &group->avg_triggers; @@ -471,14 +470,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, * events without dropping any). */ if (new_stall) { - /* - * Multiple triggers might be looking at the same state, - * remember to update group->polling_total[] once we've - * been through all of them. Also remember to extend the - * polling time if we see new stall activity. - */ - *update_total = true; - /* Calculate growth since last update */ growth = window_update(&t->win, now, total[t->state]); if (!t->pending_event) { @@ -503,8 +494,6 @@ static u64 update_triggers(struct psi_group *group, u64 now, bool *update_total, /* Reset threshold breach flag once event got generated */ t->pending_event = false; } - - return now + group->rtpoll_min_period; } static u64 update_averages(struct psi_group *group, u64 now) @@ -565,7 +554,6 @@ static void psi_avgs_work(struct work_struct *work) struct delayed_work *dwork; struct psi_group *group; u32 changed_states; - bool update_total; u64 now; dwork = to_delayed_work(work); @@ -584,7 +572,7 @@ static void psi_avgs_work(struct work_struct *work) * go - see calc_avgs() and missed_periods. */ if (now >= group->avg_next_update) { - update_triggers(group, now, &update_total, PSI_AVGS); + update_triggers(group, now, PSI_AVGS); group->avg_next_update = update_averages(group, now); } @@ -608,7 +596,7 @@ static void init_rtpoll_triggers(struct psi_group *group, u64 now) group->rtpoll_next_update = now + group->rtpoll_min_period; } -/* Schedule polling if it's not already scheduled or forced. */ +/* Schedule rtpolling if it's not already scheduled or forced. */ static void psi_schedule_rtpoll_work(struct psi_group *group, unsigned long delay, bool force) { @@ -640,7 +628,6 @@ static void psi_rtpoll_work(struct psi_group *group) { bool force_reschedule = false; u32 changed_states; - bool update_total; u64 now; mutex_lock(&group->rtpoll_trigger_lock); @@ -649,37 +636,37 @@ static void psi_rtpoll_work(struct psi_group *group) if (now > group->rtpoll_until) { /* - * We are either about to start or might stop polling if no - * state change was recorded. Resetting poll_scheduled leaves + * We are either about to start or might stop rtpolling if no + * state change was recorded. Resetting rtpoll_scheduled leaves * a small window for psi_group_change to sneak in and schedule - * an immediate poll_work before we get to rescheduling. One - * potential extra wakeup at the end of the polling window - * should be negligible and polling_next_update still keeps + * an immediate rtpoll_work before we get to rescheduling. One + * potential extra wakeup at the end of the rtpolling window + * should be negligible and rtpoll_next_update still keeps * updates correctly on schedule. */ atomic_set(&group->rtpoll_scheduled, 0); /* - * A task change can race with the poll worker that is supposed to + * A task change can race with the rtpoll worker that is supposed to * report on it. To avoid missing events, ensure ordering between - * poll_scheduled and the task state accesses, such that if the poll - * worker misses the state update, the task change is guaranteed to - * reschedule the poll worker: + * rtpoll_scheduled and the task state accesses, such that if the + * rtpoll worker misses the state update, the task change is + * guaranteed to reschedule the rtpoll worker: * - * poll worker: - * atomic_set(poll_scheduled, 0) + * rtpoll worker: + * atomic_set(rtpoll_scheduled, 0) * smp_mb() * LOAD states * * task change: * STORE states - * if atomic_xchg(poll_scheduled, 1) == 0: - * schedule poll worker + * if atomic_xchg(rtpoll_scheduled, 1) == 0: + * schedule rtpoll worker * * The atomic_xchg() implies a full barrier. */ smp_mb(); } else { - /* Polling window is not over, keep rescheduling */ + /* The rtpolling window is not over, keep rescheduling */ force_reschedule = true; } @@ -687,7 +674,7 @@ static void psi_rtpoll_work(struct psi_group *group) collect_percpu_times(group, PSI_POLL, &changed_states); if (changed_states & group->rtpoll_states) { - /* Initialize trigger windows when entering polling mode */ + /* Initialize trigger windows when entering rtpolling mode */ if (now > group->rtpoll_until) init_rtpoll_triggers(group, now); @@ -706,10 +693,12 @@ static void psi_rtpoll_work(struct psi_group *group) } if (now >= group->rtpoll_next_update) { - group->rtpoll_next_update = update_triggers(group, now, &update_total, PSI_POLL); - if (update_total) + if (changed_states & group->rtpoll_states) { + update_triggers(group, now, PSI_POLL); memcpy(group->rtpoll_total, group->total[PSI_POLL], sizeof(group->rtpoll_total)); + } + group->rtpoll_next_update = now + group->rtpoll_min_period; } psi_schedule_rtpoll_work(group, @@ -1009,6 +998,9 @@ void psi_account_irqtime(struct task_struct *task, u32 delta) struct psi_group_cpu *groupc; u64 now; + if (static_branch_likely(&psi_disabled)) + return; + if (!task->pid) return; diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 0597ba0f85ff..6aaf0a3d6081 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -16,7 +16,7 @@ struct rt_bandwidth def_rt_bandwidth; * period over which we measure -rt task CPU usage in us. * default: 1s */ -unsigned int sysctl_sched_rt_period = 1000000; +int sysctl_sched_rt_period = 1000000; /* * part of the period that we allow rt tasks to run in us. @@ -34,9 +34,11 @@ static struct ctl_table sched_rt_sysctls[] = { { .procname = "sched_rt_period_us", .data = &sysctl_sched_rt_period, - .maxlen = sizeof(unsigned int), + .maxlen = sizeof(int), .mode = 0644, .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "sched_rt_runtime_us", @@ -44,6 +46,8 @@ static struct ctl_table sched_rt_sysctls[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = sched_rt_handler, + .extra1 = SYSCTL_NEG_ONE, + .extra2 = (void *)&sysctl_sched_rt_period, }, { .procname = "sched_rr_timeslice_ms", @@ -143,7 +147,6 @@ void init_rt_rq(struct rt_rq *rt_rq) #if defined CONFIG_SMP rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->highest_prio.next = MAX_RT_PRIO-1; - rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); #endif /* CONFIG_SMP */ @@ -358,53 +361,6 @@ static inline void rt_clear_overload(struct rq *rq) cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask); } -static void update_rt_migration(struct rt_rq *rt_rq) -{ - if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) { - if (!rt_rq->overloaded) { - rt_set_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 1; - } - } else if (rt_rq->overloaded) { - rt_clear_overload(rq_of_rt_rq(rt_rq)); - rt_rq->overloaded = 0; - } -} - -static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - return; - - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total++; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory++; - - update_rt_migration(rt_rq); -} - -static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ - struct task_struct *p; - - if (!rt_entity_is_task(rt_se)) - return; - - p = rt_task_of(rt_se); - rt_rq = &rq_of_rt_rq(rt_rq)->rt; - - rt_rq->rt_nr_total--; - if (p->nr_cpus_allowed > 1) - rt_rq->rt_nr_migratory--; - - update_rt_migration(rt_rq); -} - static inline int has_pushable_tasks(struct rq *rq) { return !plist_head_empty(&rq->rt.pushable_tasks); @@ -438,6 +394,11 @@ static void enqueue_pushable_task(struct rq *rq, struct task_struct *p) /* Update the highest prio pushable task */ if (p->prio < rq->rt.highest_prio.next) rq->rt.highest_prio.next = p->prio; + + if (!rq->rt.overloaded) { + rt_set_overload(rq); + rq->rt.overloaded = 1; + } } static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) @@ -451,6 +412,11 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) rq->rt.highest_prio.next = p->prio; } else { rq->rt.highest_prio.next = MAX_RT_PRIO-1; + + if (rq->rt.overloaded) { + rt_clear_overload(rq); + rq->rt.overloaded = 0; + } } } @@ -464,16 +430,6 @@ static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p) { } -static inline -void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - -static inline -void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) -{ -} - static inline void rt_queue_push_tasks(struct rq *rq) { } @@ -515,7 +471,7 @@ static inline bool rt_task_fits_capacity(struct task_struct *p, int cpu) min_cap = uclamp_eff_value(p, UCLAMP_MIN); max_cap = uclamp_eff_value(p, UCLAMP_MAX); - cpu_cap = capacity_orig_of(cpu); + cpu_cap = arch_scale_cpu_capacity(cpu); return cpu_cap >= min(min_cap, max_cap); } @@ -953,7 +909,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) /* * When we're idle and a woken (rt) task is - * throttled check_preempt_curr() will set + * throttled wakeup_preempt() will set * skip_update and the time between the wakeup * and this unthrottle will get accounted as * 'runtime'. @@ -1281,7 +1237,6 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); inc_rt_prio(rt_rq, prio); - inc_rt_migration(rt_se, rt_rq); inc_rt_group(rt_se, rt_rq); } @@ -1294,7 +1249,6 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); - dec_rt_migration(rt_se, rt_rq); dec_rt_group(rt_se, rt_rq); } @@ -1715,7 +1669,7 @@ static int balance_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) /* * Preempt the current task with a newly woken task if needed: */ -static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) +static void wakeup_preempt_rt(struct rq *rq, struct task_struct *p, int flags) { if (p->prio < rq->curr->prio) { resched_curr(rq); @@ -2109,9 +2063,11 @@ retry: */ push_task = get_push_task(rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(rq); stop_one_cpu_nowait(rq->cpu, push_cpu_stop, push_task, &rq->push_work); + preempt_enable(); raw_spin_rq_lock(rq); } @@ -2448,9 +2404,11 @@ skip: double_unlock_balance(this_rq, src_rq); if (push_task) { + preempt_disable(); raw_spin_rq_unlock(this_rq); stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, push_task, &src_rq->push_work); + preempt_enable(); raw_spin_rq_lock(this_rq); } } @@ -2702,7 +2660,7 @@ DEFINE_SCHED_CLASS(rt) = { .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, - .check_preempt_curr = check_preempt_curr_rt, + .wakeup_preempt = wakeup_preempt_rt, .pick_next_task = pick_next_task_rt, .put_prev_task = put_prev_task_rt, @@ -2985,9 +2943,6 @@ static int sched_rt_global_constraints(void) #ifdef CONFIG_SYSCTL static int sched_rt_global_validate(void) { - if (sysctl_sched_rt_period <= 0) - return -EINVAL; - if ((sysctl_sched_rt_runtime != RUNTIME_INF) && ((sysctl_sched_rt_runtime > sysctl_sched_rt_period) || ((u64)sysctl_sched_rt_runtime * @@ -3018,7 +2973,7 @@ static int sched_rt_handler(struct ctl_table *table, int write, void *buffer, old_period = sysctl_sched_rt_period; old_runtime = sysctl_sched_rt_runtime; - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { ret = sched_rt_global_validate(); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 04846272409c..2e5a95486a42 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -74,15 +74,6 @@ #include "../workqueue_internal.h" -#ifdef CONFIG_CGROUP_SCHED -#include <linux/cgroup.h> -#include <linux/psi.h> -#endif - -#ifdef CONFIG_SCHED_DEBUG -# include <linux/static_key.h> -#endif - #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> # include <asm/paravirt_api_clock.h> @@ -109,14 +100,12 @@ extern __read_mostly int scheduler_running; extern unsigned long calc_load_update; extern atomic_long_t calc_load_tasks; -extern unsigned int sysctl_sched_child_runs_first; - extern void calc_global_load_tick(struct rq *this_rq); extern long calc_load_fold_active(struct rq *this_rq, long adjust); extern void call_trace_sched_update_nr_running(struct rq *rq, int count); -extern unsigned int sysctl_sched_rt_period; +extern int sysctl_sched_rt_period; extern int sysctl_sched_rt_runtime; extern int sched_rr_timeslice; @@ -594,6 +583,7 @@ struct cfs_rq { } removed; #ifdef CONFIG_FAIR_GROUP_SCHED + u64 last_update_tg_load_avg; unsigned long tg_load_avg_contrib; long propagate; long prop_runnable_sum; @@ -644,9 +634,7 @@ struct cfs_rq { int throttled; int throttle_count; struct list_head throttled_list; -#ifdef CONFIG_SMP struct list_head throttled_csd_list; -#endif #endif /* CONFIG_CFS_BANDWIDTH */ #endif /* CONFIG_FAIR_GROUP_SCHED */ }; @@ -675,8 +663,6 @@ struct rt_rq { } highest_prio; #endif #ifdef CONFIG_SMP - unsigned int rt_nr_migratory; - unsigned int rt_nr_total; int overloaded; struct plist_head pushable_tasks; @@ -721,7 +707,6 @@ struct dl_rq { u64 next; } earliest_dl; - unsigned int dl_nr_migratory; int overloaded; /* @@ -963,10 +948,6 @@ struct rq { /* runqueue lock: */ raw_spinlock_t __lock; - /* - * nr_running and cpu_load should be in the same cacheline because - * remote CPUs use both these fields when doing load calculation. - */ unsigned int nr_running; #ifdef CONFIG_NUMA_BALANCING unsigned int nr_numa_running; @@ -1048,7 +1029,6 @@ struct rq { struct sched_domain __rcu *sd; unsigned long cpu_capacity; - unsigned long cpu_capacity_orig; struct balance_callback *balance_callback; @@ -1079,9 +1059,6 @@ struct rq { u64 idle_stamp; u64 avg_idle; - unsigned long wake_stamp; - u64 wake_avg_idle; - /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; @@ -1658,6 +1635,11 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, struct rq_flags *rf) raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags); } +DEFINE_LOCK_GUARD_1(task_rq_lock, struct task_struct, + _T->rq = task_rq_lock(_T->lock, &_T->rf), + task_rq_unlock(_T->rq, _T->lock, &_T->rf), + struct rq *rq; struct rq_flags rf) + static inline void rq_lock_irqsave(struct rq *rq, struct rq_flags *rf) __acquires(rq->lock) @@ -1868,11 +1850,13 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_share_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); extern struct static_key_false sched_asym_cpucapacity; +extern struct static_key_false sched_cluster_active; static __always_inline bool sched_asym_cpucap_active(void) { @@ -2239,7 +2223,7 @@ struct sched_class { void (*yield_task) (struct rq *rq); bool (*yield_to_task)(struct rq *rq, struct task_struct *p); - void (*check_preempt_curr)(struct rq *rq, struct task_struct *p, int flags); + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); struct task_struct *(*pick_next_task)(struct rq *rq); @@ -2513,7 +2497,7 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) extern void activate_task(struct rq *rq, struct task_struct *p, int flags); extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); -extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); +extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); #ifdef CONFIG_PREEMPT_RT #define SCHED_NR_MIGRATE_BREAK 8 @@ -2977,11 +2961,6 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} #endif #ifdef CONFIG_SMP -static inline unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} - /** * enum cpu_util_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency @@ -3219,6 +3198,8 @@ static inline bool sched_energy_enabled(void) return static_branch_unlikely(&sched_energy_present); } +extern struct cpufreq_governor schedutil_gov; + #else /* ! (CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL) */ #define perf_domain_span(pd) NULL diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index 85590599b4d6..6cf7304e6449 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -23,7 +23,7 @@ balance_stop(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) #endif /* CONFIG_SMP */ static void -check_preempt_curr_stop(struct rq *rq, struct task_struct *p, int flags) +wakeup_preempt_stop(struct rq *rq, struct task_struct *p, int flags) { /* we're never preempted */ } @@ -120,7 +120,7 @@ DEFINE_SCHED_CLASS(stop) = { .dequeue_task = dequeue_task_stop, .yield_task = yield_task_stop, - .check_preempt_curr = check_preempt_curr_stop, + .wakeup_preempt = wakeup_preempt_stop, .pick_next_task = pick_next_task_stop, .put_prev_task = put_prev_task_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 05a5bc678c08..10d1391e7416 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -212,6 +212,69 @@ static unsigned int sysctl_sched_energy_aware = 1; static DEFINE_MUTEX(sched_energy_mutex); static bool sched_energy_update; +static bool sched_is_eas_possible(const struct cpumask *cpu_mask) +{ + bool any_asym_capacity = false; + struct cpufreq_policy *policy; + struct cpufreq_governor *gov; + int i; + + /* EAS is enabled for asymmetric CPU capacity topologies. */ + for_each_cpu(i, cpu_mask) { + if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) { + any_asym_capacity = true; + break; + } + } + if (!any_asym_capacity) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + /* EAS definitely does *not* handle SMT */ + if (sched_smt_active()) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, SMT is not supported\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + if (!arch_scale_freq_invariant()) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported", + cpumask_pr_args(cpu_mask)); + } + return false; + } + + /* Do not attempt EAS if schedutil is not being used. */ + for_each_cpu(i, cpu_mask) { + policy = cpufreq_cpu_get(i); + if (!policy) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d", + cpumask_pr_args(cpu_mask), i); + } + return false; + } + gov = policy->governor; + cpufreq_cpu_put(policy); + if (gov != &schedutil_gov) { + if (sched_debug()) { + pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n", + cpumask_pr_args(cpu_mask)); + } + return false; + } + } + + return true; +} + void rebuild_sched_domains_energy(void) { mutex_lock(&sched_energy_mutex); @@ -230,6 +293,15 @@ static int sched_energy_aware_handler(struct ctl_table *table, int write, if (write && !capable(CAP_SYS_ADMIN)) return -EPERM; + if (!sched_is_eas_possible(cpu_active_mask)) { + if (write) { + return -EOPNOTSUPP; + } else { + *lenp = 0; + return 0; + } + } + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) { state = static_branch_unlikely(&sched_energy_present); @@ -348,103 +420,33 @@ static void sched_energy_set(bool has_eas) * 1. an Energy Model (EM) is available; * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. * 3. no SMT is detected. - * 4. the EM complexity is low enough to keep scheduling overheads low; - * 5. schedutil is driving the frequency of all CPUs of the rd; - * 6. frequency invariance support is present; - * - * The complexity of the Energy Model is defined as: - * - * C = nr_pd * (nr_cpus + nr_ps) - * - * with parameters defined as: - * - nr_pd: the number of performance domains - * - nr_cpus: the number of CPUs - * - nr_ps: the sum of the number of performance states of all performance - * domains (for example, on a system with 2 performance domains, - * with 10 performance states each, nr_ps = 2 * 10 = 20). - * - * It is generally not a good idea to use such a model in the wake-up path on - * very complex platforms because of the associated scheduling overheads. The - * arbitrary constraint below prevents that. It makes EAS usable up to 16 CPUs - * with per-CPU DVFS and less than 8 performance states each, for example. + * 4. schedutil is driving the frequency of all CPUs of the rd; + * 5. frequency invariance support is present; */ -#define EM_MAX_COMPLEXITY 2048 - -extern struct cpufreq_governor schedutil_gov; static bool build_perf_domains(const struct cpumask *cpu_map) { - int i, nr_pd = 0, nr_ps = 0, nr_cpus = cpumask_weight(cpu_map); + int i; struct perf_domain *pd = NULL, *tmp; int cpu = cpumask_first(cpu_map); struct root_domain *rd = cpu_rq(cpu)->rd; - struct cpufreq_policy *policy; - struct cpufreq_governor *gov; if (!sysctl_sched_energy_aware) goto free; - /* EAS is enabled for asymmetric CPU capacity topologies. */ - if (!per_cpu(sd_asym_cpucapacity, cpu)) { - if (sched_debug()) { - pr_info("rd %*pbl: CPUs do not have asymmetric capacities\n", - cpumask_pr_args(cpu_map)); - } + if (!sched_is_eas_possible(cpu_map)) goto free; - } - - /* EAS definitely does *not* handle SMT */ - if (sched_smt_active()) { - pr_warn("rd %*pbl: Disabling EAS, SMT is not supported\n", - cpumask_pr_args(cpu_map)); - goto free; - } - - if (!arch_scale_freq_invariant()) { - if (sched_debug()) { - pr_warn("rd %*pbl: Disabling EAS: frequency-invariant load tracking not yet supported", - cpumask_pr_args(cpu_map)); - } - goto free; - } for_each_cpu(i, cpu_map) { /* Skip already covered CPUs. */ if (find_pd(pd, i)) continue; - /* Do not attempt EAS if schedutil is not being used. */ - policy = cpufreq_cpu_get(i); - if (!policy) - goto free; - gov = policy->governor; - cpufreq_cpu_put(policy); - if (gov != &schedutil_gov) { - if (rd->pd) - pr_warn("rd %*pbl: Disabling EAS, schedutil is mandatory\n", - cpumask_pr_args(cpu_map)); - goto free; - } - /* Create the new pd and add it to the local list. */ tmp = pd_init(i); if (!tmp) goto free; tmp->next = pd; pd = tmp; - - /* - * Count performance domains and performance states for the - * complexity check. - */ - nr_pd++; - nr_ps += em_pd_nr_perf_states(pd->em_pd); - } - - /* Bail out if the Energy Model complexity is too high. */ - if (nr_pd * (nr_ps + nr_cpus) > EM_MAX_COMPLEXITY) { - WARN(1, "rd %*pbl: Failed to start EAS, EM complexity is too high\n", - cpumask_pr_args(cpu_map)); - goto free; } perf_domain_debug(cpu_map, pd); @@ -666,11 +668,14 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); +DEFINE_STATIC_KEY_FALSE(sched_cluster_active); static void update_top_cache_domain(int cpu) { @@ -691,6 +696,17 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_share_id, cpu) = id; + sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -1117,7 +1133,7 @@ fail: * * - Simultaneous multithreading (SMT) * - Multi-Core Cache (MC) - * - Package (DIE) + * - Package (PKG) * * Where the last one more or less denotes everything up to a NUMA node. * @@ -1139,13 +1155,13 @@ fail: * * CPU 0 1 2 3 4 5 6 7 * - * DIE [ ] + * PKG [ ] * MC [ ] [ ] * SMT [ ] [ ] [ ] [ ] * * - or - * - * DIE 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 + * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 * @@ -1548,6 +1564,7 @@ static struct cpumask ***sched_domains_numa_masks; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_CLUSTER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING) @@ -1679,7 +1696,7 @@ static struct sched_domain_topology_level default_topology[] = { #ifdef CONFIG_SCHED_MC { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, #endif - { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { cpu_cpu_mask, SD_INIT_NAME(PKG) }, { NULL, }, }; @@ -2112,22 +2129,31 @@ static int hop_cmp(const void *a, const void *b) return -1; } -/* - * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth next cpu - * closest to @cpu from @cpumask. - * cpumask: cpumask to find a cpu from - * cpu: Nth cpu to find - * - * returns: cpu, or nr_cpu_ids when nothing found. +/** + * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU + * from @cpus to @cpu, taking into account distance + * from a given @node. + * @cpus: cpumask to find a cpu from + * @cpu: CPU to start searching + * @node: NUMA node to order CPUs by distance + * + * Return: cpu, or nr_cpu_ids when nothing found. */ int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { - struct __cmp_key k = { .cpus = cpus, .node = node, .cpu = cpu }; + struct __cmp_key k = { .cpus = cpus, .cpu = cpu }; struct cpumask ***hop_masks; int hop, ret = nr_cpu_ids; + if (node == NUMA_NO_NODE) + return cpumask_nth_and(cpu, cpus, cpu_online_mask); + rcu_read_lock(); + /* CPU-less node entries are uninitialized in sched_domains_numa_masks */ + node = numa_nearest_node(node, N_CPU); + k.node = node; + k.masks = rcu_dereference(sched_domains_numa_masks); if (!k.masks) goto unlock; @@ -2362,6 +2388,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att struct rq *rq = NULL; int i, ret = -ENOMEM; bool has_asym = false; + bool has_cluster = false; if (WARN_ON(cpumask_empty(cpu_map))) goto error; @@ -2479,20 +2506,29 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* Attach the domains */ rcu_read_lock(); for_each_cpu(i, cpu_map) { + unsigned long capacity; + rq = cpu_rq(i); sd = *per_cpu_ptr(d.sd, i); + capacity = arch_scale_cpu_capacity(i); /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ - if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity)) - WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig); + if (capacity > READ_ONCE(d.rd->max_cpu_capacity)) + WRITE_ONCE(d.rd->max_cpu_capacity, capacity); cpu_attach_domain(sd, d.rd, i); + + if (lowest_flag_domain(i, SD_CLUSTER)) + has_cluster = true; } rcu_read_unlock(); if (has_asym) static_branch_inc_cpuslocked(&sched_asym_cpucapacity); + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + if (rq && sched_debug_verbose) { pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); @@ -2592,6 +2628,9 @@ static void detach_destroy_domains(const struct cpumask *cpu_map) if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) static_branch_dec_cpuslocked(&sched_asym_cpucapacity); + if (static_branch_unlikely(&sched_cluster_active)) + static_branch_dec_cpuslocked(&sched_cluster_active); + rcu_read_lock(); for_each_cpu(i, cpu_map) cpu_attach_domain(NULL, &def_root_domain, i); |