diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-09 06:49:17 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2024-01-09 06:49:17 +0300 |
commit | bfe8eb3b85c571f7e94e1039f59b462505b8e0fc (patch) | |
tree | 2084624e1d6e2c7f570239aad1bbdd9741cfe5e5 /include/linux | |
parent | aac4de465af08ccec90ef47bdcc13435e48a7223 (diff) | |
parent | cdb3033e191fd03da2d7da23b9cd448dfa180a8e (diff) | |
download | linux-bfe8eb3b85c571f7e94e1039f59b462505b8e0fc.tar.xz |
Merge tag 'sched-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar:
"Energy scheduling:
- Consolidate how the max compute capacity is used in the scheduler
and how we calculate the frequency for a level of utilization.
- Rework interface between the scheduler and the schedutil governor
- Simplify the util_est logic
Deadline scheduler:
- Work more towards reducing SCHED_DEADLINE starvation of low
priority tasks (e.g., SCHED_OTHER) tasks when higher priority tasks
monopolize CPU cycles, via the introduction of 'deadline servers'
(nested/2-level scheduling).
"Fair servers" to make use of this facility are not introduced yet.
EEVDF:
- Introduce O(1) fastpath for EEVDF task selection
NUMA balancing:
- Tune the NUMA-balancing vma scanning logic some more, to better
distribute the probability of a particular vma getting scanned.
Plus misc fixes, cleanups and updates"
* tag 'sched-core-2024-01-08' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (30 commits)
sched/fair: Fix tg->load when offlining a CPU
sched/fair: Remove unused 'next_buddy_marked' local variable in check_preempt_wakeup_fair()
sched/fair: Use all little CPUs for CPU-bound workloads
sched/fair: Simplify util_est
sched/fair: Remove SCHED_FEAT(UTIL_EST_FASTUP, true)
arm64/amu: Use capacity_ref_freq() to set AMU ratio
cpufreq/cppc: Set the frequency used for computing the capacity
cpufreq/cppc: Move and rename cppc_cpufreq_{perf_to_khz|khz_to_perf}()
energy_model: Use a fixed reference frequency
cpufreq/schedutil: Use a fixed reference frequency
cpufreq: Use the fixed and coherent frequency for scaling capacity
sched/topology: Add a new arch_scale_freq_ref() method
freezer,sched: Clean saved_state when restoring it during thaw
sched/fair: Update min_vruntime for reweight_entity() correctly
sched/doc: Update documentation after renames and synchronize Chinese version
sched/cpufreq: Rework iowait boost
sched/cpufreq: Rework schedutil governor performance estimation
sched/pelt: Avoid underestimation of task utilization
sched/timers: Explain why idle task schedules out on remote timer enqueue
sched/cpuidle: Comment about timers requirements VS idle handler
...
Diffstat (limited to 'include/linux')
-rw-r--r-- | include/linux/arch_topology.h | 8 | ||||
-rw-r--r-- | include/linux/cpufreq.h | 1 | ||||
-rw-r--r-- | include/linux/energy_model.h | 7 | ||||
-rw-r--r-- | include/linux/mm_types.h | 3 | ||||
-rw-r--r-- | include/linux/sched.h | 75 | ||||
-rw-r--r-- | include/linux/sched/topology.h | 8 |
6 files changed, 58 insertions, 44 deletions
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h index a07b510e7dc5..a63d61ca55af 100644 --- a/include/linux/arch_topology.h +++ b/include/linux/arch_topology.h @@ -27,6 +27,13 @@ static inline unsigned long topology_get_cpu_scale(int cpu) void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity); +DECLARE_PER_CPU(unsigned long, capacity_freq_ref); + +static inline unsigned long topology_get_freq_ref(int cpu) +{ + return per_cpu(capacity_freq_ref, cpu); +} + DECLARE_PER_CPU(unsigned long, arch_freq_scale); static inline unsigned long topology_get_freq_scale(int cpu) @@ -92,6 +99,7 @@ void update_siblings_masks(unsigned int cpu); void remove_cpu_topology(unsigned int cpuid); void reset_cpu_topology(void); int parse_acpi_topology(void); +void freq_inv_set_max_ratio(int cpu, u64 max_rate); #endif #endif /* _LINUX_ARCH_TOPOLOGY_H_ */ diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index 1c5ca92a0555..afda5f24d3dd 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -1203,6 +1203,7 @@ void arch_set_freq_scale(const struct cpumask *cpus, { } #endif + /* the following are really really optional */ extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs; diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h index b9caa01dfac4..88d91e087471 100644 --- a/include/linux/energy_model.h +++ b/include/linux/energy_model.h @@ -224,7 +224,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, unsigned long max_util, unsigned long sum_util, unsigned long allowed_cpu_cap) { - unsigned long freq, scale_cpu; + unsigned long freq, ref_freq, scale_cpu; struct em_perf_state *ps; int cpu; @@ -241,11 +241,10 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd, */ cpu = cpumask_first(to_cpumask(pd->cpus)); scale_cpu = arch_scale_cpu_capacity(cpu); - ps = &pd->table[pd->nr_perf_states - 1]; + ref_freq = arch_scale_freq_ref(cpu); - max_util = map_util_perf(max_util); max_util = min(max_util, allowed_cpu_cap); - freq = map_util_freq(max_util, ps->frequency, scale_cpu); + freq = map_util_freq(max_util, ref_freq, scale_cpu); /* * Find the lowest performance state of the Energy Model above the diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 957ce38768b2..950df415d7de 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -600,6 +600,9 @@ struct vma_numab_state { */ unsigned long pids_active[2]; + /* MM scan sequence ID when scan first started after VMA creation */ + int start_scan_seq; + /* * MM scan sequence ID when the VMA was last completely scanned. * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq diff --git a/include/linux/sched.h b/include/linux/sched.h index 292c31697248..03bfe9ab2951 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -63,11 +63,13 @@ struct robust_list_head; struct root_domain; struct rq; struct sched_attr; +struct sched_dl_entity; struct seq_file; struct sighand_struct; struct signal_struct; struct task_delay_info; struct task_group; +struct task_struct; struct user_event_mm; /* @@ -413,42 +415,6 @@ struct load_weight { u32 inv_weight; }; -/** - * struct util_est - Estimation utilization of FAIR tasks - * @enqueued: instantaneous estimated utilization of a task/cpu - * @ewma: the Exponential Weighted Moving Average (EWMA) - * utilization of a task - * - * Support data structure to track an Exponential Weighted Moving Average - * (EWMA) of a FAIR task's utilization. New samples are added to the moving - * average each time a task completes an activation. Sample's weight is chosen - * so that the EWMA will be relatively insensitive to transient changes to the - * task's workload. - * - * The enqueued attribute has a slightly different meaning for tasks and cpus: - * - task: the task's util_avg at last task dequeue time - * - cfs_rq: the sum of util_est.enqueued for each RUNNABLE task on that CPU - * Thus, the util_est.enqueued of a task represents the contribution on the - * estimated utilization of the CPU where that task is currently enqueued. - * - * Only for tasks we track a moving average of the past instantaneous - * estimated utilization. This allows to absorb sporadic drops in utilization - * of an otherwise almost periodic task. - * - * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg - * updates. When a task is dequeued, its util_est should not be updated if its - * util_avg has not been updated in the meantime. - * This information is mapped into the MSB bit of util_est.enqueued at dequeue - * time. Since max value of util_est.enqueued for a task is 1024 (PELT util_avg - * for a task) it is safe to use MSB. - */ -struct util_est { - unsigned int enqueued; - unsigned int ewma; -#define UTIL_EST_WEIGHT_SHIFT 2 -#define UTIL_AVG_UNCHANGED 0x80000000 -} __attribute__((__aligned__(sizeof(u64)))); - /* * The load/runnable/util_avg accumulates an infinite geometric series * (see __update_load_avg_cfs_rq() in kernel/sched/pelt.c). @@ -503,9 +469,20 @@ struct sched_avg { unsigned long load_avg; unsigned long runnable_avg; unsigned long util_avg; - struct util_est util_est; + unsigned int util_est; } ____cacheline_aligned; +/* + * The UTIL_AVG_UNCHANGED flag is used to synchronize util_est with util_avg + * updates. When a task is dequeued, its util_est should not be updated if its + * util_avg has not been updated in the meantime. + * This information is mapped into the MSB bit of util_est at dequeue time. + * Since max value of util_est for a task is 1024 (PELT util_avg for a task) + * it is safe to use MSB. + */ +#define UTIL_EST_WEIGHT_SHIFT 2 +#define UTIL_AVG_UNCHANGED 0x80000000 + struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; @@ -523,7 +500,7 @@ struct sched_statistics { u64 block_max; s64 sum_block_runtime; - u64 exec_max; + s64 exec_max; u64 slice_max; u64 nr_migrations_cold; @@ -553,7 +530,7 @@ struct sched_entity { struct load_weight load; struct rb_node run_node; u64 deadline; - u64 min_deadline; + u64 min_vruntime; struct list_head group_node; unsigned int on_rq; @@ -607,6 +584,9 @@ struct sched_rt_entity { #endif } __randomize_layout; +typedef bool (*dl_server_has_tasks_f)(struct sched_dl_entity *); +typedef struct task_struct *(*dl_server_pick_f)(struct sched_dl_entity *); + struct sched_dl_entity { struct rb_node rb_node; @@ -654,6 +634,7 @@ struct sched_dl_entity { unsigned int dl_yielded : 1; unsigned int dl_non_contending : 1; unsigned int dl_overrun : 1; + unsigned int dl_server : 1; /* * Bandwidth enforcement timer. Each -deadline task has its @@ -668,7 +649,20 @@ struct sched_dl_entity { * timer is needed to decrease the active utilization at the correct * time. */ - struct hrtimer inactive_timer; + struct hrtimer inactive_timer; + + /* + * Bits for DL-server functionality. Also see the comment near + * dl_server_update(). + * + * @rq the runqueue this server is for + * + * @server_has_tasks() returns true if @server_pick return a + * runnable task. + */ + struct rq *rq; + dl_server_has_tasks_f server_has_tasks; + dl_server_pick_f server_pick; #ifdef CONFIG_RT_MUTEXES /* @@ -795,6 +789,7 @@ struct task_struct { struct sched_entity se; struct sched_rt_entity rt; struct sched_dl_entity dl; + struct sched_dl_entity *dl_server; const struct sched_class *sched_class; #ifdef CONFIG_SCHED_CORE diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index de545ba85218..a6e04b4a21d7 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -279,6 +279,14 @@ void arch_update_thermal_pressure(const struct cpumask *cpus, { } #endif +#ifndef arch_scale_freq_ref +static __always_inline +unsigned int arch_scale_freq_ref(int cpu) +{ + return 0; +} +#endif + static inline int task_node(const struct task_struct *p) { return cpu_to_node(task_cpu(p)); |