From 6fb45460615358157a6d3c990e74f9c1395247e2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 9 Jun 2023 20:45:16 +0200 Subject: sched: Simplify tg_set_cfs_bandwidth() Use guards to reduce gotos and simplify control flow. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- include/linux/cpu.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 0abd60a7987b..f19f56501809 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -153,6 +153,8 @@ static inline int remove_cpu(unsigned int cpu) { return -EPERM; } static inline void smp_shutdown_nonboot_cpus(unsigned int primary_cpu) { } #endif /* !CONFIG_HOTPLUG_CPU */ +DEFINE_LOCK_GUARD_0(cpus_read_lock, cpus_read_lock(), cpus_read_unlock()) + #ifdef CONFIG_PM_SLEEP_SMP extern int freeze_secondary_cpus(int primary); extern void thaw_secondary_cpus(void); -- cgit v1.2.3 From 4de7b17fd05d03fa919e8c47fc66122bd24d7b6c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 21 Aug 2023 14:44:28 +0100 Subject: sched: Assert for_each_thread() is properly locked list_for_each_entry_rcu() takes an optional fourth argument which allows RCU to assert that the correct lock is held. Several callers of for_each_thread() rely on their caller to be holding the appropriate lock, so this is a useful assertion to include. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Ingo Molnar Reviewed-by: Joel Fernandes (Google) Link: https://lore.kernel.org/r/20230821134428.2504912-1-willy@infradead.org --- include/linux/sched/signal.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index 0014d3adaf84..9610bad018a3 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h @@ -656,7 +656,8 @@ extern bool current_is_single_threaded(void); while ((t = next_thread(t)) != g) #define __for_each_thread(signal, t) \ - list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node) + list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node, \ + lockdep_is_held(&tasklist_lock)) #define for_each_thread(p, t) \ __for_each_thread((p)->signal, t) -- cgit v1.2.3 From b1f099b1cf51d553c510c6c8141c27d9ba7ea1fe Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 19 Aug 2023 07:12:33 -0700 Subject: numa: Generalize numa_map_to_online_node() The function in fact searches the nearest node for a given one, based on a N_ONLINE state. This is a common pattern to search for a nearest node. This patch converts numa_map_to_online_node() to numa_nearest_node() so that others won't need to opencode the logic. Signed-off-by: Yury Norov Signed-off-by: Ingo Molnar Cc: Mel Gorman Link: https://lore.kernel.org/r/20230819141239.287290-2-yury.norov@gmail.com --- include/linux/numa.h | 7 +++++-- mm/mempolicy.c | 18 +++++++++++------- 2 files changed, 16 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/numa.h b/include/linux/numa.h index 59df211d051f..fb30a42f0700 100644 --- a/include/linux/numa.h +++ b/include/linux/numa.h @@ -25,7 +25,7 @@ #include /* Generic implementation available */ -int numa_map_to_online_node(int node); +int numa_nearest_node(int node, unsigned int state); #ifndef memory_add_physaddr_to_nid static inline int memory_add_physaddr_to_nid(u64 start) @@ -44,10 +44,11 @@ static inline int phys_to_target_node(u64 start) } #endif #else /* !CONFIG_NUMA */ -static inline int numa_map_to_online_node(int node) +static inline int numa_nearest_node(int node, unsigned int state) { return NUMA_NO_NODE; } + static inline int memory_add_physaddr_to_nid(u64 start) { return 0; @@ -58,6 +59,8 @@ static inline int phys_to_target_node(u64 start) } #endif +#define numa_map_to_online_node(node) numa_nearest_node(node, N_ONLINE) + #ifdef CONFIG_HAVE_ARCH_NODE_DEV_GROUP extern const struct attribute_group arch_node_dev_group; #endif diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 42b5567e3773..d4c0fff79758 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -131,22 +131,26 @@ static struct mempolicy default_policy = { static struct mempolicy preferred_node_policy[MAX_NUMNODES]; /** - * numa_map_to_online_node - Find closest online node + * numa_nearest_node - Find nearest node by state * @node: Node id to start the search + * @state: State to filter the search * - * Lookup the next closest node by distance if @nid is not online. + * Lookup the closest node by distance if @nid is not in state. * - * Return: this @node if it is online, otherwise the closest node by distance + * Return: this @node if it is in state, otherwise the closest node by distance */ -int numa_map_to_online_node(int node) +int numa_nearest_node(int node, unsigned int state) { int min_dist = INT_MAX, dist, n, min_node; - if (node == NUMA_NO_NODE || node_online(node)) + if (state >= NR_NODE_STATES) + return -EINVAL; + + if (node == NUMA_NO_NODE || node_state(node, state)) return node; min_node = node; - for_each_online_node(n) { + for_each_node_state(n, state) { dist = node_distance(node, n); if (dist < min_dist) { min_dist = dist; @@ -156,7 +160,7 @@ int numa_map_to_online_node(int node) return min_node; } -EXPORT_SYMBOL_GPL(numa_map_to_online_node); +EXPORT_SYMBOL_GPL(numa_nearest_node); struct mempolicy *get_task_policy(struct task_struct *p) { -- cgit v1.2.3 From 8ab63d418d4339d996f80d02a00dbce0aa3ff972 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Sat, 19 Aug 2023 07:12:36 -0700 Subject: sched/topology: Fix sched_numa_find_nth_cpu() in non-NUMA case When CONFIG_NUMA is enabled, sched_numa_find_nth_cpu() searches for a CPU in sched_domains_numa_masks. The masks includes only online CPUs, so effectively offline CPUs are skipped. When CONFIG_NUMA is disabled, the fallback function should be consistent. Fixes: cd7f55359c90 ("sched: add sched_numa_find_nth_cpu()") Signed-off-by: Yury Norov Signed-off-by: Ingo Molnar Cc: Mel Gorman Link: https://lore.kernel.org/r/20230819141239.287290-5-yury.norov@gmail.com --- include/linux/topology.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/topology.h b/include/linux/topology.h index fea32377f7c7..52f5850730b3 100644 --- a/include/linux/topology.h +++ b/include/linux/topology.h @@ -251,7 +251,7 @@ extern const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int #else static __always_inline int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) { - return cpumask_nth(cpu, cpus); + return cpumask_nth_and(cpu, cpus, cpu_online_mask); } static inline const struct cpumask * -- cgit v1.2.3 From fbaa6a181a4b1886cbf4214abdf9a2df68471510 Mon Sep 17 00:00:00 2001 From: Elliot Berman Date: Fri, 8 Sep 2023 15:49:15 -0700 Subject: sched/core: Remove ifdeffery for saved_state In preparation for freezer to also use saved_state, remove the CONFIG_PREEMPT_RT compilation guard around saved_state. On the arm64 platform I tested which did not have CONFIG_PREEMPT_RT, there was no statistically significant deviation by applying this patch. Test methodology: perf bench sched message -g 40 -l 40 Signed-off-by: Elliot Berman Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 -- kernel/sched/core.c | 8 ++------ 2 files changed, 2 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index 77f01ac385f7..dc37ae787e33 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -750,10 +750,8 @@ struct task_struct { #endif unsigned int __state; -#ifdef CONFIG_PREEMPT_RT /* saved state for "spinlock sleepers" */ unsigned int saved_state; -#endif /* * This begins the randomizable portion of task_struct. Only diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f39482d6a6e6..49541e3c1295 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2232,23 +2232,20 @@ int __task_state_match(struct task_struct *p, unsigned int state) if (READ_ONCE(p->__state) & state) return 1; -#ifdef CONFIG_PREEMPT_RT if (READ_ONCE(p->saved_state) & state) return -1; -#endif + return 0; } static __always_inline int task_state_match(struct task_struct *p, unsigned int state) { -#ifdef CONFIG_PREEMPT_RT /* * Serialize against current_save_and_set_rtlock_wait_state() and * current_restore_rtlock_saved_state(). */ guard(raw_spinlock_irq)(&p->pi_lock); -#endif return __task_state_match(p, state); } @@ -4038,7 +4035,6 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) *success = !!(match = __task_state_match(p, state)); -#ifdef CONFIG_PREEMPT_RT /* * Saved state preserves the task state across blocking on * an RT lock. If the state matches, set p::saved_state to @@ -4054,7 +4050,7 @@ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) */ if (match < 0) p->saved_state = TASK_RUNNING; -#endif + return match > 0; } -- cgit v1.2.3 From 87c3a5893e865739ce78aa7192d36011022e0af7 Mon Sep 17 00:00:00 2001 From: Finn Thain Date: Fri, 15 Sep 2023 15:47:11 +1000 Subject: sched/core: Optimize in_task() and in_interrupt() a bit Except on x86, preempt_count is always accessed with READ_ONCE(). Repeated invocations in macros like irq_count() produce repeated loads. These redundant instructions appear in various fast paths. In the one shown below, for example, irq_count() is evaluated during kernel entry if !tick_nohz_full_cpu(smp_processor_id()). 0001ed0a : 1ed0a: 4e56 0000 linkw %fp,#0 1ed0e: 200f movel %sp,%d0 1ed10: 0280 ffff e000 andil #-8192,%d0 1ed16: 2040 moveal %d0,%a0 1ed18: 2028 0008 movel %a0@(8),%d0 1ed1c: 0680 0001 0000 addil #65536,%d0 1ed22: 2140 0008 movel %d0,%a0@(8) 1ed26: 082a 0001 000f btst #1,%a2@(15) 1ed2c: 670c beqs 1ed3a 1ed2e: 2028 0008 movel %a0@(8),%d0 1ed32: 2028 0008 movel %a0@(8),%d0 1ed36: 2028 0008 movel %a0@(8),%d0 1ed3a: 4e5e unlk %fp 1ed3c: 4e75 rts This patch doesn't prevent the pointless btst and beqs instructions above, but it does eliminate 2 of the 3 pointless move instructions here and elsewhere. On x86, preempt_count is per-cpu data and the problem does not arise presumably because the compiler is free to optimize more effectively. This patch was tested on m68k and x86. I was expecting no changes to object code for x86 and mostly that's what I saw. However, there were a few places where code generation was perturbed for some reason. The performance issue addressed here is minor on uniprocessor m68k. I got a 0.01% improvement from this patch for a simple "find /sys -false" benchmark. For architectures and workloads susceptible to cache line bounce the improvement is expected to be larger. The only SMP architecture I have is x86, and as x86 unaffected I have not done any further measurements. Fixes: 15115830c887 ("preempt: Cleanup the macro maze a bit") Signed-off-by: Finn Thain Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/0a403120a682a525e6db2d81d1a3ffcc137c3742.1694756831.git.fthain@linux-m68k.org --- include/linux/preempt.h | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 1424670df161..9aa6358a1a16 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -99,14 +99,21 @@ static __always_inline unsigned char interrupt_context_level(void) return level; } +/* + * These macro definitions avoid redundant invocations of preempt_count() + * because such invocations would result in redundant loads given that + * preempt_count() is commonly implemented with READ_ONCE(). + */ + #define nmi_count() (preempt_count() & NMI_MASK) #define hardirq_count() (preempt_count() & HARDIRQ_MASK) #ifdef CONFIG_PREEMPT_RT # define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK) +# define irq_count() ((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | softirq_count()) #else # define softirq_count() (preempt_count() & SOFTIRQ_MASK) +# define irq_count() (preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_MASK)) #endif -#define irq_count() (nmi_count() | hardirq_count() | softirq_count()) /* * Macros to retrieve the current execution context: @@ -119,7 +126,11 @@ static __always_inline unsigned char interrupt_context_level(void) #define in_nmi() (nmi_count()) #define in_hardirq() (hardirq_count()) #define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) -#define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq())) +#ifdef CONFIG_PREEMPT_RT +# define in_task() (!((preempt_count() & (NMI_MASK | HARDIRQ_MASK)) | in_serving_softirq())) +#else +# define in_task() (!(preempt_count() & (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) +#endif /* * The following macros are deprecated and should not be used in new code: -- cgit v1.2.3 From 3ba78da711940ce07c39c4cdd1f4ad284067a42d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 6 Jun 2021 13:27:15 +0200 Subject: sched/headers: Add header guard to It's the only non-trivial header in include/linux/sched/ missing a header guard. Signed-off-by: Ingo Molnar --- include/linux/sched/deadline.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h index 7c83d4d5a971..df3aca89d4f5 100644 --- a/include/linux/sched/deadline.h +++ b/include/linux/sched/deadline.h @@ -1,4 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_SCHED_DEADLINE_H +#define _LINUX_SCHED_DEADLINE_H /* * SCHED_DEADLINE tasks has negative priorities, reflecting @@ -34,3 +36,5 @@ extern void dl_add_task_root_domain(struct task_struct *p); extern void dl_clear_root_domain(struct root_domain *rd); #endif /* CONFIG_SMP */ + +#endif /* _LINUX_SCHED_DEADLINE_H */ -- cgit v1.2.3 From 6eddb116dd830436afbd922568292867de6c8b9e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Sep 2023 11:24:17 +0200 Subject: sched/headers: Standardize the header guard name Use the same _LINUX_SCHED_ prefix nomenclature as the other 29 header guards in include/linux/sched/ do. Signed-off-by: Ingo Molnar --- include/linux/sched/vhost_task.h | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/vhost_task.h b/include/linux/sched/vhost_task.h index 837a23624a66..bc60243d43b3 100644 --- a/include/linux/sched/vhost_task.h +++ b/include/linux/sched/vhost_task.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _LINUX_VHOST_TASK_H -#define _LINUX_VHOST_TASK_H - +#ifndef _LINUX_SCHED_VHOST_TASK_H +#define _LINUX_SCHED_VHOST_TASK_H struct vhost_task; @@ -11,4 +10,4 @@ void vhost_task_start(struct vhost_task *vtsk); void vhost_task_stop(struct vhost_task *vtsk); void vhost_task_wake(struct vhost_task *vtsk); -#endif +#endif /* _LINUX_SCHED_VHOST_TASK_H */ -- cgit v1.2.3 From 0f9a1a4d234c064d8dff69cf3f3755554dd479ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Sep 2023 11:27:37 +0200 Subject: sched/headers: Standardize the header guard #endif Signed-off-by: Ingo Molnar --- include/linux/sched/types.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/types.h b/include/linux/sched/types.h index 3c3e049224ae..969aaf5ef9d6 100644 --- a/include/linux/sched/types.h +++ b/include/linux/sched/types.h @@ -20,4 +20,4 @@ struct task_cputime { unsigned long long sum_exec_runtime; }; -#endif +#endif /* _LINUX_SCHED_TYPES_H */ -- cgit v1.2.3 From 1632d47fae2f2d229dd432854c4443ebb0bb27a4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Sep 2023 11:28:48 +0200 Subject: sched/headers: Standardize the header guard #endif Signed-off-by: Ingo Molnar --- include/linux/sched/smt.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/smt.h b/include/linux/sched/smt.h index 59d3736c454c..fb1e295e7e63 100644 --- a/include/linux/sched/smt.h +++ b/include/linux/sched/smt.h @@ -17,4 +17,4 @@ static inline bool sched_smt_active(void) { return false; } void arch_smt_update(void); -#endif +#endif /* _LINUX_SCHED_SMT_H */ -- cgit v1.2.3 From 8bf0cdfac7f8aa3fa6151b5c5f5eebdb44a64e89 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 21 Sep 2023 11:32:58 +0200 Subject: : Introduce the list_for_each_reverse() method The list_head counterpart of list_for_each_entry_reverse() was missing, add it to complete the list handling APIs in . [ This new API is also relied on by a WIP scheduler patch, so this variant is not a theoretical possibility only. ] Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org --- include/linux/list.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/list.h b/include/linux/list.h index 164b4d0e9d2a..1837caedf723 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -686,6 +686,14 @@ static inline void list_splice_tail_init(struct list_head *list, #define list_for_each(pos, head) \ for (pos = (head)->next; !list_is_head(pos, (head)); pos = pos->next) +/** + * list_for_each_reverse - iterate backwards over a list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each_reverse(pos, head) \ + for (pos = (head)->prev; pos != (head); pos = pos->prev) + /** * list_for_each_rcu - Iterate over a list in an RCU-safe fashion * @pos: the &struct list_head to use as a loop cursor. -- cgit v1.2.3 From d844fe65f0957024c3e1b0bf2a0615246184d9bc Mon Sep 17 00:00:00 2001 From: Kir Kolyshkin Date: Mon, 7 Aug 2023 20:03:57 -0700 Subject: sched/headers: Move 'struct sched_param' out of uapi, to work around glibc/musl breakage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Both glibc and musl define 'struct sched_param' in sched.h, while kernel has it in uapi/linux/sched/types.h, making it cumbersome to use sched_getattr(2) or sched_setattr(2) from userspace. For example, something like this: #include #include struct sched_attr sa; will result in "error: redefinition of ‘struct sched_param’" (note the code doesn't need sched_param at all -- it needs struct sched_attr plus some stuff from sched.h). The situation is, glibc is not going to provide a wrapper for sched_{get,set}attr, thus the need to include linux/sched_types.h directly, which leads to the above problem. Thus, the userspace is left with a few sub-par choices when it wants to use e.g. sched_setattr(2), such as maintaining a copy of struct sched_attr definition, or using some other ugly tricks. OTOH, 'struct sched_param' is well known, defined in POSIX, and it won't be ever changed (as that would break backward compatibility). So, while 'struct sched_param' is indeed part of the kernel uapi, exposing it the way it's done now creates an issue, and hiding it (like this patch does) fixes that issue, hopefully without creating another one: common userspace software rely on libc headers, and as for "special" software (like libc), it looks like glibc and musl do not rely on kernel headers for 'struct sched_param' definition (but let's Cc their mailing lists in case it's otherwise). The alternative to this patch would be to move struct sched_attr to, say, linux/sched.h, or linux/sched/attr.h (the new file). Oh, and here is the previous attempt to fix the issue: https://lore.kernel.org/all/20200528135552.GA87103@google.com/ While I support Linus arguments, the issue is still here and needs to be fixed. [ mingo: Linus is right, this shouldn't be needed - but on the other hand I agree that this header is not really helpful to user-space as-is. So let's pretend that is only about sched_attr, and call this commit a workaround for user-space breakage that it in reality is ... Also, remove the Fixes tag. ] Signed-off-by: Kir Kolyshkin Signed-off-by: Ingo Molnar Cc: Linus Torvalds Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20230808030357.1213829-1-kolyshkin@gmail.com --- include/linux/sched.h | 5 ++++- include/uapi/linux/sched/types.h | 4 ---- 2 files changed, 4 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched.h b/include/linux/sched.h index dc37ae787e33..e4235bbfad77 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -63,7 +63,6 @@ struct robust_list_head; struct root_domain; struct rq; struct sched_attr; -struct sched_param; struct seq_file; struct sighand_struct; struct signal_struct; @@ -370,6 +369,10 @@ extern struct root_domain def_root_domain; extern struct mutex sched_domains_mutex; #endif +struct sched_param { + int sched_priority; +}; + struct sched_info { #ifdef CONFIG_SCHED_INFO /* Cumulative counters: */ diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h index f2c4589d4dbf..90662385689b 100644 --- a/include/uapi/linux/sched/types.h +++ b/include/uapi/linux/sched/types.h @@ -4,10 +4,6 @@ #include -struct sched_param { - int sched_priority; -}; - #define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */ #define SCHED_ATTR_SIZE_VER1 56 /* add: util_{min,max} */ -- cgit v1.2.3 From 9ae5c00ea2e600a8b823f9b95606dd244f3096bf Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:38 +0100 Subject: sched/numa: Document vma_numab_state fields Document the intended usage of the fields. [ mingo: Reformatted to take less vertical space & tidied it up. ] Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231010083143.19593-2-mgorman@techsingularity.net --- include/linux/mm_types.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 36c5b43999e6..d7f042ec1f33 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -551,8 +551,29 @@ struct vma_lock { }; struct vma_numab_state { + /* + * Initialised as time in 'jiffies' after which VMA + * should be scanned. Delays first scan of new VMA by at + * least sysctl_numa_balancing_scan_delay: + */ unsigned long next_scan; + + /* + * Time in jiffies when access_pids[] is reset to + * detect phase change behaviour: + */ unsigned long next_pid_reset; + + /* + * Approximate tracking of PIDs that trapped a NUMA hinting + * fault. May produce false positives due to hash collisions. + * + * [0] Previous PID tracking + * [1] Current PID tracking + * + * Window moves after next_pid_reset has expired approximately + * every VMA_PID_RESET_PERIOD jiffies: + */ unsigned long access_pids[2]; }; -- cgit v1.2.3 From f3a6c97940fbd25d6c84c2d5642338fc99a9b35b Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:39 +0100 Subject: sched/numa: Rename vma_numab_state::access_pids[] => ::pids_active[], ::next_pid_reset => ::pids_active_reset The access_pids[] field name is somewhat ambiguous as no PIDs are accessed. Similarly, it's not clear that next_pid_reset is related to access_pids[]. Rename the fields to more accurately reflect their purpose. [ mingo: Rename in the comments too. ] Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231010083143.19593-3-mgorman@techsingularity.net --- include/linux/mm.h | 4 ++-- include/linux/mm_types.h | 6 +++--- kernel/sched/fair.c | 12 ++++++------ 3 files changed, 11 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm.h b/include/linux/mm.h index bf5d0b1b16f4..19fc73b02c9f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1726,8 +1726,8 @@ static inline void vma_set_access_pid_bit(struct vm_area_struct *vma) unsigned int pid_bit; pid_bit = hash_32(current->pid, ilog2(BITS_PER_LONG)); - if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->access_pids[1])) { - __set_bit(pid_bit, &vma->numab_state->access_pids[1]); + if (vma->numab_state && !test_bit(pid_bit, &vma->numab_state->pids_active[1])) { + __set_bit(pid_bit, &vma->numab_state->pids_active[1]); } } #else /* !CONFIG_NUMA_BALANCING */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d7f042ec1f33..e7571eca1131 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -559,10 +559,10 @@ struct vma_numab_state { unsigned long next_scan; /* - * Time in jiffies when access_pids[] is reset to + * Time in jiffies when pids_active[] is reset to * detect phase change behaviour: */ - unsigned long next_pid_reset; + unsigned long pids_active_reset; /* * Approximate tracking of PIDs that trapped a NUMA hinting @@ -574,7 +574,7 @@ struct vma_numab_state { * Window moves after next_pid_reset has expired approximately * every VMA_PID_RESET_PERIOD jiffies: */ - unsigned long access_pids[2]; + unsigned long pids_active[2]; }; /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e7c1bafc0460..6b47edcbe834 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3125,7 +3125,7 @@ static bool vma_is_accessed(struct vm_area_struct *vma) if (READ_ONCE(current->mm->numa_scan_seq) < 2) return true; - pids = vma->numab_state->access_pids[0] | vma->numab_state->access_pids[1]; + pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids); } @@ -3241,7 +3241,7 @@ static void task_numa_work(struct callback_head *work) msecs_to_jiffies(sysctl_numa_balancing_scan_delay); /* Reset happens after 4 times scan delay of scan start */ - vma->numab_state->next_pid_reset = vma->numab_state->next_scan + + vma->numab_state->pids_active_reset = vma->numab_state->next_scan + msecs_to_jiffies(VMA_PID_RESET_PERIOD); } @@ -3262,11 +3262,11 @@ static void task_numa_work(struct callback_head *work) * vma for recent access to avoid clearing PID info before access.. */ if (mm->numa_scan_seq && - time_after(jiffies, vma->numab_state->next_pid_reset)) { - vma->numab_state->next_pid_reset = vma->numab_state->next_pid_reset + + time_after(jiffies, vma->numab_state->pids_active_reset)) { + vma->numab_state->pids_active_reset = vma->numab_state->pids_active_reset + msecs_to_jiffies(VMA_PID_RESET_PERIOD); - vma->numab_state->access_pids[0] = READ_ONCE(vma->numab_state->access_pids[1]); - vma->numab_state->access_pids[1] = 0; + vma->numab_state->pids_active[0] = READ_ONCE(vma->numab_state->pids_active[1]); + vma->numab_state->pids_active[1] = 0; } do { -- cgit v1.2.3 From ed2da8b725b932b1e2b2f4835bb664d47ed03031 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:40 +0100 Subject: sched/numa: Trace decisions related to skipping VMAs NUMA balancing skips or scans VMAs for a variety of reasons. In preparation for completing scans of VMAs regardless of PID access, trace the reasons why a VMA was skipped. In a later patch, the tracing will be used to track if a VMA was forcibly scanned. Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20231010083143.19593-4-mgorman@techsingularity.net --- include/linux/sched/numa_balancing.h | 8 ++++++ include/trace/events/sched.h | 50 ++++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 17 +++++++++--- 3 files changed, 71 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index 3988762efe15..c127a1509e2f 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -15,6 +15,14 @@ #define TNF_FAULT_LOCAL 0x08 #define TNF_MIGRATE_FAIL 0x10 +enum numa_vmaskip_reason { + NUMAB_SKIP_UNSUITABLE, + NUMAB_SKIP_SHARED_RO, + NUMAB_SKIP_INACCESSIBLE, + NUMAB_SKIP_SCAN_DELAY, + NUMAB_SKIP_PID_INACTIVE, +}; + #ifdef CONFIG_NUMA_BALANCING extern void task_numa_fault(int last_node, int node, int pages, int flags); extern pid_t task_numa_group_id(struct task_struct *p); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index a13d5d06be9d..d82a04d6a1bc 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -664,6 +664,56 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, TP_ARGS(src_tsk, src_cpu, dst_tsk, dst_cpu) ); +#ifdef CONFIG_NUMA_BALANCING +#define NUMAB_SKIP_REASON \ + EM( NUMAB_SKIP_UNSUITABLE, "unsuitable" ) \ + EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \ + EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ + EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ + EMe(NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) + +/* Redefine for export. */ +#undef EM +#undef EMe +#define EM(a, b) TRACE_DEFINE_ENUM(a); +#define EMe(a, b) TRACE_DEFINE_ENUM(a); + +NUMAB_SKIP_REASON + +/* Redefine for symbolic printing. */ +#undef EM +#undef EMe +#define EM(a, b) { a, b }, +#define EMe(a, b) { a, b } + +TRACE_EVENT(sched_skip_vma_numa, + + TP_PROTO(struct mm_struct *mm, struct vm_area_struct *vma, + enum numa_vmaskip_reason reason), + + TP_ARGS(mm, vma, reason), + + TP_STRUCT__entry( + __field(unsigned long, numa_scan_offset) + __field(unsigned long, vm_start) + __field(unsigned long, vm_end) + __field(enum numa_vmaskip_reason, reason) + ), + + TP_fast_assign( + __entry->numa_scan_offset = mm->numa_scan_offset; + __entry->vm_start = vma->vm_start; + __entry->vm_end = vma->vm_end; + __entry->reason = reason; + ), + + TP_printk("numa_scan_offset=%lX vm_start=%lX vm_end=%lX reason=%s", + __entry->numa_scan_offset, + __entry->vm_start, + __entry->vm_end, + __print_symbolic(__entry->reason, NUMAB_SKIP_REASON)) +); +#endif /* CONFIG_NUMA_BALANCING */ /* * Tracepoint for waking a polling cpu without an IPI. diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 6b47edcbe834..31cfdb0794fb 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3210,6 +3210,7 @@ static void task_numa_work(struct callback_head *work) do { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_UNSUITABLE); continue; } @@ -3220,15 +3221,19 @@ static void task_numa_work(struct callback_head *work) * as migrating the pages will be of marginal benefit. */ if (!vma->vm_mm || - (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) + (vma->vm_file && (vma->vm_flags & (VM_READ|VM_WRITE)) == (VM_READ))) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SHARED_RO); continue; + } /* * Skip inaccessible VMAs to avoid any confusion between * PROT_NONE and NUMA hinting ptes */ - if (!vma_is_accessible(vma)) + if (!vma_is_accessible(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_INACCESSIBLE); continue; + } /* Initialise new per-VMA NUMAB state. */ if (!vma->numab_state) { @@ -3250,12 +3255,16 @@ static void task_numa_work(struct callback_head *work) * delay the scan for new VMAs. */ if (mm->numa_scan_seq && time_before(jiffies, - vma->numab_state->next_scan)) + vma->numab_state->next_scan)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SCAN_DELAY); continue; + } /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(vma)) + if (!vma_is_accessed(vma)) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); continue; + } /* * RESET access PIDs regularly for old VMAs. Resetting after checking -- cgit v1.2.3 From b7a5b537c55c088d891ae554103d1b281abef781 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:42 +0100 Subject: sched/numa: Complete scanning of partial VMAs regardless of PID activity NUMA Balancing skips VMAs when the current task has not trapped a NUMA fault within the VMA. If the VMA is skipped then mm->numa_scan_offset advances and a task that is trapping faults within the VMA may never fully update PTEs within the VMA. Force tasks to update PTEs for partially scanned PTEs. The VMA will be tagged for NUMA hints by some task but this removes some of the benefit of tracking PID activity within a VMA. A follow-on patch will mitigate this problem. The test cases and machines evaluated did not trigger the corner case so the performance results are neutral with only small changes within the noise from normal test-to-test variance. However, the next patch makes the corner case easier to trigger. Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Tested-by: Raghavendra K T Link: https://lore.kernel.org/r/20231010083143.19593-6-mgorman@techsingularity.net --- include/linux/sched/numa_balancing.h | 1 + include/trace/events/sched.h | 3 ++- kernel/sched/fair.c | 18 +++++++++++++++--- 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index c127a1509e2f..7dcc0bdfddbb 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -21,6 +21,7 @@ enum numa_vmaskip_reason { NUMAB_SKIP_INACCESSIBLE, NUMAB_SKIP_SCAN_DELAY, NUMAB_SKIP_PID_INACTIVE, + NUMAB_SKIP_IGNORE_PID, }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index d82a04d6a1bc..bfc07c10541a 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -670,7 +670,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, EM( NUMAB_SKIP_SHARED_RO, "shared_ro" ) \ EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ - EMe(NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) + EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \ + EMe(NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) /* Redefine for export. */ #undef EM diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ce36969625bd..ab79013f6e91 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3113,7 +3113,7 @@ static void reset_ptenuma_scan(struct task_struct *p) p->mm->numa_scan_offset = 0; } -static bool vma_is_accessed(struct vm_area_struct *vma) +static bool vma_is_accessed(struct mm_struct *mm, struct vm_area_struct *vma) { unsigned long pids; /* @@ -3126,7 +3126,19 @@ static bool vma_is_accessed(struct vm_area_struct *vma) return true; pids = vma->numab_state->pids_active[0] | vma->numab_state->pids_active[1]; - return test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids); + if (test_bit(hash_32(current->pid, ilog2(BITS_PER_LONG)), &pids)) + return true; + + /* + * Complete a scan that has already started regardless of PID access, or + * some VMAs may never be scanned in multi-threaded applications: + */ + if (mm->numa_scan_offset > vma->vm_start) { + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_IGNORE_PID); + return true; + } + + return false; } #define VMA_PID_RESET_PERIOD (4 * sysctl_numa_balancing_scan_delay) @@ -3270,7 +3282,7 @@ static void task_numa_work(struct callback_head *work) } /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(vma)) { + if (!vma_is_accessed(mm, vma)) { trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); continue; } -- cgit v1.2.3 From f169c62ff7cd1acf8bac8ae17bfeafa307d9e6fa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Oct 2023 09:31:43 +0100 Subject: sched/numa: Complete scanning of inactive VMAs when there is no alternative VMAs are skipped if there is no recent fault activity but this represents a chicken-and-egg problem as there may be no fault activity if the PTEs are never updated to trap NUMA hints. There is an indirect reliance on scanning to be forced early in the lifetime of a task but this may fail to detect changes in phase behaviour. Force inactive VMAs to be scanned when all other eligible VMAs have been updated within the same scan sequence. Test results in general look good with some changes in performance, both negative and positive, depending on whether the additional scanning and faulting was beneficial or not to the workload. The autonuma benchmark workload NUMA01_THREADLOCAL was picked for closer examination. The workload creates two processes with numerous threads and thread-local storage that is zero-filled in a loop. It exercises the corner case where unrelated threads may skip VMAs that are thread-local to another thread and still has some VMAs that inactive while the workload executes. The VMA skipping activity frequency with and without the patch: 6.6.0-rc2-sched-numabtrace-v1 ============================= 649 reason=scan_delay 9,094 reason=unsuitable 48,915 reason=shared_ro 143,919 reason=inaccessible 193,050 reason=pid_inactive 6.6.0-rc2-sched-numabselective-v1 ============================= 146 reason=seq_completed 622 reason=ignore_pid_inactive 624 reason=scan_delay 6,570 reason=unsuitable 16,101 reason=shared_ro 27,608 reason=inaccessible 41,939 reason=pid_inactive Note that with the patch applied, the PID activity is ignored (ignore_pid_inactive) to ensure a VMA with some activity is completely scanned. In addition, a small number of VMAs are scanned when no other eligible VMA is available during a single scan window (seq_completed). The number of times a VMA is skipped due to no PID activity from the scanning task (pid_inactive) drops dramatically. It is expected that this will increase the number of PTEs updated for NUMA hinting faults as well as hinting faults but these represent PTEs that would otherwise have been missed. The tradeoff is scan+fault overhead versus improving locality due to migration. On a 2-socket Cascade Lake test machine, the time to complete the workload is as follows; 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 Min elsp-NUMA01_THREADLOCAL 174.22 ( 0.00%) 117.64 ( 32.48%) Amean elsp-NUMA01_THREADLOCAL 175.68 ( 0.00%) 123.34 * 29.79%* Stddev elsp-NUMA01_THREADLOCAL 1.20 ( 0.00%) 4.06 (-238.20%) CoeffVar elsp-NUMA01_THREADLOCAL 0.68 ( 0.00%) 3.29 (-381.70%) Max elsp-NUMA01_THREADLOCAL 177.18 ( 0.00%) 128.03 ( 27.74%) The time to complete the workload is reduced by almost 30%: 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 / Duration User 91201.80 63506.64 Duration System 2015.53 1819.78 Duration Elapsed 1234.77 868.37 In this specific case, system CPU time was not increased but it's not universally true. From vmstat, the NUMA scanning and fault activity is as follows; 6.6.0-rc2 6.6.0-rc2 sched-numabtrace-v1 sched-numabselective-v1 Ops NUMA base-page range updates 64272.00 26374386.00 Ops NUMA PTE updates 36624.00 55538.00 Ops NUMA PMD updates 54.00 51404.00 Ops NUMA hint faults 15504.00 75786.00 Ops NUMA hint local faults % 14860.00 56763.00 Ops NUMA hint local percent 95.85 74.90 Ops NUMA pages migrated 1629.00 6469222.00 Both the number of PTE updates and hint faults is dramatically increased. While this is superficially unfortunate, it represents ranges that were simply skipped without the patch. As a result of the scanning and hinting faults, many more pages were also migrated but as the time to completion is reduced, the overhead is offset by the gain. Signed-off-by: Mel Gorman Signed-off-by: Ingo Molnar Tested-by: Raghavendra K T Link: https://lore.kernel.org/r/20231010083143.19593-7-mgorman@techsingularity.net --- include/linux/mm_types.h | 6 ++++ include/linux/sched/numa_balancing.h | 1 + include/trace/events/sched.h | 3 +- kernel/sched/fair.c | 55 ++++++++++++++++++++++++++++++++++-- 4 files changed, 61 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index e7571eca1131..589f31ef2e84 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -575,6 +575,12 @@ struct vma_numab_state { * every VMA_PID_RESET_PERIOD jiffies: */ unsigned long pids_active[2]; + + /* + * MM scan sequence ID when the VMA was last completely scanned. + * A VMA is not eligible for scanning if prev_scan_seq == numa_scan_seq + */ + int prev_scan_seq; }; /* diff --git a/include/linux/sched/numa_balancing.h b/include/linux/sched/numa_balancing.h index 7dcc0bdfddbb..b69afb8630db 100644 --- a/include/linux/sched/numa_balancing.h +++ b/include/linux/sched/numa_balancing.h @@ -22,6 +22,7 @@ enum numa_vmaskip_reason { NUMAB_SKIP_SCAN_DELAY, NUMAB_SKIP_PID_INACTIVE, NUMAB_SKIP_IGNORE_PID, + NUMAB_SKIP_SEQ_COMPLETED, }; #ifdef CONFIG_NUMA_BALANCING diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index bfc07c10541a..6188ad0d9e0d 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -671,7 +671,8 @@ DEFINE_EVENT(sched_numa_pair_template, sched_swap_numa, EM( NUMAB_SKIP_INACCESSIBLE, "inaccessible" ) \ EM( NUMAB_SKIP_SCAN_DELAY, "scan_delay" ) \ EM( NUMAB_SKIP_PID_INACTIVE, "pid_inactive" ) \ - EMe(NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) + EM( NUMAB_SKIP_IGNORE_PID, "ignore_pid_inactive" ) \ + EMe(NUMAB_SKIP_SEQ_COMPLETED, "seq_completed" ) /* Redefine for export. */ #undef EM diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index ab79013f6e91..922905194c0c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3158,6 +3158,8 @@ static void task_numa_work(struct callback_head *work) unsigned long nr_pte_updates = 0; long pages, virtpages; struct vma_iterator vmi; + bool vma_pids_skipped; + bool vma_pids_forced = false; SCHED_WARN_ON(p != container_of(work, struct task_struct, numa_work)); @@ -3200,7 +3202,6 @@ static void task_numa_work(struct callback_head *work) */ p->node_stamp += 2 * TICK_NSEC; - start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ virtpages = pages * 8; /* Scan up to this much virtual space */ @@ -3210,6 +3211,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; + + /* + * VMAs are skipped if the current PID has not trapped a fault within + * the VMA recently. Allow scanning to be forced if there is no + * suitable VMA remaining. + */ + vma_pids_skipped = false; + +retry_pids: + start = mm->numa_scan_offset; vma_iter_init(&vmi, mm, start); vma = vma_next(&vmi); if (!vma) { @@ -3260,6 +3271,13 @@ static void task_numa_work(struct callback_head *work) /* Reset happens after 4 times scan delay of scan start */ vma->numab_state->pids_active_reset = vma->numab_state->next_scan + msecs_to_jiffies(VMA_PID_RESET_PERIOD); + + /* + * Ensure prev_scan_seq does not match numa_scan_seq, + * to prevent VMAs being skipped prematurely on the + * first scan: + */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq - 1; } /* @@ -3281,8 +3299,19 @@ static void task_numa_work(struct callback_head *work) vma->numab_state->pids_active[1] = 0; } - /* Do not scan the VMA if task has not accessed */ - if (!vma_is_accessed(mm, vma)) { + /* Do not rescan VMAs twice within the same sequence. */ + if (vma->numab_state->prev_scan_seq == mm->numa_scan_seq) { + mm->numa_scan_offset = vma->vm_end; + trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_SEQ_COMPLETED); + continue; + } + + /* + * Do not scan the VMA if task has not accessed it, unless no other + * VMA candidate exists. + */ + if (!vma_pids_forced && !vma_is_accessed(mm, vma)) { + vma_pids_skipped = true; trace_sched_skip_vma_numa(mm, vma, NUMAB_SKIP_PID_INACTIVE); continue; } @@ -3311,8 +3340,28 @@ static void task_numa_work(struct callback_head *work) cond_resched(); } while (end != vma->vm_end); + + /* VMA scan is complete, do not scan until next sequence. */ + vma->numab_state->prev_scan_seq = mm->numa_scan_seq; + + /* + * Only force scan within one VMA at a time, to limit the + * cost of scanning a potentially uninteresting VMA. + */ + if (vma_pids_forced) + break; } for_each_vma(vmi, vma); + /* + * If no VMAs are remaining and VMAs were skipped due to the PID + * not accessing the VMA previously, then force a scan to ensure + * forward progress: + */ + if (!vma && !vma_pids_forced && vma_pids_skipped) { + vma_pids_forced = true; + goto retry_pids; + } + out: /* * It is possible to reach the end of the VMA list but the last few -- cgit v1.2.3 From b95303e0aeaf446b65169dd4142cacdaeb7d4c8b Mon Sep 17 00:00:00 2001 From: Barry Song Date: Thu, 19 Oct 2023 11:33:21 +0800 Subject: sched: Add cpus_share_resources API Add cpus_share_resources() API. This is the preparation for the optimization of select_idle_cpu() on platforms with cluster scheduler level. On a machine with clusters cpus_share_resources() will test whether two cpus are within the same cluster. On a non-cluster machine it will behaves the same as cpus_share_cache(). So we use "resources" here for cache resources. Signed-off-by: Barry Song Signed-off-by: Yicong Yang Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Gautham R. Shenoy Reviewed-by: Tim Chen Reviewed-by: Vincent Guittot Tested-and-reviewed-by: Chen Yu Tested-by: K Prateek Nayak Link: https://lkml.kernel.org/r/20231019033323.54147-2-yangyicong@huawei.com --- include/linux/sched/sd_flags.h | 7 +++++++ include/linux/sched/topology.h | 8 +++++++- kernel/sched/core.c | 12 ++++++++++++ kernel/sched/sched.h | 1 + kernel/sched/topology.c | 13 +++++++++++++ 5 files changed, 40 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h index fad77b5172e2..a8b28647aafc 100644 --- a/include/linux/sched/sd_flags.h +++ b/include/linux/sched/sd_flags.h @@ -109,6 +109,13 @@ SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS) */ SD_FLAG(SD_SHARE_CPUCAPACITY, SDF_SHARED_CHILD | SDF_NEEDS_GROUPS) +/* + * Domain members share CPU cluster (LLC tags or L2 cache) + * + * NEEDS_GROUPS: Clusters are shared between groups. + */ +SD_FLAG(SD_CLUSTER, SDF_NEEDS_GROUPS) + /* * Domain members share CPU package resources (i.e. caches) * diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 67b573d5bf28..4c14fe127223 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -45,7 +45,7 @@ static inline int cpu_smt_flags(void) #ifdef CONFIG_SCHED_CLUSTER static inline int cpu_cluster_flags(void) { - return SD_SHARE_PKG_RESOURCES; + return SD_CLUSTER | SD_SHARE_PKG_RESOURCES; } #endif @@ -179,6 +179,7 @@ cpumask_var_t *alloc_sched_domains(unsigned int ndoms); void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms); bool cpus_share_cache(int this_cpu, int that_cpu); +bool cpus_share_resources(int this_cpu, int that_cpu); typedef const struct cpumask *(*sched_domain_mask_f)(int cpu); typedef int (*sched_domain_flags_f)(void); @@ -232,6 +233,11 @@ static inline bool cpus_share_cache(int this_cpu, int that_cpu) return true; } +static inline bool cpus_share_resources(int this_cpu, int that_cpu) +{ + return true; +} + #endif /* !CONFIG_SMP */ #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dc724f59e495..5e1fb8a63b2e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3939,6 +3939,18 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } +/* + * Whether CPUs are share cache resources, which means LLC on non-cluster + * machines and LLC tag or L2 on machines with clusters. + */ +bool cpus_share_resources(int this_cpu, int that_cpu) +{ + if (this_cpu == that_cpu) + return true; + + return per_cpu(sd_share_id, this_cpu) == per_cpu(sd_share_id, that_cpu); +} + static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) { /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 65cad0e5729e..998f03d02de0 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1853,6 +1853,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); DECLARE_PER_CPU(int, sd_llc_size); DECLARE_PER_CPU(int, sd_llc_id); +DECLARE_PER_CPU(int, sd_share_id); DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index a63729f87c21..dbb8c328e8ad 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -668,6 +668,7 @@ static void destroy_sched_domains(struct sched_domain *sd) DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); DEFINE_PER_CPU(int, sd_llc_size); DEFINE_PER_CPU(int, sd_llc_id); +DEFINE_PER_CPU(int, sd_share_id); DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); @@ -693,6 +694,17 @@ static void update_top_cache_domain(int cpu) per_cpu(sd_llc_id, cpu) = id; rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); + + /* + * This assignment should be placed after the sd_llc_id as + * we want this id equals to cluster id on cluster machines + * but equals to LLC id on non-Cluster machines. + */ + per_cpu(sd_share_id, cpu) = id; + sd = lowest_flag_domain(cpu, SD_NUMA); rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); @@ -1550,6 +1562,7 @@ static struct cpumask ***sched_domains_numa_masks; */ #define TOPOLOGY_SD_FLAGS \ (SD_SHARE_CPUCAPACITY | \ + SD_CLUSTER | \ SD_SHARE_PKG_RESOURCES | \ SD_NUMA | \ SD_ASYM_PACKING) -- cgit v1.2.3 From 984ffb6a4366752c949f7b39640aecdce222607f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 20 Oct 2023 12:35:33 +0200 Subject: sched/fair: Remove SIS_PROP SIS_UTIL seems to work well, lets remove the old thing. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Vincent Guittot Link: https://lkml.kernel.org/r/20231020134337.GD33965@noisy.programming.kicks-ass.net --- include/linux/sched/topology.h | 2 -- kernel/sched/core.c | 5 ----- kernel/sched/fair.c | 48 ------------------------------------------ kernel/sched/features.h | 1 - kernel/sched/sched.h | 3 --- 5 files changed, 59 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h index 4c14fe127223..de545ba85218 100644 --- a/include/linux/sched/topology.h +++ b/include/linux/sched/topology.h @@ -109,8 +109,6 @@ struct sched_domain { u64 max_newidle_lb_cost; unsigned long last_decay_max_lb_cost; - u64 avg_scan_cost; /* select_idle_sibling */ - #ifdef CONFIG_SCHEDSTATS /* load_balance() stats */ unsigned int lb_count[CPU_MAX_IDLE_TYPES]; diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5e1fb8a63b2e..7a0c16115b79 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3792,9 +3792,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, if (rq->avg_idle > max) rq->avg_idle = max; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle / 2; - rq->idle_stamp = 0; } #endif @@ -9953,8 +9950,6 @@ void __init sched_init(void) rq->online = 0; rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; - rq->wake_stamp = jiffies; - rq->wake_avg_idle = rq->avg_idle; rq->max_idle_balance_cost = sysctl_sched_migration_cost; INIT_LIST_HEAD(&rq->cfs_tasks); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 523b5aee2d6a..8767988242ee 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -7209,45 +7209,9 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_rq_mask); int i, cpu, idle_cpu = -1, nr = INT_MAX; struct sched_domain_shared *sd_share; - struct rq *this_rq = this_rq(); - int this = smp_processor_id(); - struct sched_domain *this_sd = NULL; - u64 time = 0; cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); - if (sched_feat(SIS_PROP) && !has_idle_core) { - u64 avg_cost, avg_idle, span_avg; - unsigned long now = jiffies; - - this_sd = rcu_dereference(*this_cpu_ptr(&sd_llc)); - if (!this_sd) - return -1; - - /* - * If we're busy, the assumption that the last idle period - * predicts the future is flawed; age away the remaining - * predicted idle time. - */ - if (unlikely(this_rq->wake_stamp < now)) { - while (this_rq->wake_stamp < now && this_rq->wake_avg_idle) { - this_rq->wake_stamp++; - this_rq->wake_avg_idle >>= 1; - } - } - - avg_idle = this_rq->wake_avg_idle; - avg_cost = this_sd->avg_scan_cost + 1; - - span_avg = sd->span_weight * avg_idle; - if (span_avg > 4*avg_cost) - nr = div_u64(span_avg, avg_cost); - else - nr = 4; - - time = cpu_clock(this); - } - if (sched_feat(SIS_UTIL)) { sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); if (sd_share) { @@ -7301,18 +7265,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool if (has_idle_core) set_idle_cores(target, false); - if (sched_feat(SIS_PROP) && this_sd && !has_idle_core) { - time = cpu_clock(this) - time; - - /* - * Account for the scan cost of wakeups against the average - * idle time. - */ - this_rq->wake_avg_idle -= min(this_rq->wake_avg_idle, time); - - update_avg(&this_sd->avg_scan_cost, time); - } - return idle_cpu; } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index f770168230ae..a3ddf84de430 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -49,7 +49,6 @@ SCHED_FEAT(TTWU_QUEUE, true) /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. */ -SCHED_FEAT(SIS_PROP, false) SCHED_FEAT(SIS_UTIL, true) /* diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index ef4fe7bcf740..2e5a95486a42 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1059,9 +1059,6 @@ struct rq { u64 idle_stamp; u64 avg_idle; - unsigned long wake_stamp; - u64 wake_avg_idle; - /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; -- cgit v1.2.3