summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-11-14 20:39:03 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-11-14 20:39:03 +0300
commitfc661f2dcb7e41dcda9ae862efb822bb2f461646 (patch)
tree2aae84cc82d8c0e2b390d313ea36c2550327da49 /kernel
parentf7018be29253b89175d03284f8f49ac4ffed0472 (diff)
parenta8b76910e465d718effce0cad306a21fa4f3526b (diff)
downloadlinux-fc661f2dcb7e41dcda9ae862efb822bb2f461646.tar.xz
Merge tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Borislav Petkov: - Avoid touching ~100 config files in order to be able to select the preemption model - clear cluster CPU masks too, on the CPU unplug path - prevent use-after-free in cfs - Prevent a race condition when updating CPU cache domains - Factor out common shared part of smp_prepare_cpus() into a common helper which can be called by both baremetal and Xen, in order to fix a booting of Xen PV guests * tag 'sched_urgent_for_v5.16_rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: preempt: Restore preemption model selection configs arch_topology: Fix missing clear cluster_cpumask in remove_cpu_topology() sched/fair: Prevent dead task groups from regaining cfs_rq's sched/core: Mitigate race cpus_share_cache()/update_top_cache_domain() x86/smp: Factor out parts of native_smp_prepare_cpus()
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Kconfig.preempt42
-rw-r--r--kernel/sched/autogroup.c2
-rw-r--r--kernel/sched/core.c53
-rw-r--r--kernel/sched/fair.c4
-rw-r--r--kernel/sched/rt.c12
-rw-r--r--kernel/sched/sched.h3
6 files changed, 76 insertions, 40 deletions
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
index 60f1bfc3c7b2..ce77f0265660 100644
--- a/kernel/Kconfig.preempt
+++ b/kernel/Kconfig.preempt
@@ -1,12 +1,23 @@
# SPDX-License-Identifier: GPL-2.0-only
+config PREEMPT_NONE_BUILD
+ bool
+
+config PREEMPT_VOLUNTARY_BUILD
+ bool
+
+config PREEMPT_BUILD
+ bool
+ select PREEMPTION
+ select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
+
choice
prompt "Preemption Model"
- default PREEMPT_NONE_BEHAVIOUR
+ default PREEMPT_NONE
-config PREEMPT_NONE_BEHAVIOUR
+config PREEMPT_NONE
bool "No Forced Preemption (Server)"
- select PREEMPT_NONE if !PREEMPT_DYNAMIC
+ select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC
help
This is the traditional Linux preemption model, geared towards
throughput. It will still provide good latencies most of the
@@ -18,10 +29,10 @@ config PREEMPT_NONE_BEHAVIOUR
raw processing power of the kernel, irrespective of scheduling
latencies.
-config PREEMPT_VOLUNTARY_BEHAVIOUR
+config PREEMPT_VOLUNTARY
bool "Voluntary Kernel Preemption (Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPT_VOLUNTARY if !PREEMPT_DYNAMIC
+ select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC
help
This option reduces the latency of the kernel by adding more
"explicit preemption points" to the kernel code. These new
@@ -37,10 +48,10 @@ config PREEMPT_VOLUNTARY_BEHAVIOUR
Select this if you are building a kernel for a desktop system.
-config PREEMPT_BEHAVIOUR
+config PREEMPT
bool "Preemptible Kernel (Low-Latency Desktop)"
depends on !ARCH_NO_PREEMPT
- select PREEMPT
+ select PREEMPT_BUILD
help
This option reduces the latency of the kernel by making
all kernel code (that is not executing in a critical section)
@@ -58,7 +69,7 @@ config PREEMPT_BEHAVIOUR
config PREEMPT_RT
bool "Fully Preemptible Kernel (Real-Time)"
- depends on EXPERT && ARCH_SUPPORTS_RT && !PREEMPT_DYNAMIC
+ depends on EXPERT && ARCH_SUPPORTS_RT
select PREEMPTION
help
This option turns the kernel into a real-time kernel by replacing
@@ -75,17 +86,6 @@ config PREEMPT_RT
endchoice
-config PREEMPT_NONE
- bool
-
-config PREEMPT_VOLUNTARY
- bool
-
-config PREEMPT
- bool
- select PREEMPTION
- select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK
-
config PREEMPT_COUNT
bool
@@ -95,8 +95,8 @@ config PREEMPTION
config PREEMPT_DYNAMIC
bool "Preemption behaviour defined on boot"
- depends on HAVE_PREEMPT_DYNAMIC
- select PREEMPT
+ depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT
+ select PREEMPT_BUILD
default y
help
This option allows to define the preemption model on the kernel
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c
index 2067080bb235..8629b37d118e 100644
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -31,7 +31,7 @@ static inline void autogroup_destroy(struct kref *kref)
ag->tg->rt_se = NULL;
ag->tg->rt_rq = NULL;
#endif
- sched_offline_group(ag->tg);
+ sched_release_group(ag->tg);
sched_destroy_group(ag->tg);
}
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 523fd602ea90..3c9b0fda64ac 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3726,6 +3726,9 @@ out:
bool cpus_share_cache(int this_cpu, int that_cpu)
{
+ if (this_cpu == that_cpu)
+ return true;
+
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
@@ -6625,13 +6628,13 @@ __setup("preempt=", setup_preempt_mode);
static void __init preempt_dynamic_init(void)
{
if (preempt_dynamic_mode == preempt_dynamic_undefined) {
- if (IS_ENABLED(CONFIG_PREEMPT_NONE_BEHAVIOUR)) {
+ if (IS_ENABLED(CONFIG_PREEMPT_NONE)) {
sched_dynamic_update(preempt_dynamic_none);
- } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY_BEHAVIOUR)) {
+ } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) {
sched_dynamic_update(preempt_dynamic_voluntary);
} else {
/* Default static call setting, nothing to do */
- WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT_BEHAVIOUR));
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT));
preempt_dynamic_mode = preempt_dynamic_full;
pr_info("Dynamic Preempt: full\n");
}
@@ -9716,6 +9719,22 @@ static void sched_free_group(struct task_group *tg)
kmem_cache_free(task_group_cache, tg);
}
+static void sched_free_group_rcu(struct rcu_head *rcu)
+{
+ sched_free_group(container_of(rcu, struct task_group, rcu));
+}
+
+static void sched_unregister_group(struct task_group *tg)
+{
+ unregister_fair_sched_group(tg);
+ unregister_rt_sched_group(tg);
+ /*
+ * We have to wait for yet another RCU grace period to expire, as
+ * print_cfs_stats() might run concurrently.
+ */
+ call_rcu(&tg->rcu, sched_free_group_rcu);
+}
+
/* allocate runqueue etc for a new task group */
struct task_group *sched_create_group(struct task_group *parent)
{
@@ -9759,25 +9778,35 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
}
/* rcu callback to free various structures associated with a task group */
-static void sched_free_group_rcu(struct rcu_head *rhp)
+static void sched_unregister_group_rcu(struct rcu_head *rhp)
{
/* Now it should be safe to free those cfs_rqs: */
- sched_free_group(container_of(rhp, struct task_group, rcu));
+ sched_unregister_group(container_of(rhp, struct task_group, rcu));
}
void sched_destroy_group(struct task_group *tg)
{
/* Wait for possible concurrent references to cfs_rqs complete: */
- call_rcu(&tg->rcu, sched_free_group_rcu);
+ call_rcu(&tg->rcu, sched_unregister_group_rcu);
}
-void sched_offline_group(struct task_group *tg)
+void sched_release_group(struct task_group *tg)
{
unsigned long flags;
- /* End participation in shares distribution: */
- unregister_fair_sched_group(tg);
-
+ /*
+ * Unlink first, to avoid walk_tg_tree_from() from finding us (via
+ * sched_cfs_period_timer()).
+ *
+ * For this to be effective, we have to wait for all pending users of
+ * this task group to leave their RCU critical section to ensure no new
+ * user will see our dying task group any more. Specifically ensure
+ * that tg_unthrottle_up() won't add decayed cfs_rq's to it.
+ *
+ * We therefore defer calling unregister_fair_sched_group() to
+ * sched_unregister_group() which is guarantied to get called only after the
+ * current RCU grace period has expired.
+ */
spin_lock_irqsave(&task_group_lock, flags);
list_del_rcu(&tg->list);
list_del_rcu(&tg->siblings);
@@ -9896,7 +9925,7 @@ static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
- sched_offline_group(tg);
+ sched_release_group(tg);
}
static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
@@ -9906,7 +9935,7 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
/*
* Relies on the RCU grace period between css_released() and this.
*/
- sched_free_group(tg);
+ sched_unregister_group(tg);
}
/*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 13950beb01a2..6e476f6d9435 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -11456,8 +11456,6 @@ void free_fair_sched_group(struct task_group *tg)
{
int i;
- destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
-
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
@@ -11534,6 +11532,8 @@ void unregister_fair_sched_group(struct task_group *tg)
struct rq *rq;
int cpu;
+ destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
+
for_each_possible_cpu(cpu) {
if (tg->se[cpu])
remove_entity_load_avg(tg->se[cpu]);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index bb945f8faeca..b48baaba2fc2 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -137,13 +137,17 @@ static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
return rt_rq->rq;
}
-void free_rt_sched_group(struct task_group *tg)
+void unregister_rt_sched_group(struct task_group *tg)
{
- int i;
-
if (tg->rt_se)
destroy_rt_bandwidth(&tg->rt_bandwidth);
+}
+
+void free_rt_sched_group(struct task_group *tg)
+{
+ int i;
+
for_each_possible_cpu(i) {
if (tg->rt_rq)
kfree(tg->rt_rq[i]);
@@ -250,6 +254,8 @@ static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
return &rq->rt;
}
+void unregister_rt_sched_group(struct task_group *tg) { }
+
void free_rt_sched_group(struct task_group *tg) { }
int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 7f1612d26c18..0e66749486e7 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -488,6 +488,7 @@ extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b);
extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b);
extern void unthrottle_cfs_rq(struct cfs_rq *cfs_rq);
+extern void unregister_rt_sched_group(struct task_group *tg);
extern void free_rt_sched_group(struct task_group *tg);
extern int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent);
extern void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
@@ -503,7 +504,7 @@ extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_online_group(struct task_group *tg,
struct task_group *parent);
extern void sched_destroy_group(struct task_group *tg);
-extern void sched_offline_group(struct task_group *tg);
+extern void sched_release_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);