summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPeng Liu <iwtbavbm@gmail.com>2020-10-08 18:49:42 +0300
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>2020-12-29 15:46:51 +0300
commitf4689cad10f5c92d1a2f5589ce1fba1c84b724e2 (patch)
tree60d3d3fe4492e6224dc245adbf6348240fcdd289
parent93543a276647d0bfebfbca80f05367dc6c107b90 (diff)
downloadlinux-f4689cad10f5c92d1a2f5589ce1fba1c84b724e2.tar.xz
sched/deadline: Fix sched_dl_global_validate()
[ Upstream commit a57415f5d1e43c3a5c5d412cd85e2792d7ed9b11 ] When change sched_rt_{runtime, period}_us, we validate that the new settings should at least accommodate the currently allocated -dl bandwidth: sched_rt_handler() --> sched_dl_bandwidth_validate() { new_bw = global_rt_runtime()/global_rt_period(); for_each_possible_cpu(cpu) { dl_b = dl_bw_of(cpu); if (new_bw < dl_b->total_bw) <------- ret = -EBUSY; } } But under CONFIG_SMP, dl_bw is per root domain , but not per CPU, dl_b->total_bw is the allocated bandwidth of the whole root domain. Instead, we should compare dl_b->total_bw against "cpus*new_bw", where 'cpus' is the number of CPUs of the root domain. Also, below annotation(in kernel/sched/sched.h) implied implementation only appeared in SCHED_DEADLINE v2[1], then deadline scheduler kept evolving till got merged(v9), but the annotation remains unchanged, meaningless and misleading, update it. * With respect to SMP, the bandwidth is given on a per-CPU basis, * meaning that: * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; * - dl_total_bw array contains, in the i-eth element, the currently * allocated bandwidth on the i-eth CPU. [1]: https://lore.kernel.org/lkml/1267385230.13676.101.camel@Palantir/ Fixes: 332ac17ef5bf ("sched/deadline: Add bandwidth management for SCHED_DEADLINE tasks") Signed-off-by: Peng Liu <iwtbavbm@gmail.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Reviewed-by: Daniel Bristot de Oliveira <bristot@redhat.com> Acked-by: Juri Lelli <juri.lelli@redhat.com> Link: https://lkml.kernel.org/r/db6bbda316048cda7a1bbc9571defde193a8d67e.1602171061.git.iwtbavbm@gmail.com Signed-off-by: Sasha Levin <sashal@kernel.org>
-rw-r--r--kernel/sched/deadline.c5
-rw-r--r--kernel/sched/sched.h42
2 files changed, 21 insertions, 26 deletions
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 22770168bff8..06a6bcd6cfa6 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -2345,7 +2345,7 @@ int sched_dl_global_validate(void)
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
struct dl_bw *dl_b;
- int cpu, ret = 0;
+ int cpu, cpus, ret = 0;
unsigned long flags;
/*
@@ -2360,9 +2360,10 @@ int sched_dl_global_validate(void)
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- if (new_bw < dl_b->total_bw)
+ if (new_bw * cpus < dl_b->total_bw)
ret = -EBUSY;
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 391d73a12ad7..e5cfec6bc891 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -191,30 +191,6 @@ struct rt_bandwidth {
void __dl_clear_params(struct task_struct *p);
-/*
- * To keep the bandwidth of -deadline tasks and groups under control
- * we need some place where:
- * - store the maximum -deadline bandwidth of the system (the group);
- * - cache the fraction of that bandwidth that is currently allocated.
- *
- * This is all done in the data structure below. It is similar to the
- * one used for RT-throttling (rt_bandwidth), with the main difference
- * that, since here we are only interested in admission control, we
- * do not decrease any runtime while the group "executes", neither we
- * need a timer to replenish it.
- *
- * With respect to SMP, the bandwidth is given on a per-CPU basis,
- * meaning that:
- * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
- * - dl_total_bw array contains, in the i-eth element, the currently
- * allocated bandwidth on the i-eth CPU.
- * Moreover, groups consume bandwidth on each CPU, while tasks only
- * consume bandwidth on the CPU they're running on.
- * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
- * that will be shown the next time the proc or cgroup controls will
- * be red. It on its turn can be changed by writing on its own
- * control.
- */
struct dl_bandwidth {
raw_spinlock_t dl_runtime_lock;
u64 dl_runtime;
@@ -226,6 +202,24 @@ static inline int dl_bandwidth_enabled(void)
return sysctl_sched_rt_runtime >= 0;
}
+/*
+ * To keep the bandwidth of -deadline tasks under control
+ * we need some place where:
+ * - store the maximum -deadline bandwidth of each cpu;
+ * - cache the fraction of bandwidth that is currently allocated in
+ * each root domain;
+ *
+ * This is all done in the data structure below. It is similar to the
+ * one used for RT-throttling (rt_bandwidth), with the main difference
+ * that, since here we are only interested in admission control, we
+ * do not decrease any runtime while the group "executes", neither we
+ * need a timer to replenish it.
+ *
+ * With respect to SMP, bandwidth is given on a per root domain basis,
+ * meaning that:
+ * - bw (< 100%) is the deadline bandwidth of each CPU;
+ * - total_bw is the currently allocated bandwidth in each root domain;
+ */
struct dl_bw {
raw_spinlock_t lock;
u64 bw, total_bw;