summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2021-06-28 22:14:19 +0300
committerLinus Torvalds <torvalds@linux-foundation.org>2021-06-28 22:14:19 +0300
commit54a728dc5e4feb0a9278ad62b19f34ad21ed0ee4 (patch)
tree2737c23d4dbc6426d6d9467626a7634cbbb40fcd /include
parent28a27cbd86076c1a6be311c751b421c4c17a7dd9 (diff)
parentadf3c31e18b765ea24eba7b0c1efc076b8ee3d55 (diff)
downloadlinux-54a728dc5e4feb0a9278ad62b19f34ad21ed0ee4.tar.xz
Merge tag 'sched-core-2021-06-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler udpates from Ingo Molnar: - Changes to core scheduling facilities: - Add "Core Scheduling" via CONFIG_SCHED_CORE=y, which enables coordinated scheduling across SMT siblings. This is a much requested feature for cloud computing platforms, to allow the flexible utilization of SMT siblings, without exposing untrusted domains to information leaks & side channels, plus to ensure more deterministic computing performance on SMT systems used by heterogenous workloads. There are new prctls to set core scheduling groups, which allows more flexible management of workloads that can share siblings. - Fix task->state access anti-patterns that may result in missed wakeups and rename it to ->__state in the process to catch new abuses. - Load-balancing changes: - Tweak newidle_balance for fair-sched, to improve 'memcache'-like workloads. - "Age" (decay) average idle time, to better track & improve workloads such as 'tbench'. - Fix & improve energy-aware (EAS) balancing logic & metrics. - Fix & improve the uclamp metrics. - Fix task migration (taskset) corner case on !CONFIG_CPUSET. - Fix RT and deadline utilization tracking across policy changes - Introduce a "burstable" CFS controller via cgroups, which allows bursty CPU-bound workloads to borrow a bit against their future quota to improve overall latencies & batching. Can be tweaked via /sys/fs/cgroup/cpu/<X>/cpu.cfs_burst_us. - Rework assymetric topology/capacity detection & handling. - Scheduler statistics & tooling: - Disable delayacct by default, but add a sysctl to enable it at runtime if tooling needs it. Use static keys and other optimizations to make it more palatable. - Use sched_clock() in delayacct, instead of ktime_get_ns(). - Misc cleanups and fixes. * tag 'sched-core-2021-06-28' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (72 commits) sched/doc: Update the CPU capacity asymmetry bits sched/topology: Rework CPU capacity asymmetry detection sched/core: Introduce SD_ASYM_CPUCAPACITY_FULL sched_domain flag psi: Fix race between psi_trigger_create/destroy sched/fair: Introduce the burstable CFS controller sched/uclamp: Fix uclamp_tg_restrict() sched/rt: Fix Deadline utilization tracking during policy change sched/rt: Fix RT utilization tracking during policy change sched: Change task_struct::state sched,arch: Remove unused TASK_STATE offsets sched,timer: Use __set_current_state() sched: Add get_current_state() sched,perf,kvm: Fix preemption condition sched: Introduce task_is_running() sched: Unbreak wakeups sched/fair: Age the average idle time sched/cpufreq: Consider reduced CPU capacity in energy calculation sched/fair: Take thermal pressure into account while estimating energy thermal/cpufreq_cooling: Update offline CPUs per-cpu thermal_pressure sched/fair: Return early from update_tg_cfs_load() if delta == 0 ...
Diffstat (limited to 'include')
-rw-r--r--include/asm-generic/preempt.h2
-rw-r--r--include/linux/delayacct.h22
-rw-r--r--include/linux/energy_model.h16
-rw-r--r--include/linux/kthread.h2
-rw-r--r--include/linux/sched.h50
-rw-r--r--include/linux/sched/cpufreq.h2
-rw-r--r--include/linux/sched/debug.h2
-rw-r--r--include/linux/sched/sd_flags.h10
-rw-r--r--include/linux/sched/signal.h2
-rw-r--r--include/linux/sched/stat.h16
-rw-r--r--include/linux/sched_clock.h2
-rw-r--r--include/uapi/linux/prctl.h8
12 files changed, 91 insertions, 43 deletions
diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h
index d683f5e6d791..b4d43a4af5f7 100644
--- a/include/asm-generic/preempt.h
+++ b/include/asm-generic/preempt.h
@@ -29,7 +29,7 @@ static __always_inline void preempt_count_set(int pc)
} while (0)
#define init_idle_preempt_count(p, cpu) do { \
- task_thread_info(p)->preempt_count = PREEMPT_ENABLED; \
+ task_thread_info(p)->preempt_count = PREEMPT_DISABLED; \
} while (0)
static __always_inline void set_preempt_need_resched(void)
diff --git a/include/linux/delayacct.h b/include/linux/delayacct.h
index 21651f946751..af7e6eb50283 100644
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -58,16 +58,22 @@ struct task_delay_info {
#include <linux/sched.h>
#include <linux/slab.h>
+#include <linux/jump_label.h>
#ifdef CONFIG_TASK_DELAY_ACCT
+DECLARE_STATIC_KEY_FALSE(delayacct_key);
extern int delayacct_on; /* Delay accounting turned on/off */
extern struct kmem_cache *delayacct_cache;
extern void delayacct_init(void);
+
+extern int sysctl_delayacct(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos);
+
extern void __delayacct_tsk_init(struct task_struct *);
extern void __delayacct_tsk_exit(struct task_struct *);
extern void __delayacct_blkio_start(void);
extern void __delayacct_blkio_end(struct task_struct *);
-extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
+extern int delayacct_add_tsk(struct taskstats *, struct task_struct *);
extern __u64 __delayacct_blkio_ticks(struct task_struct *);
extern void __delayacct_freepages_start(void);
extern void __delayacct_freepages_end(void);
@@ -114,6 +120,9 @@ static inline void delayacct_tsk_free(struct task_struct *tsk)
static inline void delayacct_blkio_start(void)
{
+ if (!static_branch_unlikely(&delayacct_key))
+ return;
+
delayacct_set_flag(current, DELAYACCT_PF_BLKIO);
if (current->delays)
__delayacct_blkio_start();
@@ -121,19 +130,14 @@ static inline void delayacct_blkio_start(void)
static inline void delayacct_blkio_end(struct task_struct *p)
{
+ if (!static_branch_unlikely(&delayacct_key))
+ return;
+
if (p->delays)
__delayacct_blkio_end(p);
delayacct_clear_flag(p, DELAYACCT_PF_BLKIO);
}
-static inline int delayacct_add_tsk(struct taskstats *d,
- struct task_struct *tsk)
-{
- if (!delayacct_on || !tsk->delays)
- return 0;
- return __delayacct_add_tsk(d, tsk);
-}
-
static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
{
if (tsk->delays)
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 757fc60658fa..3f221dbf5f95 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -91,6 +91,8 @@ void em_dev_unregister_perf_domain(struct device *dev);
* @pd : performance domain for which energy has to be estimated
* @max_util : highest utilization among CPUs of the domain
* @sum_util : sum of the utilization of all CPUs in the domain
+ * @allowed_cpu_cap : maximum allowed CPU capacity for the @pd, which
+ might reflect reduced frequency (due to thermal)
*
* This function must be used only for CPU devices. There is no validation,
* i.e. if the EM is a CPU type and has cpumask allocated. It is called from
@@ -100,7 +102,8 @@ void em_dev_unregister_perf_domain(struct device *dev);
* a capacity state satisfying the max utilization of the domain.
*/
static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
- unsigned long max_util, unsigned long sum_util)
+ unsigned long max_util, unsigned long sum_util,
+ unsigned long allowed_cpu_cap)
{
unsigned long freq, scale_cpu;
struct em_perf_state *ps;
@@ -112,11 +115,17 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
/*
* In order to predict the performance state, map the utilization of
* the most utilized CPU of the performance domain to a requested
- * frequency, like schedutil.
+ * frequency, like schedutil. Take also into account that the real
+ * frequency might be set lower (due to thermal capping). Thus, clamp
+ * max utilization to the allowed CPU capacity before calculating
+ * effective frequency.
*/
cpu = cpumask_first(to_cpumask(pd->cpus));
scale_cpu = arch_scale_cpu_capacity(cpu);
ps = &pd->table[pd->nr_perf_states - 1];
+
+ max_util = map_util_perf(max_util);
+ max_util = min(max_util, allowed_cpu_cap);
freq = map_util_freq(max_util, ps->frequency, scale_cpu);
/*
@@ -209,7 +218,8 @@ static inline struct em_perf_domain *em_pd_get(struct device *dev)
return NULL;
}
static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
- unsigned long max_util, unsigned long sum_util)
+ unsigned long max_util, unsigned long sum_util,
+ unsigned long allowed_cpu_cap)
{
return 0;
}
diff --git a/include/linux/kthread.h b/include/linux/kthread.h
index 2484ed97e72f..d9133d6db308 100644
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -33,6 +33,8 @@ struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
unsigned int cpu,
const char *namefmt);
+void set_kthread_struct(struct task_struct *p);
+
void kthread_set_per_cpu(struct task_struct *k, int cpu);
bool kthread_is_per_cpu(struct task_struct *k);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 32813c345115..31aaf9d7b3e9 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -113,11 +113,13 @@ struct task_group;
__TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \
TASK_PARKED)
-#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
+#define task_is_running(task) (READ_ONCE((task)->__state) == TASK_RUNNING)
-#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
+#define task_is_traced(task) ((READ_ONCE(task->__state) & __TASK_TRACED) != 0)
-#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
+#define task_is_stopped(task) ((READ_ONCE(task->__state) & __TASK_STOPPED) != 0)
+
+#define task_is_stopped_or_traced(task) ((READ_ONCE(task->__state) & (__TASK_STOPPED | __TASK_TRACED)) != 0)
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
@@ -132,14 +134,14 @@ struct task_group;
do { \
WARN_ON_ONCE(is_special_task_state(state_value));\
current->task_state_change = _THIS_IP_; \
- current->state = (state_value); \
+ WRITE_ONCE(current->__state, (state_value)); \
} while (0)
#define set_current_state(state_value) \
do { \
WARN_ON_ONCE(is_special_task_state(state_value));\
current->task_state_change = _THIS_IP_; \
- smp_store_mb(current->state, (state_value)); \
+ smp_store_mb(current->__state, (state_value)); \
} while (0)
#define set_special_state(state_value) \
@@ -148,7 +150,7 @@ struct task_group;
WARN_ON_ONCE(!is_special_task_state(state_value)); \
raw_spin_lock_irqsave(&current->pi_lock, flags); \
current->task_state_change = _THIS_IP_; \
- current->state = (state_value); \
+ WRITE_ONCE(current->__state, (state_value)); \
raw_spin_unlock_irqrestore(&current->pi_lock, flags); \
} while (0)
#else
@@ -190,10 +192,10 @@ struct task_group;
* Also see the comments of try_to_wake_up().
*/
#define __set_current_state(state_value) \
- current->state = (state_value)
+ WRITE_ONCE(current->__state, (state_value))
#define set_current_state(state_value) \
- smp_store_mb(current->state, (state_value))
+ smp_store_mb(current->__state, (state_value))
/*
* set_special_state() should be used for those states when the blocking task
@@ -205,12 +207,14 @@ struct task_group;
do { \
unsigned long flags; /* may shadow */ \
raw_spin_lock_irqsave(&current->pi_lock, flags); \
- current->state = (state_value); \
+ WRITE_ONCE(current->__state, (state_value)); \
raw_spin_unlock_irqrestore(&current->pi_lock, flags); \
} while (0)
#endif
+#define get_current_state() READ_ONCE(current->__state)
+
/* Task command name length: */
#define TASK_COMM_LEN 16
@@ -662,8 +666,7 @@ struct task_struct {
*/
struct thread_info thread_info;
#endif
- /* -1 unrunnable, 0 runnable, >0 stopped: */
- volatile long state;
+ unsigned int __state;
/*
* This begins the randomizable portion of task_struct. Only
@@ -708,10 +711,17 @@ struct task_struct {
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
+ struct sched_dl_entity dl;
+
+#ifdef CONFIG_SCHED_CORE
+ struct rb_node core_node;
+ unsigned long core_cookie;
+ unsigned int core_occupation;
+#endif
+
#ifdef CONFIG_CGROUP_SCHED
struct task_group *sched_task_group;
#endif
- struct sched_dl_entity dl;
#ifdef CONFIG_UCLAMP_TASK
/*
@@ -1520,7 +1530,7 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
static inline unsigned int task_state_index(struct task_struct *tsk)
{
- unsigned int tsk_state = READ_ONCE(tsk->state);
+ unsigned int tsk_state = READ_ONCE(tsk->__state);
unsigned int state = (tsk_state | tsk->exit_state) & TASK_REPORT;
BUILD_BUG_ON_NOT_POWER_OF_2(TASK_REPORT_MAX);
@@ -1828,10 +1838,10 @@ static __always_inline void scheduler_ipi(void)
*/
preempt_fold_need_resched();
}
-extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
+extern unsigned long wait_task_inactive(struct task_struct *, unsigned int match_state);
#else
static inline void scheduler_ipi(void) { }
-static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state)
+static inline unsigned long wait_task_inactive(struct task_struct *p, unsigned int match_state)
{
return 1;
}
@@ -2179,4 +2189,14 @@ int sched_trace_rq_nr_running(struct rq *rq);
const struct cpumask *sched_trace_rd_span(struct root_domain *rd);
+#ifdef CONFIG_SCHED_CORE
+extern void sched_core_free(struct task_struct *tsk);
+extern void sched_core_fork(struct task_struct *p);
+extern int sched_core_share_pid(unsigned int cmd, pid_t pid, enum pid_type type,
+ unsigned long uaddr);
+#else
+static inline void sched_core_free(struct task_struct *tsk) { }
+static inline void sched_core_fork(struct task_struct *p) { }
+#endif
+
#endif
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h
index 6205578ab6ee..bdd31ab93bc5 100644
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -26,7 +26,7 @@ bool cpufreq_this_cpu_can_update(struct cpufreq_policy *policy);
static inline unsigned long map_util_freq(unsigned long util,
unsigned long freq, unsigned long cap)
{
- return (freq + (freq >> 2)) * util / cap;
+ return freq * util / cap;
}
static inline unsigned long map_util_perf(unsigned long util)
diff --git a/include/linux/sched/debug.h b/include/linux/sched/debug.h
index ae51f4529fc9..b5035afa2396 100644
--- a/include/linux/sched/debug.h
+++ b/include/linux/sched/debug.h
@@ -14,7 +14,7 @@ extern void dump_cpu_task(int cpu);
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
-extern void show_state_filter(unsigned long state_filter);
+extern void show_state_filter(unsigned int state_filter);
static inline void show_state(void)
{
diff --git a/include/linux/sched/sd_flags.h b/include/linux/sched/sd_flags.h
index 34b21e971d77..57bde66d95f7 100644
--- a/include/linux/sched/sd_flags.h
+++ b/include/linux/sched/sd_flags.h
@@ -91,6 +91,16 @@ SD_FLAG(SD_WAKE_AFFINE, SDF_SHARED_CHILD)
SD_FLAG(SD_ASYM_CPUCAPACITY, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
/*
+ * Domain members have different CPU capacities spanning all unique CPU
+ * capacity values.
+ *
+ * SHARED_PARENT: Set from the topmost domain down to the first domain where
+ * all available CPU capacities are visible
+ * NEEDS_GROUPS: Per-CPU capacity is asymmetric between groups.
+ */
+SD_FLAG(SD_ASYM_CPUCAPACITY_FULL, SDF_SHARED_PARENT | SDF_NEEDS_GROUPS)
+
+/*
* Domain members share CPU capacity (i.e. SMT)
*
* SHARED_CHILD: Set from the base domain up until spanned CPUs no longer share
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h
index 7f4278fa21fe..c9cf678c347d 100644
--- a/include/linux/sched/signal.h
+++ b/include/linux/sched/signal.h
@@ -382,7 +382,7 @@ static inline int fatal_signal_pending(struct task_struct *p)
return task_sigpending(p) && __fatal_signal_pending(p);
}
-static inline int signal_pending_state(long state, struct task_struct *p)
+static inline int signal_pending_state(unsigned int state, struct task_struct *p)
{
if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
return 0;
diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h
index 568286411b43..0108a38bb64d 100644
--- a/include/linux/sched/stat.h
+++ b/include/linux/sched/stat.h
@@ -3,6 +3,7 @@
#define _LINUX_SCHED_STAT_H
#include <linux/percpu.h>
+#include <linux/kconfig.h>
/*
* Various counters maintained by the scheduler and fork(),
@@ -16,21 +17,14 @@ extern unsigned long total_forks;
extern int nr_threads;
DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
-extern unsigned long nr_running(void);
+extern unsigned int nr_running(void);
extern bool single_task_running(void);
-extern unsigned long nr_iowait(void);
-extern unsigned long nr_iowait_cpu(int cpu);
+extern unsigned int nr_iowait(void);
+extern unsigned int nr_iowait_cpu(int cpu);
static inline int sched_info_on(void)
{
-#ifdef CONFIG_SCHEDSTATS
- return 1;
-#elif defined(CONFIG_TASK_DELAY_ACCT)
- extern int delayacct_on;
- return delayacct_on;
-#else
- return 0;
-#endif
+ return IS_ENABLED(CONFIG_SCHED_INFO);
}
#ifdef CONFIG_SCHEDSTATS
diff --git a/include/linux/sched_clock.h b/include/linux/sched_clock.h
index 528718e4ed52..835ee87ed792 100644
--- a/include/linux/sched_clock.h
+++ b/include/linux/sched_clock.h
@@ -14,7 +14,7 @@
* @sched_clock_mask: Bitmask for two's complement subtraction of non 64bit
* clocks.
* @read_sched_clock: Current clock source (or dummy source when suspended).
- * @mult: Multipler for scaled math conversion.
+ * @mult: Multiplier for scaled math conversion.
* @shift: Shift value for scaled math conversion.
*
* Care must be taken when updating this structure; it is read by
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 18a9f59dc067..967d9c55323d 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -259,4 +259,12 @@ struct prctl_mm_map {
#define PR_PAC_SET_ENABLED_KEYS 60
#define PR_PAC_GET_ENABLED_KEYS 61
+/* Request the scheduler to share a core */
+#define PR_SCHED_CORE 62
+# define PR_SCHED_CORE_GET 0
+# define PR_SCHED_CORE_CREATE 1 /* create unique core_sched cookie */
+# define PR_SCHED_CORE_SHARE_TO 2 /* push core_sched cookie to pid */
+# define PR_SCHED_CORE_SHARE_FROM 3 /* pull core_sched cookie to pid */
+# define PR_SCHED_CORE_MAX 4
+
#endif /* _LINUX_PRCTL_H */