diff options
author | Chengming Zhou <zhouchengming@bytedance.com> | 2022-08-25 19:41:08 +0300 |
---|---|---|
committer | Peter Zijlstra <peterz@infradead.org> | 2022-09-09 12:08:32 +0300 |
commit | 52b1364ba0b105122d6de0e719b36db705011ac1 (patch) | |
tree | a6601f432fbe2c006addc068eab9b98febf0449c /kernel/sched | |
parent | 71dbdde7914d32e86f01ac1f6e54e964c9dfdbd9 (diff) | |
download | linux-52b1364ba0b105122d6de0e719b36db705011ac1.tar.xz |
sched/psi: Add PSI_IRQ to track IRQ/SOFTIRQ pressure
Now PSI already tracked workload pressure stall information for
CPU, memory and IO. Apart from these, IRQ/SOFTIRQ could have
obvious impact on some workload productivity, such as web service
workload.
When CONFIG_IRQ_TIME_ACCOUNTING, we can get IRQ/SOFTIRQ delta time
from update_rq_clock_task(), in which we can record that delta
to CPU curr task's cgroups as PSI_IRQ_FULL status.
Note we don't use PSI_IRQ_SOME since IRQ/SOFTIRQ always happen in
the current task on the CPU, make nothing productive could run
even if it were runnable, so we only use PSI_IRQ_FULL.
Signed-off-by: Chengming Zhou <zhouchengming@bytedance.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Link: https://lore.kernel.org/r/20220825164111.29534-8-zhouchengming@bytedance.com
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 1 | ||||
-rw-r--r-- | kernel/sched/psi.c | 74 | ||||
-rw-r--r-- | kernel/sched/stats.h | 2 |
3 files changed, 75 insertions, 2 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index ee28253c9ac0..7d1ea9240af0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -708,6 +708,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta) rq->prev_irq_time += irq_delta; delta -= irq_delta; + psi_account_irqtime(rq->curr, irq_delta); #endif #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING if (static_key_false((¶virt_steal_rq_enabled))) { diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c index 4702a770e272..2545a78f82d8 100644 --- a/kernel/sched/psi.c +++ b/kernel/sched/psi.c @@ -904,6 +904,36 @@ void psi_task_switch(struct task_struct *prev, struct task_struct *next, } } +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +void psi_account_irqtime(struct task_struct *task, u32 delta) +{ + int cpu = task_cpu(task); + void *iter = NULL; + struct psi_group *group; + struct psi_group_cpu *groupc; + u64 now; + + if (!task->pid) + return; + + now = cpu_clock(cpu); + + while ((group = iterate_groups(task, &iter))) { + groupc = per_cpu_ptr(group->pcpu, cpu); + + write_seqcount_begin(&groupc->seq); + + record_times(groupc, now); + groupc->times[PSI_IRQ_FULL] += delta; + + write_seqcount_end(&groupc->seq); + + if (group->poll_states & (1 << PSI_IRQ_FULL)) + psi_schedule_poll_work(group, 1); + } +} +#endif + /** * psi_memstall_enter - mark the beginning of a memory stall section * @flags: flags to handle nested sections @@ -1065,6 +1095,7 @@ void cgroup_move_task(struct task_struct *task, struct css_set *to) int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) { + bool only_full = false; int full; u64 now; @@ -1079,7 +1110,11 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) group->avg_next_update = update_averages(group, now); mutex_unlock(&group->avgs_lock); - for (full = 0; full < 2; full++) { +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + only_full = res == PSI_IRQ; +#endif + + for (full = 0; full < 2 - only_full; full++) { unsigned long avg[3] = { 0, }; u64 total = 0; int w; @@ -1093,7 +1128,7 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res) } seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n", - full ? "full" : "some", + full || only_full ? "full" : "some", LOAD_INT(avg[0]), LOAD_FRAC(avg[0]), LOAD_INT(avg[1]), LOAD_FRAC(avg[1]), LOAD_INT(avg[2]), LOAD_FRAC(avg[2]), @@ -1121,6 +1156,11 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group, else return ERR_PTR(-EINVAL); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + if (res == PSI_IRQ && --state != PSI_IRQ_FULL) + return ERR_PTR(-EINVAL); +#endif + if (state >= PSI_NONIDLE) return ERR_PTR(-EINVAL); @@ -1405,6 +1445,33 @@ static const struct proc_ops psi_cpu_proc_ops = { .proc_release = psi_fop_release, }; +#ifdef CONFIG_IRQ_TIME_ACCOUNTING +static int psi_irq_show(struct seq_file *m, void *v) +{ + return psi_show(m, &psi_system, PSI_IRQ); +} + +static int psi_irq_open(struct inode *inode, struct file *file) +{ + return psi_open(file, psi_irq_show); +} + +static ssize_t psi_irq_write(struct file *file, const char __user *user_buf, + size_t nbytes, loff_t *ppos) +{ + return psi_write(file, user_buf, nbytes, PSI_IRQ); +} + +static const struct proc_ops psi_irq_proc_ops = { + .proc_open = psi_irq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_write = psi_irq_write, + .proc_poll = psi_fop_poll, + .proc_release = psi_fop_release, +}; +#endif + static int __init psi_proc_init(void) { if (psi_enable) { @@ -1412,6 +1479,9 @@ static int __init psi_proc_init(void) proc_create("pressure/io", 0666, NULL, &psi_io_proc_ops); proc_create("pressure/memory", 0666, NULL, &psi_memory_proc_ops); proc_create("pressure/cpu", 0666, NULL, &psi_cpu_proc_ops); +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + proc_create("pressure/irq", 0666, NULL, &psi_irq_proc_ops); +#endif } return 0; } diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index c39b467ece43..84a188913cc9 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -110,6 +110,7 @@ __schedstats_from_se(struct sched_entity *se) void psi_task_change(struct task_struct *task, int clear, int set); void psi_task_switch(struct task_struct *prev, struct task_struct *next, bool sleep); +void psi_account_irqtime(struct task_struct *task, u32 delta); /* * PSI tracks state that persists across sleeps, such as iowaits and @@ -205,6 +206,7 @@ static inline void psi_ttwu_dequeue(struct task_struct *p) {} static inline void psi_sched_switch(struct task_struct *prev, struct task_struct *next, bool sleep) {} +static inline void psi_account_irqtime(struct task_struct *task, u32 delta) {} #endif /* CONFIG_PSI */ #ifdef CONFIG_SCHED_INFO |