From 64eea63c19a2c386a96638f4e54a1355510709e3 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 25 Oct 2019 04:03:03 +0200 Subject: sched/kcpustat: Introduce vtime-aware kcpustat accessor for CPUTIME_SYSTEM Kcpustat is not correctly supported on nohz_full CPUs. The tick doesn't fire and the cputime therefore doesn't move forward. The issue has shown up after the vanishing of the remaining 1Hz which has made the stall visible. We are solving that with checking the task running on a CPU through RCU and reading its vtime delta that we add to the raw kcpustat values. We make sure that we fetch a coherent raw-kcpustat/vtime-delta couple sequence while checking that the CPU referred by the target vtime is the correct one, under the locked vtime seqcount. Only CPUTIME_SYSTEM is handled here as a start because it's the trivial case. User and guest time will require more preparation work to correctly handle niceness. Reported-by: Yauheni Kaliuta Signed-off-by: Frederic Weisbecker Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: Wanpeng Li Link: https://lkml.kernel.org/r/20191025020303.19342-1-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux/kernel_stat.h') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 7ee2bb43b251..79781196eb25 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -78,6 +78,17 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) return kstat_cpu(cpu).irqs_sum; } +#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +extern u64 kcpustat_field(struct kernel_cpustat *kcpustat, + enum cpu_usage_stat usage, int cpu); +#else +static inline u64 kcpustat_field(struct kernel_cpustat *kcpustat, + enum cpu_usage_stat usage, int cpu) +{ + return kcpustat->cpustat[usage]; +} +#endif + extern void account_user_time(struct task_struct *, u64); extern void account_guest_time(struct task_struct *, u64); extern void account_system_time(struct task_struct *, int, u64); -- cgit v1.2.3 From 74722bb223d0f236303b60c9509ff924a9713780 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 21 Nov 2019 03:44:26 +0100 Subject: sched/vtime: Bring up complete kcpustat accessor Many callsites want to fetch the values of system, user, user_nice, guest or guest_nice kcpustat fields altogether or at least a pair of these. In that case calling kcpustat_field() for each requested field brings unecessary overhead when we could fetch all of them in a row. So provide kcpustat_cpu_fetch() that fetches the whole kcpustat array in a vtime safe way under the same RCU and seqcount block. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Wanpeng Li Cc: Yauheni Kaliuta Link: https://lkml.kernel.org/r/20191121024430.19938-3-frederic@kernel.org Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 7 +++ kernel/sched/cputime.c | 136 +++++++++++++++++++++++++++++++++++++------- 2 files changed, 123 insertions(+), 20 deletions(-) (limited to 'include/linux/kernel_stat.h') diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 79781196eb25..89f0745c096d 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -81,12 +81,19 @@ static inline unsigned int kstat_cpu_irqs_sum(unsigned int cpu) #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN extern u64 kcpustat_field(struct kernel_cpustat *kcpustat, enum cpu_usage_stat usage, int cpu); +extern void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu); #else static inline u64 kcpustat_field(struct kernel_cpustat *kcpustat, enum cpu_usage_stat usage, int cpu) { return kcpustat->cpustat[usage]; } + +static inline void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) +{ + *dst = kcpustat_cpu(cpu); +} + #endif extern void account_user_time(struct task_struct *, u64); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 27b5406222fc..d43318a489f2 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -912,6 +912,30 @@ void task_cputime(struct task_struct *t, u64 *utime, u64 *stime) } while (read_seqcount_retry(&vtime->seqcount, seq)); } +static int vtime_state_check(struct vtime *vtime, int cpu) +{ + /* + * We raced against a context switch, fetch the + * kcpustat task again. + */ + if (vtime->cpu != cpu && vtime->cpu != -1) + return -EAGAIN; + + /* + * Two possible things here: + * 1) We are seeing the scheduling out task (prev) or any past one. + * 2) We are seeing the scheduling in task (next) but it hasn't + * passed though vtime_task_switch() yet so the pending + * cputime of the prev task may not be flushed yet. + * + * Case 1) is ok but 2) is not. So wait for a safe VTIME state. + */ + if (vtime->state == VTIME_INACTIVE) + return -EAGAIN; + + return 0; +} + static u64 kcpustat_user_vtime(struct vtime *vtime) { if (vtime->state == VTIME_USER) @@ -933,26 +957,9 @@ static int kcpustat_field_vtime(u64 *cpustat, do { seq = read_seqcount_begin(&vtime->seqcount); - /* - * We raced against context switch, fetch the - * kcpustat task again. - */ - if (vtime->cpu != cpu && vtime->cpu != -1) - return -EAGAIN; - - /* - * Two possible things here: - * 1) We are seeing the scheduling out task (prev) or any past one. - * 2) We are seeing the scheduling in task (next) but it hasn't - * passed though vtime_task_switch() yet so the pending - * cputime of the prev task may not be flushed yet. - * - * Case 1) is ok but 2) is not. So wait for a safe VTIME state. - */ - if (vtime->state == VTIME_INACTIVE) - return -EAGAIN; - - err = 0; + err = vtime_state_check(vtime, cpu); + if (err < 0) + return err; *val = cpustat[usage]; @@ -1025,4 +1032,93 @@ u64 kcpustat_field(struct kernel_cpustat *kcpustat, } } EXPORT_SYMBOL_GPL(kcpustat_field); + +static int kcpustat_cpu_fetch_vtime(struct kernel_cpustat *dst, + const struct kernel_cpustat *src, + struct task_struct *tsk, int cpu) +{ + struct vtime *vtime = &tsk->vtime; + unsigned int seq; + int err; + + do { + u64 *cpustat; + u64 delta; + + seq = read_seqcount_begin(&vtime->seqcount); + + err = vtime_state_check(vtime, cpu); + if (err < 0) + return err; + + *dst = *src; + cpustat = dst->cpustat; + + /* Task is sleeping, dead or idle, nothing to add */ + if (vtime->state < VTIME_SYS) + continue; + + delta = vtime_delta(vtime); + + /* + * Task runs either in user (including guest) or kernel space, + * add pending nohz time to the right place. + */ + if (vtime->state == VTIME_SYS) { + cpustat[CPUTIME_SYSTEM] += vtime->stime + delta; + } else if (vtime->state == VTIME_USER) { + if (task_nice(tsk) > 0) + cpustat[CPUTIME_NICE] += vtime->utime + delta; + else + cpustat[CPUTIME_USER] += vtime->utime + delta; + } else { + WARN_ON_ONCE(vtime->state != VTIME_GUEST); + if (task_nice(tsk) > 0) { + cpustat[CPUTIME_GUEST_NICE] += vtime->gtime + delta; + cpustat[CPUTIME_NICE] += vtime->gtime + delta; + } else { + cpustat[CPUTIME_GUEST] += vtime->gtime + delta; + cpustat[CPUTIME_USER] += vtime->gtime + delta; + } + } + } while (read_seqcount_retry(&vtime->seqcount, seq)); + + return err; +} + +void kcpustat_cpu_fetch(struct kernel_cpustat *dst, int cpu) +{ + const struct kernel_cpustat *src = &kcpustat_cpu(cpu); + struct rq *rq; + int err; + + if (!vtime_accounting_enabled_cpu(cpu)) { + *dst = *src; + return; + } + + rq = cpu_rq(cpu); + + for (;;) { + struct task_struct *curr; + + rcu_read_lock(); + curr = rcu_dereference(rq->curr); + if (WARN_ON_ONCE(!curr)) { + rcu_read_unlock(); + *dst = *src; + return; + } + + err = kcpustat_cpu_fetch_vtime(dst, src, curr, cpu); + rcu_read_unlock(); + + if (!err) + return; + + cpu_relax(); + } +} +EXPORT_SYMBOL_GPL(kcpustat_cpu_fetch); + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_GEN */ -- cgit v1.2.3