From 62c43dd9864dbd52ff158922d1d08c75f20335af Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:50 -0400 Subject: sched_clock: record from last tick The sched_clock code tries to keep within the gtod time by one tick (jiffy). The current code mistakenly keeps track of the delta jiffies between updates of the clock, where the the delta is used to compare with the number of jiffies that have past since an update of the gtod. The gtod is updated at each schedule tick not each sched_clock update. After one jiffy passes the clock is updated fine. But the delta is taken from the last update so if the next update happens before the next tick the delta jiffies used will be incorrect. This patch changes the code to check the delta of jiffies between ticks and not updates to match the comparison of the updates with the gtod. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ce05271219ab..e383bc7df6dd 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -40,7 +40,7 @@ struct sched_clock_data { */ raw_spinlock_t lock; - unsigned long prev_jiffies; + unsigned long tick_jiffies; u64 prev_raw; u64 tick_raw; u64 tick_gtod; @@ -71,7 +71,7 @@ void sched_clock_init(void) struct sched_clock_data *scd = cpu_sdc(cpu); scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - scd->prev_jiffies = now_jiffies; + scd->tick_jiffies = now_jiffies; scd->prev_raw = 0; scd->tick_raw = 0; scd->tick_gtod = ktime_now; @@ -90,7 +90,7 @@ void sched_clock_init(void) static void __update_sched_clock(struct sched_clock_data *scd, u64 now) { unsigned long now_jiffies = jiffies; - long delta_jiffies = now_jiffies - scd->prev_jiffies; + long delta_jiffies = now_jiffies - scd->tick_jiffies; u64 clock = scd->clock; u64 min_clock, max_clock; s64 delta = now - scd->prev_raw; @@ -119,7 +119,6 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) clock = min_clock; scd->prev_raw = now; - scd->prev_jiffies = now_jiffies; scd->clock = clock; } @@ -179,6 +178,7 @@ u64 sched_clock_cpu(int cpu) void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); + unsigned long now_jiffies = jiffies; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -196,6 +196,7 @@ void sched_clock_tick(void) * already observe 1 new jiffy; adding a new tick_gtod to that would * increase the clock 2 jiffies. */ + scd->tick_jiffies = now_jiffies; scd->tick_raw = now; scd->tick_gtod = now_gtod; __raw_spin_unlock(&scd->lock); -- cgit v1.2.3 From f7cce27f5605b9e137b829a47949cb2d3c7e1cab Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:51 -0400 Subject: sched_clock: widen the max and min time With keeping the max and min sched time within one jiffy of the gtod clock was too tight. Just before a schedule tick the max could easily be hit, as well as just after a schedule_tick the min could be hit. This caused the clock to jump around by a jiffy. This patch widens the minimum to last gtod + (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSECS and the maximum to last gtod + (2 + delta_jiffies) * TICK_NSECS This keeps the minum to gtod or if one jiffy less than delta jiffies and the maxim 2 jiffies ahead of gtod. This may cause unstable TSCs to be a bit more sporadic, but it helps keep a clock with a stable TSC working well. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index e383bc7df6dd..42b81fa38cbd 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -96,14 +96,21 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) s64 delta = now - scd->prev_raw; WARN_ON_ONCE(!irqs_disabled()); - min_clock = scd->tick_gtod + delta_jiffies * TICK_NSEC; + + min_clock = scd->tick_gtod + + (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSEC; if (unlikely(delta < 0)) { clock++; goto out; } - max_clock = min_clock + TICK_NSEC; + /* + * The clock must stay within a jiffie of the gtod. + * But since we may be at the start of a jiffy or the end of one + * we add another jiffy buffer. + */ + max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; if (unlikely(clock + delta > max_clock)) { if (clock < max_clock) -- cgit v1.2.3 From af52a90a14cdaa54ecbfb6e6982abb13466a4b56 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 14:16:52 -0400 Subject: sched_clock: stop maximum check on NO HZ Working with ftrace I would get large jumps of 11 millisecs or more with the clock tracer. This killed the latencing timings of ftrace and also caused the irqoff self tests to fail. What was happening is with NO_HZ the idle would stop the jiffy counter and before the jiffy counter was updated the sched_clock would have a bad delta jiffies to compare with the gtod with the maximum. The jiffies would stop and the last sched_tick would record the last gtod. On wakeup, the sched clock update would compare the gtod + delta jiffies (which would be zero) and compare it to the TSC. The TSC would have correctly (with a stable TSC) moved forward several jiffies. But because the jiffies has not been updated yet the clock would be prevented from moving forward because it would appear that the TSC jumped too far ahead. The clock would then virtually stop, until the jiffies are updated. Then the next sched clock update would see that the clock was very much behind since the delta jiffies is now correct. This would then jump the clock forward by several jiffies. This caused ftrace to report several milliseconds of interrupts off latency at every resume from NO_HZ idle. This patch adds hooks into the nohz code to disable the checking of the maximum clock update when nohz is in effect. It resumes the max check when nohz has updated the jiffies again. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/sched.h | 17 ++++++++++++++++- kernel/sched_clock.c | 39 ++++++++++++++++++++++++++++++++++++++- kernel/time/tick-sched.c | 2 ++ 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index c5d3f847ca8d..33a8f42041fa 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1573,13 +1573,28 @@ static inline void sched_clock_idle_sleep_event(void) static inline void sched_clock_idle_wakeup_event(u64 delta_ns) { } -#else + +#ifdef CONFIG_NO_HZ +static inline void sched_clock_tick_stop(int cpu) +{ +} + +static inline void sched_clock_tick_start(int cpu) +{ +} +#endif + +#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ extern void sched_clock_init(void); extern u64 sched_clock_cpu(int cpu); extern void sched_clock_tick(void); extern void sched_clock_idle_sleep_event(void); extern void sched_clock_idle_wakeup_event(u64 delta_ns); +#ifdef CONFIG_NO_HZ +extern void sched_clock_tick_stop(int cpu); +extern void sched_clock_tick_start(int cpu); #endif +#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 42b81fa38cbd..97159e225a77 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -45,6 +45,9 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; +#ifdef CONFIG_NO_HZ + int check_max; +#endif }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); @@ -76,11 +79,45 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; +#ifdef CONFIG_NO_HZ + scd->check_max = 1; +#endif } sched_clock_running = 1; } +#ifdef CONFIG_NO_HZ +/* + * The dynamic ticks makes the delta jiffies inaccurate. This + * prevents us from checking the maximum time update. + * Disable the maximum check during stopped ticks. + */ +void sched_clock_tick_stop(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 0; +} + +void sched_clock_tick_start(int cpu) +{ + struct sched_clock_data *scd = cpu_sdc(cpu); + + scd->check_max = 1; +} + +static int check_max(struct sched_clock_data *scd) +{ + return scd->check_max; +} +#else +static int check_max(struct sched_clock_data *scd) +{ + return 1; +} +#endif /* CONFIG_NO_HZ */ + /* * update the percpu scd from the raw @now value * @@ -112,7 +149,7 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) */ max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; - if (unlikely(clock + delta > max_clock)) { + if (unlikely(clock + delta > max_clock) && check_max(scd)) { if (clock < max_clock) clock = max_clock; else diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index b854a895591e..d63008b09a4c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -276,6 +276,7 @@ void tick_nohz_stop_sched_tick(void) ts->tick_stopped = 1; ts->idle_jiffies = last_jiffies; rcu_enter_nohz(); + sched_clock_tick_stop(cpu); } /* @@ -375,6 +376,7 @@ void tick_nohz_restart_sched_tick(void) select_nohz_load_balancer(0); now = ktime_get(); tick_do_update_jiffies64(now); + sched_clock_tick_start(cpu); cpu_clear(cpu, nohz_cpu_mask); /* -- cgit v1.2.3 From 2b8a0cf4890d7537a77b51caa8f508e4a05a0e67 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 7 Jul 2008 19:49:41 -0400 Subject: sched_clock: fix calculation of other CPU The algorithm to calculate the 'now' of another CPU is not correct. At each scheduler tick, each CPU records the last sched_clock and gtod (tick_raw and tick_gtod respectively). If the TSC is somewhat the same in speed between two clocks the algorithm would be: tick_gtod1 + (now1 - tick_raw1) = tick_gtod2 + (now2 - tick_raw2) To calculate now2 we would have: now2 = (tick_gtod1 - tick_gtod2) + (tick_raw2 - tick_raw1) + now1 Currently the algorithm is: now2 = (tick_gtod1 - tick_gtod2) + (tick_raw1 - tick_raw2) + now1 This solves most of the rest of the issues I've had with timestamps in ftace. Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: john stultz Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 97159e225a77..55fca1e9e12a 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -203,8 +203,8 @@ u64 sched_clock_cpu(int cpu) now -= my_scd->tick_raw; now += scd->tick_raw; - now -= my_scd->tick_gtod; - now += scd->tick_gtod; + now += my_scd->tick_gtod; + now -= scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); } else { -- cgit v1.2.3 From c0c87734f125d2fa8ebc70310f3257fa6209f2b6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:31 -0400 Subject: sched_clock: only update deltas with local reads. Reading the CPU clock should try to stay accurate within the CPU. By reading the CPU clock from another CPU and updating the deltas can cause unneeded jumps when reading from the local CPU. This patch changes the code to update the last read TSC only when read from the local CPU. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 55fca1e9e12a..ee7cce5029ce 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -124,7 +124,7 @@ static int check_max(struct sched_clock_data *scd) * - filter out backward motion * - use jiffies to generate a min,max window to clip the raw values */ -static void __update_sched_clock(struct sched_clock_data *scd, u64 now) +static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *time) { unsigned long now_jiffies = jiffies; long delta_jiffies = now_jiffies - scd->tick_jiffies; @@ -162,8 +162,12 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now) if (unlikely(clock < min_clock)) clock = min_clock; - scd->prev_raw = now; - scd->clock = clock; + if (time) + *time = clock; + else { + scd->prev_raw = now; + scd->clock = clock; + } } static void lock_double_clock(struct sched_clock_data *data1, @@ -207,15 +211,18 @@ u64 sched_clock_cpu(int cpu) now -= scd->tick_gtod; __raw_spin_unlock(&my_scd->lock); + + __update_sched_clock(scd, now, &clock); + + __raw_spin_unlock(&scd->lock); + } else { __raw_spin_lock(&scd->lock); + __update_sched_clock(scd, now, NULL); + clock = scd->clock; + __raw_spin_unlock(&scd->lock); } - __update_sched_clock(scd, now); - clock = scd->clock; - - __raw_spin_unlock(&scd->lock); - return clock; } @@ -234,7 +241,7 @@ void sched_clock_tick(void) now_gtod = ktime_to_ns(ktime_get()); __raw_spin_lock(&scd->lock); - __update_sched_clock(scd, now); + __update_sched_clock(scd, now, NULL); /* * update tick_gtod after __update_sched_clock() because that will * already observe 1 new jiffy; adding a new tick_gtod to that would -- cgit v1.2.3 From a83bc47c33ab182f1e48977fd5a04024d713c75e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:32 -0400 Subject: sched_clock: record TSC after gtod To read the gtod we need to grab the xtime lock for read. Reading the gtod before the TSC can cause a bigger gab if the xtime lock is contended. This patch simply reverses the order to read the TSC after the gtod. The locking in the reading of the gtod handles any barriers one might think is needed. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index ee7cce5029ce..28ff6bf5e02b 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -237,8 +237,8 @@ void sched_clock_tick(void) WARN_ON_ONCE(!irqs_disabled()); - now = sched_clock(); now_gtod = ktime_to_ns(ktime_get()); + now = sched_clock(); __raw_spin_lock(&scd->lock); __update_sched_clock(scd, now, NULL); -- cgit v1.2.3 From c300ba252829e9325e08f0af60687add94445b25 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 9 Jul 2008 00:15:33 -0400 Subject: sched_clock: and multiplier for TSC to gtod drift The sched_clock code currently tries to keep all CPU clocks of all CPUS somewhat in sync. At every clock tick it records the gtod clock and uses that and jiffies and the TSC to calculate a CPU clock that tries to stay in sync with all the other CPUs. ftrace depends heavily on this timer and it detects when this timer "jumps". One problem is that the TSC and the gtod also drift. When the TSC is 0.1% faster or slower than the gtod it is very noticeable in ftrace. To help compensate for this, I've added a multiplier that tries to keep the CPU clock updating at the same rate as the gtod. I've tried various ways to get it to be in sync and this ended up being the most reliable. At every scheduler tick we calculate the new multiplier: multi = delta_gtod / delta_TSC This means we perform a 64 bit divide at the tick (once a HZ). A shift is used to handle the accuracy. Other methods that failed due to dynamic HZ are: (not used) multi += (gtod - tsc) / delta_gtod (not used) multi += (gtod - (last_tsc + delta_tsc)) / delta_gtod as well as other variants. This code still allows for a slight drift between TSC and gtod, but it keeps the damage down to a minimum. Signed-off-by: Steven Rostedt Cc: Steven Rostedt Cc: Peter Zijlstra Cc: Andrew Morton Cc: john stultz Signed-off-by: Ingo Molnar --- kernel/sched_clock.c | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c index 28ff6bf5e02b..8affbfd0cdb0 100644 --- a/kernel/sched_clock.c +++ b/kernel/sched_clock.c @@ -3,6 +3,9 @@ * * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * + * Updates and enhancements: + * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt + * * Based on code by: * Ingo Molnar * Guillaume Chazarain @@ -32,6 +35,11 @@ #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK +#define MULTI_SHIFT 15 +/* Max is double, Min is 1/2 */ +#define MAX_MULTI (2LL << MULTI_SHIFT) +#define MIN_MULTI (1LL << (MULTI_SHIFT-1)) + struct sched_clock_data { /* * Raw spinlock - this is a special case: this might be called @@ -45,6 +53,7 @@ struct sched_clock_data { u64 tick_raw; u64 tick_gtod; u64 clock; + s64 multi; #ifdef CONFIG_NO_HZ int check_max; #endif @@ -79,6 +88,7 @@ void sched_clock_init(void) scd->tick_raw = 0; scd->tick_gtod = ktime_now; scd->clock = ktime_now; + scd->multi = 1 << MULTI_SHIFT; #ifdef CONFIG_NO_HZ scd->check_max = 1; #endif @@ -134,8 +144,13 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim WARN_ON_ONCE(!irqs_disabled()); - min_clock = scd->tick_gtod + - (delta_jiffies ? delta_jiffies - 1 : 0) * TICK_NSEC; + /* + * At schedule tick the clock can be just under the gtod. We don't + * want to push it too prematurely. + */ + min_clock = scd->tick_gtod + (delta_jiffies * TICK_NSEC); + if (min_clock > TICK_NSEC) + min_clock -= TICK_NSEC / 2; if (unlikely(delta < 0)) { clock++; @@ -149,6 +164,9 @@ static void __update_sched_clock(struct sched_clock_data *scd, u64 now, u64 *tim */ max_clock = scd->tick_gtod + (2 + delta_jiffies) * TICK_NSEC; + delta *= scd->multi; + delta >>= MULTI_SHIFT; + if (unlikely(clock + delta > max_clock) && check_max(scd)) { if (clock < max_clock) clock = max_clock; @@ -230,6 +248,7 @@ void sched_clock_tick(void) { struct sched_clock_data *scd = this_scd(); unsigned long now_jiffies = jiffies; + s64 mult, delta_gtod, delta_raw; u64 now, now_gtod; if (unlikely(!sched_clock_running)) @@ -247,9 +266,23 @@ void sched_clock_tick(void) * already observe 1 new jiffy; adding a new tick_gtod to that would * increase the clock 2 jiffies. */ - scd->tick_jiffies = now_jiffies; + delta_gtod = now_gtod - scd->tick_gtod; + delta_raw = now - scd->tick_raw; + + if ((long)delta_raw > 0) { + mult = delta_gtod << MULTI_SHIFT; + do_div(mult, delta_raw); + scd->multi = mult; + if (scd->multi > MAX_MULTI) + scd->multi = MAX_MULTI; + else if (scd->multi < MIN_MULTI) + scd->multi = MIN_MULTI; + } else + scd->multi = 1 << MULTI_SHIFT; + scd->tick_raw = now; scd->tick_gtod = now_gtod; + scd->tick_jiffies = now_jiffies; __raw_spin_unlock(&scd->lock); } @@ -279,6 +312,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) __raw_spin_lock(&scd->lock); scd->prev_raw = now; scd->clock += delta_ns; + scd->multi = 1 << MULTI_SHIFT; __raw_spin_unlock(&scd->lock); touch_softlockup_watchdog(); -- cgit v1.2.3