From 425b753645767049f1a3eab02f39120c02156b72 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Jan 2025 19:34:22 +0100 Subject: cpuidle: teo: Rearrange idle state lookup code Rearrange code in the idle state lookup loop in teo_select() to make it somewhat easier to follow and update comments around it. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Aboorva Devarajan Tested-by: Christian Loehle Link: https://patch.msgid.link/4619938.LvFx2qVVIh@rjwysocki.net --- drivers/cpuidle/governors/teo.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 173ddcac540a..68af712f7064 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -367,7 +367,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * If the sum of the intercepts metric for all of the idle states * shallower than the current candidate one (idx) is greater than the * sum of the intercepts and hits metrics for the candidate state and - * all of the deeper states a shallower idle state is likely to be a + * all of the deeper states, a shallower idle state is likely to be a * better choice. */ prev_intercept_idx = idx; @@ -396,30 +396,36 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * first enabled state that is deep enough. */ if (teo_state_ok(i, drv) && - !dev->states_usage[i].disable) + !dev->states_usage[i].disable) { idx = i; - else - idx = first_suitable_idx; - + break; + } + idx = first_suitable_idx; break; } if (dev->states_usage[i].disable) continue; - if (!teo_state_ok(i, drv)) { + if (teo_state_ok(i, drv)) { /* - * The current state is too shallow, but if an - * alternative candidate state has been found, - * it may still turn out to be a better choice. + * The current state is deep enough, but still + * there may be a better one. */ - if (first_suitable_idx != idx) - continue; - - break; + first_suitable_idx = i; + continue; } - first_suitable_idx = i; + /* + * The current state is too shallow, so if no suitable + * states other than the initial candidate have been + * found, give up (the remaining states to check are + * shallower still), but otherwise the first suitable + * state other than the initial candidate may turn out + * to be preferable. + */ + if (first_suitable_idx == idx) + break; } } if (!idx && prev_intercept_idx) { -- cgit v1.2.3 From 92ce5c07b7a1913246fd5492aee52db8a0c66f55 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Jan 2025 19:36:57 +0100 Subject: cpuidle: teo: Reorder candidate state index checks Since constraint_idx may be 0, the candidate state index may change to 0 after assigning constraint_idx to it, so first check if it is greater than constraint_idx (and update it if so) and then check it against 0. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Aboorva Devarajan Tested-by: Christian Loehle Link: https://patch.msgid.link/1907276.tdWV9SEqCh@rjwysocki.net --- drivers/cpuidle/governors/teo.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 68af712f7064..30e444c9c40b 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -428,6 +428,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, break; } } + + /* + * If there is a latency constraint, it may be necessary to select an + * idle state shallower than the current candidate one. + */ + if (idx > constraint_idx) + idx = constraint_idx; + if (!idx && prev_intercept_idx) { /* * We have to query the sleep length here otherwise we don't @@ -438,13 +446,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, goto out_tick; } - /* - * If there is a latency constraint, it may be necessary to select an - * idle state shallower than the current candidate one. - */ - if (idx > constraint_idx) - idx = constraint_idx; - /* * Skip the timers check if state 0 is the current candidate one, * because an immediate non-timer wakeup is expected in that case. -- cgit v1.2.3 From ea185406d1ed90493ef0868a03ddcb6b2701b11b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Jan 2025 19:39:00 +0100 Subject: cpuidle: teo: Combine candidate state index checks against 0 There are two candidate state index checks against 0 in teo_select() that need not be separate, so combine them and update comments around them. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Aboorva Devarajan Tested-by: Christian Loehle Link: https://patch.msgid.link/13676346.uLZWGnKmhe@rjwysocki.net --- drivers/cpuidle/governors/teo.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 30e444c9c40b..bd2fe41b4287 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -436,23 +436,18 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (idx > constraint_idx) idx = constraint_idx; - if (!idx && prev_intercept_idx) { - /* - * We have to query the sleep length here otherwise we don't - * know after wakeup if our guess was correct. - */ - duration_ns = tick_nohz_get_sleep_length(&delta_tick); - cpu_data->sleep_length_ns = duration_ns; + if (!idx) { + if (prev_intercept_idx) { + /* + * Query the sleep length to be able to count the wakeup + * as a hit if it is caused by a timer. + */ + duration_ns = tick_nohz_get_sleep_length(&delta_tick); + cpu_data->sleep_length_ns = duration_ns; + } goto out_tick; } - /* - * Skip the timers check if state 0 is the current candidate one, - * because an immediate non-timer wakeup is expected in that case. - */ - if (!idx) - goto out_tick; - /* * If state 0 is a polling one, check if the target residency of * the current candidate state is low enough and skip the timers -- cgit v1.2.3 From b9a6af26bd83fb23d15c53b1ce63df77dda15513 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Jan 2025 19:40:49 +0100 Subject: cpuidle: teo: Drop local variable prev_intercept_idx Local variable prev_intercept_idx in teo_select() is redundant because it cannot be 0 when candidate state index is 0. The prev_intercept_idx value is the index of the deepest enabled idle state, so if it is 0, state 0 is the deepest enabled idle state, in which case it must be the only enabled idle state, but then teo_select() would have returned early before initializing prev_intercept_idx. Thus prev_intercept_idx must be nonzero and the check of it against 0 always passes, so it can be dropped altogether. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Aboorva Devarajan Tested-by: Christian Loehle Link: https://patch.msgid.link/3327997.aeNJFYEL58@rjwysocki.net [ rjw: Fixed typo in the changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index bd2fe41b4287..95d76c8e0d12 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -292,7 +292,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, unsigned int hit_sum = 0; int constraint_idx = 0; int idx0 = 0, idx = -1; - int prev_intercept_idx; s64 duration_ns; int i; @@ -370,7 +369,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * all of the deeper states, a shallower idle state is likely to be a * better choice. */ - prev_intercept_idx = idx; if (2 * idx_intercept_sum > cpu_data->total - idx_hit_sum) { int first_suitable_idx = idx; @@ -437,14 +435,11 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, idx = constraint_idx; if (!idx) { - if (prev_intercept_idx) { - /* - * Query the sleep length to be able to count the wakeup - * as a hit if it is caused by a timer. - */ - duration_ns = tick_nohz_get_sleep_length(&delta_tick); - cpu_data->sleep_length_ns = duration_ns; - } + /* + * Query the sleep length to be able to count the wakeup as a + * hit if it is caused by a timer. + */ + cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick); goto out_tick; } -- cgit v1.2.3 From e24f8a55de509ba26726f094e084d90428cbcf26 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Jan 2025 19:41:55 +0100 Subject: cpuidle: teo: Clarify two code comments Rewrite two code comments suposed to explain its behavior that are too concise or not sufficiently clear. No functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Aboorva Devarajan Tested-by: Christian Loehle Link: https://patch.msgid.link/8472971.T7Z3S40VBb@rjwysocki.net [ rjw: Fixed 2 typos in new comments ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 95d76c8e0d12..411b315081f8 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -154,9 +154,10 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { /* - * One of the safety nets has triggered or the wakeup was close - * enough to the closest timer event expected at the idle state - * selection time to be discarded. + * This causes the wakeup to be counted as a hit regardless of + * the real idle duration which doesn't need to be computed + * because the wakeup has been close enough to an anticipated + * timer. */ measured_ns = U64_MAX; } else { @@ -302,8 +303,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, cpu_data->time_span_ns = local_clock(); /* - * Set the expected sleep length to infinity in case of an early - * return. + * Set the sleep length to infinity in case the invocation of + * tick_nohz_get_sleep_length() below is skipped, in which case it won't + * be known whether or not the subsequent wakeup is caused by a timer. + * It is generally fine to count the wakeup as an intercept then, except + * for the cases when the CPU is mostly woken up by timers and there may + * be opportunities to ask for a deeper idle state when no imminent + * timers are scheduled which may be missed. */ cpu_data->sleep_length_ns = KTIME_MAX; -- cgit v1.2.3 From d619b5cc678024fa5ed7eb3702c3991a2aa96823 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Jan 2025 19:45:50 +0100 Subject: cpuidle: teo: Simplify counting events used for tick management Replace the tick_hits metric with a new tick_intercepts one that can be used directly when deciding whether or not to stop the scheduler tick and update the governor functional description accordingly. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Aboorva Devarajan Tested-by: Christian Loehle Link: https://patch.msgid.link/1987985.PYKUYFuaPT@rjwysocki.net --- drivers/cpuidle/governors/teo.c | 49 ++++++++++++----------------------------- 1 file changed, 14 insertions(+), 35 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 411b315081f8..62f323f2e245 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -41,11 +41,7 @@ * idle state 2, the third bin spans from the target residency of idle state 2 * up to, but not including, the target residency of idle state 3 and so on. * The last bin spans from the target residency of the deepest idle state - * supplied by the driver to the scheduler tick period length or to infinity if - * the tick period length is less than the target residency of that state. In - * the latter case, the governor also counts events with the measured idle - * duration between the tick period length and the target residency of the - * deepest idle state. + * supplied by the driver to infinity. * * Two metrics called "hits" and "intercepts" are associated with each bin. * They are updated every time before selecting an idle state for the given CPU @@ -60,6 +56,10 @@ * into by the sleep length (these events are also referred to as "intercepts" * below). * + * The governor also counts "intercepts" with the measured idle duration below + * the tick period length and uses this information when deciding whether or not + * to stop the scheduler tick. + * * In order to select an idle state for a CPU, the governor takes the following * steps (modulo the possible latency constraint that must be taken into account * too): @@ -128,14 +128,14 @@ struct teo_bin { * @sleep_length_ns: Time till the closest timer event (at the selection time). * @state_bins: Idle state data bins for this CPU. * @total: Grand total of the "intercepts" and "hits" metrics for all bins. - * @tick_hits: Number of "hits" after TICK_NSEC. + * @tick_intercepts: "Intercepts" before TICK_NSEC. */ struct teo_cpu { s64 time_span_ns; s64 sleep_length_ns; struct teo_bin state_bins[CPUIDLE_STATE_MAX]; unsigned int total; - unsigned int tick_hits; + unsigned int tick_intercepts; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); @@ -207,38 +207,21 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) } } - /* - * If the deepest state's target residency is below the tick length, - * make a record of it to help teo_select() decide whether or not - * to stop the tick. This effectively adds an extra hits-only bin - * beyond the last state-related one. - */ - if (target_residency_ns < TICK_NSEC) { - cpu_data->tick_hits -= cpu_data->tick_hits >> DECAY_SHIFT; - - cpu_data->total += cpu_data->tick_hits; - - if (TICK_NSEC <= cpu_data->sleep_length_ns) { - idx_timer = drv->state_count; - if (TICK_NSEC <= measured_ns) { - cpu_data->tick_hits += PULSE; - goto end; - } - } - } - + cpu_data->tick_intercepts -= cpu_data->tick_intercepts >> DECAY_SHIFT; /* * If the measured idle duration falls into the same bin as the sleep * length, this is a "hit", so update the "hits" metric for that bin. * Otherwise, update the "intercepts" metric for the bin fallen into by * the measured idle duration. */ - if (idx_timer == idx_duration) + if (idx_timer == idx_duration) { cpu_data->state_bins[idx_timer].hits += PULSE; - else + } else { cpu_data->state_bins[idx_duration].intercepts += PULSE; + if (TICK_NSEC <= measured_ns) + cpu_data->tick_intercepts += PULSE; + } -end: cpu_data->total += PULSE; } @@ -286,7 +269,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); ktime_t delta_tick = TICK_NSEC / 2; - unsigned int tick_intercept_sum = 0; unsigned int idx_intercept_sum = 0; unsigned int intercept_sum = 0; unsigned int idx_hit_sum = 0; @@ -365,9 +347,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, goto end; } - tick_intercept_sum = intercept_sum + - cpu_data->state_bins[drv->state_count-1].intercepts; - /* * If the sum of the intercepts metric for all of the idle states * shallower than the current candidate one (idx) is greater than the @@ -477,7 +456,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * total wakeup events, do not stop the tick. */ if (drv->states[idx].target_residency_ns < TICK_NSEC && - tick_intercept_sum > cpu_data->total / 2 + cpu_data->total / 8) + cpu_data->tick_intercepts > cpu_data->total / 2 + cpu_data->total / 8) duration_ns = TICK_NSEC / 2; end: -- cgit v1.2.3 From 13ed5c4a6d9c91755227b7b0fab7b2543f6adfd2 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Jan 2025 19:48:47 +0100 Subject: cpuidle: teo: Skip getting the sleep length if wakeups are very frequent Commit 6da8f9ba5a87 ("cpuidle: teo: Skip tick_nohz_get_sleep_length() call in some cases") attempted to reduce the governor overhead in some cases by making it avoid obtaining the sleep length (the time till the next timer event) which may be costly. Among other things, after the above commit, tick_nohz_get_sleep_length() was not called any more when idle state 0 was to be returned, which turned out to be problematic and the previous behavior in that respect was restored by commit 4b20b07ce72f ("cpuidle: teo: Don't count non- existent intercepts"). However, commit 6da8f9ba5a87 also caused the governor to avoid calling tick_nohz_get_sleep_length() on systems where idle state 0 is a "polling" one (that is, it is not really an idle state, but a loop continuously executed by the CPU) when the target residency of the idle state to be returned was low enough, so there was no practical need to refine the idle state selection in any way. This change was not removed by the other commit, so now on systems where idle state 0 is a "polling" one, tick_nohz_get_sleep_length() is called when idle state 0 is to be returned, but it is not called when a deeper idle state with sufficiently low target residency is to be returned. That is arguably confusing and inconsistent. Moreover, there is no specific reason why the behavior in question should depend on whether or not idle state 0 is a "polling" one. One way to address this would be to make the governor always call tick_nohz_get_sleep_length() to obtain the sleep length, but that would effectively mean reverting commit 6da8f9ba5a87 and restoring the latency issue that was the reason for doing it. This approach is thus not particularly attractive. To address it differently, notice that if a CPU is woken up very often, this is not likely to be caused by timers in the first place (user space has a default timer slack of 50 us and there are relatively few timers with a deadline shorter than several microseconds in the kernel) and even if it were the case, the potential benefit from using a deep idle state would then be questionable for latency reasons. Therefore, if the majority of CPU wakeups occur within several microseconds, it can be assumed that all wakeups in that range are non-timer and the sleep length need not be determined. Accordingly, introduce a new metric for counting wakeups with the measured idle duration below RESIDENCY_THRESHOLD_NS and modify the idle state selection to skip the tick_nohz_get_sleep_length() invocation if idle state 0 has been selected or the target residency of the candidate idle state is below RESIDENCY_THRESHOLD_NS and the value of the new metric is at least 1/2 of the total event count. Since the above requires the measured idle duration to be determined every time, except for the cases when one of the safety nets has triggered in which the wakeup is counted as a hit in the deepest idle state idle residency range, update the handling of those cases to avoid skipping the idle duration computation when the CPU wakeup is "genuine". Signed-off-by: Rafael J. Wysocki Link: https://patch.msgid.link/3851791.kQq0lBPeGt@rjwysocki.net Tested-by: Aboorva Devarajan Tested-by: Christian Loehle Reviewed-by: Christian Loehle [ rjw: Renamed a struct field ] [ rjw: Fixed typo in the subject and one in a comment ] Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 58 +++++++++++++++++++++++++---------------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 62f323f2e245..d772a3b7ccbd 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -129,6 +129,7 @@ struct teo_bin { * @state_bins: Idle state data bins for this CPU. * @total: Grand total of the "intercepts" and "hits" metrics for all bins. * @tick_intercepts: "Intercepts" before TICK_NSEC. + * @short_idles: Wakeups after short idle periods. */ struct teo_cpu { s64 time_span_ns; @@ -136,6 +137,7 @@ struct teo_cpu { struct teo_bin state_bins[CPUIDLE_STATE_MAX]; unsigned int total; unsigned int tick_intercepts; + unsigned int short_idles; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); @@ -152,12 +154,12 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) s64 target_residency_ns; u64 measured_ns; - if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { + cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT; + + if (cpu_data->time_span_ns < 0) { /* - * This causes the wakeup to be counted as a hit regardless of - * the real idle duration which doesn't need to be computed - * because the wakeup has been close enough to an anticipated - * timer. + * If one of the safety nets has triggered, assume that this + * might have been a long sleep. */ measured_ns = U64_MAX; } else { @@ -177,10 +179,14 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * time, so take 1/2 of the exit latency as a very rough * approximation of the average of it. */ - if (measured_ns >= lat_ns) + if (measured_ns >= lat_ns) { measured_ns -= lat_ns / 2; - else + if (measured_ns < RESIDENCY_THRESHOLD_NS) + cpu_data->short_idles += PULSE; + } else { measured_ns /= 2; + cpu_data->short_idles += PULSE; + } } cpu_data->total = 0; @@ -419,27 +425,35 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (idx > constraint_idx) idx = constraint_idx; - if (!idx) { - /* - * Query the sleep length to be able to count the wakeup as a - * hit if it is caused by a timer. - */ - cpu_data->sleep_length_ns = tick_nohz_get_sleep_length(&delta_tick); - goto out_tick; - } - /* - * If state 0 is a polling one, check if the target residency of - * the current candidate state is low enough and skip the timers - * check in that case too. + * If either the candidate state is state 0 or its target residency is + * low enough, there is basically nothing more to do, but if the sleep + * length is not updated, the subsequent wakeup will be counted as an + * "intercept" which may be problematic in the cases when timer wakeups + * are dominant. Namely, it may effectively prevent deeper idle states + * from being selected at one point even if no imminent timers are + * scheduled. + * + * However, frequent timers in the RESIDENCY_THRESHOLD_NS range on one + * CPU are unlikely (user space has a default 50 us slack value for + * hrtimers and there are relatively few timers with a lower deadline + * value in the kernel), and even if they did happen, the potential + * benefit from using a deep idle state in that case would be + * questionable anyway for latency reasons. Thus if the measured idle + * duration falls into that range in the majority of cases, assume + * non-timer wakeups to be dominant and skip updating the sleep length + * to reduce latency. */ - if ((drv->states[0].flags & CPUIDLE_FLAG_POLLING) && - drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) + if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) && + 2 * cpu_data->short_idles >= cpu_data->total) goto out_tick; duration_ns = tick_nohz_get_sleep_length(&delta_tick); cpu_data->sleep_length_ns = duration_ns; + if (!idx) + goto out_tick; + /* * If the closest expected timer is before the target residency of the * candidate state, a shallower one needs to be found. @@ -501,7 +515,7 @@ static void teo_reflect(struct cpuidle_device *dev, int state) if (dev->poll_time_limit || (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { dev->poll_time_limit = false; - cpu_data->time_span_ns = cpu_data->sleep_length_ns; + cpu_data->time_span_ns = KTIME_MIN; } else { cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; } -- cgit v1.2.3 From ddcfa7964677b1298712edb931a98ac25ffd2fb6 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Jan 2025 19:50:23 +0100 Subject: cpuidle: teo: Simplify handling of total events count Instead of computing the total events count from scratch every time, decay it and add a PULSE value to it in teo_update(). No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Aboorva Devarajan Tested-by: Christian Loehle Link: https://patch.msgid.link/9388883.CDJkKcVGEf@rjwysocki.net --- drivers/cpuidle/governors/teo.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index d772a3b7ccbd..600b54a9f1e1 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -189,8 +189,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) } } - cpu_data->total = 0; - /* * Decay the "hits" and "intercepts" metrics for all of the bins and * find the bins that the sleep length and the measured idle duration @@ -202,8 +200,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) bin->hits -= bin->hits >> DECAY_SHIFT; bin->intercepts -= bin->intercepts >> DECAY_SHIFT; - cpu_data->total += bin->hits + bin->intercepts; - target_residency_ns = drv->states[i].target_residency_ns; if (target_residency_ns <= cpu_data->sleep_length_ns) { @@ -228,6 +224,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->tick_intercepts += PULSE; } + cpu_data->total -= cpu_data->total >> DECAY_SHIFT; cpu_data->total += PULSE; } -- cgit v1.2.3 From 65e18e6544751ac25dc284794566ee90d65a379e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 13 Jan 2025 19:51:59 +0100 Subject: cpuidle: teo: Replace time_span_ns with a flag After recent updates, the time_span_ns field in struct teo_cpu has become an indicator on whether or not the most recent wakeup has been "genuine" which may as well be indicated by a bool field without calling local_clock(), so update the code accordingly. No intentional functional impact. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Tested-by: Aboorva Devarajan Tested-by: Christian Loehle Link: https://patch.msgid.link/6010475.MhkbZ0Pkbq@rjwysocki.net --- drivers/cpuidle/governors/teo.c | 27 +++++++++------------------ 1 file changed, 9 insertions(+), 18 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 600b54a9f1e1..c232c95ca7fa 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -124,20 +124,20 @@ struct teo_bin { /** * struct teo_cpu - CPU data used by the TEO cpuidle governor. - * @time_span_ns: Time between idle state selection and post-wakeup update. * @sleep_length_ns: Time till the closest timer event (at the selection time). * @state_bins: Idle state data bins for this CPU. * @total: Grand total of the "intercepts" and "hits" metrics for all bins. * @tick_intercepts: "Intercepts" before TICK_NSEC. * @short_idles: Wakeups after short idle periods. + * @artificial_wakeup: Set if the wakeup has been triggered by a safety net. */ struct teo_cpu { - s64 time_span_ns; s64 sleep_length_ns; struct teo_bin state_bins[CPUIDLE_STATE_MAX]; unsigned int total; unsigned int tick_intercepts; unsigned int short_idles; + bool artificial_wakeup; }; static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); @@ -156,7 +156,7 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) cpu_data->short_idles -= cpu_data->short_idles >> DECAY_SHIFT; - if (cpu_data->time_span_ns < 0) { + if (cpu_data->artificial_wakeup) { /* * If one of the safety nets has triggered, assume that this * might have been a long sleep. @@ -165,13 +165,6 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) } else { u64 lat_ns = drv->states[dev->last_state_idx].exit_latency_ns; - /* - * The computations below are to determine whether or not the - * (saved) time till the next timer event and the measured idle - * duration fall into the same "bin", so use last_residency_ns - * for that instead of time_span_ns which includes the cpuidle - * overhead. - */ measured_ns = dev->last_residency_ns; /* * The delay between the wakeup and the first instruction @@ -286,7 +279,6 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, dev->last_state_idx = -1; } - cpu_data->time_span_ns = local_clock(); /* * Set the sleep length to infinity in case the invocation of * tick_nohz_get_sleep_length() below is skipped, in which case it won't @@ -504,17 +496,16 @@ static void teo_reflect(struct cpuidle_device *dev, int state) struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); dev->last_state_idx = state; - /* - * If the wakeup was not "natural", but triggered by one of the safety - * nets, assume that the CPU might have been idle for the entire sleep - * length time. - */ if (dev->poll_time_limit || (tick_nohz_idle_got_tick() && cpu_data->sleep_length_ns > TICK_NSEC)) { + /* + * The wakeup was not "genuine", but triggered by one of the + * safety nets. + */ dev->poll_time_limit = false; - cpu_data->time_span_ns = KTIME_MIN; + cpu_data->artificial_wakeup = true; } else { - cpu_data->time_span_ns = local_clock() - cpu_data->time_span_ns; + cpu_data->artificial_wakeup = false; } } -- cgit v1.2.3 From 16c8d7586c196cddcc8822a946ef03c9cfabae30 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 20 Jan 2025 17:08:50 +0100 Subject: cpuidle: teo: Skip sleep length computation for low latency constraints If the idle state exit latency constraint is sufficiently low, it is better to avoid the additional latency related to calling tick_nohz_get_sleep_length(). It is also not necessary to compute the sleep length in that case because shallow idle state selection will be forced then regardless of the recent wakeup history. Accordingly, skip the sleep length computation and subsequent checks of the exit latency constraint is low enough. Signed-off-by: Rafael J. Wysocki Reviewed-by: Christian Loehle Link: https://patch.msgid.link/6122398.lOV4Wx5bFT@rjwysocki.net --- drivers/cpuidle/governors/teo.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index c232c95ca7fa..8fe5e1b47ef9 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -105,6 +105,12 @@ #include "gov.h" +/* + * Idle state exit latency threshold used for deciding whether or not to check + * the time till the closest expected timer event. + */ +#define LATENCY_THRESHOLD_NS (RESIDENCY_THRESHOLD_NS / 2) + /* * The PULSE value is added to metrics when they grow and the DECAY_SHIFT value * is used for decreasing metrics on a regular basis. @@ -432,9 +438,14 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * duration falls into that range in the majority of cases, assume * non-timer wakeups to be dominant and skip updating the sleep length * to reduce latency. + * + * Also, if the latency constraint is sufficiently low, it will force + * shallow idle states regardless of the wakeup type, so the sleep + * length need not be known in that case. */ if ((!idx || drv->states[idx].target_residency_ns < RESIDENCY_THRESHOLD_NS) && - 2 * cpu_data->short_idles >= cpu_data->total) + (2 * cpu_data->short_idles >= cpu_data->total || + latency_req < LATENCY_THRESHOLD_NS)) goto out_tick; duration_ns = tick_nohz_get_sleep_length(&delta_tick); -- cgit v1.2.3