summaryrefslogtreecommitdiff
path: root/include
diff options
context:
space:
mode:
authorThomas Gleixner <tglx@kernel.org>2026-02-24 19:36:40 +0300
committerPeter Zijlstra <peterz@infradead.org>2026-02-27 18:40:08 +0300
commitcd38bdb8e696a1a1eb12fc6662a6e420977aacfd (patch)
treeaa4d6b0eedb0b48549e92eb928dfa1f6d7c1b00f /include
parent23028286128d817a414eee0c0a2c6cdc57a83e6f (diff)
downloadlinux-cd38bdb8e696a1a1eb12fc6662a6e420977aacfd.tar.xz
timekeeping: Provide infrastructure for coupled clockevents
Some architectures have clockevent devices which are coupled to the system clocksource by implementing a less than or equal comparator which compares the programmed absolute expiry time against the underlying time counter. Well known examples are TSC/TSC deadline timer and the S390 TOD clocksource/comparator. While the concept is nice it has some downsides: 1) The clockevents core code is strictly based on relative expiry times as that's the most common case for clockevent device hardware. That requires to convert the absolute expiry time provided by the caller (hrtimers, NOHZ code) to a relative expiry time by reading and substracting the current time. The clockevent::set_next_event() callback must then read the counter again to convert the relative expiry back into a absolute one. 2) The conversion factors from nanoseconds to counter clock cycles are set up when the clockevent is registered. When NTP applies corrections then the clockevent conversion factors can deviate from the clocksource conversion substantially which either results in timers firing late or in the worst case early. The early expiry then needs to do a reprogam with a short delta. In most cases this is papered over by the fact that the read in the set_next_event() callback happens after the read which is used to calculate the delta. So the tendency is that timers expire mostly late. All of this can be avoided by providing support for these devices in the core code: 1) The timekeeping core keeps track of the last update to the clocksource by storing the base nanoseconds and the corresponding clocksource counter value. That's used to keep the conversion math for reading the time within 64-bit in the common case. This information can be used to avoid both reads of the underlying clocksource in the clockevents reprogramming path: delta = expiry - base_ns; cycles = base_cycles + ((delta * clockevent::mult) >> clockevent::shift); The resulting cycles value can be directly used to program the comparator. 2) As #1 does not longer provide the "compensation" through the second read the deviation of the clocksource and clockevent conversions caused by NTP become more prominent. This can be cured by letting the timekeeping core compute and store the reverse conversion factors when the clocksource cycles to nanoseconds factors are modified by NTP: CS::MULT (1 << NS_TO_CYC_SHIFT) --------------- = ---------------------- (1 << CS:SHIFT) NS_TO_CYC_MULT Ergo: NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT The NS_TO_CYC_SHIFT value is calculated when the clocksource is installed so that it aims for a one hour maximum sleep time. Signed-off-by: Thomas Gleixner <tglx@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://patch.msgid.link/20260224163429.944763521@kernel.org
Diffstat (limited to 'include')
-rw-r--r--include/linux/clocksource.h1
-rw-r--r--include/linux/timekeeper_internal.h8
2 files changed, 9 insertions, 0 deletions
diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index 54366d5c4d19..25774fc5b53d 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -150,6 +150,7 @@ struct clocksource {
#define CLOCK_SOURCE_RESELECT 0x100
#define CLOCK_SOURCE_VERIFY_PERCPU 0x200
#define CLOCK_SOURCE_CAN_INLINE_READ 0x400
+#define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x800
/* simplify initialization of mask field */
#define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0)
diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h
index b8ae89ea28ab..e36d11e33e0c 100644
--- a/include/linux/timekeeper_internal.h
+++ b/include/linux/timekeeper_internal.h
@@ -72,6 +72,10 @@ struct tk_read_base {
* @id: The timekeeper ID
* @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW
* @raw_sec: CLOCK_MONOTONIC_RAW time in seconds
+ * @cs_id: The ID of the current clocksource
+ * @cs_ns_to_cyc_mult: Multiplicator for nanoseconds to cycles conversion
+ * @cs_ns_to_cyc_shift: Shift value for nanoseconds to cycles conversion
+ * @cs_ns_to_cyc_maxns: Maximum nanoseconds to cyles conversion range
* @clock_was_set_seq: The sequence number of clock was set events
* @cs_was_changed_seq: The sequence number of clocksource change events
* @clock_valid: Indicator for valid clock
@@ -159,6 +163,10 @@ struct timekeeper {
u64 raw_sec;
/* Cachline 3 and 4 (timekeeping internal variables): */
+ enum clocksource_ids cs_id;
+ u32 cs_ns_to_cyc_mult;
+ u32 cs_ns_to_cyc_shift;
+ u64 cs_ns_to_cyc_maxns;
unsigned int clock_was_set_seq;
u8 cs_was_changed_seq;
u8 clock_valid;