From ecfa23b486b22844855844202424bc1966cebb33 Mon Sep 17 00:00:00 2001 From: Petr Pavlu Date: Tue, 17 Feb 2026 12:26:15 +0100 Subject: jiffies: Remove unused __jiffy_arch_data The __jiffy_arch_data definition was added in 2017 by commit 60b0a8c3d248 ("frv: declare jiffies to be located in the .data section") for the needs of the frv port. The frv support was removed in 2018 by commit fd8773f9f544 ("arch: remove frv port") and no other architecture has required __jiffy_arch_data. Therefore, remove this unused definition. Signed-off-by: Petr Pavlu Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260217112638.1525094-1-petr.pavlu@suse.com --- include/linux/jiffies.h | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index fdef2c155c27..1a393d160420 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -67,10 +67,6 @@ extern void register_refined_jiffies(long clock_tick_rate); /* USER_TICK_USEC is the time between ticks in usec assuming fake USER_HZ */ #define USER_TICK_USEC ((1000000UL + USER_HZ/2) / USER_HZ) -#ifndef __jiffy_arch_data -#define __jiffy_arch_data -#endif - /* * The 64-bit value is not atomic on 32-bit systems - you MUST NOT read it * without sampling the sequence number in jiffies_lock. @@ -83,7 +79,7 @@ extern void register_refined_jiffies(long clock_tick_rate); * See arch/ARCH/kernel/vmlinux.lds.S */ extern u64 __cacheline_aligned_in_smp jiffies_64; -extern unsigned long volatile __cacheline_aligned_in_smp __jiffy_arch_data jiffies; +extern unsigned long volatile __cacheline_aligned_in_smp jiffies; #if (BITS_PER_LONG < 64) u64 get_jiffies_64(void); -- cgit v1.2.3 From 0a93d30861617ecf207dcc4c6c736435fac36dae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:35:42 +0100 Subject: hrtimer: Provide a static branch based hrtimer_hres_enabled() The scheduler evaluates this via hrtimer_is_hres_active() every time it has to update HRTICK. This needs to follow three pointers, which is expensive. Provide a static branch based mechanism to avoid that. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.136503358@kernel.org --- include/linux/hrtimer.h | 13 +++++++++---- kernel/time/hrtimer.c | 28 +++++++++++++++++++++++++--- 2 files changed, 34 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 74adbd4e7003..c9ca105ba009 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -153,17 +153,22 @@ static inline int hrtimer_is_hres_active(struct hrtimer *timer) } #ifdef CONFIG_HIGH_RES_TIMERS +extern unsigned int hrtimer_resolution; struct clock_event_device; extern void hrtimer_interrupt(struct clock_event_device *dev); -extern unsigned int hrtimer_resolution; +extern struct static_key_false hrtimer_highres_enabled_key; -#else +static inline bool hrtimer_highres_enabled(void) +{ + return static_branch_likely(&hrtimer_highres_enabled_key); +} +#else /* CONFIG_HIGH_RES_TIMERS */ #define hrtimer_resolution (unsigned int)LOW_RES_NSEC - -#endif +static inline bool hrtimer_highres_enabled(void) { return false; } +#endif /* !CONFIG_HIGH_RES_TIMERS */ static inline ktime_t __hrtimer_expires_remaining_adjusted(const struct hrtimer *timer, ktime_t now) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3088db419aa6..67917ce696d4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -126,6 +126,25 @@ static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) return likely(base->online); } +#ifdef CONFIG_HIGH_RES_TIMERS +DEFINE_STATIC_KEY_FALSE(hrtimer_highres_enabled_key); + +static void hrtimer_hres_workfn(struct work_struct *work) +{ + static_branch_enable(&hrtimer_highres_enabled_key); +} + +static DECLARE_WORK(hrtimer_hres_work, hrtimer_hres_workfn); + +static inline void hrtimer_schedule_hres_work(void) +{ + if (!hrtimer_highres_enabled()) + schedule_work(&hrtimer_hres_work); +} +#else +static inline void hrtimer_schedule_hres_work(void) { } +#endif + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -649,7 +668,9 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) } /* - * Is the high resolution mode active ? + * Is the high resolution mode active in the CPU base. This cannot use the + * static key as the CPUs are switched to high resolution mode + * asynchronously. */ static inline int hrtimer_hres_active(struct hrtimer_cpu_base *cpu_base) { @@ -750,6 +771,7 @@ static void hrtimer_switch_to_hres(void) tick_setup_sched_timer(true); /* "Retrigger" the interrupt to get things going */ retrigger_next_event(NULL); + hrtimer_schedule_hres_work(); } #else @@ -947,11 +969,10 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, */ void clock_was_set(unsigned int bases) { - struct hrtimer_cpu_base *cpu_base = raw_cpu_ptr(&hrtimer_bases); cpumask_var_t mask; int cpu; - if (!hrtimer_hres_active(cpu_base) && !tick_nohz_is_active()) + if (!hrtimer_highres_enabled() && !tick_nohz_is_active()) goto out_timerfd; if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { @@ -962,6 +983,7 @@ void clock_was_set(unsigned int bases) /* Avoid interrupting CPUs if possible */ cpus_read_lock(); for_each_online_cpu(cpu) { + struct hrtimer_cpu_base *cpu_base; unsigned long flags; cpu_base = &per_cpu(hrtimer_bases, cpu); -- cgit v1.2.3 From c3a92213eb3dd8ea6f664d16a08eda800e34eaad Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:35:47 +0100 Subject: sched: Use hrtimer_highres_enabled() Use the static branch based variant and thereby avoid following three pointers. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.203610956@kernel.org --- include/linux/hrtimer.h | 6 ------ kernel/sched/sched.h | 37 +++++++++---------------------------- 2 files changed, 9 insertions(+), 34 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c9ca105ba009..b5003856fd60 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -146,12 +146,6 @@ static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) return ktime_sub(timer->node.expires, hrtimer_cb_get_time(timer)); } -static inline int hrtimer_is_hres_active(struct hrtimer *timer) -{ - return IS_ENABLED(CONFIG_HIGH_RES_TIMERS) ? - timer->base->cpu_base->hres_active : 0; -} - #ifdef CONFIG_HIGH_RES_TIMERS extern unsigned int hrtimer_resolution; struct clock_event_device; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 73bc20c47631..0aa089dfaaa4 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3019,25 +3019,19 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; * - enabled by features * - hrtimer is actually high res */ -static inline int hrtick_enabled(struct rq *rq) +static inline bool hrtick_enabled(struct rq *rq) { - if (!cpu_active(cpu_of(rq))) - return 0; - return hrtimer_is_hres_active(&rq->hrtick_timer); + return cpu_active(cpu_of(rq)) && hrtimer_highres_enabled(); } -static inline int hrtick_enabled_fair(struct rq *rq) +static inline bool hrtick_enabled_fair(struct rq *rq) { - if (!sched_feat(HRTICK)) - return 0; - return hrtick_enabled(rq); + return sched_feat(HRTICK) && hrtick_enabled(rq); } -static inline int hrtick_enabled_dl(struct rq *rq) +static inline bool hrtick_enabled_dl(struct rq *rq) { - if (!sched_feat(HRTICK_DL)) - return 0; - return hrtick_enabled(rq); + return sched_feat(HRTICK_DL) && hrtick_enabled(rq); } extern void hrtick_start(struct rq *rq, u64 delay); @@ -3047,22 +3041,9 @@ static inline bool hrtick_active(struct rq *rq) } #else /* !CONFIG_SCHED_HRTICK: */ - -static inline int hrtick_enabled_fair(struct rq *rq) -{ - return 0; -} - -static inline int hrtick_enabled_dl(struct rq *rq) -{ - return 0; -} - -static inline int hrtick_enabled(struct rq *rq) -{ - return 0; -} - +static inline bool hrtick_enabled_fair(struct rq *rq) { return false; } +static inline bool hrtick_enabled_dl(struct rq *rq) { return false; } +static inline bool hrtick_enabled(struct rq *rq) { return false; } #endif /* !CONFIG_SCHED_HRTICK */ #ifndef arch_scale_freq_tick -- cgit v1.2.3 From b7dd64778aa3f89de9afa1e81171cfe110ddc525 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:36:01 +0100 Subject: hrtimer: Provide LAZY_REARM mode The hrtick timer is frequently rearmed before expiry and most of the time the new expiry is past the armed one. As this happens on every context switch it becomes expensive with scheduling heavy work loads especially in virtual machines as the "hardware" reprogamming implies a VM exit. Add a lazy rearm mode flag which skips the reprogamming if: 1) The timer was the first expiring timer before the rearm 2) The new expiry time is farther out than the armed time This avoids a massive amount of reprogramming operations of the hrtick timer for the price of eventually taking the alredy armed interrupt for nothing. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.408524456@kernel.org --- include/linux/hrtimer.h | 8 ++++++++ include/linux/hrtimer_types.h | 3 +++ kernel/time/hrtimer.c | 17 ++++++++++++++++- 3 files changed, 27 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index b5003856fd60..c924bb2498db 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -31,6 +31,13 @@ * soft irq context * HRTIMER_MODE_HARD - Timer callback function will be executed in * hard irq context even on PREEMPT_RT. + * HRTIMER_MODE_LAZY_REARM - Avoid reprogramming if the timer was the + * first expiring timer and is moved into the + * future. Special mode for the HRTICK timer to + * avoid extensive reprogramming of the hardware, + * which is expensive in virtual machines. Risks + * a pointless expiry, but that's better than + * reprogramming on every context switch, */ enum hrtimer_mode { HRTIMER_MODE_ABS = 0x00, @@ -38,6 +45,7 @@ enum hrtimer_mode { HRTIMER_MODE_PINNED = 0x02, HRTIMER_MODE_SOFT = 0x04, HRTIMER_MODE_HARD = 0x08, + HRTIMER_MODE_LAZY_REARM = 0x10, HRTIMER_MODE_ABS_PINNED = HRTIMER_MODE_ABS | HRTIMER_MODE_PINNED, HRTIMER_MODE_REL_PINNED = HRTIMER_MODE_REL | HRTIMER_MODE_PINNED, diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 8fbbb6bdf7a1..64381c64cdbd 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -33,6 +33,8 @@ enum hrtimer_restart { * @is_soft: Set if hrtimer will be expired in soft interrupt context. * @is_hard: Set if hrtimer will be expired in hard interrupt context * even on RT. + * @is_lazy: Set if the timer is frequently rearmed to avoid updates + * of the clock event device * * The hrtimer structure must be initialized by hrtimer_setup() */ @@ -45,6 +47,7 @@ struct hrtimer { u8 is_rel; u8 is_soft; u8 is_hard; + u8 is_lazy; }; #endif /* _LINUX_HRTIMER_TYPES_H */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 67917ce696d4..e54f8b59f6b4 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1152,7 +1152,7 @@ static void __remove_hrtimer(struct hrtimer *timer, * an superfluous call to hrtimer_force_reprogram() on the * remote cpu later on if the same timer gets enqueued again. */ - if (reprogram && timer == cpu_base->next_timer) + if (reprogram && timer == cpu_base->next_timer && !timer->is_lazy) hrtimer_force_reprogram(cpu_base, 1); } @@ -1321,6 +1321,20 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, return 0; } + /* + * Special case for the HRTICK timer. It is frequently rearmed and most + * of the time moves the expiry into the future. That's expensive in + * virtual machines and it's better to take the pointless already armed + * interrupt than reprogramming the hardware on every context switch. + * + * If the new expiry is before the armed time, then reprogramming is + * required. + */ + if (timer->is_lazy) { + if (new_base->cpu_base->expires_next <= hrtimer_get_expires(timer)) + return 0; + } + /* * Timer was forced to stay on the current CPU to avoid * reprogramming on removal and enqueue. Force reprogram the @@ -1675,6 +1689,7 @@ static void __hrtimer_setup(struct hrtimer *timer, base += hrtimer_clockid_to_base(clock_id); timer->is_soft = softtimer; timer->is_hard = !!(mode & HRTIMER_MODE_HARD); + timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM); timer->base = &cpu_base->clock_base[base]; timerqueue_init(&timer->node); -- cgit v1.2.3 From 70802807398c65f5a49b2baec87e1f6c8db43de6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:15 +0100 Subject: clockevents: Remove redundant CLOCK_EVT_FEAT_KTIME The only real usecase for this is the hrtimer based broadcast device. No point in using two different feature flags for this. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.609049777@kernel.org --- include/linux/clockchips.h | 1 - kernel/time/clockevents.c | 4 ++-- kernel/time/tick-broadcast-hrtimer.c | 1 - 3 files changed, 2 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index b0df28ddd394..5e8f7819f6a6 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -45,7 +45,6 @@ enum clock_event_state { */ # define CLOCK_EVT_FEAT_PERIODIC 0x000001 # define CLOCK_EVT_FEAT_ONESHOT 0x000002 -# define CLOCK_EVT_FEAT_KTIME 0x000004 /* * x86(64) specific (mis)features: diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index eaae1ce9f060..5abaeef08e6a 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -319,8 +319,8 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, WARN_ONCE(!clockevent_state_oneshot(dev), "Current state: %d\n", clockevent_get_state(dev)); - /* Shortcut for clockevent devices that can deal with ktime. */ - if (dev->features & CLOCK_EVT_FEAT_KTIME) + /* ktime_t based reprogramming for the broadcast hrtimer device */ + if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER)) return dev->set_next_ktime(expires, dev); delta = ktime_to_ns(ktime_sub(expires, ktime_get())); diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c index a88b72b0f35e..51f6a1032c83 100644 --- a/kernel/time/tick-broadcast-hrtimer.c +++ b/kernel/time/tick-broadcast-hrtimer.c @@ -78,7 +78,6 @@ static struct clock_event_device ce_broadcast_hrtimer = { .set_state_shutdown = bc_shutdown, .set_next_ktime = bc_set_next, .features = CLOCK_EVT_FEAT_ONESHOT | - CLOCK_EVT_FEAT_KTIME | CLOCK_EVT_FEAT_HRTIMER, .rating = 0, .bound_on = -1, -- cgit v1.2.3 From 2e27beeb66e43f3b84aef5a07e486a5d50695c06 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:20 +0100 Subject: timekeeping: Allow inlining clocksource::read() On some architectures clocksource::read() boils down to a single instruction, so the indirect function call is just a massive overhead especially with speculative execution mitigations in effect. Allow architectures to enable conditional inlining of that read to avoid that by: - providing a static branch to switch to the inlined variant - disabling the branch before clocksource changes - enabling the branch after a clocksource change, when the clocksource indicates in a feature flag that it is the one which provides the inlined variant This is intentionally not a static call as that would only remove the indirect call, but not the rest of the overhead. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.675151545@kernel.org --- include/linux/clocksource.h | 2 ++ kernel/time/Kconfig | 3 ++ kernel/time/timekeeping.c | 74 +++++++++++++++++++++++++++++++++------------ 3 files changed, 60 insertions(+), 19 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 65b7c41471c3..54366d5c4d19 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -149,6 +149,8 @@ struct clocksource { #define CLOCK_SOURCE_SUSPEND_NONSTOP 0x80 #define CLOCK_SOURCE_RESELECT 0x100 #define CLOCK_SOURCE_VERIFY_PERCPU 0x200 +#define CLOCK_SOURCE_CAN_INLINE_READ 0x400 + /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 7c6a52f7836c..07b048ba0cca 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -17,6 +17,9 @@ config ARCH_CLOCKSOURCE_DATA config ARCH_CLOCKSOURCE_INIT bool +config ARCH_WANTS_CLOCKSOURCE_READ_INLINE + bool + # Timekeeping vsyscall support config GENERIC_TIME_VSYSCALL bool diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 91fa2003351c..63aa31f02ebc 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -3,34 +3,30 @@ * Kernel timekeeping code and accessor functions. Based on code from * timer.c, moved in commit 8524070b7982. */ -#include -#include -#include +#include +#include +#include +#include #include -#include -#include -#include +#include #include -#include -#include +#include +#include #include +#include +#include +#include #include -#include -#include +#include #include #include -#include -#include -#include -#include -#include -#include +#include #include #include "tick-internal.h" -#include "ntp_internal.h" #include "timekeeping_internal.h" +#include "ntp_internal.h" #define TK_CLEAR_NTP (1 << 0) #define TK_CLOCK_WAS_SET (1 << 1) @@ -275,6 +271,11 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) tk->monotonic_to_boot = ktime_to_timespec64(tk->offs_boot); } +#ifdef CONFIG_ARCH_WANTS_CLOCKSOURCE_READ_INLINE +#include + +static DEFINE_STATIC_KEY_FALSE(clocksource_read_inlined); + /* * tk_clock_read - atomic clocksource read() helper * @@ -288,13 +289,36 @@ static inline void tk_update_sleep_time(struct timekeeper *tk, ktime_t delta) * a read of the fast-timekeeper tkrs (which is protected by its own locking * and update logic). */ -static inline u64 tk_clock_read(const struct tk_read_base *tkr) +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) { struct clocksource *clock = READ_ONCE(tkr->clock); + if (static_branch_likely(&clocksource_read_inlined)) + return arch_inlined_clocksource_read(clock); + return clock->read(clock); } +static inline void clocksource_disable_inline_read(void) +{ + static_branch_disable(&clocksource_read_inlined); +} + +static inline void clocksource_enable_inline_read(void) +{ + static_branch_enable(&clocksource_read_inlined); +} +#else +static __always_inline u64 tk_clock_read(const struct tk_read_base *tkr) +{ + struct clocksource *clock = READ_ONCE(tkr->clock); + + return clock->read(clock); +} +static inline void clocksource_disable_inline_read(void) { } +static inline void clocksource_enable_inline_read(void) { } +#endif + /** * tk_setup_internals - Set up internals to use clocksource clock. * @@ -375,7 +399,7 @@ static noinline u64 delta_to_ns_safe(const struct tk_read_base *tkr, u64 delta) return mul_u64_u32_add_u64_shr(delta, tkr->mult, tkr->xtime_nsec, tkr->shift); } -static inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) +static __always_inline u64 timekeeping_cycles_to_ns(const struct tk_read_base *tkr, u64 cycles) { /* Calculate the delta since the last update_wall_time() */ u64 mask = tkr->mask, delta = (cycles - tkr->cycle_last) & mask; @@ -1631,7 +1655,19 @@ int timekeeping_notify(struct clocksource *clock) if (tk->tkr_mono.clock == clock) return 0; + + /* Disable inlined reads accross the clocksource switch */ + clocksource_disable_inline_read(); + stop_machine(change_clocksource, clock, NULL); + + /* + * If the clocksource has been selected and supports inlined reads + * enable the branch. + */ + if (tk->tkr_mono.clock == clock && clock->flags & CLOCK_SOURCE_CAN_INLINE_READ) + clocksource_enable_inline_read(); + tick_clock_notify(); return tk->tkr_mono.clock == clock ? 0 : -1; } -- cgit v1.2.3 From cd38bdb8e696a1a1eb12fc6662a6e420977aacfd Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:40 +0100 Subject: timekeeping: Provide infrastructure for coupled clockevents Some architectures have clockevent devices which are coupled to the system clocksource by implementing a less than or equal comparator which compares the programmed absolute expiry time against the underlying time counter. Well known examples are TSC/TSC deadline timer and the S390 TOD clocksource/comparator. While the concept is nice it has some downsides: 1) The clockevents core code is strictly based on relative expiry times as that's the most common case for clockevent device hardware. That requires to convert the absolute expiry time provided by the caller (hrtimers, NOHZ code) to a relative expiry time by reading and substracting the current time. The clockevent::set_next_event() callback must then read the counter again to convert the relative expiry back into a absolute one. 2) The conversion factors from nanoseconds to counter clock cycles are set up when the clockevent is registered. When NTP applies corrections then the clockevent conversion factors can deviate from the clocksource conversion substantially which either results in timers firing late or in the worst case early. The early expiry then needs to do a reprogam with a short delta. In most cases this is papered over by the fact that the read in the set_next_event() callback happens after the read which is used to calculate the delta. So the tendency is that timers expire mostly late. All of this can be avoided by providing support for these devices in the core code: 1) The timekeeping core keeps track of the last update to the clocksource by storing the base nanoseconds and the corresponding clocksource counter value. That's used to keep the conversion math for reading the time within 64-bit in the common case. This information can be used to avoid both reads of the underlying clocksource in the clockevents reprogramming path: delta = expiry - base_ns; cycles = base_cycles + ((delta * clockevent::mult) >> clockevent::shift); The resulting cycles value can be directly used to program the comparator. 2) As #1 does not longer provide the "compensation" through the second read the deviation of the clocksource and clockevent conversions caused by NTP become more prominent. This can be cured by letting the timekeeping core compute and store the reverse conversion factors when the clocksource cycles to nanoseconds factors are modified by NTP: CS::MULT (1 << NS_TO_CYC_SHIFT) --------------- = ---------------------- (1 << CS:SHIFT) NS_TO_CYC_MULT Ergo: NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT The NS_TO_CYC_SHIFT value is calculated when the clocksource is installed so that it aims for a one hour maximum sleep time. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163429.944763521@kernel.org --- include/linux/clocksource.h | 1 + include/linux/timekeeper_internal.h | 8 +++ kernel/time/Kconfig | 3 + kernel/time/timekeeping.c | 110 ++++++++++++++++++++++++++++++++++++ kernel/time/timekeeping.h | 2 + 5 files changed, 124 insertions(+) (limited to 'include/linux') diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 54366d5c4d19..25774fc5b53d 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -150,6 +150,7 @@ struct clocksource { #define CLOCK_SOURCE_RESELECT 0x100 #define CLOCK_SOURCE_VERIFY_PERCPU 0x200 #define CLOCK_SOURCE_CAN_INLINE_READ 0x400 +#define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x800 /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) diff --git a/include/linux/timekeeper_internal.h b/include/linux/timekeeper_internal.h index b8ae89ea28ab..e36d11e33e0c 100644 --- a/include/linux/timekeeper_internal.h +++ b/include/linux/timekeeper_internal.h @@ -72,6 +72,10 @@ struct tk_read_base { * @id: The timekeeper ID * @tkr_raw: The readout base structure for CLOCK_MONOTONIC_RAW * @raw_sec: CLOCK_MONOTONIC_RAW time in seconds + * @cs_id: The ID of the current clocksource + * @cs_ns_to_cyc_mult: Multiplicator for nanoseconds to cycles conversion + * @cs_ns_to_cyc_shift: Shift value for nanoseconds to cycles conversion + * @cs_ns_to_cyc_maxns: Maximum nanoseconds to cyles conversion range * @clock_was_set_seq: The sequence number of clock was set events * @cs_was_changed_seq: The sequence number of clocksource change events * @clock_valid: Indicator for valid clock @@ -159,6 +163,10 @@ struct timekeeper { u64 raw_sec; /* Cachline 3 and 4 (timekeeping internal variables): */ + enum clocksource_ids cs_id; + u32 cs_ns_to_cyc_mult; + u32 cs_ns_to_cyc_shift; + u64 cs_ns_to_cyc_maxns; unsigned int clock_was_set_seq; u8 cs_was_changed_seq; u8 clock_valid; diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 07b048ba0cca..b51bc5625129 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -47,6 +47,9 @@ config GENERIC_CLOCKEVENTS_BROADCAST_IDLE config GENERIC_CLOCKEVENTS_MIN_ADJUST bool +config GENERIC_CLOCKEVENTS_COUPLED + bool + # Generic update of CMOS clock config GENERIC_CMOS_UPDATE bool diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 63aa31f02ebc..b7a0f93011e0 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -391,6 +391,20 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) tk->tkr_raw.mult = clock->mult; tk->ntp_err_mult = 0; tk->skip_second_overflow = 0; + + tk->cs_id = clock->id; + + /* Coupled clockevent data */ + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) && + clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT) { + /* + * Aim for an one hour maximum delta and use KHz to handle + * clocksources with a frequency above 4GHz correctly as + * the frequency argument of clocks_calc_mult_shift() is u32. + */ + clocks_calc_mult_shift(&tk->cs_ns_to_cyc_mult, &tk->cs_ns_to_cyc_shift, + NSEC_PER_MSEC, clock->freq_khz, 3600 * 1000); + } } /* Timekeeper helper functions. */ @@ -720,6 +734,36 @@ static inline void tk_update_ktime_data(struct timekeeper *tk) tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC); } +static inline void tk_update_ns_to_cyc(struct timekeeper *tks, struct timekeeper *tkc) +{ + struct tk_read_base *tkrs = &tks->tkr_mono; + struct tk_read_base *tkrc = &tkc->tkr_mono; + unsigned int shift; + + if (!IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED) || + !(tkrs->clock->flags & CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT)) + return; + + if (tkrs->mult == tkrc->mult && tkrs->shift == tkrc->shift) + return; + /* + * The conversion math is simple: + * + * CS::MULT (1 << NS_TO_CYC_SHIFT) + * --------------- = ---------------------- + * (1 << CS:SHIFT) NS_TO_CYC_MULT + * + * Ergo: + * + * NS_TO_CYC_MULT = (1 << (CS::SHIFT + NS_TO_CYC_SHIFT)) / CS::MULT + * + * NS_TO_CYC_SHIFT has been set up in tk_setup_internals() + */ + shift = tkrs->shift + tks->cs_ns_to_cyc_shift; + tks->cs_ns_to_cyc_mult = (u32)div_u64(1ULL << shift, tkrs->mult); + tks->cs_ns_to_cyc_maxns = div_u64(tkrs->clock->mask, tks->cs_ns_to_cyc_mult); +} + /* * Restore the shadow timekeeper from the real timekeeper. */ @@ -754,6 +798,7 @@ static void timekeeping_update_from_shadow(struct tk_data *tkd, unsigned int act tk->tkr_mono.base_real = tk->tkr_mono.base + tk->offs_real; if (tk->id == TIMEKEEPER_CORE) { + tk_update_ns_to_cyc(tk, &tkd->timekeeper); update_vsyscall(tk); update_pvclock_gtod(tk, action & TK_CLOCK_WAS_SET); @@ -808,6 +853,71 @@ static void timekeeping_forward_now(struct timekeeper *tk) tk_update_coarse_nsecs(tk); } +/* + * ktime_expiry_to_cycles - Convert a expiry time to clocksource cycles + * @id: Clocksource ID which is required for validity + * @expires_ns: Absolute CLOCK_MONOTONIC expiry time (nsecs) to be converted + * @cycles: Pointer to storage for corresponding absolute cycles value + * + * Convert a CLOCK_MONOTONIC based absolute expiry time to a cycles value + * based on the correlated clocksource of the clockevent device by using + * the base nanoseconds and cycles values of the last timekeeper update and + * converting the delta between @expires_ns and base nanoseconds to cycles. + * + * This only works for clockevent devices which are using a less than or + * equal comparator against the clocksource. + * + * Utilizing this avoids two clocksource reads for such devices, the + * ktime_get() in clockevents_program_event() to calculate the delta expiry + * value and the readout in the device::set_next_event() callback to + * convert the delta back to a absolute comparator value. + * + * Returns: True if @id matches the current clocksource ID, false otherwise + */ +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles) +{ + struct timekeeper *tk = &tk_core.timekeeper; + struct tk_read_base *tkrm = &tk->tkr_mono; + ktime_t base_ns, delta_ns, max_ns; + u64 base_cycles, delta_cycles; + unsigned int seq; + u32 mult, shift; + + /* + * Racy check to avoid the seqcount overhead when ID does not match. If + * the relevant clocksource is installed concurrently, then this will + * just delay the switch over to this mechanism until the next event is + * programmed. If the ID is not matching the clock events code will use + * the regular relative set_next_event() callback as before. + */ + if (data_race(tk->cs_id) != id) + return false; + + do { + seq = read_seqcount_begin(&tk_core.seq); + + if (tk->cs_id != id) + return false; + + base_cycles = tkrm->cycle_last; + base_ns = tkrm->base + (tkrm->xtime_nsec >> tkrm->shift); + + mult = tk->cs_ns_to_cyc_mult; + shift = tk->cs_ns_to_cyc_shift; + max_ns = tk->cs_ns_to_cyc_maxns; + + } while (read_seqcount_retry(&tk_core.seq, seq)); + + /* Prevent negative deltas and multiplication overflows */ + delta_ns = min(expires_ns - base_ns, max_ns); + delta_ns = max(delta_ns, 0); + + /* Convert to cycles */ + delta_cycles = ((u64)delta_ns * mult) >> shift; + *cycles = base_cycles + delta_cycles; + return true; +} + /** * ktime_get_real_ts64 - Returns the time of day in a timespec64. * @ts: pointer to the timespec to be set diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h index 543beba096c7..198d0608db74 100644 --- a/kernel/time/timekeeping.h +++ b/kernel/time/timekeeping.h @@ -9,6 +9,8 @@ extern ktime_t ktime_get_update_offsets_now(unsigned int *cwsseq, ktime_t *offs_boot, ktime_t *offs_tai); +bool ktime_expiry_to_cycles(enum clocksource_ids id, ktime_t expires_ns, u64 *cycles); + extern int timekeeping_valid_for_hres(void); extern u64 timekeeping_max_deferment(void); extern void timekeeping_warp_clock(void); -- cgit v1.2.3 From 89f951a1e8ad781e7ac70eccddab0e0c270485f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:36:45 +0100 Subject: clockevents: Provide support for clocksource coupled comparators Some clockevent devices are coupled to the system clocksource by implementing a less than or equal comparator which compares the programmed absolute expiry time against the underlying time counter. The timekeeping core provides a function to convert and absolute CLOCK_MONOTONIC based expiry time to a absolute clock cycles time which can be directly fed into the comparator. That spares two time reads in the next event progamming path, one to convert the absolute nanoseconds time to a delta value and the other to convert the delta value back to a absolute time value suitable for the comparator. Provide a new clocksource callback which takes the absolute cycle value and wire it up in clockevents_program_event(). Similar to clocksources allow architectures to inline the rearm operation. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.010425428@kernel.org --- include/linux/clockchips.h | 7 +++++-- kernel/time/Kconfig | 4 ++++ kernel/time/clockevents.c | 44 +++++++++++++++++++++++++++++++++++++++----- 3 files changed, 48 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 5e8f7819f6a6..92d90220c0d4 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -43,8 +43,9 @@ enum clock_event_state { /* * Clock event features */ -# define CLOCK_EVT_FEAT_PERIODIC 0x000001 -# define CLOCK_EVT_FEAT_ONESHOT 0x000002 +# define CLOCK_EVT_FEAT_PERIODIC 0x000001 +# define CLOCK_EVT_FEAT_ONESHOT 0x000002 +# define CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED 0x000004 /* * x86(64) specific (mis)features: @@ -100,6 +101,7 @@ struct clock_event_device { void (*event_handler)(struct clock_event_device *); int (*set_next_event)(unsigned long evt, struct clock_event_device *); int (*set_next_ktime)(ktime_t expires, struct clock_event_device *); + void (*set_next_coupled)(u64 cycles, struct clock_event_device *); ktime_t next_event; u64 max_delta_ns; u64 min_delta_ns; @@ -107,6 +109,7 @@ struct clock_event_device { u32 shift; enum clock_event_state state_use_accessors; unsigned int features; + enum clocksource_ids cs_id; unsigned long retries; int (*set_state_periodic)(struct clock_event_device *); diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b51bc5625129..e1968ab8b37f 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -50,6 +50,10 @@ config GENERIC_CLOCKEVENTS_MIN_ADJUST config GENERIC_CLOCKEVENTS_COUPLED bool +config GENERIC_CLOCKEVENTS_COUPLED_INLINE + select GENERIC_CLOCKEVENTS_COUPLED + bool + # Generic update of CMOS clock config GENERIC_CMOS_UPDATE bool diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 5abaeef08e6a..83712aa1d385 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -292,6 +292,38 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) #endif /* CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST */ +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED +#ifdef CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE +#include +#else +static __always_inline void +arch_inlined_clockevent_set_next_coupled(u64 u64 cycles, struct clock_event_device *dev) { } +#endif + +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + u64 cycles; + + if (unlikely(!(dev->features & CLOCK_EVT_FEAT_CLOCKSOURCE_COUPLED))) + return false; + + if (unlikely(!ktime_expiry_to_cycles(dev->cs_id, expires, &cycles))) + return false; + + if (IS_ENABLED(CONFIG_GENERIC_CLOCKEVENTS_COUPLED_INLINE)) + arch_inlined_clockevent_set_next_coupled(cycles, dev); + else + dev->set_next_coupled(cycles, dev); + return true; +} + +#else +static inline bool clockevent_set_next_coupled(struct clock_event_device *dev, ktime_t expires) +{ + return false; +} +#endif + /** * clockevents_program_event - Reprogram the clock event device. * @dev: device to program @@ -300,11 +332,10 @@ static int clockevents_program_min_delta(struct clock_event_device *dev) * * Returns 0 on success, -ETIME when the event is in the past. */ -int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, - bool force) +int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, bool force) { - unsigned long long clc; int64_t delta; + u64 cycles; int rc; if (WARN_ON_ONCE(expires < 0)) @@ -323,6 +354,9 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, if (unlikely(dev->features & CLOCK_EVT_FEAT_HRTIMER)) return dev->set_next_ktime(expires, dev); + if (likely(clockevent_set_next_coupled(dev, expires))) + return 0; + delta = ktime_to_ns(ktime_sub(expires, ktime_get())); if (delta <= 0) return force ? clockevents_program_min_delta(dev) : -ETIME; @@ -330,8 +364,8 @@ int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, delta = min(delta, (int64_t) dev->max_delta_ns); delta = max(delta, (int64_t) dev->min_delta_ns); - clc = ((unsigned long long) delta * dev->mult) >> dev->shift; - rc = dev->set_next_event((unsigned long) clc, dev); + cycles = ((u64)delta * dev->mult) >> dev->shift; + rc = dev->set_next_event((unsigned long) cycles, dev); return (rc && force) ? clockevents_program_min_delta(dev) : rc; } -- cgit v1.2.3 From 7d27eafe54659d19cef10dab4520cbcdfb17b0e3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:18 +0100 Subject: hrtimer: Replace the bitfield in hrtimer_cpu_base Use bool for the various flags as that creates better code in the hot path. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.475262618@kernel.org --- include/linux/hrtimer_defs.h | 10 +++++----- kernel/time/hrtimer.c | 25 +++++++++++++------------ 2 files changed, 18 insertions(+), 17 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 02b010df6570..f9fbf9a48f59 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -83,11 +83,11 @@ struct hrtimer_cpu_base { unsigned int cpu; unsigned int active_bases; unsigned int clock_was_set_seq; - unsigned int hres_active : 1, - in_hrtirq : 1, - hang_detected : 1, - softirq_activated : 1, - online : 1; + bool hres_active; + bool in_hrtirq; + bool hang_detected; + bool softirq_activated; + bool online; #ifdef CONFIG_HIGH_RES_TIMERS unsigned int nr_events; unsigned short nr_retries; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e6f02e980371..3b80a4453ee6 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -741,7 +741,7 @@ static void hrtimer_switch_to_hres(void) pr_warn("Could not switch to high resolution mode on CPU %u\n", base->cpu); return; } - base->hres_active = 1; + base->hres_active = true; hrtimer_resolution = HIGH_RES_NSEC; tick_setup_sched_timer(true); @@ -1854,7 +1854,7 @@ static __latent_entropy void hrtimer_run_softirq(void) now = hrtimer_update_base(cpu_base); __hrtimer_run_queues(cpu_base, now, flags, HRTIMER_ACTIVE_SOFT); - cpu_base->softirq_activated = 0; + cpu_base->softirq_activated = false; hrtimer_update_softirq_timer(cpu_base, true); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); @@ -1881,7 +1881,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: - cpu_base->in_hrtirq = 1; + cpu_base->in_hrtirq = true; /* * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue * timers while __hrtimer_run_queues() is expiring the clock bases. @@ -1892,7 +1892,7 @@ retry: if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } @@ -1905,12 +1905,12 @@ retry: * against it. */ cpu_base->expires_next = expires_next; - cpu_base->in_hrtirq = 0; + cpu_base->in_hrtirq = false; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); /* Reprogramming necessary ? */ if (!tick_program_event(expires_next, 0)) { - cpu_base->hang_detected = 0; + cpu_base->hang_detected = false; return; } @@ -1939,7 +1939,7 @@ retry: * time away. */ cpu_base->nr_hangs++; - cpu_base->hang_detected = 1; + cpu_base->hang_detected = true; raw_spin_unlock_irqrestore(&cpu_base->lock, flags); delta = ktime_sub(now, entry_time); @@ -1987,7 +1987,7 @@ void hrtimer_run_queues(void) if (!ktime_before(now, cpu_base->softirq_expires_next)) { cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->softirq_activated = 1; + cpu_base->softirq_activated = true; raise_timer_softirq(HRTIMER_SOFTIRQ); } @@ -2239,13 +2239,14 @@ int hrtimers_cpu_starting(unsigned int cpu) /* Clear out any left over state from a CPU down operation */ cpu_base->active_bases = 0; - cpu_base->hres_active = 0; - cpu_base->hang_detected = 0; + cpu_base->hres_active = false; + cpu_base->hang_detected = false; cpu_base->next_timer = NULL; cpu_base->softirq_next_timer = NULL; cpu_base->expires_next = KTIME_MAX; cpu_base->softirq_expires_next = KTIME_MAX; - cpu_base->online = 1; + cpu_base->softirq_activated = false; + cpu_base->online = true; return 0; } @@ -2303,7 +2304,7 @@ int hrtimers_cpu_dying(unsigned int dying_cpu) smp_call_function_single(ncpu, retrigger_next_event, NULL, 0); raw_spin_unlock(&new_base->lock); - old_base->online = 0; + old_base->online = false; raw_spin_unlock(&old_base->lock); return 0; -- cgit v1.2.3 From 22f011be7aaa77ca8f502b9dd07b7334f9965d18 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:23 +0100 Subject: hrtimer: Convert state and properties to boolean All 'u8' flags are true booleans, so make it entirely clear that these can only contain true or false. This is especially true for hrtimer::state, which has a historical leftover of using the state with bitwise operations. That was used in the early hrtimer implementation with several bits, but then converted to a boolean state. But that conversion missed to replace the bit OR and bit check operations all over the place, which creates suboptimal code. As of today 'state' is a misnomer because it's only purpose is to reflect whether the timer is enqueued into the RB-tree or not. Rename it to 'is_queued' and make all operations on it boolean. This reduces text size from 8926 to 8732 bytes. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.542427240@kernel.org --- include/linux/hrtimer.h | 31 ++--------------------- include/linux/hrtimer_types.h | 12 ++++----- kernel/time/hrtimer.c | 58 +++++++++++++++++++++++++++++-------------- kernel/time/timer_list.c | 2 +- 4 files changed, 49 insertions(+), 54 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c924bb2498db..4ad4a454b4c5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -63,33 +63,6 @@ enum hrtimer_mode { HRTIMER_MODE_REL_PINNED_HARD = HRTIMER_MODE_REL_PINNED | HRTIMER_MODE_HARD, }; -/* - * Values to track state of the timer - * - * Possible states: - * - * 0x00 inactive - * 0x01 enqueued into rbtree - * - * The callback state is not part of the timer->state because clearing it would - * mean touching the timer after the callback, this makes it impossible to free - * the timer from the callback function. - * - * Therefore we track the callback state in: - * - * timer->base->cpu_base->running == timer - * - * On SMP it is possible to have a "callback function running and enqueued" - * status. It happens for example when a posix timer expired and the callback - * queued a signal. Between dropping the lock which protects the posix timer - * and reacquiring the base lock of the hrtimer, another CPU can deliver the - * signal and rearm the timer. - * - * All state transitions are protected by cpu_base->lock. - */ -#define HRTIMER_STATE_INACTIVE 0x00 -#define HRTIMER_STATE_ENQUEUED 0x01 - /** * struct hrtimer_sleeper - simple sleeper structure * @timer: embedded timer structure @@ -300,8 +273,8 @@ extern bool hrtimer_active(const struct hrtimer *timer); */ static inline bool hrtimer_is_queued(struct hrtimer *timer) { - /* The READ_ONCE pairs with the update functions of timer->state */ - return !!(READ_ONCE(timer->state) & HRTIMER_STATE_ENQUEUED); + /* The READ_ONCE pairs with the update functions of timer->is_queued */ + return READ_ONCE(timer->is_queued); } /* diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 64381c64cdbd..0e22bc91d00f 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -28,7 +28,7 @@ enum hrtimer_restart { * was armed. * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) - * @state: state information (See bit values above) + * @is_queued: Indicates whether a timer is enqueued or not * @is_rel: Set if the timer was armed relative * @is_soft: Set if hrtimer will be expired in soft interrupt context. * @is_hard: Set if hrtimer will be expired in hard interrupt context @@ -43,11 +43,11 @@ struct hrtimer { ktime_t _softexpires; enum hrtimer_restart (*__private function)(struct hrtimer *); struct hrtimer_clock_base *base; - u8 state; - u8 is_rel; - u8 is_soft; - u8 is_hard; - u8 is_lazy; + bool is_queued; + bool is_rel; + bool is_soft; + bool is_hard; + bool is_lazy; }; #endif /* _LINUX_HRTIMER_TYPES_H */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 3b80a4453ee6..6bab3b7eb0de 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -49,6 +49,28 @@ #include "tick-internal.h" +/* + * Constants to set the queued state of the timer (INACTIVE, ENQUEUED) + * + * The callback state is kept separate in the CPU base because having it in + * the timer would required touching the timer after the callback, which + * makes it impossible to free the timer from the callback function. + * + * Therefore we track the callback state in: + * + * timer->base->cpu_base->running == timer + * + * On SMP it is possible to have a "callback function running and enqueued" + * status. It happens for example when a posix timer expired and the callback + * queued a signal. Between dropping the lock which protects the posix timer + * and reacquiring the base lock of the hrtimer, another CPU can deliver the + * signal and rearm the timer. + * + * All state transitions are protected by cpu_base->lock. + */ +#define HRTIMER_STATE_INACTIVE false +#define HRTIMER_STATE_ENQUEUED true + /* * The resolution of the clocks. The resolution value is returned in * the clock_getres() system call to give application programmers an @@ -1038,7 +1060,7 @@ u64 hrtimer_forward(struct hrtimer *timer, ktime_t now, ktime_t interval) if (delta < 0) return 0; - if (WARN_ON(timer->state & HRTIMER_STATE_ENQUEUED)) + if (WARN_ON(timer->is_queued)) return 0; if (interval < hrtimer_resolution) @@ -1082,7 +1104,7 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba base->cpu_base->active_bases |= 1 << base->index; /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, HRTIMER_STATE_ENQUEUED); + WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); return timerqueue_add(&base->active, &timer->node); } @@ -1096,18 +1118,18 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba * anyway (e.g. timer interrupt) */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - u8 newstate, bool reprogram) + bool newstate, bool reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - u8 state = timer->state; lockdep_assert_held(&cpu_base->lock); - /* Pairs with the lockless read in hrtimer_is_queued() */ - WRITE_ONCE(timer->state, newstate); - if (!(state & HRTIMER_STATE_ENQUEUED)) + if (!timer->is_queued) return; + /* Pairs with the lockless read in hrtimer_is_queued() */ + WRITE_ONCE(timer->is_queued, newstate); + if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); @@ -1127,11 +1149,11 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart, bool keep_local) { - u8 state = timer->state; + bool queued_state = timer->is_queued; lockdep_assert_held(&base->cpu_base->lock); - if (state & HRTIMER_STATE_ENQUEUED) { + if (queued_state) { bool reprogram; debug_hrtimer_deactivate(timer); @@ -1153,11 +1175,11 @@ static inline bool remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_ba * and a moment later when it's requeued). */ if (!restart) - state = HRTIMER_STATE_INACTIVE; + queued_state = HRTIMER_STATE_INACTIVE; else reprogram &= !keep_local; - __remove_hrtimer(timer, base, state, reprogram); + __remove_hrtimer(timer, base, queued_state, reprogram); return true; } return false; @@ -1704,7 +1726,7 @@ bool hrtimer_active(const struct hrtimer *timer) base = READ_ONCE(timer->base); seq = raw_read_seqcount_begin(&base->seq); - if (timer->state != HRTIMER_STATE_INACTIVE || base->running == timer) + if (timer->is_queued || base->running == timer) return true; } while (read_seqcount_retry(&base->seq, seq) || base != READ_ONCE(timer->base)); @@ -1721,7 +1743,7 @@ EXPORT_SYMBOL_GPL(hrtimer_active); * - callback: the timer is being ran * - post: the timer is inactive or (re)queued * - * On the read side we ensure we observe timer->state and cpu_base->running + * On the read side we ensure we observe timer->is_queued and cpu_base->running * from the same section, if anything changed while we looked at it, we retry. * This includes timer->base changing because sequence numbers alone are * insufficient for that. @@ -1744,11 +1766,11 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_cloc base->running = timer; /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); @@ -1787,15 +1809,15 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_cloc * hrtimer_start_range_ns() can have popped in and enqueued the timer * for us already. */ - if (restart != HRTIMER_NORESTART && !(timer->state & HRTIMER_STATE_ENQUEUED)) + if (restart == HRTIMER_RESTART && !timer->is_queued) enqueue_hrtimer(timer, base, HRTIMER_MODE_ABS, false); /* - * Separate the ->running assignment from the ->state assignment. + * Separate the ->running assignment from the ->is_queued assignment. * * As with a regular write barrier, this ensures the read side in * hrtimer_active() cannot observe base->running.timer == NULL && - * timer->state == INACTIVE. + * timer->is_queued == INACTIVE. */ raw_write_seqcount_barrier(&base->seq); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 488e47e96e93..19e61826b7de 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -47,7 +47,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, ACCESS_PRIVATE(timer, function)); - SEQ_printf(m, ", S:%02x", timer->state); + SEQ_printf(m, ", S:%02x", timer->is_queued); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", (unsigned long long)ktime_to_ns(hrtimer_get_softexpires(timer)), -- cgit v1.2.3 From 9e07a9c980eaa93fd1bba722d31eeb4bf0cbbfb4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:37:53 +0100 Subject: hrtimer: Rename hrtimer_cpu_base::in_hrtirq to deferred_rearm The upcoming deferred rearming scheme has the same effect as the deferred rearming when the hrtimer interrupt is executing. So it can reuse the in_hrtirq flag, but when it gets deferred beyond the hrtimer interrupt path, then the name does not make sense anymore. Rename it to deferred_rearm upfront to keep the actual functional change separate from the mechanical rename churn. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163430.935623347@kernel.org --- include/linux/hrtimer_defs.h | 4 ++-- kernel/time/hrtimer.c | 28 +++++++++------------------- 2 files changed, 11 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index f9fbf9a48f59..2c3bdbd562d2 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -53,7 +53,7 @@ enum hrtimer_base_type { * @active_bases: Bitfield to mark bases with active timers * @clock_was_set_seq: Sequence counter of clock was set events * @hres_active: State of high resolution mode - * @in_hrtirq: hrtimer_interrupt() is currently executing + * @deferred_rearm: A deferred rearm is pending * @hang_detected: The last hrtimer interrupt detected a hang * @softirq_activated: displays, if the softirq is raised - update of softirq * related settings is not required then. @@ -84,7 +84,7 @@ struct hrtimer_cpu_base { unsigned int active_bases; unsigned int clock_was_set_seq; bool hres_active; - bool in_hrtirq; + bool deferred_rearm; bool hang_detected; bool softirq_activated; bool online; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 2e05a1885d24..6f05d2569286 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -883,11 +883,8 @@ static void hrtimer_reprogram(struct hrtimer *timer, bool reprogram) if (expires >= cpu_base->expires_next) return; - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) return; cpu_base->next_timer = timer; @@ -921,12 +918,8 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int act if (seq == cpu_base->clock_was_set_seq) return false; - /* - * If the remote CPU is currently handling an hrtimer interrupt, it - * will reevaluate the first expiring timer of all clock bases - * before reprogramming. Nothing to do here. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending the remote CPU will take care of it */ + if (cpu_base->deferred_rearm) return false; /* @@ -1334,11 +1327,8 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del first = enqueue_hrtimer(timer, base, mode, was_armed); } - /* - * If the hrtimer interrupt is running, then it will reevaluate the - * clock bases and reprogram the clock event device. - */ - if (cpu_base->in_hrtirq) + /* If a deferred rearm is pending skip reprogramming the device */ + if (cpu_base->deferred_rearm) return false; if (!was_first || cpu_base != this_cpu_base) { @@ -1947,14 +1937,14 @@ static __latent_entropy void hrtimer_run_softirq(void) /* * Very similar to hrtimer_force_reprogram(), except it deals with - * in_hrtirq and hang_detected. + * deferred_rearm and hang_detected. */ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now) { ktime_t expires_next = hrtimer_update_next_event(cpu_base); cpu_base->expires_next = expires_next; - cpu_base->in_hrtirq = false; + cpu_base->deferred_rearm = false; if (unlikely(cpu_base->hang_detected)) { /* @@ -1985,7 +1975,7 @@ void hrtimer_interrupt(struct clock_event_device *dev) raw_spin_lock_irqsave(&cpu_base->lock, flags); entry_time = now = hrtimer_update_base(cpu_base); retry: - cpu_base->in_hrtirq = true; + cpu_base->deferred_rearm = true; /* * Set expires_next to KTIME_MAX, which prevents that remote CPUs queue * timers while __hrtimer_run_queues() is expiring the clock bases. -- cgit v1.2.3 From a43b4856bc039675165a50d9ef5f41b28520f0f4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:37:58 +0100 Subject: hrtimer: Prepare stubs for deferred rearming The hrtimer interrupt expires timers and at the end of the interrupt it rearms the clockevent device for the next expiring timer. That's obviously correct, but in the case that a expired timer set NEED_RESCHED the return from interrupt ends up in schedule(). If HRTICK is enabled then schedule() will modify the hrtick timer, which causes another reprogramming of the hardware. That can be avoided by deferring the rearming to the return from interrupt path and if the return results in a immediate schedule() invocation then it can be deferred until the end of schedule(). To make this correct the affected code parts need to be made aware of this. Provide empty stubs for the deferred rearming mechanism, so that the relevant code changes for entry, softirq and scheduler can be split up into separate changes independent of the actual enablement in the hrtimer code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.000891171@kernel.org --- include/linux/hrtimer.h | 1 + include/linux/hrtimer_rearm.h | 21 +++++++++++++++++++++ kernel/time/Kconfig | 4 ++++ 3 files changed, 26 insertions(+) create mode 100644 include/linux/hrtimer_rearm.h (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 4ad4a454b4c5..c087b7142330 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -13,6 +13,7 @@ #define _LINUX_HRTIMER_H #include +#include #include #include #include diff --git a/include/linux/hrtimer_rearm.h b/include/linux/hrtimer_rearm.h new file mode 100644 index 000000000000..6293076c03a6 --- /dev/null +++ b/include/linux/hrtimer_rearm.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _LINUX_HRTIMER_REARM_H +#define _LINUX_HRTIMER_REARM_H + +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +static __always_inline void __hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +static __always_inline bool +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +#else /* CONFIG_HRTIMER_REARM_DEFERRED */ +static __always_inline void __hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred(void) { } +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +static __always_inline bool +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ + +#endif diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index e1968ab8b37f..b95bfee3f592 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -58,6 +58,10 @@ config GENERIC_CLOCKEVENTS_COUPLED_INLINE config GENERIC_CMOS_UPDATE bool +# Deferred rearming of the hrtimer interrupt +config HRTIMER_REARM_DEFERRED + def_bool n + # Select to handle posix CPU timers from task_work # and not from the timer interrupt context config HAVE_POSIX_CPU_TIMERS_TASK_WORK -- cgit v1.2.3 From 0e98eb14814ef669e07ca6effaa03df2e57ef956 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:38:03 +0100 Subject: entry: Prepare for deferred hrtimer rearming The hrtimer interrupt expires timers and at the end of the interrupt it rearms the clockevent device for the next expiring timer. That's obviously correct, but in the case that a expired timer sets NEED_RESCHED the return from interrupt ends up in schedule(). If HRTICK is enabled then schedule() will modify the hrtick timer, which causes another reprogramming of the hardware. That can be avoided by deferring the rearming to the return from interrupt path and if the return results in a immediate schedule() invocation then it can be deferred until the end of schedule(), which avoids multiple rearms and re-evaluation of the timer wheel. As this is only relevant for interrupt to user return split the work masks up and hand them in as arguments from the relevant exit to user functions, which allows the compiler to optimize the deferred handling out for the syscall exit to user case. Add the rearm checks to the approritate places in the exit to user loop and the interrupt return to kernel path, so that the rearming is always guaranteed. In the return to user space path this is handled in the same way as TIF_RSEQ to avoid extra instructions in the fast path, which are truly hurtful for device interrupt heavy work loads as the extra instructions and conditionals while benign at first sight accumulate quickly into measurable regressions. The return from syscall path is completely unaffected due to the above mentioned split so syscall heavy workloads wont have any extra burden. For now this is just placing empty stubs at the right places which are all optimized out by the compiler until the actual functionality is in place. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.066469985@kernel.org --- include/linux/irq-entry-common.h | 25 +++++++++++++++++++------ include/linux/rseq_entry.h | 16 +++++++++++++--- kernel/entry/common.c | 4 +++- 3 files changed, 35 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq-entry-common.h b/include/linux/irq-entry-common.h index d26d1b1bcbfb..b976946b3cdb 100644 --- a/include/linux/irq-entry-common.h +++ b/include/linux/irq-entry-common.h @@ -3,6 +3,7 @@ #define __LINUX_IRQENTRYCOMMON_H #include +#include #include #include #include @@ -33,6 +34,14 @@ _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | _TIF_RSEQ | \ ARCH_EXIT_TO_USER_MODE_WORK) +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK) +# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK | _TIF_HRTIMER_REARM) +#else +# define EXIT_TO_USER_MODE_WORK_SYSCALL (EXIT_TO_USER_MODE_WORK) +# define EXIT_TO_USER_MODE_WORK_IRQ (EXIT_TO_USER_MODE_WORK) +#endif + /** * arch_enter_from_user_mode - Architecture specific sanity check for user mode regs * @regs: Pointer to currents pt_regs @@ -203,6 +212,7 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work /** * __exit_to_user_mode_prepare - call exit_to_user_mode_loop() if required * @regs: Pointer to pt_regs on entry stack + * @work_mask: Which TIF bits need to be evaluated * * 1) check that interrupts are disabled * 2) call tick_nohz_user_enter_prepare() @@ -212,7 +222,8 @@ unsigned long exit_to_user_mode_loop(struct pt_regs *regs, unsigned long ti_work * * Don't invoke directly, use the syscall/irqentry_ prefixed variants below */ -static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) +static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs, + const unsigned long work_mask) { unsigned long ti_work; @@ -222,8 +233,10 @@ static __always_inline void __exit_to_user_mode_prepare(struct pt_regs *regs) tick_nohz_user_enter_prepare(); ti_work = read_thread_flags(); - if (unlikely(ti_work & EXIT_TO_USER_MODE_WORK)) - ti_work = exit_to_user_mode_loop(regs, ti_work); + if (unlikely(ti_work & work_mask)) { + if (!hrtimer_rearm_deferred_user_irq(&ti_work, work_mask)) + ti_work = exit_to_user_mode_loop(regs, ti_work); + } arch_exit_to_user_mode_prepare(regs, ti_work); } @@ -239,7 +252,7 @@ static __always_inline void __exit_to_user_mode_validate(void) /* Temporary workaround to keep ARM64 alive */ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK); rseq_exit_to_user_mode_legacy(); __exit_to_user_mode_validate(); } @@ -253,7 +266,7 @@ static __always_inline void exit_to_user_mode_prepare_legacy(struct pt_regs *reg */ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_SYSCALL); rseq_syscall_exit_to_user_mode(); __exit_to_user_mode_validate(); } @@ -267,7 +280,7 @@ static __always_inline void syscall_exit_to_user_mode_prepare(struct pt_regs *re */ static __always_inline void irqentry_exit_to_user_mode_prepare(struct pt_regs *regs) { - __exit_to_user_mode_prepare(regs); + __exit_to_user_mode_prepare(regs, EXIT_TO_USER_MODE_WORK_IRQ); rseq_irqentry_exit_to_user_mode(); __exit_to_user_mode_validate(); } diff --git a/include/linux/rseq_entry.h b/include/linux/rseq_entry.h index cbc4a791618b..17956e119e81 100644 --- a/include/linux/rseq_entry.h +++ b/include/linux/rseq_entry.h @@ -40,6 +40,7 @@ DECLARE_PER_CPU(struct rseq_stats, rseq_stats); #endif /* !CONFIG_RSEQ_STATS */ #ifdef CONFIG_RSEQ +#include #include #include #include @@ -110,7 +111,7 @@ static __always_inline void rseq_slice_clear_grant(struct task_struct *t) t->rseq.slice.state.granted = false; } -static __always_inline bool rseq_grant_slice_extension(bool work_pending) +static __always_inline bool __rseq_grant_slice_extension(bool work_pending) { struct task_struct *curr = current; struct rseq_slice_ctrl usr_ctrl; @@ -215,11 +216,20 @@ efault: return false; } +static __always_inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) +{ + if (unlikely(__rseq_grant_slice_extension(ti_work & mask))) { + hrtimer_rearm_deferred_tif(ti_work); + return true; + } + return false; +} + #else /* CONFIG_RSEQ_SLICE_EXTENSION */ static inline bool rseq_slice_extension_enabled(void) { return false; } static inline bool rseq_arm_slice_extension_timer(void) { return false; } static inline void rseq_slice_clear_grant(struct task_struct *t) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ_SLICE_EXTENSION */ bool rseq_debug_update_user_cs(struct task_struct *t, struct pt_regs *regs, unsigned long csaddr); @@ -778,7 +788,7 @@ static inline void rseq_syscall_exit_to_user_mode(void) { } static inline void rseq_irqentry_exit_to_user_mode(void) { } static inline void rseq_exit_to_user_mode_legacy(void) { } static inline void rseq_debug_syscall_return(struct pt_regs *regs) { } -static inline bool rseq_grant_slice_extension(bool work_pending) { return false; } +static inline bool rseq_grant_slice_extension(unsigned long ti_work, unsigned long mask) { return false; } #endif /* !CONFIG_RSEQ */ #endif /* _LINUX_RSEQ_ENTRY_H */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 9ef63e414791..9e1a6afb07f2 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -50,7 +50,7 @@ static __always_inline unsigned long __exit_to_user_mode_loop(struct pt_regs *re local_irq_enable_exit_to_user(ti_work); if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) { - if (!rseq_grant_slice_extension(ti_work & TIF_SLICE_EXT_DENY)) + if (!rseq_grant_slice_extension(ti_work, TIF_SLICE_EXT_DENY)) schedule(); } @@ -225,6 +225,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) */ if (state.exit_rcu) { instrumentation_begin(); + hrtimer_rearm_deferred(); /* Tell the tracer that IRET will enable interrupts */ trace_hardirqs_on_prepare(); lockdep_hardirqs_on_prepare(); @@ -238,6 +239,7 @@ noinstr void irqentry_exit(struct pt_regs *regs, irqentry_state_t state) if (IS_ENABLED(CONFIG_PREEMPTION)) irqentry_exit_cond_resched(); + hrtimer_rearm_deferred(); /* Covers both tracing and lockdep */ trace_hardirqs_on(); instrumentation_end(); -- cgit v1.2.3 From 15dd3a9488557d3e6ebcecacab79f4e56b69ab54 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Feb 2026 17:38:18 +0100 Subject: hrtimer: Push reprogramming timers into the interrupt return path Currently hrtimer_interrupt() runs expired timers, which can re-arm themselves, after which it computes the next expiration time and re-programs the hardware. However, things like HRTICK, a highres timer driving preemption, cannot re-arm itself at the point of running, since the next task has not been determined yet. The schedule() in the interrupt return path will switch to the next task, which then causes a new hrtimer to be programmed. This then results in reprogramming the hardware at least twice, once after running the timers, and once upon selecting the new task. Notably, *both* events happen in the interrupt. By pushing the hrtimer reprogram all the way into the interrupt return path, it runs after schedule() picks the new task and the double reprogram can be avoided. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.273488269@kernel.org --- include/asm-generic/thread_info_tif.h | 5 ++- include/linux/hrtimer_rearm.h | 72 ++++++++++++++++++++++++++++++++--- kernel/time/Kconfig | 4 +- kernel/time/hrtimer.c | 38 +++++++++++++++--- 4 files changed, 107 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/asm-generic/thread_info_tif.h b/include/asm-generic/thread_info_tif.h index da1610a78f92..528e6fc7efe9 100644 --- a/include/asm-generic/thread_info_tif.h +++ b/include/asm-generic/thread_info_tif.h @@ -41,11 +41,14 @@ #define _TIF_PATCH_PENDING BIT(TIF_PATCH_PENDING) #ifdef HAVE_TIF_RESTORE_SIGMASK -# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() */ +# define TIF_RESTORE_SIGMASK 10 // Restore signal mask in do_signal() # define _TIF_RESTORE_SIGMASK BIT(TIF_RESTORE_SIGMASK) #endif #define TIF_RSEQ 11 // Run RSEQ fast path #define _TIF_RSEQ BIT(TIF_RSEQ) +#define TIF_HRTIMER_REARM 12 // re-arm the timer +#define _TIF_HRTIMER_REARM BIT(TIF_HRTIMER_REARM) + #endif /* _ASM_GENERIC_THREAD_INFO_TIF_H_ */ diff --git a/include/linux/hrtimer_rearm.h b/include/linux/hrtimer_rearm.h index 6293076c03a6..a6f2e5d5e1c7 100644 --- a/include/linux/hrtimer_rearm.h +++ b/include/linux/hrtimer_rearm.h @@ -3,12 +3,74 @@ #define _LINUX_HRTIMER_REARM_H #ifdef CONFIG_HRTIMER_REARM_DEFERRED -static __always_inline void __hrtimer_rearm_deferred(void) { } -static __always_inline void hrtimer_rearm_deferred(void) { } -static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) { } +#include + +void __hrtimer_rearm_deferred(void); + +/* + * This is purely CPU local, so check the TIF bit first to avoid the overhead of + * the atomic test_and_clear_bit() operation for the common case where the bit + * is not set. + */ +static __always_inline bool hrtimer_test_and_clear_rearm_deferred_tif(unsigned long tif_work) +{ + lockdep_assert_irqs_disabled(); + + if (unlikely(tif_work & _TIF_HRTIMER_REARM)) { + clear_thread_flag(TIF_HRTIMER_REARM); + return true; + } + return false; +} + +#define TIF_REARM_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_HRTIMER_REARM) + +/* Invoked from the exit to user before invoking exit_to_user_mode_loop() */ static __always_inline bool -hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) { return false; } -static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) { return false; } +hrtimer_rearm_deferred_user_irq(unsigned long *tif_work, const unsigned long tif_mask) +{ + /* Help the compiler to optimize the function out for syscall returns */ + if (!(tif_mask & _TIF_HRTIMER_REARM)) + return false; + /* + * Rearm the timer if none of the resched flags is set before going into + * the loop which re-enables interrupts. + */ + if (unlikely((*tif_work & TIF_REARM_MASK) == _TIF_HRTIMER_REARM)) { + clear_thread_flag(TIF_HRTIMER_REARM); + __hrtimer_rearm_deferred(); + /* Don't go into the loop if HRTIMER_REARM was the only flag */ + *tif_work &= ~TIF_HRTIMER_REARM; + return !*tif_work; + } + return false; +} + +/* Invoked from the time slice extension decision function */ +static __always_inline void hrtimer_rearm_deferred_tif(unsigned long tif_work) +{ + if (hrtimer_test_and_clear_rearm_deferred_tif(tif_work)) + __hrtimer_rearm_deferred(); +} + +/* + * This is to be called on all irqentry_exit() paths that will enable + * interrupts. + */ +static __always_inline void hrtimer_rearm_deferred(void) +{ + hrtimer_rearm_deferred_tif(read_thread_flags()); +} + +/* + * Invoked from the scheduler on entry to __schedule() so it can defer + * rearming after the load balancing callbacks which might change hrtick. + */ +static __always_inline bool hrtimer_test_and_clear_rearm_deferred(void) +{ + return hrtimer_test_and_clear_rearm_deferred_tif(read_thread_flags()); +} + #else /* CONFIG_HRTIMER_REARM_DEFERRED */ static __always_inline void __hrtimer_rearm_deferred(void) { } static __always_inline void hrtimer_rearm_deferred(void) { } diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index b95bfee3f592..6d6aace0a693 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -60,7 +60,9 @@ config GENERIC_CMOS_UPDATE # Deferred rearming of the hrtimer interrupt config HRTIMER_REARM_DEFERRED - def_bool n + def_bool y + depends on GENERIC_ENTRY && HAVE_GENERIC_TIF_BITS + depends on HIGH_RES_TIMERS && SCHED_HRTICK # Select to handle posix CPU timers from task_work # and not from the timer interrupt context diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 6f05d2569286..2e5f0e292efb 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1939,10 +1939,9 @@ static __latent_entropy void hrtimer_run_softirq(void) * Very similar to hrtimer_force_reprogram(), except it deals with * deferred_rearm and hang_detected. */ -static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now) +static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, + ktime_t expires_next, bool deferred) { - ktime_t expires_next = hrtimer_update_next_event(cpu_base); - cpu_base->expires_next = expires_next; cpu_base->deferred_rearm = false; @@ -1954,9 +1953,37 @@ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now) expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); cpu_base->hang_detected = false; } - hrtimer_rearm_event(expires_next, false); + hrtimer_rearm_event(expires_next, deferred); +} + +#ifdef CONFIG_HRTIMER_REARM_DEFERRED +void __hrtimer_rearm_deferred(void) +{ + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + ktime_t now, expires_next; + + if (!cpu_base->deferred_rearm) + return; + + guard(raw_spinlock)(&cpu_base->lock); + now = hrtimer_update_base(cpu_base); + expires_next = hrtimer_update_next_event(cpu_base); + hrtimer_rearm(cpu_base, now, expires_next, true); } +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +{ + set_thread_flag(TIF_HRTIMER_REARM); +} +#else /* CONFIG_HRTIMER_REARM_DEFERRED */ +static __always_inline void +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +{ + hrtimer_rearm(cpu_base, now, expires_next, false); +} +#endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ + /* * High resolution timer interrupt * Called with interrupts disabled @@ -2014,9 +2041,10 @@ retry: cpu_base->hang_detected = true; } - hrtimer_rearm(cpu_base, now); + hrtimer_interrupt_rearm(cpu_base, now, expires_next); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } + #endif /* !CONFIG_HIGH_RES_TIMERS */ /* -- cgit v1.2.3 From b95c4442b02162904e9012e670b602ebeb3c6c1b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:23 +0100 Subject: hrtimer: Avoid re-evaluation when nothing changed Most times there is no change between hrtimer_interrupt() deferring the rearm and the invocation of hrtimer_rearm_deferred(). In those cases it's a pointless exercise to re-evaluate the next expiring timer. Cache the required data and use it if nothing changed. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.338569372@kernel.org --- include/linux/hrtimer_defs.h | 53 ++++++++++++++++++++++---------------------- kernel/time/hrtimer.c | 45 +++++++++++++++++++++++++------------ 2 files changed, 58 insertions(+), 40 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 2c3bdbd562d2..b6846efec210 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -47,32 +47,31 @@ enum hrtimer_base_type { /** * struct hrtimer_cpu_base - the per cpu clock bases - * @lock: lock protecting the base and associated clock bases - * and timers - * @cpu: cpu number - * @active_bases: Bitfield to mark bases with active timers - * @clock_was_set_seq: Sequence counter of clock was set events - * @hres_active: State of high resolution mode - * @deferred_rearm: A deferred rearm is pending - * @hang_detected: The last hrtimer interrupt detected a hang - * @softirq_activated: displays, if the softirq is raised - update of softirq - * related settings is not required then. - * @nr_events: Total number of hrtimer interrupt events - * @nr_retries: Total number of hrtimer interrupt retries - * @nr_hangs: Total number of hrtimer interrupt hangs - * @max_hang_time: Maximum time spent in hrtimer_interrupt - * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are - * expired - * @online: CPU is online from an hrtimers point of view - * @timer_waiters: A hrtimer_cancel() invocation waits for the timer - * callback to finish. - * @expires_next: absolute time of the next event, is required for remote - * hrtimer enqueue; it is the total first expiry time (hard - * and soft hrtimer are taken into account) - * @next_timer: Pointer to the first expiring timer - * @softirq_expires_next: Time to check, if soft queues needs also to be expired - * @softirq_next_timer: Pointer to the first expiring softirq based timer - * @clock_base: array of clock bases for this cpu + * @lock: lock protecting the base and associated clock bases and timers + * @cpu: cpu number + * @active_bases: Bitfield to mark bases with active timers + * @clock_was_set_seq: Sequence counter of clock was set events + * @hres_active: State of high resolution mode + * @deferred_rearm: A deferred rearm is pending + * @deferred_needs_update: The deferred rearm must re-evaluate the first timer + * @hang_detected: The last hrtimer interrupt detected a hang + * @softirq_activated: displays, if the softirq is raised - update of softirq + * related settings is not required then. + * @nr_events: Total number of hrtimer interrupt events + * @nr_retries: Total number of hrtimer interrupt retries + * @nr_hangs: Total number of hrtimer interrupt hangs + * @max_hang_time: Maximum time spent in hrtimer_interrupt + * @softirq_expiry_lock: Lock which is taken while softirq based hrtimer are expired + * @online: CPU is online from an hrtimers point of view + * @timer_waiters: A hrtimer_cancel() waiters for the timer callback to finish. + * @expires_next: Absolute time of the next event, is required for remote + * hrtimer enqueue; it is the total first expiry time (hard + * and soft hrtimer are taken into account) + * @next_timer: Pointer to the first expiring timer + * @softirq_expires_next: Time to check, if soft queues needs also to be expired + * @softirq_next_timer: Pointer to the first expiring softirq based timer + * @deferred_expires_next: Cached expires next value for deferred rearm + * @clock_base: Array of clock bases for this cpu * * Note: next_timer is just an optimization for __remove_hrtimer(). * Do not dereference the pointer because it is not reliable on @@ -85,6 +84,7 @@ struct hrtimer_cpu_base { unsigned int clock_was_set_seq; bool hres_active; bool deferred_rearm; + bool deferred_needs_update; bool hang_detected; bool softirq_activated; bool online; @@ -102,6 +102,7 @@ struct hrtimer_cpu_base { struct hrtimer *next_timer; ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; + ktime_t deferred_expires_next; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; call_single_data_t csd; } ____cacheline_aligned; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 2e5f0e292efb..e9592cb1e39a 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -919,8 +919,10 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int act return false; /* If a deferred rearm is pending the remote CPU will take care of it */ - if (cpu_base->deferred_rearm) + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; return false; + } /* * Walk the affected clock bases and check whether the first expiring @@ -1141,7 +1143,12 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b * a local timer is removed to be immediately restarted. That's handled * at the call site. */ - if (reprogram && timer == cpu_base->next_timer && !timer->is_lazy) + if (!reprogram || timer != cpu_base->next_timer || timer->is_lazy) + return; + + if (cpu_base->deferred_rearm) + cpu_base->deferred_needs_update = true; + else hrtimer_force_reprogram(cpu_base, /* skip_equal */ true); } @@ -1328,8 +1335,10 @@ static bool __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 del } /* If a deferred rearm is pending skip reprogramming the device */ - if (cpu_base->deferred_rearm) + if (cpu_base->deferred_rearm) { + cpu_base->deferred_needs_update = true; return false; + } if (!was_first || cpu_base != this_cpu_base) { /* @@ -1939,8 +1948,7 @@ static __latent_entropy void hrtimer_run_softirq(void) * Very similar to hrtimer_force_reprogram(), except it deals with * deferred_rearm and hang_detected. */ -static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, - ktime_t expires_next, bool deferred) +static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next, bool deferred) { cpu_base->expires_next = expires_next; cpu_base->deferred_rearm = false; @@ -1950,7 +1958,7 @@ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, * Give the system a chance to do something else than looping * on hrtimer interrupts. */ - expires_next = ktime_add_ns(now, 100 * NSEC_PER_MSEC); + expires_next = ktime_add_ns(ktime_get(), 100 * NSEC_PER_MSEC); cpu_base->hang_detected = false; } hrtimer_rearm_event(expires_next, deferred); @@ -1960,27 +1968,36 @@ static void hrtimer_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, void __hrtimer_rearm_deferred(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - ktime_t now, expires_next; + ktime_t expires_next; if (!cpu_base->deferred_rearm) return; guard(raw_spinlock)(&cpu_base->lock); - now = hrtimer_update_base(cpu_base); - expires_next = hrtimer_update_next_event(cpu_base); - hrtimer_rearm(cpu_base, now, expires_next, true); + if (cpu_base->deferred_needs_update) { + hrtimer_update_base(cpu_base); + expires_next = hrtimer_update_next_event(cpu_base); + } else { + /* No timer added/removed. Use the cached value */ + expires_next = cpu_base->deferred_expires_next; + } + hrtimer_rearm(cpu_base, expires_next, true); } static __always_inline void -hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) { + /* hrtimer_interrupt() just re-evaluated the first expiring timer */ + cpu_base->deferred_needs_update = false; + /* Cache the expiry time */ + cpu_base->deferred_expires_next = expires_next; set_thread_flag(TIF_HRTIMER_REARM); } #else /* CONFIG_HRTIMER_REARM_DEFERRED */ static __always_inline void -hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t now, ktime_t expires_next) +hrtimer_interrupt_rearm(struct hrtimer_cpu_base *cpu_base, ktime_t expires_next) { - hrtimer_rearm(cpu_base, now, expires_next, false); + hrtimer_rearm(cpu_base, expires_next, false); } #endif /* !CONFIG_HRTIMER_REARM_DEFERRED */ @@ -2041,7 +2058,7 @@ retry: cpu_base->hang_detected = true; } - hrtimer_interrupt_rearm(cpu_base, now, expires_next); + hrtimer_interrupt_rearm(cpu_base, expires_next); raw_spin_unlock_irqrestore(&cpu_base->lock, flags); } -- cgit v1.2.3 From eddffab8282e388dddf032f3295fcec87eb08095 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:28 +0100 Subject: hrtimer: Keep track of first expiring timer per clock base Evaluating the next expiry time of all clock bases is cache line expensive as the expiry time of the first expiring timer is not cached in the base and requires to access the timer itself, which is definitely in a different cache line. It's way more efficient to keep track of the expiry time on enqueue and dequeue operations as the relevant data is already in the cache at that point. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.404839710@kernel.org --- include/linux/hrtimer_defs.h | 2 ++ kernel/time/hrtimer.c | 37 ++++++++++++++++++++++++++++++++++--- 2 files changed, 36 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index b6846efec210..fb38df4c0b64 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -19,6 +19,7 @@ * timer to a base on another cpu. * @clockid: clock id for per_cpu support * @seq: seqcount around __run_hrtimer + * @expires_next: Absolute time of the next event in this clock base * @running: pointer to the currently running hrtimer * @active: red black tree root node for the active timers * @offset: offset of this clock to the monotonic base @@ -28,6 +29,7 @@ struct hrtimer_clock_base { unsigned int index; clockid_t clockid; seqcount_raw_spinlock_t seq; + ktime_t expires_next; struct hrtimer *running; struct timerqueue_head active; ktime_t offset; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e9592cb1e39a..d70899a9ddc1 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1107,7 +1107,18 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); - return timerqueue_add(&base->active, &timer->node); + if (!timerqueue_add(&base->active, &timer->node)) + return false; + + base->expires_next = hrtimer_get_expires(timer); + return true; +} + +static inline void base_update_next_timer(struct hrtimer_clock_base *base) +{ + struct timerqueue_node *next = timerqueue_getnext(&base->active); + + base->expires_next = next ? next->expires : KTIME_MAX; } /* @@ -1122,6 +1133,7 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b bool newstate, bool reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; + bool was_first; lockdep_assert_held(&cpu_base->lock); @@ -1131,9 +1143,17 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, newstate); + was_first = &timer->node == timerqueue_getnext(&base->active); + if (!timerqueue_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); + /* Nothing to update if this was not the first timer in the base */ + if (!was_first) + return; + + base_update_next_timer(base); + /* * If reprogram is false don't update cpu_base->next_timer and do not * touch the clock event device. @@ -1182,9 +1202,12 @@ static inline bool remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *base, const enum hrtimer_mode mode, ktime_t expires, u64 delta_ns) { + bool was_first = false; + /* Remove it from the timer queue if active */ if (timer->is_queued) { debug_hrtimer_deactivate(timer); + was_first = &timer->node == timerqueue_getnext(&base->active); timerqueue_del(&base->active, &timer->node); } @@ -1197,8 +1220,16 @@ remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *b /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); - /* Returns true if this is the first expiring timer */ - return timerqueue_add(&base->active, &timer->node); + /* If it's the first expiring timer now or again, update base */ + if (timerqueue_add(&base->active, &timer->node)) { + base->expires_next = expires; + return true; + } + + if (was_first) + base_update_next_timer(base); + + return false; } static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, -- cgit v1.2.3 From 671047943dce5af24e023bca3c5cc244d7565f5a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:47 +0100 Subject: rbtree: Provide rbtree with links Some RB tree users require quick access to the next and the previous node, e.g. to check whether a modification of the node results in a change of the nodes position in the tree. If the node position does not change, then the modification can happen in place without going through a full enqueue requeue cycle. A upcoming use case for this are the timer queues of the hrtimer subsystem as they can optimize for timers which are frequently rearmed while enqueued. This can be obviously achieved with rb_next() and rb_prev(), but those turned out to be quite expensive for hotpath operations depending on the tree depth. Add a linked RB tree variant where add() and erase() maintain the links between the nodes. Like the cached variant it provides a pointer to the left most node in the root. It intentionally does not use a [h]list head as there is no real need for true list operations as the list is strictly coupled to the tree and and cannot be manipulated independently. It sets the nodes previous pointer to NULL for the left most node and the next pointer to NULL for the right most node. This allows a quick check especially for the left most node without consulting the list head address, which creates better code. Aside of the rb_leftmost cached pointer this could trivially provide a rb_rightmost pointer as well, but there is no usage for that (yet). Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.668401024@kernel.org --- include/linux/rbtree.h | 81 +++++++++++++++++++++++++++++++++++++++----- include/linux/rbtree_types.h | 16 +++++++++ lib/rbtree.c | 17 ++++++++++ 3 files changed, 105 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index 4091e978aef2..48acdc3889dd 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -35,10 +35,15 @@ #define RB_CLEAR_NODE(node) \ ((node)->__rb_parent_color = (unsigned long)(node)) +#define RB_EMPTY_LINKED_NODE(lnode) RB_EMPTY_NODE(&(lnode)->node) +#define RB_CLEAR_LINKED_NODE(lnode) ({ \ + RB_CLEAR_NODE(&(lnode)->node); \ + (lnode)->prev = (lnode)->next = NULL; \ +}) extern void rb_insert_color(struct rb_node *, struct rb_root *); extern void rb_erase(struct rb_node *, struct rb_root *); - +extern bool rb_erase_linked(struct rb_node_linked *, struct rb_root_linked *); /* Find logical next and previous nodes in a tree */ extern struct rb_node *rb_next(const struct rb_node *); @@ -213,15 +218,10 @@ rb_add_cached(struct rb_node *node, struct rb_root_cached *tree, return leftmost ? node : NULL; } -/** - * rb_add() - insert @node into @tree - * @node: node to insert - * @tree: tree to insert @node into - * @less: operator defining the (partial) node order - */ static __always_inline void -rb_add(struct rb_node *node, struct rb_root *tree, - bool (*less)(struct rb_node *, const struct rb_node *)) +__rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *), + void (*linkop)(struct rb_node *, struct rb_node *, struct rb_node **)) { struct rb_node **link = &tree->rb_node; struct rb_node *parent = NULL; @@ -234,10 +234,73 @@ rb_add(struct rb_node *node, struct rb_root *tree, link = &parent->rb_right; } + linkop(node, parent, link); rb_link_node(node, parent, link); rb_insert_color(node, tree); } +#define __node_2_linked_node(_n) \ + rb_entry((_n), struct rb_node_linked, node) + +static inline void +rb_link_linked_node(struct rb_node *node, struct rb_node *parent, struct rb_node **link) +{ + if (!parent) + return; + + struct rb_node_linked *nnew = __node_2_linked_node(node); + struct rb_node_linked *npar = __node_2_linked_node(parent); + + if (link == &parent->rb_left) { + nnew->prev = npar->prev; + nnew->next = npar; + npar->prev = nnew; + if (nnew->prev) + nnew->prev->next = nnew; + } else { + nnew->next = npar->next; + nnew->prev = npar; + npar->next = nnew; + if (nnew->next) + nnew->next->prev = nnew; + } +} + +/** + * rb_add_linked() - insert @node into the leftmost linked tree @tree + * @node: node to insert + * @tree: linked tree to insert @node into + * @less: operator defining the (partial) node order + * + * Returns @true when @node is the new leftmost, @false otherwise. + */ +static __always_inline bool +rb_add_linked(struct rb_node_linked *node, struct rb_root_linked *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + __rb_add(&node->node, &tree->rb_root, less, rb_link_linked_node); + if (!node->prev) + tree->rb_leftmost = node; + return !node->prev; +} + +/* Empty linkop function which is optimized away by the compiler */ +static __always_inline void +rb_link_noop(struct rb_node *n, struct rb_node *p, struct rb_node **l) { } + +/** + * rb_add() - insert @node into @tree + * @node: node to insert + * @tree: tree to insert @node into + * @less: operator defining the (partial) node order + */ +static __always_inline void +rb_add(struct rb_node *node, struct rb_root *tree, + bool (*less)(struct rb_node *, const struct rb_node *)) +{ + __rb_add(node, tree, less, rb_link_noop); +} + /** * rb_find_add_cached() - find equivalent @node in @tree, or add @node * @node: node to look-for / insert diff --git a/include/linux/rbtree_types.h b/include/linux/rbtree_types.h index 45b6ecde3665..3c7ae53e8139 100644 --- a/include/linux/rbtree_types.h +++ b/include/linux/rbtree_types.h @@ -9,6 +9,12 @@ struct rb_node { } __attribute__((aligned(sizeof(long)))); /* The alignment might seem pointless, but allegedly CRIS needs it */ +struct rb_node_linked { + struct rb_node node; + struct rb_node_linked *prev; + struct rb_node_linked *next; +}; + struct rb_root { struct rb_node *rb_node; }; @@ -28,7 +34,17 @@ struct rb_root_cached { struct rb_node *rb_leftmost; }; +/* + * Leftmost tree with links. This would allow a trivial rb_rightmost update, + * but that has been omitted due to the lack of users. + */ +struct rb_root_linked { + struct rb_root rb_root; + struct rb_node_linked *rb_leftmost; +}; + #define RB_ROOT (struct rb_root) { NULL, } #define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } +#define RB_ROOT_LINKED (struct rb_root_linked) { {NULL, }, NULL } #endif diff --git a/lib/rbtree.c b/lib/rbtree.c index 18d42bcf4ec9..5790d6ecba4e 100644 --- a/lib/rbtree.c +++ b/lib/rbtree.c @@ -446,6 +446,23 @@ void rb_erase(struct rb_node *node, struct rb_root *root) } EXPORT_SYMBOL(rb_erase); +bool rb_erase_linked(struct rb_node_linked *node, struct rb_root_linked *root) +{ + if (node->prev) + node->prev->next = node->next; + else + root->rb_leftmost = node->next; + + if (node->next) + node->next->prev = node->prev; + + rb_erase(&node->node, &root->rb_root); + RB_CLEAR_LINKED_NODE(node); + + return !!root->rb_leftmost; +} +EXPORT_SYMBOL_GPL(rb_erase_linked); + /* * Augmented rbtree manipulation functions. * -- cgit v1.2.3 From 1339eeb73d6b99cf3aa9981f3f91d6ac4a49c72e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:52 +0100 Subject: timerqueue: Provide linked timerqueue The hrtimer subsystem wants to peak ahead to the next and previous timer to evaluated whether a to be rearmed timer can stay at the same position in the RB tree with the new expiry time. The linked RB tree provides the infrastructure for this as it maintains links to the previous and next nodes for each entry in the tree. Provide timerqueue wrappers around that. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.734827095@kernel.org --- include/linux/timerqueue.h | 56 ++++++++++++++++++++++++++++++++++------ include/linux/timerqueue_types.h | 15 ++++++++--- lib/timerqueue.c | 14 ++++++++++ 3 files changed, 74 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timerqueue.h b/include/linux/timerqueue.h index d306d9dd2207..7d0aaa766580 100644 --- a/include/linux/timerqueue.h +++ b/include/linux/timerqueue.h @@ -5,12 +5,11 @@ #include #include -extern bool timerqueue_add(struct timerqueue_head *head, - struct timerqueue_node *node); -extern bool timerqueue_del(struct timerqueue_head *head, - struct timerqueue_node *node); -extern struct timerqueue_node *timerqueue_iterate_next( - struct timerqueue_node *node); +bool timerqueue_add(struct timerqueue_head *head, struct timerqueue_node *node); +bool timerqueue_del(struct timerqueue_head *head, struct timerqueue_node *node); +struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node); + +bool timerqueue_linked_add(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node); /** * timerqueue_getnext - Returns the timer with the earliest expiration time @@ -19,8 +18,7 @@ extern struct timerqueue_node *timerqueue_iterate_next( * * Returns a pointer to the timer node that has the earliest expiration time. */ -static inline -struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) +static inline struct timerqueue_node *timerqueue_getnext(struct timerqueue_head *head) { struct rb_node *leftmost = rb_first_cached(&head->rb_root); @@ -41,4 +39,46 @@ static inline void timerqueue_init_head(struct timerqueue_head *head) { head->rb_root = RB_ROOT_CACHED; } + +/* Timer queues with linked nodes */ + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_first(struct timerqueue_linked_head *head) +{ + return rb_entry_safe(head->rb_root.rb_leftmost, struct timerqueue_linked_node, node); +} + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_next(struct timerqueue_linked_node *node) +{ + return rb_entry_safe(node->node.next, struct timerqueue_linked_node, node); +} + +static __always_inline +struct timerqueue_linked_node *timerqueue_linked_prev(struct timerqueue_linked_node *node) +{ + return rb_entry_safe(node->node.prev, struct timerqueue_linked_node, node); +} + +static __always_inline +bool timerqueue_linked_del(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node) +{ + return rb_erase_linked(&node->node, &head->rb_root); +} + +static __always_inline void timerqueue_linked_init(struct timerqueue_linked_node *node) +{ + RB_CLEAR_LINKED_NODE(&node->node); +} + +static __always_inline bool timerqueue_linked_node_queued(struct timerqueue_linked_node *node) +{ + return !RB_EMPTY_LINKED_NODE(&node->node); +} + +static __always_inline void timerqueue_linked_init_head(struct timerqueue_linked_head *head) +{ + head->rb_root = RB_ROOT_LINKED; +} + #endif /* _LINUX_TIMERQUEUE_H */ diff --git a/include/linux/timerqueue_types.h b/include/linux/timerqueue_types.h index dc298d0923e3..be2218b147c4 100644 --- a/include/linux/timerqueue_types.h +++ b/include/linux/timerqueue_types.h @@ -6,12 +6,21 @@ #include struct timerqueue_node { - struct rb_node node; - ktime_t expires; + struct rb_node node; + ktime_t expires; }; struct timerqueue_head { - struct rb_root_cached rb_root; + struct rb_root_cached rb_root; +}; + +struct timerqueue_linked_node { + struct rb_node_linked node; + ktime_t expires; +}; + +struct timerqueue_linked_head { + struct rb_root_linked rb_root; }; #endif /* _LINUX_TIMERQUEUE_TYPES_H */ diff --git a/lib/timerqueue.c b/lib/timerqueue.c index cdb9c7658478..e2a1e08cb4bd 100644 --- a/lib/timerqueue.c +++ b/lib/timerqueue.c @@ -82,3 +82,17 @@ struct timerqueue_node *timerqueue_iterate_next(struct timerqueue_node *node) return container_of(next, struct timerqueue_node, node); } EXPORT_SYMBOL_GPL(timerqueue_iterate_next); + +#define __node_2_tq_linked(_n) \ + container_of(rb_entry((_n), struct rb_node_linked, node), struct timerqueue_linked_node, node) + +static __always_inline bool __tq_linked_less(struct rb_node *a, const struct rb_node *b) +{ + return __node_2_tq_linked(a)->expires < __node_2_tq_linked(b)->expires; +} + +bool timerqueue_linked_add(struct timerqueue_linked_head *head, struct timerqueue_linked_node *node) +{ + return rb_add_linked(&node->node, &head->rb_root, __tq_linked_less); +} +EXPORT_SYMBOL_GPL(timerqueue_linked_add); -- cgit v1.2.3 From b7418e6e9b87b849af4df93d527ff83498d1e4c3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 24 Feb 2026 17:38:57 +0100 Subject: hrtimer: Use linked timerqueue To prepare for optimizing the rearming of enqueued timers, switch to the linked timerqueue. That allows to check whether the new expiry time changes the position of the timer in the RB tree or not, by checking the new expiry time against the previous and the next timers expiry. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260224163431.806643179@kernel.org --- include/linux/hrtimer_defs.h | 16 ++++++++-------- include/linux/hrtimer_types.h | 8 ++++---- kernel/time/hrtimer.c | 34 +++++++++++++++++----------------- kernel/time/timer_list.c | 10 ++++------ 4 files changed, 33 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index fb38df4c0b64..0f851b2432c3 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -25,14 +25,14 @@ * @offset: offset of this clock to the monotonic base */ struct hrtimer_clock_base { - struct hrtimer_cpu_base *cpu_base; - unsigned int index; - clockid_t clockid; - seqcount_raw_spinlock_t seq; - ktime_t expires_next; - struct hrtimer *running; - struct timerqueue_head active; - ktime_t offset; + struct hrtimer_cpu_base *cpu_base; + unsigned int index; + clockid_t clockid; + seqcount_raw_spinlock_t seq; + ktime_t expires_next; + struct hrtimer *running; + struct timerqueue_linked_head active; + ktime_t offset; } __hrtimer_clock_base_align; enum hrtimer_base_type { diff --git a/include/linux/hrtimer_types.h b/include/linux/hrtimer_types.h index 0e22bc91d00f..b5dacc8271a4 100644 --- a/include/linux/hrtimer_types.h +++ b/include/linux/hrtimer_types.h @@ -17,7 +17,7 @@ enum hrtimer_restart { /** * struct hrtimer - the basic hrtimer structure - * @node: timerqueue node, which also manages node.expires, + * @node: Linked timerqueue node, which also manages node.expires, * the absolute expiry time in the hrtimers internal * representation. The time is related to the clock on * which the timer is based. Is setup by adding @@ -39,15 +39,15 @@ enum hrtimer_restart { * The hrtimer structure must be initialized by hrtimer_setup() */ struct hrtimer { - struct timerqueue_node node; - ktime_t _softexpires; - enum hrtimer_restart (*__private function)(struct hrtimer *); + struct timerqueue_linked_node node; struct hrtimer_clock_base *base; bool is_queued; bool is_rel; bool is_soft; bool is_hard; bool is_lazy; + ktime_t _softexpires; + enum hrtimer_restart (*__private function)(struct hrtimer *); }; #endif /* _LINUX_HRTIMER_TYPES_H */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index d1e58482e0a9..5e45982363ce 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -557,10 +557,10 @@ static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_bas * If the excluded timer is the first on this base evaluate the * next timer. */ - struct timerqueue_node *node = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *node = timerqueue_linked_first(&base->active); if (unlikely(&exclude->node == node)) { - node = timerqueue_iterate_next(node); + node = timerqueue_linked_next(node); if (!node) continue; expires = ktime_sub(node->expires, base->offset); @@ -576,7 +576,7 @@ static ktime_t hrtimer_bases_next_event_without(struct hrtimer_cpu_base *cpu_bas static __always_inline struct hrtimer *clock_base_next_timer(struct hrtimer_clock_base *base) { - struct timerqueue_node *next = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); return container_of(next, struct hrtimer, node); } @@ -938,9 +938,9 @@ static bool update_needs_ipi(struct hrtimer_cpu_base *cpu_base, unsigned int act active &= cpu_base->active_bases; for_each_active_base(base, cpu_base, active) { - struct timerqueue_node *next; + struct timerqueue_linked_node *next; - next = timerqueue_getnext(&base->active); + next = timerqueue_linked_first(&base->active); expires = ktime_sub(next->expires, base->offset); if (expires < cpu_base->expires_next) return true; @@ -1112,7 +1112,7 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); - if (!timerqueue_add(&base->active, &timer->node)) + if (!timerqueue_linked_add(&base->active, &timer->node)) return false; base->expires_next = hrtimer_get_expires(timer); @@ -1121,7 +1121,7 @@ static bool enqueue_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *ba static inline void base_update_next_timer(struct hrtimer_clock_base *base) { - struct timerqueue_node *next = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); base->expires_next = next ? next->expires : KTIME_MAX; } @@ -1148,9 +1148,9 @@ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *b /* Pairs with the lockless read in hrtimer_is_queued() */ WRITE_ONCE(timer->is_queued, newstate); - was_first = &timer->node == timerqueue_getnext(&base->active); + was_first = !timerqueue_linked_prev(&timer->node); - if (!timerqueue_del(&base->active, &timer->node)) + if (!timerqueue_linked_del(&base->active, &timer->node)) cpu_base->active_bases &= ~(1 << base->index); /* Nothing to update if this was not the first timer in the base */ @@ -1212,8 +1212,8 @@ remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *b /* Remove it from the timer queue if active */ if (timer->is_queued) { debug_hrtimer_deactivate(timer); - was_first = &timer->node == timerqueue_getnext(&base->active); - timerqueue_del(&base->active, &timer->node); + was_first = !timerqueue_linked_prev(&timer->node); + timerqueue_linked_del(&base->active, &timer->node); } /* Set the new expiry time */ @@ -1226,7 +1226,7 @@ remove_and_enqueue_same_base(struct hrtimer *timer, struct hrtimer_clock_base *b WRITE_ONCE(timer->is_queued, HRTIMER_STATE_ENQUEUED); /* If it's the first expiring timer now or again, update base */ - if (timerqueue_add(&base->active, &timer->node)) { + if (timerqueue_linked_add(&base->active, &timer->node)) { base->expires_next = expires; return true; } @@ -1758,7 +1758,7 @@ static void __hrtimer_setup(struct hrtimer *timer, enum hrtimer_restart (*fn)(st timer->is_hard = !!(mode & HRTIMER_MODE_HARD); timer->is_lazy = !!(mode & HRTIMER_MODE_LAZY_REARM); timer->base = &cpu_base->clock_base[base]; - timerqueue_init(&timer->node); + timerqueue_linked_init(&timer->node); if (WARN_ON_ONCE(!fn)) ACCESS_PRIVATE(timer, function) = hrtimer_dummy_timeout; @@ -1923,7 +1923,7 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, struct hrtimer_cloc static __always_inline struct hrtimer *clock_base_next_timer_safe(struct hrtimer_clock_base *base) { - struct timerqueue_node *next = timerqueue_getnext(&base->active); + struct timerqueue_linked_node *next = timerqueue_linked_first(&base->active); return next ? container_of(next, struct hrtimer, node) : NULL; } @@ -2369,7 +2369,7 @@ int hrtimers_prepare_cpu(unsigned int cpu) clock_b->cpu_base = cpu_base; seqcount_raw_spinlock_init(&clock_b->seq, &cpu_base->lock); - timerqueue_init_head(&clock_b->active); + timerqueue_linked_init_head(&clock_b->active); } cpu_base->cpu = cpu; @@ -2399,10 +2399,10 @@ int hrtimers_cpu_starting(unsigned int cpu) static void migrate_hrtimer_list(struct hrtimer_clock_base *old_base, struct hrtimer_clock_base *new_base) { - struct timerqueue_node *node; + struct timerqueue_linked_node *node; struct hrtimer *timer; - while ((node = timerqueue_getnext(&old_base->active))) { + while ((node = timerqueue_linked_first(&old_base->active))) { timer = container_of(node, struct hrtimer, node); BUG_ON(hrtimer_callback_running(timer)); debug_hrtimer_deactivate(timer); diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 19e61826b7de..e2e14fd1b466 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -56,13 +56,11 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, (long long)(ktime_to_ns(hrtimer_get_expires(timer)) - now)); } -static void -print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, - u64 now) +static void print_active_timers(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { + struct timerqueue_linked_node *curr; struct hrtimer *timer, tmp; unsigned long next = 0, i; - struct timerqueue_node *curr; unsigned long flags; next_one: @@ -72,13 +70,13 @@ next_one: raw_spin_lock_irqsave(&base->cpu_base->lock, flags); - curr = timerqueue_getnext(&base->active); + curr = timerqueue_linked_first(&base->active); /* * Crude but we have to do this O(N*N) thing, because * we have to unlock the base when printing: */ while (curr && i < next) { - curr = timerqueue_iterate_next(curr); + curr = timerqueue_linked_next(curr); i++; } -- cgit v1.2.3 From 754e38d2d1aeeadddac5220f34e07cf263502a46 Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:11 +0100 Subject: tracing: Use explicit array size instead of sentinel elements in symbol printing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The sentinel value added by the wrapper macros __print_symbolic() et al prevents the callers from adding their own trailing comma. This makes constructing symbol list dynamically based on kconfig values tedious. Drop the sentinel elements, so callers can either specify the trailing comma or not, just like in regular array initializers. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-2-095357392669@linutronix.de --- include/linux/trace_events.h | 13 ++++++---- include/trace/stages/stage3_trace_output.h | 40 +++++++++++++++--------------- kernel/trace/trace_events_synth.c | 4 +-- kernel/trace/trace_output.c | 20 +++++++++------ kernel/trace/trace_syscalls.c | 3 +-- 5 files changed, 43 insertions(+), 37 deletions(-) (limited to 'include/linux') diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 37eb2f0f3dd8..40a43a4c7caf 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -22,20 +22,23 @@ union bpf_attr; const char *trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, - const struct trace_print_flags *flag_array); + const struct trace_print_flags *flag_array, + size_t flag_array_size); const char *trace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array); + const struct trace_print_flags *symbol_array, + size_t symbol_array_size); #if BITS_PER_LONG == 32 const char *trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, - const struct trace_print_flags_u64 *flag_array); + const struct trace_print_flags_u64 *flag_array, + size_t flag_array_size); const char *trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, - const struct trace_print_flags_u64 - *symbol_array); + const struct trace_print_flags_u64 *symbol_array, + size_t symbol_array_size); #endif struct trace_iterator; diff --git a/include/trace/stages/stage3_trace_output.h b/include/trace/stages/stage3_trace_output.h index fce85ea2df1c..b7d8ef4b9fe1 100644 --- a/include/trace/stages/stage3_trace_output.h +++ b/include/trace/stages/stage3_trace_output.h @@ -64,36 +64,36 @@ #define __get_rel_sockaddr(field) ((struct sockaddr *)__get_rel_dynamic_array(field)) #undef __print_flags -#define __print_flags(flag, delim, flag_array...) \ - ({ \ - static const struct trace_print_flags __flags[] = \ - { flag_array, { -1, NULL }}; \ - trace_print_flags_seq(p, delim, flag, __flags); \ +#define __print_flags(flag, delim, flag_array...) \ + ({ \ + static const struct trace_print_flags __flags[] = \ + { flag_array }; \ + trace_print_flags_seq(p, delim, flag, __flags, ARRAY_SIZE(__flags)); \ }) #undef __print_symbolic -#define __print_symbolic(value, symbol_array...) \ - ({ \ - static const struct trace_print_flags symbols[] = \ - { symbol_array, { -1, NULL }}; \ - trace_print_symbols_seq(p, value, symbols); \ +#define __print_symbolic(value, symbol_array...) \ + ({ \ + static const struct trace_print_flags symbols[] = \ + { symbol_array }; \ + trace_print_symbols_seq(p, value, symbols, ARRAY_SIZE(symbols)); \ }) #undef __print_flags_u64 #undef __print_symbolic_u64 #if BITS_PER_LONG == 32 -#define __print_flags_u64(flag, delim, flag_array...) \ - ({ \ - static const struct trace_print_flags_u64 __flags[] = \ - { flag_array, { -1, NULL } }; \ - trace_print_flags_seq_u64(p, delim, flag, __flags); \ +#define __print_flags_u64(flag, delim, flag_array...) \ + ({ \ + static const struct trace_print_flags_u64 __flags[] = \ + { flag_array }; \ + trace_print_flags_seq_u64(p, delim, flag, __flags, ARRAY_SIZE(__flags)); \ }) -#define __print_symbolic_u64(value, symbol_array...) \ - ({ \ - static const struct trace_print_flags_u64 symbols[] = \ - { symbol_array, { -1, NULL } }; \ - trace_print_symbols_seq_u64(p, value, symbols); \ +#define __print_symbolic_u64(value, symbol_array...) \ + ({ \ + static const struct trace_print_flags_u64 symbols[] = \ + { symbol_array }; \ + trace_print_symbols_seq_u64(p, value, symbols, ARRAY_SIZE(symbols)); \ }) #else #define __print_flags_u64(flag, delim, flag_array...) \ diff --git a/kernel/trace/trace_events_synth.c b/kernel/trace/trace_events_synth.c index 8bb95b2a6fcf..39ac4eba0702 100644 --- a/kernel/trace/trace_events_synth.c +++ b/kernel/trace/trace_events_synth.c @@ -395,7 +395,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, n_u64++; } else { struct trace_print_flags __flags[] = { - __def_gfpflag_names, {-1, NULL} }; + __def_gfpflag_names }; char *space = (i == se->n_fields - 1 ? "" : " "); print_synth_event_num_val(s, print_fmt, @@ -408,7 +408,7 @@ static enum print_line_t print_synth_event(struct trace_iterator *iter, trace_seq_puts(s, " ("); trace_print_flags_seq(s, "|", entry->fields[n_u64].as_u64, - __flags); + __flags, ARRAY_SIZE(__flags)); trace_seq_putc(s, ')'); } n_u64++; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 1996d7aba038..96e2d22b4364 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -69,14 +69,15 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter) const char * trace_print_flags_seq(struct trace_seq *p, const char *delim, unsigned long flags, - const struct trace_print_flags *flag_array) + const struct trace_print_flags *flag_array, + size_t flag_array_size) { unsigned long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -106,12 +107,13 @@ EXPORT_SYMBOL(trace_print_flags_seq); const char * trace_print_symbols_seq(struct trace_seq *p, unsigned long val, - const struct trace_print_flags *symbol_array) + const struct trace_print_flags *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; @@ -133,14 +135,15 @@ EXPORT_SYMBOL(trace_print_symbols_seq); const char * trace_print_flags_seq_u64(struct trace_seq *p, const char *delim, unsigned long long flags, - const struct trace_print_flags_u64 *flag_array) + const struct trace_print_flags_u64 *flag_array, + size_t flag_array_size) { unsigned long long mask; const char *str; const char *ret = trace_seq_buffer_ptr(p); int i, first = 1; - for (i = 0; flag_array[i].name && flags; i++) { + for (i = 0; i < flag_array_size && flags; i++) { mask = flag_array[i].mask; if ((flags & mask) != mask) @@ -170,12 +173,13 @@ EXPORT_SYMBOL(trace_print_flags_seq_u64); const char * trace_print_symbols_seq_u64(struct trace_seq *p, unsigned long long val, - const struct trace_print_flags_u64 *symbol_array) + const struct trace_print_flags_u64 *symbol_array, + size_t symbol_array_size) { int i; const char *ret = trace_seq_buffer_ptr(p); - for (i = 0; symbol_array[i].name; i++) { + for (i = 0; i < symbol_array_size; i++) { if (val != symbol_array[i].mask) continue; diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 37317b81fcda..8ad72e17d8eb 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -174,7 +174,6 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat { O_NOFOLLOW, "O_NOFOLLOW" }, { O_NOATIME, "O_NOATIME" }, { O_CLOEXEC, "O_CLOEXEC" }, - { -1, NULL } }; trace_seq_printf(s, "%s(", entry->name); @@ -205,7 +204,7 @@ sys_enter_openat_print(struct syscall_trace_enter *trace, struct syscall_metadat trace_seq_puts(s, "O_RDONLY|"); } - trace_print_flags_seq(s, "|", bits, __flags); + trace_print_flags_seq(s, "|", bits, __flags, ARRAY_SIZE(__flags)); /* * trace_print_flags_seq() adds a '\0' to the * buffer, but this needs to append more to the seq. -- cgit v1.2.3 From 8ef2807042d0886a85bbcb0aba1a2a277680dc4a Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:15 +0100 Subject: hrtimer: Remove hrtimer_get_expires_ns() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are no users left. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-6-095357392669@linutronix.de --- include/linux/hrtimer.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index c087b7142330..9ced498fefaa 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -116,11 +116,6 @@ static inline ktime_t hrtimer_get_softexpires(const struct hrtimer *timer) return timer->_softexpires; } -static inline s64 hrtimer_get_expires_ns(const struct hrtimer *timer) -{ - return ktime_to_ns(timer->node.expires); -} - ktime_t hrtimer_cb_get_time(const struct hrtimer *timer); static inline ktime_t hrtimer_expires_remaining(const struct hrtimer *timer) -- cgit v1.2.3 From b94c076dd949426d09e5d415304acb3f951d9069 Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:17 +0100 Subject: hrtimer: Drop spurious space in 'enum hrtimer_base_type' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This spurious space makes grepping for the enum definition annoying. Remove it. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-8-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index 0f851b2432c3..e6d4dc1b61e0 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -35,7 +35,7 @@ struct hrtimer_clock_base { ktime_t offset; } __hrtimer_clock_base_align; -enum hrtimer_base_type { +enum hrtimer_base_type { HRTIMER_BASE_MONOTONIC, HRTIMER_BASE_REALTIME, HRTIMER_BASE_BOOTTIME, -- cgit v1.2.3 From f12ef5cb4e035e15f0c324c41ff402441578ffda Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:19 +0100 Subject: hrtimer: Mark index and clockid of clock base as const MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These fields are initialized once and are never supposed to change. Mark them as const to make this explicit. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-10-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index e6d4dc1b61e0..a03240c0b14f 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -26,8 +26,8 @@ */ struct hrtimer_clock_base { struct hrtimer_cpu_base *cpu_base; - unsigned int index; - clockid_t clockid; + const unsigned int index; + const clockid_t clockid; seqcount_raw_spinlock_t seq; ktime_t expires_next; struct hrtimer *running; -- cgit v1.2.3 From f27fc117cf8fba56e0619694e685f9bca9b9cb82 Mon Sep 17 00:00:00 2001 From: "Thomas Weißschuh (Schneider Electric)" Date: Wed, 11 Mar 2026 11:15:20 +0100 Subject: hrtimer: Remove trailing comma after HRTIMER_MAX_CLOCK_BASES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HRTIMER_MAX_CLOCK_BASES is required to stay the last value of the enum. Drop the trailing comma so no new members are added after it by mistake. Signed-off-by: Thomas Weißschuh (Schneider Electric) Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260311-hrtimer-cleanups-v1-11-095357392669@linutronix.de --- include/linux/hrtimer_defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer_defs.h b/include/linux/hrtimer_defs.h index a03240c0b14f..52ed9e46ff13 100644 --- a/include/linux/hrtimer_defs.h +++ b/include/linux/hrtimer_defs.h @@ -44,7 +44,7 @@ enum hrtimer_base_type { HRTIMER_BASE_REALTIME_SOFT, HRTIMER_BASE_BOOTTIME_SOFT, HRTIMER_BASE_TAI_SOFT, - HRTIMER_MAX_CLOCK_BASES, + HRTIMER_MAX_CLOCK_BASES }; /** -- cgit v1.2.3 From 763aacf86f1baefb134c70813aa8c72d1675d738 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 17 Mar 2026 10:01:54 +0100 Subject: clocksource: Rewrite watchdog code completely The clocksource watchdog code has over time reached the state of an impenetrable maze of duct tape and staples. The original design, which was made in the context of systems far smaller than today, is based on the assumption that the to be monitored clocksource (TSC) can be trivially compared against a known to be stable clocksource (HPET/ACPI-PM timer). Over the years it turned out that this approach has major flaws: - Long delays between watchdog invocations can result in wrap arounds of the reference clocksource - Scalability of the reference clocksource readout can degrade on large multi-socket systems due to interconnect congestion This was addressed with various heuristics which degraded the accuracy of the watchdog to the point that it fails to detect actual TSC problems on older hardware which exposes slow inter CPU drifts due to firmware manipulating the TSC to hide SMI time. To address this and bring back sanity to the watchdog, rewrite the code completely with a different approach: 1) Restrict the validation against a reference clocksource to the boot CPU, which is usually the CPU/Socket closest to the legacy block which contains the reference source (HPET/ACPI-PM timer). Validate that the reference readout is within a bound latency so that the actual comparison against the TSC stays within 500ppm as long as the clocks are stable. 2) Compare the TSCs of the other CPUs in a round robin fashion against the boot CPU in the same way the TSC synchronization on CPU hotplug works. This still can suffer from delayed reaction of the remote CPU to the SMP function call and the latency of the control variable cache line. But this latency is not affecting correctness. It only affects the accuracy. With low contention the readout latency is in the low nanoseconds range, which detects even slight skews between CPUs. Under high contention this becomes obviously less accurate, but still detects slow skews reliably as it solely relies on subsequent readouts being monotonically increasing. It just can take slightly longer to detect the issue. 3) Rewrite the watchdog test so it tests the various mechanisms one by one and validating the result against the expectation. Signed-off-by: Thomas Gleixner Tested-by: Borislav Petkov (AMD) Tested-by: Daniel J Blueman Reviewed-by: Jiri Wiesner Reviewed-by: Daniel J Blueman Link: https://patch.msgid.link/20260123231521.926490888@kernel.org Link: https://patch.msgid.link/87h5qeomm5.ffs@tglx --- Documentation/admin-guide/kernel-parameters.txt | 7 +- arch/x86/include/asm/time.h | 1 - arch/x86/kernel/hpet.c | 4 +- arch/x86/kernel/tsc.c | 49 +- drivers/clocksource/acpi_pm.c | 4 +- include/linux/clocksource.h | 28 +- kernel/time/Kconfig | 12 - kernel/time/clocksource-wdtest.c | 268 ++++---- kernel/time/clocksource.c | 793 +++++++++++++----------- kernel/time/jiffies.c | 1 - 10 files changed, 584 insertions(+), 583 deletions(-) (limited to 'include/linux') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index cb850e5290c2..9b0be127d4a8 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -7950,12 +7950,7 @@ Kernel parameters (HPET or PM timer) on systems whose TSC frequency was obtained from HW or FW using either an MSR or CPUID(0x15). Warn if the difference is more than 500 ppm. - [x86] watchdog: Use TSC as the watchdog clocksource with - which to check other HW timers (HPET or PM timer), but - only on systems where TSC has been deemed trustworthy. - This will be suppressed by an earlier tsc=nowatchdog and - can be overridden by a later tsc=nowatchdog. A console - message will flag any such suppression or overriding. + [x86] watchdog: Enforce the clocksource watchdog on TSC tsc_early_khz= [X86,EARLY] Skip early TSC calibration and use the given value instead. Useful when the early TSC frequency discovery diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h index f360104ed172..459780c3ed1f 100644 --- a/arch/x86/include/asm/time.h +++ b/arch/x86/include/asm/time.h @@ -7,7 +7,6 @@ extern void hpet_time_init(void); extern bool pit_timer_init(void); -extern bool tsc_clocksource_watchdog_disabled(void); extern struct clock_event_device *global_clock_event; diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 610590e83445..8dc7b710e125 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -854,7 +854,7 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = HPET_MASK, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_CALIBRATED, .resume = hpet_resume_counter, }; @@ -1082,8 +1082,6 @@ int __init hpet_enable(void) if (!hpet_counting()) goto out_nohpet; - if (tsc_clocksource_watchdog_disabled()) - clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY; clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); if (id & HPET_ID_LEGSUP) { diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9ccd58c29409..c5110eb554bc 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -322,12 +322,16 @@ int __init notsc_setup(char *str) return 1; } #endif - __setup("notsc", notsc_setup); +enum { + TSC_WATCHDOG_AUTO, + TSC_WATCHDOG_OFF, + TSC_WATCHDOG_ON, +}; + static int no_sched_irq_time; -static int no_tsc_watchdog; -static int tsc_as_watchdog; +static int tsc_watchdog; static int __init tsc_setup(char *str) { @@ -337,25 +341,14 @@ static int __init tsc_setup(char *str) no_sched_irq_time = 1; if (!strcmp(str, "unstable")) mark_tsc_unstable("boot parameter"); - if (!strcmp(str, "nowatchdog")) { - no_tsc_watchdog = 1; - if (tsc_as_watchdog) - pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n", - __func__); - tsc_as_watchdog = 0; - } + if (!strcmp(str, "nowatchdog")) + tsc_watchdog = TSC_WATCHDOG_OFF; if (!strcmp(str, "recalibrate")) tsc_force_recalibrate = 1; - if (!strcmp(str, "watchdog")) { - if (no_tsc_watchdog) - pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n", - __func__); - else - tsc_as_watchdog = 1; - } + if (!strcmp(str, "watchdog")) + tsc_watchdog = TSC_WATCHDOG_ON; return 1; } - __setup("tsc=", tsc_setup); #define MAX_RETRIES 5 @@ -1175,7 +1168,6 @@ static int tsc_cs_enable(struct clocksource *cs) static struct clocksource clocksource_tsc_early = { .name = "tsc-early", .rating = 299, - .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = read_tsc, .mask = CLOCKSOURCE_MASK(64), .flags = CLOCK_SOURCE_IS_CONTINUOUS | @@ -1202,7 +1194,6 @@ static struct clocksource clocksource_tsc = { .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_CAN_INLINE_READ | CLOCK_SOURCE_MUST_VERIFY | - CLOCK_SOURCE_VERIFY_PERCPU | CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT, .id = CSID_X86_TSC, .vdso_clock_mode = VDSO_CLOCKMODE_TSC, @@ -1231,16 +1222,12 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable); static void __init tsc_disable_clocksource_watchdog(void) { + if (tsc_watchdog == TSC_WATCHDOG_ON) + return; clocksource_tsc_early.flags &= ~CLOCK_SOURCE_MUST_VERIFY; clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; } -bool tsc_clocksource_watchdog_disabled(void) -{ - return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) && - tsc_as_watchdog && !no_tsc_watchdog; -} - static void __init check_system_tsc_reliable(void) { #if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) @@ -1395,6 +1382,8 @@ restart: (unsigned long)tsc_khz / 1000, (unsigned long)tsc_khz % 1000); + clocksource_tsc.flags |= CLOCK_SOURCE_CALIBRATED; + /* Inform the TSC deadline clockevent devices about the recalibration */ lapic_update_tsc_freq(); @@ -1470,12 +1459,10 @@ static bool __init determine_cpu_tsc_frequencies(bool early) if (early) { cpu_khz = x86_platform.calibrate_cpu(); - if (tsc_early_khz) { + if (tsc_early_khz) tsc_khz = tsc_early_khz; - } else { + else tsc_khz = x86_platform.calibrate_tsc(); - clocksource_tsc.freq_khz = tsc_khz; - } } else { /* We should not be here with non-native cpu calibration */ WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); @@ -1579,7 +1566,7 @@ void __init tsc_init(void) return; } - if (tsc_clocksource_reliable || no_tsc_watchdog) + if (tsc_clocksource_reliable || tsc_watchdog == TSC_WATCHDOG_OFF) tsc_disable_clocksource_watchdog(); clocksource_register_khz(&clocksource_tsc_early, tsc_khz); diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c index b4330a01a566..67792937242f 100644 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@ -98,7 +98,7 @@ static struct clocksource clocksource_acpi_pm = { .rating = 200, .read = acpi_pm_read, .mask = (u64)ACPI_PM_MASK, - .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_CALIBRATED, .suspend = acpi_pm_suspend, .resume = acpi_pm_resume, }; @@ -243,8 +243,6 @@ static int __init init_acpi_pm_clocksource(void) return -ENODEV; } - if (tsc_clocksource_watchdog_disabled()) - clocksource_acpi_pm.flags |= CLOCK_SOURCE_MUST_VERIFY; return clocksource_register_hz(&clocksource_acpi_pm, PMTMR_TICKS_PER_SEC); } diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 25774fc5b53d..ccf5c0ca26b7 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -44,8 +44,6 @@ struct module; * @shift: Cycle to nanosecond divisor (power of two) * @max_idle_ns: Maximum idle time permitted by the clocksource (nsecs) * @maxadj: Maximum adjustment value to mult (~11%) - * @uncertainty_margin: Maximum uncertainty in nanoseconds per half second. - * Zero says to use default WATCHDOG_THRESHOLD. * @archdata: Optional arch-specific data * @max_cycles: Maximum safe cycle value which won't overflow on * multiplication @@ -105,7 +103,6 @@ struct clocksource { u32 shift; u64 max_idle_ns; u32 maxadj; - u32 uncertainty_margin; #ifdef CONFIG_ARCH_CLOCKSOURCE_DATA struct arch_clocksource_data archdata; #endif @@ -133,6 +130,7 @@ struct clocksource { struct list_head wd_list; u64 cs_last; u64 wd_last; + unsigned int wd_cpu; #endif struct module *owner; }; @@ -142,15 +140,18 @@ struct clocksource { */ #define CLOCK_SOURCE_IS_CONTINUOUS 0x01 #define CLOCK_SOURCE_MUST_VERIFY 0x02 +#define CLOCK_SOURCE_CALIBRATED 0x04 #define CLOCK_SOURCE_WATCHDOG 0x10 #define CLOCK_SOURCE_VALID_FOR_HRES 0x20 #define CLOCK_SOURCE_UNSTABLE 0x40 #define CLOCK_SOURCE_SUSPEND_NONSTOP 0x80 #define CLOCK_SOURCE_RESELECT 0x100 -#define CLOCK_SOURCE_VERIFY_PERCPU 0x200 -#define CLOCK_SOURCE_CAN_INLINE_READ 0x400 -#define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x800 +#define CLOCK_SOURCE_CAN_INLINE_READ 0x200 +#define CLOCK_SOURCE_HAS_COUPLED_CLOCK_EVENT 0x400 + +#define CLOCK_SOURCE_WDTEST 0x800 +#define CLOCK_SOURCE_WDTEST_PERCPU 0x1000 /* simplify initialization of mask field */ #define CLOCKSOURCE_MASK(bits) GENMASK_ULL((bits) - 1, 0) @@ -301,21 +302,6 @@ static inline void timer_probe(void) {} #define TIMER_ACPI_DECLARE(name, table_id, fn) \ ACPI_DECLARE_PROBE_ENTRY(timer, name, table_id, 0, NULL, 0, fn) -static inline unsigned int clocksource_get_max_watchdog_retry(void) -{ - /* - * When system is in the boot phase or under heavy workload, there - * can be random big latencies during the clocksource/watchdog - * read, so allow retries to filter the noise latency. As the - * latency's frequency and maximum value goes up with the number of - * CPUs, scale the number of retries with the number of online - * CPUs. - */ - return (ilog2(num_online_cpus()) / 2) + 1; -} - -void clocksource_verify_percpu(struct clocksource *cs); - /** * struct clocksource_base - hardware abstraction for clock on which a clocksource * is based diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig index 6d6aace0a693..6a11964377e6 100644 --- a/kernel/time/Kconfig +++ b/kernel/time/Kconfig @@ -212,18 +212,6 @@ config HIGH_RES_TIMERS hardware is not capable then this option only increases the size of the kernel image. -config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US - int "Clocksource watchdog maximum allowable skew (in microseconds)" - depends on CLOCKSOURCE_WATCHDOG - range 50 1000 - default 125 - help - Specify the maximum amount of allowable watchdog skew in - microseconds before reporting the clocksource to be unstable. - The default is based on a half-second clocksource watchdog - interval and NTP's maximum frequency drift of 500 parts - per million. If the clocksource is good enough for NTP, - it is good enough for the clocksource watchdog! endif config POSIX_AUX_CLOCKS diff --git a/kernel/time/clocksource-wdtest.c b/kernel/time/clocksource-wdtest.c index 38dae590b29f..b4cf17b4aeed 100644 --- a/kernel/time/clocksource-wdtest.c +++ b/kernel/time/clocksource-wdtest.c @@ -3,202 +3,196 @@ * Unit test for the clocksource watchdog. * * Copyright (C) 2021 Facebook, Inc. + * Copyright (C) 2026 Intel Corp. * * Author: Paul E. McKenney + * Author: Thomas Gleixner */ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include -#include +#include #include -#include /* for spin_unlock_irq() using preempt_count() m68k */ -#include #include -#include -#include -#include #include "tick-internal.h" +#include "timekeeping_internal.h" MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Clocksource watchdog unit test"); MODULE_AUTHOR("Paul E. McKenney "); +MODULE_AUTHOR("Thomas Gleixner "); + +enum wdtest_states { + WDTEST_INJECT_NONE, + WDTEST_INJECT_DELAY, + WDTEST_INJECT_POSITIVE, + WDTEST_INJECT_NEGATIVE, + WDTEST_INJECT_PERCPU = 0x100, +}; -static int holdoff = IS_BUILTIN(CONFIG_TEST_CLOCKSOURCE_WATCHDOG) ? 10 : 0; -module_param(holdoff, int, 0444); -MODULE_PARM_DESC(holdoff, "Time to wait to start test (s)."); +static enum wdtest_states wdtest_state; +static unsigned long wdtest_test_count; +static ktime_t wdtest_last_ts, wdtest_offset; -/* Watchdog kthread's task_struct pointer for debug purposes. */ -static struct task_struct *wdtest_task; +#define SHIFT_4000PPM 8 -static u64 wdtest_jiffies_read(struct clocksource *cs) +static ktime_t wdtest_get_offset(struct clocksource *cs) { - return (u64)jiffies; -} - -static struct clocksource clocksource_wdtest_jiffies = { - .name = "wdtest-jiffies", - .rating = 1, /* lowest valid rating*/ - .uncertainty_margin = TICK_NSEC, - .read = wdtest_jiffies_read, - .mask = CLOCKSOURCE_MASK(32), - .flags = CLOCK_SOURCE_MUST_VERIFY, - .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ - .shift = JIFFIES_SHIFT, - .max_cycles = 10, -}; + if (wdtest_state < WDTEST_INJECT_PERCPU) + return wdtest_test_count & 0x1 ? 0 : wdtest_offset >> SHIFT_4000PPM; -static int wdtest_ktime_read_ndelays; -static bool wdtest_ktime_read_fuzz; + /* Only affect the readout of the "remote" CPU */ + return cs->wd_cpu == smp_processor_id() ? 0 : NSEC_PER_MSEC; +} static u64 wdtest_ktime_read(struct clocksource *cs) { - int wkrn = READ_ONCE(wdtest_ktime_read_ndelays); - static int sign = 1; - u64 ret; + ktime_t now = ktime_get_raw_fast_ns(); + ktime_t intv = now - wdtest_last_ts; - if (wkrn) { - udelay(cs->uncertainty_margin / 250); - WRITE_ONCE(wdtest_ktime_read_ndelays, wkrn - 1); - } - ret = ktime_get_real_fast_ns(); - if (READ_ONCE(wdtest_ktime_read_fuzz)) { - sign = -sign; - ret = ret + sign * 100 * NSEC_PER_MSEC; + /* + * Only increment the test counter once per watchdog interval and + * store the interval for the offset calculation of this step. This + * guarantees a consistent behaviour even if the other side needs + * to repeat due to a watchdog read timeout. + */ + if (intv > (NSEC_PER_SEC / 4)) { + WRITE_ONCE(wdtest_test_count, wdtest_test_count + 1); + wdtest_last_ts = now; + wdtest_offset = intv; } - return ret; -} -static void wdtest_ktime_cs_mark_unstable(struct clocksource *cs) -{ - pr_info("--- Marking %s unstable due to clocksource watchdog.\n", cs->name); + switch (wdtest_state & ~WDTEST_INJECT_PERCPU) { + case WDTEST_INJECT_POSITIVE: + return now + wdtest_get_offset(cs); + case WDTEST_INJECT_NEGATIVE: + return now - wdtest_get_offset(cs); + case WDTEST_INJECT_DELAY: + udelay(500); + return now; + default: + return now; + } } -#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \ - CLOCK_SOURCE_VALID_FOR_HRES | \ - CLOCK_SOURCE_MUST_VERIFY | \ - CLOCK_SOURCE_VERIFY_PERCPU) +#define KTIME_FLAGS (CLOCK_SOURCE_IS_CONTINUOUS | \ + CLOCK_SOURCE_CALIBRATED | \ + CLOCK_SOURCE_MUST_VERIFY | \ + CLOCK_SOURCE_WDTEST) static struct clocksource clocksource_wdtest_ktime = { .name = "wdtest-ktime", - .rating = 300, + .rating = 10, .read = wdtest_ktime_read, .mask = CLOCKSOURCE_MASK(64), .flags = KTIME_FLAGS, - .mark_unstable = wdtest_ktime_cs_mark_unstable, .list = LIST_HEAD_INIT(clocksource_wdtest_ktime.list), }; -/* Reset the clocksource if needed. */ -static void wdtest_ktime_clocksource_reset(void) +static void wdtest_clocksource_reset(enum wdtest_states which, bool percpu) +{ + clocksource_unregister(&clocksource_wdtest_ktime); + + pr_info("Test: State %d percpu %d\n", which, percpu); + + wdtest_state = which; + if (percpu) + wdtest_state |= WDTEST_INJECT_PERCPU; + wdtest_test_count = 0; + wdtest_last_ts = 0; + + clocksource_wdtest_ktime.rating = 10; + clocksource_wdtest_ktime.flags = KTIME_FLAGS; + if (percpu) + clocksource_wdtest_ktime.flags |= CLOCK_SOURCE_WDTEST_PERCPU; + clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); +} + +static bool wdtest_execute(enum wdtest_states which, bool percpu, unsigned int expect, + unsigned long calls) { - if (clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE) { - clocksource_unregister(&clocksource_wdtest_ktime); - clocksource_wdtest_ktime.flags = KTIME_FLAGS; - schedule_timeout_uninterruptible(HZ / 10); - clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); + wdtest_clocksource_reset(which, percpu); + + for (; READ_ONCE(wdtest_test_count) < calls; msleep(100)) { + unsigned int flags = READ_ONCE(clocksource_wdtest_ktime.flags); + + if (kthread_should_stop()) + return false; + + if (flags & CLOCK_SOURCE_UNSTABLE) { + if (expect & CLOCK_SOURCE_UNSTABLE) + return true; + pr_warn("Fail: Unexpected unstable\n"); + return false; + } + if (flags & CLOCK_SOURCE_VALID_FOR_HRES) { + if (expect & CLOCK_SOURCE_VALID_FOR_HRES) + return true; + pr_warn("Fail: Unexpected valid for highres\n"); + return false; + } } + + if (!expect) + return true; + + pr_warn("Fail: Timed out\n"); + return false; } -/* Run the specified series of watchdog tests. */ -static int wdtest_func(void *arg) +static bool wdtest_run(bool percpu) { - unsigned long j1, j2; - int i, max_retries; - char *s; + if (!wdtest_execute(WDTEST_INJECT_NONE, percpu, CLOCK_SOURCE_VALID_FOR_HRES, 8)) + return false; - schedule_timeout_uninterruptible(holdoff * HZ); + if (!wdtest_execute(WDTEST_INJECT_DELAY, percpu, 0, 4)) + return false; - /* - * Verify that jiffies-like clocksources get the manually - * specified uncertainty margin. - */ - pr_info("--- Verify jiffies-like uncertainty margin.\n"); - __clocksource_register(&clocksource_wdtest_jiffies); - WARN_ON_ONCE(clocksource_wdtest_jiffies.uncertainty_margin != TICK_NSEC); + if (!wdtest_execute(WDTEST_INJECT_POSITIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8)) + return false; - j1 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); - schedule_timeout_uninterruptible(HZ); - j2 = clocksource_wdtest_jiffies.read(&clocksource_wdtest_jiffies); - WARN_ON_ONCE(j1 == j2); + if (!wdtest_execute(WDTEST_INJECT_NEGATIVE, percpu, CLOCK_SOURCE_UNSTABLE, 8)) + return false; - clocksource_unregister(&clocksource_wdtest_jiffies); + return true; +} - /* - * Verify that tsc-like clocksources are assigned a reasonable - * uncertainty margin. - */ - pr_info("--- Verify tsc-like uncertainty margin.\n"); +static int wdtest_func(void *arg) +{ clocksource_register_khz(&clocksource_wdtest_ktime, 1000 * 1000); - WARN_ON_ONCE(clocksource_wdtest_ktime.uncertainty_margin < NSEC_PER_USEC); - - j1 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); - udelay(1); - j2 = clocksource_wdtest_ktime.read(&clocksource_wdtest_ktime); - pr_info("--- tsc-like times: %lu - %lu = %lu.\n", j2, j1, j2 - j1); - WARN_ONCE(time_before(j2, j1 + NSEC_PER_USEC), - "Expected at least 1000ns, got %lu.\n", j2 - j1); - - /* Verify tsc-like stability with various numbers of errors injected. */ - max_retries = clocksource_get_max_watchdog_retry(); - for (i = 0; i <= max_retries + 1; i++) { - if (i <= 1 && i < max_retries) - s = ""; - else if (i <= max_retries) - s = ", expect message"; - else - s = ", expect clock skew"; - pr_info("--- Watchdog with %dx error injection, %d retries%s.\n", i, max_retries, s); - WRITE_ONCE(wdtest_ktime_read_ndelays, i); - schedule_timeout_uninterruptible(2 * HZ); - WARN_ON_ONCE(READ_ONCE(wdtest_ktime_read_ndelays)); - WARN_ON_ONCE((i <= max_retries) != - !(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); - wdtest_ktime_clocksource_reset(); + if (wdtest_run(false)) { + if (wdtest_run(true)) + pr_info("Success: All tests passed\n"); } - - /* Verify tsc-like stability with clock-value-fuzz error injection. */ - pr_info("--- Watchdog clock-value-fuzz error injection, expect clock skew and per-CPU mismatches.\n"); - WRITE_ONCE(wdtest_ktime_read_fuzz, true); - schedule_timeout_uninterruptible(2 * HZ); - WARN_ON_ONCE(!(clocksource_wdtest_ktime.flags & CLOCK_SOURCE_UNSTABLE)); - clocksource_verify_percpu(&clocksource_wdtest_ktime); - WRITE_ONCE(wdtest_ktime_read_fuzz, false); - clocksource_unregister(&clocksource_wdtest_ktime); - pr_info("--- Done with test.\n"); - return 0; -} + if (!IS_MODULE(CONFIG_TEST_CLOCKSOURCE_WATCHDOG)) + return 0; -static void wdtest_print_module_parms(void) -{ - pr_alert("--- holdoff=%d\n", holdoff); + while (!kthread_should_stop()) + schedule_timeout_interruptible(3600 * HZ); + return 0; } -/* Cleanup function. */ -static void clocksource_wdtest_cleanup(void) -{ -} +static struct task_struct *wdtest_thread; static int __init clocksource_wdtest_init(void) { - int ret = 0; - - wdtest_print_module_parms(); + struct task_struct *t = kthread_run(wdtest_func, NULL, "wdtest"); - /* Create watchdog-test task. */ - wdtest_task = kthread_run(wdtest_func, NULL, "wdtest"); - if (IS_ERR(wdtest_task)) { - ret = PTR_ERR(wdtest_task); - pr_warn("%s: Failed to create wdtest kthread.\n", __func__); - wdtest_task = NULL; - return ret; + if (IS_ERR(t)) { + pr_warn("Failed to create wdtest kthread.\n"); + return PTR_ERR(t); } - + wdtest_thread = t; return 0; } - module_init(clocksource_wdtest_init); + +static void clocksource_wdtest_cleanup(void) +{ + if (wdtest_thread) + kthread_stop(wdtest_thread); +} module_exit(clocksource_wdtest_cleanup); diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e753a0632ac8..baee13a1f87f 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -7,15 +7,17 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include #include +#include +#include +#include #include -#include -#include /* for spin_unlock_irq() using preempt_count() m68k */ -#include #include +#include #include -#include +#include +#include +#include #include "tick-internal.h" #include "timekeeping_internal.h" @@ -107,48 +109,6 @@ static char override_name[CS_NAME_LEN]; static int finished_booting; static u64 suspend_start; -/* - * Interval: 0.5sec. - */ -#define WATCHDOG_INTERVAL (HZ >> 1) -#define WATCHDOG_INTERVAL_MAX_NS ((2 * WATCHDOG_INTERVAL) * (NSEC_PER_SEC / HZ)) - -/* - * Threshold: 0.0312s, when doubled: 0.0625s. - */ -#define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 5) - -/* - * Maximum permissible delay between two readouts of the watchdog - * clocksource surrounding a read of the clocksource being validated. - * This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as - * a lower bound for cs->uncertainty_margin values when registering clocks. - * - * The default of 500 parts per million is based on NTP's limits. - * If a clocksource is good enough for NTP, it is good enough for us! - * - * In other words, by default, even if a clocksource is extremely - * precise (for example, with a sub-nanosecond period), the maximum - * permissible skew between the clocksource watchdog and the clocksource - * under test is not permitted to go below the 500ppm minimum defined - * by MAX_SKEW_USEC. This 500ppm minimum may be overridden using the - * CLOCKSOURCE_WATCHDOG_MAX_SKEW_US Kconfig option. - */ -#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US -#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US -#else -#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ) -#endif - -/* - * Default for maximum permissible skew when cs->uncertainty_margin is - * not specified, and the lower bound even when cs->uncertainty_margin - * is specified. This is also the default that is used when registering - * clocks with unspecified cs->uncertainty_margin, so this macro is used - * even in CONFIG_CLOCKSOURCE_WATCHDOG=n kernels. - */ -#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC) - #ifdef CONFIG_CLOCKSOURCE_WATCHDOG static void clocksource_watchdog_work(struct work_struct *work); static void clocksource_select(void); @@ -160,7 +120,42 @@ static DECLARE_WORK(watchdog_work, clocksource_watchdog_work); static DEFINE_SPINLOCK(watchdog_lock); static int watchdog_running; static atomic_t watchdog_reset_pending; -static int64_t watchdog_max_interval; + +/* Watchdog interval: 0.5sec. */ +#define WATCHDOG_INTERVAL (HZ >> 1) +#define WATCHDOG_INTERVAL_NS (WATCHDOG_INTERVAL * (NSEC_PER_SEC / HZ)) + +/* Maximum time between two reference watchdog readouts */ +#define WATCHDOG_READOUT_MAX_NS (50U * NSEC_PER_USEC) + +/* + * Maximum time between two remote readouts for NUMA=n. On NUMA enabled systems + * the timeout is calculated from the numa distance. + */ +#define WATCHDOG_DEFAULT_TIMEOUT_NS (50U * NSEC_PER_USEC) + +/* + * Remote timeout NUMA distance multiplier. The local distance is 10. The + * default remote distance is 20. ACPI tables provide more accurate numbers + * which are guaranteed to be greater than the local distance. + * + * This results in a 5us base value, which is equivalent to the above !NUMA + * default. + */ +#define WATCHDOG_NUMA_MULTIPLIER_NS ((u64)(WATCHDOG_DEFAULT_TIMEOUT_NS / LOCAL_DISTANCE)) + +/* Limit the NUMA timeout in case the distance values are insanely big */ +#define WATCHDOG_NUMA_MAX_TIMEOUT_NS ((u64)(500U * NSEC_PER_USEC)) + +/* Shift values to calculate the approximate $N ppm of a given delta. */ +#define SHIFT_500PPM 11 +#define SHIFT_4000PPM 8 + +/* Number of attempts to read the watchdog */ +#define WATCHDOG_FREQ_RETRIES 3 + +/* Five reads local and remote for inter CPU skew detection */ +#define WATCHDOG_REMOTE_MAX_SEQ 10 static inline void clocksource_watchdog_lock(unsigned long *flags) { @@ -241,204 +236,422 @@ void clocksource_mark_unstable(struct clocksource *cs) spin_unlock_irqrestore(&watchdog_lock, flags); } -static int verify_n_cpus = 8; -module_param(verify_n_cpus, int, 0644); +static inline void clocksource_reset_watchdog(void) +{ + struct clocksource *cs; + + list_for_each_entry(cs, &watchdog_list, wd_list) + cs->flags &= ~CLOCK_SOURCE_WATCHDOG; +} + +enum wd_result { + WD_SUCCESS, + WD_FREQ_NO_WATCHDOG, + WD_FREQ_TIMEOUT, + WD_FREQ_RESET, + WD_FREQ_SKEWED, + WD_CPU_TIMEOUT, + WD_CPU_SKEWED, +}; -enum wd_read_status { - WD_READ_SUCCESS, - WD_READ_UNSTABLE, - WD_READ_SKIP +struct watchdog_cpu_data { + /* Keep first as it is 32 byte aligned */ + call_single_data_t csd; + atomic_t remote_inprogress; + enum wd_result result; + u64 cpu_ts[2]; + struct clocksource *cs; + /* Ensure that the sequence is in a separate cache line */ + atomic_t seq ____cacheline_aligned; + /* Set by the control CPU according to NUMA distance */ + u64 timeout_ns; }; -static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow, u64 *wdnow) -{ - int64_t md = watchdog->uncertainty_margin; - unsigned int nretries, max_retries; - int64_t wd_delay, wd_seq_delay; - u64 wd_end, wd_end2; - - max_retries = clocksource_get_max_watchdog_retry(); - for (nretries = 0; nretries <= max_retries; nretries++) { - local_irq_disable(); - *wdnow = watchdog->read(watchdog); - *csnow = cs->read(cs); - wd_end = watchdog->read(watchdog); - wd_end2 = watchdog->read(watchdog); - local_irq_enable(); - - wd_delay = cycles_to_nsec_safe(watchdog, *wdnow, wd_end); - if (wd_delay <= md + cs->uncertainty_margin) { - if (nretries > 1 && nretries >= max_retries) { - pr_warn("timekeeping watchdog on CPU%d: %s retried %d times before success\n", - smp_processor_id(), watchdog->name, nretries); +struct watchdog_data { + raw_spinlock_t lock; + enum wd_result result; + + u64 wd_seq; + u64 wd_delta; + u64 cs_delta; + u64 cpu_ts[2]; + + unsigned int curr_cpu; +} ____cacheline_aligned_in_smp; + +static void watchdog_check_skew_remote(void *unused); + +static DEFINE_PER_CPU_ALIGNED(struct watchdog_cpu_data, watchdog_cpu_data) = { + .csd = CSD_INIT(watchdog_check_skew_remote, NULL), +}; + +static struct watchdog_data watchdog_data = { + .lock = __RAW_SPIN_LOCK_UNLOCKED(watchdog_data.lock), +}; + +static inline void watchdog_set_result(struct watchdog_cpu_data *wd, enum wd_result result) +{ + guard(raw_spinlock)(&watchdog_data.lock); + if (!wd->result) { + atomic_set(&wd->seq, WATCHDOG_REMOTE_MAX_SEQ); + WRITE_ONCE(wd->result, result); + } +} + +/* Wait for the sequence number to hand over control. */ +static bool watchdog_wait_seq(struct watchdog_cpu_data *wd, u64 start, int seq) +{ + for(int cnt = 0; atomic_read(&wd->seq) < seq; cnt++) { + /* Bail if the other side set an error result */ + if (READ_ONCE(wd->result) != WD_SUCCESS) + return false; + + /* Prevent endless loops if the other CPU does not react. */ + if (cnt == 5000) { + u64 nsecs = ktime_get_raw_fast_ns(); + + if (nsecs - start >=wd->timeout_ns) { + watchdog_set_result(wd, WD_CPU_TIMEOUT); + return false; } - return WD_READ_SUCCESS; + cnt = 0; } + cpu_relax(); + } + return seq < WATCHDOG_REMOTE_MAX_SEQ; +} - /* - * Now compute delay in consecutive watchdog read to see if - * there is too much external interferences that cause - * significant delay in reading both clocksource and watchdog. - * - * If consecutive WD read-back delay > md, report - * system busy, reinit the watchdog and skip the current - * watchdog test. - */ - wd_seq_delay = cycles_to_nsec_safe(watchdog, wd_end, wd_end2); - if (wd_seq_delay > md) - goto skip_test; +static void watchdog_check_skew(struct watchdog_cpu_data *wd, int index) +{ + u64 prev, now, delta, start = ktime_get_raw_fast_ns(); + int local = index, remote = (index + 1) & 0x1; + struct clocksource *cs = wd->cs; + + /* Set the local timestamp so that the first iteration works correctly */ + wd->cpu_ts[local] = cs->read(cs); + + /* Signal arrival */ + atomic_inc(&wd->seq); + + for (int seq = local + 2; seq < WATCHDOG_REMOTE_MAX_SEQ; seq += 2) { + if (!watchdog_wait_seq(wd, start, seq)) + return; + + /* Capture local timestamp before possible non-local coherency overhead */ + now = cs->read(cs); + + /* Store local timestamp before reading remote to limit coherency stalls */ + wd->cpu_ts[local] = now; + + prev = wd->cpu_ts[remote]; + delta = (now - prev) & cs->mask; + + if (delta > cs->max_raw_delta) { + watchdog_set_result(wd, WD_CPU_SKEWED); + return; + } + + /* Hand over to the remote CPU */ + atomic_inc(&wd->seq); } +} - pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n", - smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name); - return WD_READ_UNSTABLE; +static void watchdog_check_skew_remote(void *unused) +{ + struct watchdog_cpu_data *wd = this_cpu_ptr(&watchdog_cpu_data); -skip_test: - pr_info("timekeeping watchdog on CPU%d: %s wd-wd read-back delay of %lldns\n", - smp_processor_id(), watchdog->name, wd_seq_delay); - pr_info("wd-%s-wd read-back delay of %lldns, clock-skew test skipped!\n", - cs->name, wd_delay); - return WD_READ_SKIP; + atomic_inc(&wd->remote_inprogress); + watchdog_check_skew(wd, 1); + atomic_dec(&wd->remote_inprogress); } -static u64 csnow_mid; -static cpumask_t cpus_ahead; -static cpumask_t cpus_behind; -static cpumask_t cpus_chosen; +static inline bool wd_csd_locked(struct watchdog_cpu_data *wd) +{ + return READ_ONCE(wd->csd.node.u_flags) & CSD_FLAG_LOCK; +} -static void clocksource_verify_choose_cpus(void) +/* + * This is only invoked for remote CPUs. See watchdog_check_cpu_skew(). + */ +static inline u64 wd_get_remote_timeout(unsigned int remote_cpu) { - int cpu, i, n = verify_n_cpus; + unsigned int n1, n2; + u64 ns; + + if (nr_node_ids == 1) + return WATCHDOG_DEFAULT_TIMEOUT_NS; + + n1 = cpu_to_node(smp_processor_id()); + n2 = cpu_to_node(remote_cpu); + ns = WATCHDOG_NUMA_MULTIPLIER_NS * node_distance(n1, n2); + return min(ns, WATCHDOG_NUMA_MAX_TIMEOUT_NS); +} - if (n < 0 || n >= num_online_cpus()) { - /* Check all of the CPUs. */ - cpumask_copy(&cpus_chosen, cpu_online_mask); - cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); +static void __watchdog_check_cpu_skew(struct clocksource *cs, unsigned int cpu) +{ + struct watchdog_cpu_data *wd; + + wd = per_cpu_ptr(&watchdog_cpu_data, cpu); + if (atomic_read(&wd->remote_inprogress) || wd_csd_locked(wd)) { + watchdog_data.result = WD_CPU_TIMEOUT; + return; + } + + atomic_set(&wd->seq, 0); + wd->result = WD_SUCCESS; + wd->cs = cs; + /* Store the current CPU ID for the watchdog test unit */ + cs->wd_cpu = smp_processor_id(); + + wd->timeout_ns = wd_get_remote_timeout(cpu); + + /* Kick the remote CPU into the watchdog function */ + if (WARN_ON_ONCE(smp_call_function_single_async(cpu, &wd->csd))) { + watchdog_data.result = WD_CPU_TIMEOUT; return; } - /* If no checking desired, or no other CPU to check, leave. */ - cpumask_clear(&cpus_chosen); - if (n == 0 || num_online_cpus() <= 1) + scoped_guard(irq) + watchdog_check_skew(wd, 0); + + scoped_guard(raw_spinlock_irq, &watchdog_data.lock) { + watchdog_data.result = wd->result; + memcpy(watchdog_data.cpu_ts, wd->cpu_ts, sizeof(wd->cpu_ts)); + } +} + +static void watchdog_check_cpu_skew(struct clocksource *cs) +{ + unsigned int cpu = watchdog_data.curr_cpu; + + cpu = cpumask_next_wrap(cpu, cpu_online_mask); + watchdog_data.curr_cpu = cpu; + + /* Skip the current CPU. Handles num_online_cpus() == 1 as well */ + if (cpu == smp_processor_id()) return; - /* Make sure to select at least one CPU other than the current CPU. */ - cpu = cpumask_any_but(cpu_online_mask, smp_processor_id()); - if (WARN_ON_ONCE(cpu >= nr_cpu_ids)) + /* Don't interfere with the test mechanics */ + if ((cs->flags & CLOCK_SOURCE_WDTEST) && !(cs->flags & CLOCK_SOURCE_WDTEST_PERCPU)) return; - cpumask_set_cpu(cpu, &cpus_chosen); - /* Force a sane value for the boot parameter. */ - if (n > nr_cpu_ids) - n = nr_cpu_ids; + __watchdog_check_cpu_skew(cs, cpu); +} + +static bool watchdog_check_freq(struct clocksource *cs, bool reset_pending) +{ + unsigned int ppm_shift = SHIFT_4000PPM; + u64 wd_ts0, wd_ts1, cs_ts; + + watchdog_data.result = WD_SUCCESS; + if (!watchdog) { + watchdog_data.result = WD_FREQ_NO_WATCHDOG; + return false; + } + + if (cs->flags & CLOCK_SOURCE_WDTEST_PERCPU) + return true; /* - * Randomly select the specified number of CPUs. If the same - * CPU is selected multiple times, that CPU is checked only once, - * and no replacement CPU is selected. This gracefully handles - * situations where verify_n_cpus is greater than the number of - * CPUs that are currently online. + * If both the clocksource and the watchdog claim they are + * calibrated use 500ppm limit. Uncalibrated clocksources need a + * larger allowance because thefirmware supplied frequencies can be + * way off. */ - for (i = 1; i < n; i++) { - cpu = cpumask_random(cpu_online_mask); - if (!WARN_ON_ONCE(cpu >= nr_cpu_ids)) - cpumask_set_cpu(cpu, &cpus_chosen); + if (watchdog->flags & CLOCK_SOURCE_CALIBRATED && cs->flags & CLOCK_SOURCE_CALIBRATED) + ppm_shift = SHIFT_500PPM; + + for (int retries = 0; retries < WATCHDOG_FREQ_RETRIES; retries++) { + s64 wd_last, cs_last, wd_seq, wd_delta, cs_delta, max_delta; + + scoped_guard(irq) { + wd_ts0 = watchdog->read(watchdog); + cs_ts = cs->read(cs); + wd_ts1 = watchdog->read(watchdog); + } + + wd_last = cs->wd_last; + cs_last = cs->cs_last; + + /* Validate the watchdog readout window */ + wd_seq = cycles_to_nsec_safe(watchdog, wd_ts0, wd_ts1); + if (wd_seq > WATCHDOG_READOUT_MAX_NS) { + /* Store for printout in case all retries fail */ + watchdog_data.wd_seq = wd_seq; + continue; + } + + /* Store for subsequent processing */ + cs->wd_last = wd_ts0; + cs->cs_last = cs_ts; + + /* First round or reset pending? */ + if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || reset_pending) + goto reset; + + /* Calculate the nanosecond deltas from the last invocation */ + wd_delta = cycles_to_nsec_safe(watchdog, wd_last, wd_ts0); + cs_delta = cycles_to_nsec_safe(cs, cs_last, cs_ts); + + watchdog_data.wd_delta = wd_delta; + watchdog_data.cs_delta = cs_delta; + + /* + * Ensure that the deltas are within the readout limits of + * the clocksource and the watchdog. Long delays can cause + * clocksources to overflow. + */ + max_delta = max(wd_delta, cs_delta); + if (max_delta > cs->max_idle_ns || max_delta > watchdog->max_idle_ns) + goto reset; + + /* + * Calculate and validate the skew against the allowed PPM + * value of the maximum delta plus the watchdog readout + * time. + */ + if (abs(wd_delta - cs_delta) < (max_delta >> ppm_shift) + wd_seq) + return true; + + watchdog_data.result = WD_FREQ_SKEWED; + return false; } - /* Don't verify ourselves. */ - cpumask_clear_cpu(smp_processor_id(), &cpus_chosen); + watchdog_data.result = WD_FREQ_TIMEOUT; + return false; + +reset: + cs->flags |= CLOCK_SOURCE_WATCHDOG; + watchdog_data.result = WD_FREQ_RESET; + return false; } -static void clocksource_verify_one_cpu(void *csin) +/* Synchronization for sched clock */ +static void clocksource_tick_stable(struct clocksource *cs) { - struct clocksource *cs = (struct clocksource *)csin; - - csnow_mid = cs->read(cs); + if (cs == curr_clocksource && cs->tick_stable) + cs->tick_stable(cs); } -void clocksource_verify_percpu(struct clocksource *cs) +/* Conditionaly enable high resolution mode */ +static void clocksource_enable_highres(struct clocksource *cs) { - int64_t cs_nsec, cs_nsec_max = 0, cs_nsec_min = LLONG_MAX; - u64 csnow_begin, csnow_end; - int cpu, testcpu; - s64 delta; + if ((cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) || + !(cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) || + !watchdog || !(watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) + return; + + /* Mark it valid for high-res. */ + cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - if (verify_n_cpus == 0) + /* + * Can't schedule work before finished_booting is + * true. clocksource_done_booting will take care of it. + */ + if (!finished_booting) return; - cpumask_clear(&cpus_ahead); - cpumask_clear(&cpus_behind); - cpus_read_lock(); - migrate_disable(); - clocksource_verify_choose_cpus(); - if (cpumask_empty(&cpus_chosen)) { - migrate_enable(); - cpus_read_unlock(); - pr_warn("Not enough CPUs to check clocksource '%s'.\n", cs->name); + + if (cs->flags & CLOCK_SOURCE_WDTEST) return; + + /* + * If this is not the current clocksource let the watchdog thread + * reselect it. Due to the change to high res this clocksource + * might be preferred now. If it is the current clocksource let the + * tick code know about that change. + */ + if (cs != curr_clocksource) { + cs->flags |= CLOCK_SOURCE_RESELECT; + schedule_work(&watchdog_work); + } else { + tick_clock_notify(); } - testcpu = smp_processor_id(); - pr_info("Checking clocksource %s synchronization from CPU %d to CPUs %*pbl.\n", - cs->name, testcpu, cpumask_pr_args(&cpus_chosen)); - preempt_disable(); - for_each_cpu(cpu, &cpus_chosen) { - if (cpu == testcpu) - continue; - csnow_begin = cs->read(cs); - smp_call_function_single(cpu, clocksource_verify_one_cpu, cs, 1); - csnow_end = cs->read(cs); - delta = (s64)((csnow_mid - csnow_begin) & cs->mask); - if (delta < 0) - cpumask_set_cpu(cpu, &cpus_behind); - delta = (csnow_end - csnow_mid) & cs->mask; - if (delta < 0) - cpumask_set_cpu(cpu, &cpus_ahead); - cs_nsec = cycles_to_nsec_safe(cs, csnow_begin, csnow_end); - if (cs_nsec > cs_nsec_max) - cs_nsec_max = cs_nsec; - if (cs_nsec < cs_nsec_min) - cs_nsec_min = cs_nsec; +} + +static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 2); + +static void watchdog_print_freq_timeout(struct clocksource *cs) +{ + if (!__ratelimit(&ratelimit_state)) + return; + pr_info("Watchdog %s read timed out. Readout sequence took: %lluns\n", + watchdog->name, watchdog_data.wd_seq); +} + +static void watchdog_print_freq_skew(struct clocksource *cs) +{ + pr_warn("Marking clocksource %s unstable due to frequency skew\n", cs->name); + pr_warn("Watchdog %20s interval: %16lluns\n", watchdog->name, watchdog_data.wd_delta); + pr_warn("Clocksource %20s interval: %16lluns\n", cs->name, watchdog_data.cs_delta); +} + +static void watchdog_handle_remote_timeout(struct clocksource *cs) +{ + pr_info_once("Watchdog remote CPU %u read timed out\n", watchdog_data.curr_cpu); +} + +static void watchdog_print_remote_skew(struct clocksource *cs) +{ + pr_warn("Marking clocksource %s unstable due to inter CPU skew\n", cs->name); + if (watchdog_data.cpu_ts[0] < watchdog_data.cpu_ts[1]) { + pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", smp_processor_id(), + watchdog_data.cpu_ts[0], watchdog_data.curr_cpu, watchdog_data.cpu_ts[1]); + } else { + pr_warn("CPU%u %16llu < CPU%u %16llu (cycles)\n", watchdog_data.curr_cpu, + watchdog_data.cpu_ts[1], smp_processor_id(), watchdog_data.cpu_ts[0]); } - preempt_enable(); - migrate_enable(); - cpus_read_unlock(); - if (!cpumask_empty(&cpus_ahead)) - pr_warn(" CPUs %*pbl ahead of CPU %d for clocksource %s.\n", - cpumask_pr_args(&cpus_ahead), testcpu, cs->name); - if (!cpumask_empty(&cpus_behind)) - pr_warn(" CPUs %*pbl behind CPU %d for clocksource %s.\n", - cpumask_pr_args(&cpus_behind), testcpu, cs->name); - pr_info(" CPU %d check durations %lldns - %lldns for clocksource %s.\n", - testcpu, cs_nsec_min, cs_nsec_max, cs->name); -} -EXPORT_SYMBOL_GPL(clocksource_verify_percpu); +} -static inline void clocksource_reset_watchdog(void) +static void watchdog_check_result(struct clocksource *cs) { - struct clocksource *cs; + switch (watchdog_data.result) { + case WD_SUCCESS: + clocksource_tick_stable(cs); + clocksource_enable_highres(cs); + return; - list_for_each_entry(cs, &watchdog_list, wd_list) + case WD_FREQ_TIMEOUT: + watchdog_print_freq_timeout(cs); + /* Try again later and invalidate the reference timestamps. */ cs->flags &= ~CLOCK_SOURCE_WATCHDOG; -} + return; + case WD_FREQ_NO_WATCHDOG: + case WD_FREQ_RESET: + /* + * Nothing to do when the reference timestamps were reset + * or no watchdog clocksource registered. + */ + return; + + case WD_FREQ_SKEWED: + watchdog_print_freq_skew(cs); + break; + + case WD_CPU_TIMEOUT: + /* Remote check timed out. Try again next cycle. */ + watchdog_handle_remote_timeout(cs); + return; + + case WD_CPU_SKEWED: + watchdog_print_remote_skew(cs); + break; + } + __clocksource_unstable(cs); +} static void clocksource_watchdog(struct timer_list *unused) { - int64_t wd_nsec, cs_nsec, interval; - u64 csnow, wdnow, cslast, wdlast; - int next_cpu, reset_pending; struct clocksource *cs; - enum wd_read_status read_ret; - unsigned long extra_wait = 0; - u32 md; + bool reset_pending; - spin_lock(&watchdog_lock); + guard(spinlock)(&watchdog_lock); if (!watchdog_running) - goto out; + return; reset_pending = atomic_read(&watchdog_reset_pending); list_for_each_entry(cs, &watchdog_list, wd_list) { - /* Clocksource already marked unstable? */ if (cs->flags & CLOCK_SOURCE_UNSTABLE) { if (finished_booting) @@ -446,170 +659,40 @@ static void clocksource_watchdog(struct timer_list *unused) continue; } - read_ret = cs_watchdog_read(cs, &csnow, &wdnow); - - if (read_ret == WD_READ_UNSTABLE) { - /* Clock readout unreliable, so give it up. */ - __clocksource_unstable(cs); - continue; - } - - /* - * When WD_READ_SKIP is returned, it means the system is likely - * under very heavy load, where the latency of reading - * watchdog/clocksource is very big, and affect the accuracy of - * watchdog check. So give system some space and suspend the - * watchdog check for 5 minutes. - */ - if (read_ret == WD_READ_SKIP) { - /* - * As the watchdog timer will be suspended, and - * cs->last could keep unchanged for 5 minutes, reset - * the counters. - */ - clocksource_reset_watchdog(); - extra_wait = HZ * 300; - break; - } - - /* Clocksource initialized ? */ - if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) || - atomic_read(&watchdog_reset_pending)) { - cs->flags |= CLOCK_SOURCE_WATCHDOG; - cs->wd_last = wdnow; - cs->cs_last = csnow; - continue; - } - - wd_nsec = cycles_to_nsec_safe(watchdog, cs->wd_last, wdnow); - cs_nsec = cycles_to_nsec_safe(cs, cs->cs_last, csnow); - wdlast = cs->wd_last; /* save these in case we print them */ - cslast = cs->cs_last; - cs->cs_last = csnow; - cs->wd_last = wdnow; - - if (atomic_read(&watchdog_reset_pending)) - continue; - - /* - * The processing of timer softirqs can get delayed (usually - * on account of ksoftirqd not getting to run in a timely - * manner), which causes the watchdog interval to stretch. - * Skew detection may fail for longer watchdog intervals - * on account of fixed margins being used. - * Some clocksources, e.g. acpi_pm, cannot tolerate - * watchdog intervals longer than a few seconds. - */ - interval = max(cs_nsec, wd_nsec); - if (unlikely(interval > WATCHDOG_INTERVAL_MAX_NS)) { - if (system_state > SYSTEM_SCHEDULING && - interval > 2 * watchdog_max_interval) { - watchdog_max_interval = interval; - pr_warn("Long readout interval, skipping watchdog check: cs_nsec: %lld wd_nsec: %lld\n", - cs_nsec, wd_nsec); - } - watchdog_timer.expires = jiffies; - continue; - } - - /* Check the deviation from the watchdog clocksource. */ - md = cs->uncertainty_margin + watchdog->uncertainty_margin; - if (abs(cs_nsec - wd_nsec) > md) { - s64 cs_wd_msec; - s64 wd_msec; - u32 wd_rem; - - pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", - smp_processor_id(), cs->name); - pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n", - watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask); - pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n", - cs->name, cs_nsec, csnow, cslast, cs->mask); - cs_wd_msec = div_s64_rem(cs_nsec - wd_nsec, 1000 * 1000, &wd_rem); - wd_msec = div_s64_rem(wd_nsec, 1000 * 1000, &wd_rem); - pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n", - cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec); - if (curr_clocksource == cs) - pr_warn(" '%s' is current clocksource.\n", cs->name); - else if (curr_clocksource) - pr_warn(" '%s' (not '%s') is current clocksource.\n", curr_clocksource->name, cs->name); - else - pr_warn(" No current clocksource.\n"); - __clocksource_unstable(cs); - continue; + /* Compare against watchdog clocksource if available */ + if (watchdog_check_freq(cs, reset_pending)) { + /* Check for inter CPU skew */ + watchdog_check_cpu_skew(cs); } - if (cs == curr_clocksource && cs->tick_stable) - cs->tick_stable(cs); - - if (!(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES) && - (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) && - (watchdog->flags & CLOCK_SOURCE_IS_CONTINUOUS)) { - /* Mark it valid for high-res. */ - cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; - - /* - * clocksource_done_booting() will sort it if - * finished_booting is not set yet. - */ - if (!finished_booting) - continue; - - /* - * If this is not the current clocksource let - * the watchdog thread reselect it. Due to the - * change to high res this clocksource might - * be preferred now. If it is the current - * clocksource let the tick code know about - * that change. - */ - if (cs != curr_clocksource) { - cs->flags |= CLOCK_SOURCE_RESELECT; - schedule_work(&watchdog_work); - } else { - tick_clock_notify(); - } - } + watchdog_check_result(cs); } - /* - * We only clear the watchdog_reset_pending, when we did a - * full cycle through all clocksources. - */ + /* Clear after the full clocksource walk */ if (reset_pending) atomic_dec(&watchdog_reset_pending); - /* - * Cycle through CPUs to check if the CPUs stay synchronized - * to each other. - */ - next_cpu = cpumask_next_wrap(raw_smp_processor_id(), cpu_online_mask); - - /* - * Arm timer if not already pending: could race with concurrent - * pair clocksource_stop_watchdog() clocksource_start_watchdog(). - */ + /* Could have been rearmed by a stop/start cycle */ if (!timer_pending(&watchdog_timer)) { - watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait; - add_timer_on(&watchdog_timer, next_cpu); + watchdog_timer.expires += WATCHDOG_INTERVAL; + add_timer_local(&watchdog_timer); } -out: - spin_unlock(&watchdog_lock); } static inline void clocksource_start_watchdog(void) { - if (watchdog_running || !watchdog || list_empty(&watchdog_list)) + if (watchdog_running || list_empty(&watchdog_list)) return; - timer_setup(&watchdog_timer, clocksource_watchdog, 0); + timer_setup(&watchdog_timer, clocksource_watchdog, TIMER_PINNED); watchdog_timer.expires = jiffies + WATCHDOG_INTERVAL; - add_timer_on(&watchdog_timer, cpumask_first(cpu_online_mask)); + + add_timer_on(&watchdog_timer, get_boot_cpu_id()); watchdog_running = 1; } static inline void clocksource_stop_watchdog(void) { - if (!watchdog_running || (watchdog && !list_empty(&watchdog_list))) + if (!watchdog_running || !list_empty(&watchdog_list)) return; timer_delete(&watchdog_timer); watchdog_running = 0; @@ -697,12 +780,6 @@ static int __clocksource_watchdog_kthread(void) unsigned long flags; int select = 0; - /* Do any required per-CPU skew verification. */ - if (curr_clocksource && - curr_clocksource->flags & CLOCK_SOURCE_UNSTABLE && - curr_clocksource->flags & CLOCK_SOURCE_VERIFY_PERCPU) - clocksource_verify_percpu(curr_clocksource); - spin_lock_irqsave(&watchdog_lock, flags); list_for_each_entry_safe(cs, tmp, &watchdog_list, wd_list) { if (cs->flags & CLOCK_SOURCE_UNSTABLE) { @@ -1023,6 +1100,8 @@ static struct clocksource *clocksource_find_best(bool oneshot, bool skipcur) continue; if (oneshot && !(cs->flags & CLOCK_SOURCE_VALID_FOR_HRES)) continue; + if (cs->flags & CLOCK_SOURCE_WDTEST) + continue; return cs; } return NULL; @@ -1047,6 +1126,8 @@ static void __clocksource_select(bool skipcur) continue; if (strcmp(cs->name, override_name) != 0) continue; + if (cs->flags & CLOCK_SOURCE_WDTEST) + continue; /* * Check to make sure we don't switch to a non-highres * capable clocksource if the tick code is in oneshot @@ -1181,30 +1262,6 @@ void __clocksource_update_freq_scale(struct clocksource *cs, u32 scale, u32 freq cs->freq_khz = div_u64((u64)freq * scale, 1000); } - /* - * If the uncertainty margin is not specified, calculate it. If - * both scale and freq are non-zero, calculate the clock period, but - * bound below at 2*WATCHDOG_MAX_SKEW, that is, 500ppm by default. - * However, if either of scale or freq is zero, be very conservative - * and take the tens-of-milliseconds WATCHDOG_THRESHOLD value - * for the uncertainty margin. Allow stupidly small uncertainty - * margins to be specified by the caller for testing purposes, - * but warn to discourage production use of this capability. - * - * Bottom line: The sum of the uncertainty margins of the - * watchdog clocksource and the clocksource under test will be at - * least 500ppm by default. For more information, please see the - * comment preceding CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US above. - */ - if (scale && freq && !cs->uncertainty_margin) { - cs->uncertainty_margin = NSEC_PER_SEC / (scale * freq); - if (cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW) - cs->uncertainty_margin = 2 * WATCHDOG_MAX_SKEW; - } else if (!cs->uncertainty_margin) { - cs->uncertainty_margin = WATCHDOG_THRESHOLD; - } - WARN_ON_ONCE(cs->uncertainty_margin < 2 * WATCHDOG_MAX_SKEW); - /* * Ensure clocksources that have large 'mult' values don't overflow * when adjusted. diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index a5c7d15fce72..4aebcc80b8e2 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -32,7 +32,6 @@ static u64 jiffies_read(struct clocksource *cs) static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ - .uncertainty_margin = 32 * NSEC_PER_MSEC, .read = jiffies_read, .mask = CLOCKSOURCE_MASK(32), .mult = TICK_NSEC << JIFFIES_SHIFT, /* details above */ -- cgit v1.2.3