From bd5956166d20adbde3af0f6f265dc2f0ce5f4df9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 8 Apr 2026 13:53:46 +0200 Subject: hrtimer: Provide hrtimer_start_range_ns_user() Calvin reported an odd NMI watchdog lockup which claims that the CPU locked up in user space. He provided a reproducer, which set's up a timerfd based timer and then rearms it in a loop with an absolute expiry time of 1ns. As the expiry time is in the past, the timer ends up as the first expiring timer in the per CPU hrtimer base and the clockevent device is programmed with the minimum delta value. If the machine is fast enough, this ends up in a endless loop of programming the delta value to the minimum value defined by the clock event device, before the timer interrupt can fire, which starves the interrupt and consequently triggers the lockup detector because the hrtimer callback of the lockup mechanism is never invoked. The clockevents code already has a last resort mechanism to prevent that, but it's sensible to catch such issues before trying to reprogram the clock event device. Provide a variant of hrtimer_start_range_ns(), which sanity checks the timer after queueing it. It does not so before because the timer might be armed and therefore needs to be dequeued. also we optimize for the latest possible point to check, so that the clock event prevention is avoided as much as possible. If the timer is already expired _before_ the clock event is reprogrammed, remove the timer from the queue and signal to the caller that the operation failed by returning false. That allows the caller to take immediate action without going through the loops and hoops of the hrtimer interrupt. The queueing code can't invoke the timer callback as the caller might hold a lock which is taken in the callback. Add a tracepoint which allows to analyze the expired at start situation. Reported-by: Calvin Owens Signed-off-by: Thomas Gleixner Tested-by: Calvin Owens Reviewed-by: Frederic Weisbecker Acked-by: Peter Zijlstra (Intel) Link: https://patch.msgid.link/20260408114951.995031895@kernel.org --- include/linux/hrtimer.h | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) (limited to 'include/linux/hrtimer.h') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 9ced498fefaa..66cc98c773cf 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -206,6 +206,9 @@ static inline void destroy_hrtimer_on_stack(struct hrtimer *timer) { } extern void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 range_ns, const enum hrtimer_mode mode); +extern bool hrtimer_start_range_ns_user(struct hrtimer *timer, ktime_t tim, + u64 range_ns, const enum hrtimer_mode mode); + /** * hrtimer_start - (re)start an hrtimer * @timer: the timer to be added @@ -223,17 +226,28 @@ static inline void hrtimer_start(struct hrtimer *timer, ktime_t tim, extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); -static inline void hrtimer_start_expires(struct hrtimer *timer, - enum hrtimer_mode mode) +static inline void hrtimer_start_expires(struct hrtimer *timer, enum hrtimer_mode mode) { - u64 delta; ktime_t soft, hard; + u64 delta; + soft = hrtimer_get_softexpires(timer); hard = hrtimer_get_expires(timer); delta = ktime_to_ns(ktime_sub(hard, soft)); hrtimer_start_range_ns(timer, soft, delta, mode); } +static inline bool hrtimer_start_expires_user(struct hrtimer *timer, enum hrtimer_mode mode) +{ + ktime_t soft, hard; + u64 delta; + + soft = hrtimer_get_softexpires(timer); + hard = hrtimer_get_expires(timer); + delta = ktime_to_ns(ktime_sub(hard, soft)); + return hrtimer_start_range_ns_user(timer, soft, delta, mode); +} + void hrtimer_sleeper_start_expires(struct hrtimer_sleeper *sl, enum hrtimer_mode mode); -- cgit v1.2.3 From 3af1f49f415dcac8c0df8bfc593df0371c219876 Mon Sep 17 00:00:00 2001 From: Thomas Weißschuh Date: Mon, 4 May 2026 08:56:10 +0200 Subject: hrtimer: Return ktime_t from hrtimer_get_next_event()/hrtimer_next_event_without() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These functions really work in terms of ktime_t and not u64. Change their return types and adapt the callers. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://patch.msgid.link/20260504-hrtimer-next_event-v2-1-7a5d0550b42f@linutronix.de --- include/linux/hrtimer.h | 4 ++-- kernel/time/hrtimer.c | 8 ++++---- kernel/time/tick-sched.c | 3 +-- kernel/time/timer.c | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) (limited to 'include/linux/hrtimer.h') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 66cc98c773cf..6862dea0acc5 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -268,8 +268,8 @@ static inline ktime_t hrtimer_get_remaining(const struct hrtimer *timer) return __hrtimer_get_remaining(timer, false); } -extern u64 hrtimer_get_next_event(void); -extern u64 hrtimer_next_event_without(const struct hrtimer *exclude); +extern ktime_t hrtimer_get_next_event(void); +extern ktime_t hrtimer_next_event_without(const struct hrtimer *exclude); extern bool hrtimer_active(const struct hrtimer *timer); diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 8cfc7aacece1..8a19a61f6fee 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -1783,10 +1783,10 @@ EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); * * Returns the next expiry time or KTIME_MAX if no timer is pending. */ -u64 hrtimer_get_next_event(void) +ktime_t hrtimer_get_next_event(void) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - u64 expires = KTIME_MAX; + ktime_t expires = KTIME_MAX; guard(raw_spinlock_irqsave)(&cpu_base->lock); if (!hrtimer_hres_active(cpu_base)) @@ -1802,10 +1802,10 @@ u64 hrtimer_get_next_event(void) * Returns the next expiry time over all timers except for the @exclude one or * KTIME_MAX if none of them is pending. */ -u64 hrtimer_next_event_without(const struct hrtimer *exclude) +ktime_t hrtimer_next_event_without(const struct hrtimer *exclude) { struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); - u64 expires = KTIME_MAX; + ktime_t expires = KTIME_MAX; unsigned int active; guard(raw_spinlock_irqsave)(&cpu_base->lock); diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cbbb87a0c6e7..3026a301dff7 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1407,8 +1407,7 @@ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) * If the next highres timer to expire is earlier than 'next_event', the * idle governor needs to know that. */ - next_event = min_t(u64, next_event, - hrtimer_next_event_without(&ts->sched_timer)); + next_event = min(next_event, hrtimer_next_event_without(&ts->sched_timer)); return ktime_sub(next_event, now); } diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 04d928c21aba..655a8c6cd84d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1932,7 +1932,7 @@ static void timer_recalc_next_expiry(struct timer_base *base) */ static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) { - u64 nextevt = hrtimer_get_next_event(); + u64 nextevt = ktime_to_ns(hrtimer_get_next_event()); /* * If high resolution timers are enabled -- cgit v1.2.3