summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt10
-rw-r--r--arch/x86/include/asm/time.h1
-rw-r--r--arch/x86/kernel/hpet.c2
-rw-r--r--arch/x86/kernel/tsc.c55
-rw-r--r--drivers/clocksource/acpi_pm.c6
-rw-r--r--kernel/time/Kconfig6
-rw-r--r--kernel/time/clocksource.c72
7 files changed, 123 insertions, 29 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 6cfa6e3996cf..7b4df6d89d3c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6369,6 +6369,16 @@
in situations with strict latency requirements (where
interruptions from clocksource watchdog are not
acceptable).
+ [x86] recalibrate: force recalibration against a HW timer
+ (HPET or PM timer) on systems whose TSC frequency was
+ obtained from HW or FW using either an MSR or CPUID(0x15).
+ Warn if the difference is more than 500 ppm.
+ [x86] watchdog: Use TSC as the watchdog clocksource with
+ which to check other HW timers (HPET or PM timer), but
+ only on systems where TSC has been deemed trustworthy.
+ This will be suppressed by an earlier tsc=nowatchdog and
+ can be overridden by a later tsc=nowatchdog. A console
+ message will flag any such suppression or overriding.
tsc_early_khz= [X86] Skip early TSC calibration and use the given
value instead. Useful when the early TSC frequency discovery
diff --git a/arch/x86/include/asm/time.h b/arch/x86/include/asm/time.h
index 8ac563abb567..a53961c64a56 100644
--- a/arch/x86/include/asm/time.h
+++ b/arch/x86/include/asm/time.h
@@ -8,6 +8,7 @@
extern void hpet_time_init(void);
extern void time_init(void);
extern bool pit_timer_init(void);
+extern bool tsc_clocksource_watchdog_disabled(void);
extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 71f336425e58..c8eb1ac5125a 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -1091,6 +1091,8 @@ int __init hpet_enable(void)
if (!hpet_counting())
goto out_nohpet;
+ if (tsc_clocksource_watchdog_disabled())
+ clocksource_hpet.flags |= CLOCK_SOURCE_MUST_VERIFY;
clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
if (id & HPET_ID_LEGSUP) {
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index a78e73da4a74..306c233c98d8 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -48,6 +48,8 @@ static DEFINE_STATIC_KEY_FALSE(__use_tsc);
int tsc_clocksource_reliable;
+static int __read_mostly tsc_force_recalibrate;
+
static u32 art_to_tsc_numerator;
static u32 art_to_tsc_denominator;
static u64 art_to_tsc_offset;
@@ -292,6 +294,7 @@ __setup("notsc", notsc_setup);
static int no_sched_irq_time;
static int no_tsc_watchdog;
+static int tsc_as_watchdog;
static int __init tsc_setup(char *str)
{
@@ -301,8 +304,22 @@ static int __init tsc_setup(char *str)
no_sched_irq_time = 1;
if (!strcmp(str, "unstable"))
mark_tsc_unstable("boot parameter");
- if (!strcmp(str, "nowatchdog"))
+ if (!strcmp(str, "nowatchdog")) {
no_tsc_watchdog = 1;
+ if (tsc_as_watchdog)
+ pr_alert("%s: Overriding earlier tsc=watchdog with tsc=nowatchdog\n",
+ __func__);
+ tsc_as_watchdog = 0;
+ }
+ if (!strcmp(str, "recalibrate"))
+ tsc_force_recalibrate = 1;
+ if (!strcmp(str, "watchdog")) {
+ if (no_tsc_watchdog)
+ pr_alert("%s: tsc=watchdog overridden by earlier tsc=nowatchdog\n",
+ __func__);
+ else
+ tsc_as_watchdog = 1;
+ }
return 1;
}
@@ -1186,6 +1203,12 @@ static void __init tsc_disable_clocksource_watchdog(void)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
}
+bool tsc_clocksource_watchdog_disabled(void)
+{
+ return !(clocksource_tsc.flags & CLOCK_SOURCE_MUST_VERIFY) &&
+ tsc_as_watchdog && !no_tsc_watchdog;
+}
+
static void __init check_system_tsc_reliable(void)
{
#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC)
@@ -1374,6 +1397,25 @@ restart:
else
freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
+ /* Will hit this only if tsc_force_recalibrate has been set */
+ if (boot_cpu_has(X86_FEATURE_TSC_KNOWN_FREQ)) {
+
+ /* Warn if the deviation exceeds 500 ppm */
+ if (abs(tsc_khz - freq) > (tsc_khz >> 11)) {
+ pr_warn("Warning: TSC freq calibrated by CPUID/MSR differs from what is calibrated by HW timer, please check with vendor!!\n");
+ pr_info("Previous calibrated TSC freq:\t %lu.%03lu MHz\n",
+ (unsigned long)tsc_khz / 1000,
+ (unsigned long)tsc_khz % 1000);
+ }
+
+ pr_info("TSC freq recalibrated by [%s]:\t %lu.%03lu MHz\n",
+ hpet ? "HPET" : "PM_TIMER",
+ (unsigned long)freq / 1000,
+ (unsigned long)freq % 1000);
+
+ return;
+ }
+
/* Make sure we're within 1% */
if (abs(tsc_khz - freq) > tsc_khz/100)
goto out;
@@ -1407,8 +1449,10 @@ static int __init init_tsc_clocksource(void)
if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz)
return 0;
- if (tsc_unstable)
- goto unreg;
+ if (tsc_unstable) {
+ clocksource_unregister(&clocksource_tsc_early);
+ return 0;
+ }
if (boot_cpu_has(X86_FEATURE_NONSTOP_TSC_S3))
clocksource_tsc.flags |= CLOCK_SOURCE_SUSPEND_NONSTOP;
@@ -1421,9 +1465,10 @@ static int __init init_tsc_clocksource(void)
if (boot_cpu_has(X86_FEATURE_ART))
art_related_clocksource = &clocksource_tsc;
clocksource_register_khz(&clocksource_tsc, tsc_khz);
-unreg:
clocksource_unregister(&clocksource_tsc_early);
- return 0;
+
+ if (!tsc_force_recalibrate)
+ return 0;
}
schedule_delayed_work(&tsc_irqwork, 0);
diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c
index 279ddff81ab4..82338773602c 100644
--- a/drivers/clocksource/acpi_pm.c
+++ b/drivers/clocksource/acpi_pm.c
@@ -23,6 +23,7 @@
#include <linux/pci.h>
#include <linux/delay.h>
#include <asm/io.h>
+#include <asm/time.h>
/*
* The I/O port the PMTMR resides at.
@@ -210,8 +211,9 @@ static int __init init_acpi_pm_clocksource(void)
return -ENODEV;
}
- return clocksource_register_hz(&clocksource_acpi_pm,
- PMTMR_TICKS_PER_SEC);
+ if (tsc_clocksource_watchdog_disabled())
+ clocksource_acpi_pm.flags |= CLOCK_SOURCE_MUST_VERIFY;
+ return clocksource_register_hz(&clocksource_acpi_pm, PMTMR_TICKS_PER_SEC);
}
/* We use fs_initcall because we want the PCI fixups to have run
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
index a41753be1a2b..bae8f11070be 100644
--- a/kernel/time/Kconfig
+++ b/kernel/time/Kconfig
@@ -200,10 +200,14 @@ config CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
int "Clocksource watchdog maximum allowable skew (in μs)"
depends on CLOCKSOURCE_WATCHDOG
range 50 1000
- default 100
+ default 125
help
Specify the maximum amount of allowable watchdog skew in
microseconds before reporting the clocksource to be unstable.
+ The default is based on a half-second clocksource watchdog
+ interval and NTP's maximum frequency drift of 500 parts
+ per million. If the clocksource is good enough for NTP,
+ it is good enough for the clocksource watchdog!
endmenu
endif
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index 9cf32ccda715..91836b727cef 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -96,6 +96,11 @@ static int finished_booting;
static u64 suspend_start;
/*
+ * Interval: 0.5sec.
+ */
+#define WATCHDOG_INTERVAL (HZ >> 1)
+
+/*
* Threshold: 0.0312s, when doubled: 0.0625s.
* Also a default for cs->uncertainty_margin when registering clocks.
*/
@@ -106,11 +111,14 @@ static u64 suspend_start;
* clocksource surrounding a read of the clocksource being validated.
* This delay could be due to SMIs, NMIs, or to VCPU preemptions. Used as
* a lower bound for cs->uncertainty_margin values when registering clocks.
+ *
+ * The default of 500 parts per million is based on NTP's limits.
+ * If a clocksource is good enough for NTP, it is good enough for us!
*/
#ifdef CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#define MAX_SKEW_USEC CONFIG_CLOCKSOURCE_WATCHDOG_MAX_SKEW_US
#else
-#define MAX_SKEW_USEC 100
+#define MAX_SKEW_USEC (125 * WATCHDOG_INTERVAL / HZ)
#endif
#define WATCHDOG_MAX_SKEW (MAX_SKEW_USEC * NSEC_PER_USEC)
@@ -140,11 +148,6 @@ static inline void clocksource_watchdog_unlock(unsigned long *flags)
static int clocksource_watchdog_kthread(void *data);
static void __clocksource_change_rating(struct clocksource *cs, int rating);
-/*
- * Interval: 0.5sec.
- */
-#define WATCHDOG_INTERVAL (HZ >> 1)
-
static void clocksource_watchdog_work(struct work_struct *work)
{
/*
@@ -257,8 +260,8 @@ static enum wd_read_status cs_watchdog_read(struct clocksource *cs, u64 *csnow,
goto skip_test;
}
- pr_warn("timekeeping watchdog on CPU%d: %s read-back delay of %lldns, attempt %d, marking unstable\n",
- smp_processor_id(), watchdog->name, wd_delay, nretries);
+ pr_warn("timekeeping watchdog on CPU%d: wd-%s-wd excessive read-back delay of %lldns vs. limit of %ldns, wd-wd read-back delay only %lldns, attempt %d, marking %s unstable\n",
+ smp_processor_id(), cs->name, wd_delay, WATCHDOG_MAX_SKEW, wd_seq_delay, nretries, cs->name);
return WD_READ_UNSTABLE;
skip_test:
@@ -384,6 +387,15 @@ void clocksource_verify_percpu(struct clocksource *cs)
}
EXPORT_SYMBOL_GPL(clocksource_verify_percpu);
+static inline void clocksource_reset_watchdog(void)
+{
+ struct clocksource *cs;
+
+ list_for_each_entry(cs, &watchdog_list, wd_list)
+ cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
+}
+
+
static void clocksource_watchdog(struct timer_list *unused)
{
u64 csnow, wdnow, cslast, wdlast, delta;
@@ -391,6 +403,7 @@ static void clocksource_watchdog(struct timer_list *unused)
int64_t wd_nsec, cs_nsec;
struct clocksource *cs;
enum wd_read_status read_ret;
+ unsigned long extra_wait = 0;
u32 md;
spin_lock(&watchdog_lock);
@@ -410,13 +423,30 @@ static void clocksource_watchdog(struct timer_list *unused)
read_ret = cs_watchdog_read(cs, &csnow, &wdnow);
- if (read_ret != WD_READ_SUCCESS) {
- if (read_ret == WD_READ_UNSTABLE)
- /* Clock readout unreliable, so give it up. */
- __clocksource_unstable(cs);
+ if (read_ret == WD_READ_UNSTABLE) {
+ /* Clock readout unreliable, so give it up. */
+ __clocksource_unstable(cs);
continue;
}
+ /*
+ * When WD_READ_SKIP is returned, it means the system is likely
+ * under very heavy load, where the latency of reading
+ * watchdog/clocksource is very big, and affect the accuracy of
+ * watchdog check. So give system some space and suspend the
+ * watchdog check for 5 minutes.
+ */
+ if (read_ret == WD_READ_SKIP) {
+ /*
+ * As the watchdog timer will be suspended, and
+ * cs->last could keep unchanged for 5 minutes, reset
+ * the counters.
+ */
+ clocksource_reset_watchdog();
+ extra_wait = HZ * 300;
+ break;
+ }
+
/* Clocksource initialized ? */
if (!(cs->flags & CLOCK_SOURCE_WATCHDOG) ||
atomic_read(&watchdog_reset_pending)) {
@@ -443,12 +473,20 @@ static void clocksource_watchdog(struct timer_list *unused)
/* Check the deviation from the watchdog clocksource. */
md = cs->uncertainty_margin + watchdog->uncertainty_margin;
if (abs(cs_nsec - wd_nsec) > md) {
+ u64 cs_wd_msec;
+ u64 wd_msec;
+ u32 wd_rem;
+
pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n",
smp_processor_id(), cs->name);
pr_warn(" '%s' wd_nsec: %lld wd_now: %llx wd_last: %llx mask: %llx\n",
watchdog->name, wd_nsec, wdnow, wdlast, watchdog->mask);
pr_warn(" '%s' cs_nsec: %lld cs_now: %llx cs_last: %llx mask: %llx\n",
cs->name, cs_nsec, csnow, cslast, cs->mask);
+ cs_wd_msec = div_u64_rem(cs_nsec - wd_nsec, 1000U * 1000U, &wd_rem);
+ wd_msec = div_u64_rem(wd_nsec, 1000U * 1000U, &wd_rem);
+ pr_warn(" Clocksource '%s' skewed %lld ns (%lld ms) over watchdog '%s' interval of %lld ns (%lld ms)\n",
+ cs->name, cs_nsec - wd_nsec, cs_wd_msec, watchdog->name, wd_nsec, wd_msec);
if (curr_clocksource == cs)
pr_warn(" '%s' is current clocksource.\n", cs->name);
else if (curr_clocksource)
@@ -512,7 +550,7 @@ static void clocksource_watchdog(struct timer_list *unused)
* pair clocksource_stop_watchdog() clocksource_start_watchdog().
*/
if (!timer_pending(&watchdog_timer)) {
- watchdog_timer.expires += WATCHDOG_INTERVAL;
+ watchdog_timer.expires += WATCHDOG_INTERVAL + extra_wait;
add_timer_on(&watchdog_timer, next_cpu);
}
out:
@@ -537,14 +575,6 @@ static inline void clocksource_stop_watchdog(void)
watchdog_running = 0;
}
-static inline void clocksource_reset_watchdog(void)
-{
- struct clocksource *cs;
-
- list_for_each_entry(cs, &watchdog_list, wd_list)
- cs->flags &= ~CLOCK_SOURCE_WATCHDOG;
-}
-
static void clocksource_resume_watchdog(void)
{
atomic_inc(&watchdog_reset_pending);