From dbd0fbc76c77daac08ddd245afdcbade0d506e19 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 29 Jun 2018 22:31:10 +0300 Subject: x86/tsc: Add missing header to tsc_msr.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a missing header otherwise compiler warns about missed prototype: CC arch/x86/kernel/tsc_msr.o arch/x86/kernel/tsc_msr.c:73:15: warning: no previous prototype for ‘cpu_khz_from_msr’ [-Wmissing-prototypes] unsigned long cpu_khz_from_msr(void) ^~~~~~~~~~~~~~~~ Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Pavel Tatashin Link: https://lkml.kernel.org/r/20180629193113.84425-4-andriy.shevchenko@linux.intel.com --- arch/x86/kernel/tsc_msr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 19afdbd7d0a7..5532d1be7687 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -12,6 +12,7 @@ #include #include #include +#include #define MAX_NUM_FREQS 9 -- cgit v1.2.3 From 397d3ad18dc431456baf8bce96606fa1d18b30b0 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 29 Jun 2018 22:31:09 +0300 Subject: x86/tsc: Convert to use x86_match_cpu() and INTEL_CPU_FAM6() Move the code to use recently introduced INTEL_CPU_FAM6() macro and drop custom version of x86_match_cpu() function. No functional change intended. Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Pavel Tatashin Link: https://lkml.kernel.org/r/20180629193113.84425-3-andriy.shevchenko@linux.intel.com --- arch/x86/kernel/tsc_msr.c | 83 ++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 5532d1be7687..1465aaee543a 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -8,9 +8,11 @@ */ #include -#include -#include + #include +#include +#include +#include #include #include @@ -24,44 +26,43 @@ * field msr_plat does. */ struct freq_desc { - u8 x86_family; /* CPU family */ - u8 x86_model; /* model */ u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ u32 freqs[MAX_NUM_FREQS]; }; -static struct freq_desc freq_desc_tables[] = { - /* PNW */ - { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }, - /* CLV+ */ - { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } }, - /* TNG - Intel Atom processor Z3400 series */ - { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } }, - /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */ - { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } }, - /* ANN - Intel Atom processor Z3500 series */ - { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }, - /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */ - { 6, 0x4c, 1, { 83300, 100000, 133300, 116700, - 80000, 93300, 90000, 88900, 87500 } }, +static const struct freq_desc freq_desc_pnw = { + 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }; -static int match_cpu(u8 family, u8 model) -{ - int i; +static const struct freq_desc freq_desc_clv = { + 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } +}; - for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) { - if ((family == freq_desc_tables[i].x86_family) && - (model == freq_desc_tables[i].x86_model)) - return i; - } +static const struct freq_desc freq_desc_byt = { + 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } +}; - return -1; -} +static const struct freq_desc freq_desc_cht = { + 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 } +}; -/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ -#define id_to_freq(cpu_index, freq_id) \ - (freq_desc_tables[cpu_index].freqs[freq_id]) +static const struct freq_desc freq_desc_tng = { + 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } +}; + +static const struct freq_desc freq_desc_ann = { + 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } +}; + +static const struct x86_cpu_id tsc_msr_cpu_ids[] = { + INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw), + INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv), + INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), + INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng), + INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann), + {} +}; /* * MSR-based CPU/TSC frequency discovery for certain CPUs. @@ -71,18 +72,17 @@ static int match_cpu(u8 family, u8 model) */ unsigned long cpu_khz_from_msr(void) { - u32 lo, hi, ratio, freq_id, freq; + u32 lo, hi, ratio, freq; + const struct freq_desc *freq_desc; + const struct x86_cpu_id *id; unsigned long res; - int cpu_index; - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + id = x86_match_cpu(tsc_msr_cpu_ids); + if (!id) return 0; - cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); - if (cpu_index < 0) - return 0; - - if (freq_desc_tables[cpu_index].msr_plat) { + freq_desc = (struct freq_desc *)id->driver_data; + if (freq_desc->msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); ratio = (lo >> 8) & 0xff; } else { @@ -92,8 +92,9 @@ unsigned long cpu_khz_from_msr(void) /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); - freq_id = lo & 0x7; - freq = id_to_freq(cpu_index, freq_id); + + /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ + freq = freq_desc->freqs[lo & 0x7]; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ res = freq * ratio; -- cgit v1.2.3 From 5067b087cf5b2fa4de00443cdc6a66acb28a4953 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 29 Jun 2018 22:31:11 +0300 Subject: x86/tsc: Use SPDX identifier and update Intel copyright Use SPDX identifier and update year in Intel copyright line. While here, remove file name from the file itself. Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Pavel Tatashin Link: https://lkml.kernel.org/r/20180629193113.84425-5-andriy.shevchenko@linux.intel.com --- arch/x86/kernel/tsc_msr.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 1465aaee543a..f0951c2e9f28 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -1,10 +1,9 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * tsc_msr.c - TSC frequency enumeration via MSR + * TSC frequency enumeration via MSR * - * Copyright (C) 2013 Intel Corporation + * Copyright (C) 2013, 2018 Intel Corporation * Author: Bin Gao - * - * This file is released under the GPLv2. */ #include -- cgit v1.2.3 From d99e5da91b36db5c35ddaf3653b280ee060971da Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 29 Jun 2018 22:31:12 +0300 Subject: x86/platform/intel-mid: Remove custom TSC calibration Since the commit 7da7c1561366 ("x86, tsc: Add static (MSR) TSC calibration on Intel Atom SoCs") introduced a common way for all Intel MID chips to get their TSC frequency via MSRs, there is no need to keep a duplication in each of Intel MID platform code. Thus, remove the custom calibration code for good. Note, there is slight difference in how to get frequency for (reserved?) values in MSRs, i.e. legacy code enforces some defaults while new code just uses 0 in that cases. Suggested-by: Alexander Shishkin Signed-off-by: Andy Shevchenko Signed-off-by: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Pavel Tatashin Cc: Bin Gao Link: https://lkml.kernel.org/r/20180629193113.84425-6-andriy.shevchenko@linux.intel.com --- arch/x86/include/asm/intel-mid.h | 14 ------- arch/x86/kernel/tsc_msr.c | 5 +++ arch/x86/platform/intel-mid/intel-mid.c | 6 --- arch/x86/platform/intel-mid/mfld.c | 36 ---------------- arch/x86/platform/intel-mid/mrfld.c | 74 --------------------------------- 5 files changed, 5 insertions(+), 130 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/intel-mid.h b/arch/x86/include/asm/intel-mid.h index fe04491130ae..376eb8ada62d 100644 --- a/arch/x86/include/asm/intel-mid.h +++ b/arch/x86/include/asm/intel-mid.h @@ -136,20 +136,6 @@ enum intel_mid_timer_options { extern enum intel_mid_timer_options intel_mid_timer_options; -/* - * Penwell uses spread spectrum clock, so the freq number is not exactly - * the same as reported by MSR based on SDM. - */ -#define FSB_FREQ_83SKU 83200 -#define FSB_FREQ_100SKU 99840 -#define FSB_FREQ_133SKU 133000 - -#define FSB_FREQ_167SKU 167000 -#define FSB_FREQ_200SKU 200000 -#define FSB_FREQ_267SKU 267000 -#define FSB_FREQ_333SKU 333000 -#define FSB_FREQ_400SKU 400000 - /* Bus Select SoC Fuse value */ #define BSEL_SOC_FUSE_MASK 0x7 /* FSB 133MHz */ diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index f0951c2e9f28..27ef714d886c 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -29,6 +29,11 @@ struct freq_desc { u32 freqs[MAX_NUM_FREQS]; }; +/* + * Penwell and Clovertrail use spread spectrum clock, + * so the freq number is not exactly the same as reported + * by MSR based on SDM. + */ static const struct freq_desc freq_desc_pnw = { 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }; diff --git a/arch/x86/platform/intel-mid/intel-mid.c b/arch/x86/platform/intel-mid/intel-mid.c index 2ebdf31d9996..aac15a4018d5 100644 --- a/arch/x86/platform/intel-mid/intel-mid.c +++ b/arch/x86/platform/intel-mid/intel-mid.c @@ -82,11 +82,6 @@ static void intel_mid_reboot(void) intel_scu_ipc_simple_command(IPCMSG_COLD_RESET, 0); } -static unsigned long __init intel_mid_calibrate_tsc(void) -{ - return 0; -} - static void __init intel_mid_setup_bp_timer(void) { apbt_time_init(); @@ -191,7 +186,6 @@ void __init x86_intel_mid_early_setup(void) x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; - x86_platform.calibrate_tsc = intel_mid_calibrate_tsc; x86_platform.get_nmi_reason = intel_mid_get_nmi_reason; x86_init.pci.arch_init = intel_mid_pci_init; diff --git a/arch/x86/platform/intel-mid/mfld.c b/arch/x86/platform/intel-mid/mfld.c index e42978d4deaf..e66b51f5c206 100644 --- a/arch/x86/platform/intel-mid/mfld.c +++ b/arch/x86/platform/intel-mid/mfld.c @@ -11,48 +11,12 @@ #include -#include #include -#include #include "intel_mid_weak_decls.h" -static unsigned long __init mfld_calibrate_tsc(void) -{ - unsigned long fast_calibrate; - u32 lo, hi, ratio, fsb; - - rdmsr(MSR_IA32_PERF_STATUS, lo, hi); - pr_debug("IA32 perf status is 0x%x, 0x%0x\n", lo, hi); - ratio = (hi >> 8) & 0x1f; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("read a zero ratio, should be incorrect!\n"); - pr_err("force tsc ratio to 16 ...\n"); - ratio = 16; - } - rdmsr(MSR_FSB_FREQ, lo, hi); - if ((lo & 0x7) == 0x7) - fsb = FSB_FREQ_83SKU; - else - fsb = FSB_FREQ_100SKU; - fast_calibrate = ratio * fsb; - pr_debug("read penwell tsc %lu khz\n", fast_calibrate); - lapic_timer_frequency = fsb * 1000 / HZ; - - /* - * TSC on Intel Atom SoCs is reliable and of known frequency. - * See tsc_msr.c for details. - */ - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); - - return fast_calibrate; -} - static void __init penwell_arch_setup(void) { - x86_platform.calibrate_tsc = mfld_calibrate_tsc; } static struct intel_mid_ops penwell_ops = { diff --git a/arch/x86/platform/intel-mid/mrfld.c b/arch/x86/platform/intel-mid/mrfld.c index ae7bdeb0e507..c5538ec2d62d 100644 --- a/arch/x86/platform/intel-mid/mrfld.c +++ b/arch/x86/platform/intel-mid/mrfld.c @@ -11,86 +11,12 @@ #include -#include #include #include "intel_mid_weak_decls.h" -static unsigned long __init tangier_calibrate_tsc(void) -{ - unsigned long fast_calibrate; - u32 lo, hi, ratio, fsb, bus_freq; - - /* *********************** */ - /* Compute TSC:Ratio * FSB */ - /* *********************** */ - - /* Compute Ratio */ - rdmsr(MSR_PLATFORM_INFO, lo, hi); - pr_debug("IA32 PLATFORM_INFO is 0x%x : %x\n", hi, lo); - - ratio = (lo >> 8) & 0xFF; - pr_debug("ratio is %d\n", ratio); - if (!ratio) { - pr_err("Read a zero ratio, force tsc ratio to 4 ...\n"); - ratio = 4; - } - - /* Compute FSB */ - rdmsr(MSR_FSB_FREQ, lo, hi); - pr_debug("Actual FSB frequency detected by SOC 0x%x : %x\n", - hi, lo); - - bus_freq = lo & 0x7; - pr_debug("bus_freq = 0x%x\n", bus_freq); - - if (bus_freq == 0) - fsb = FSB_FREQ_100SKU; - else if (bus_freq == 1) - fsb = FSB_FREQ_100SKU; - else if (bus_freq == 2) - fsb = FSB_FREQ_133SKU; - else if (bus_freq == 3) - fsb = FSB_FREQ_167SKU; - else if (bus_freq == 4) - fsb = FSB_FREQ_83SKU; - else if (bus_freq == 5) - fsb = FSB_FREQ_400SKU; - else if (bus_freq == 6) - fsb = FSB_FREQ_267SKU; - else if (bus_freq == 7) - fsb = FSB_FREQ_333SKU; - else { - BUG(); - pr_err("Invalid bus_freq! Setting to minimal value!\n"); - fsb = FSB_FREQ_100SKU; - } - - /* TSC = FSB Freq * Resolved HFM Ratio */ - fast_calibrate = ratio * fsb; - pr_debug("calculate tangier tsc %lu KHz\n", fast_calibrate); - - /* ************************************ */ - /* Calculate Local APIC Timer Frequency */ - /* ************************************ */ - lapic_timer_frequency = (fsb * 1000) / HZ; - - pr_debug("Setting lapic_timer_frequency = %d\n", - lapic_timer_frequency); - - /* - * TSC on Intel Atom SoCs is reliable and of known frequency. - * See tsc_msr.c for details. - */ - setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - setup_force_cpu_cap(X86_FEATURE_TSC_RELIABLE); - - return fast_calibrate; -} - static void __init tangier_arch_setup(void) { - x86_platform.calibrate_tsc = tangier_calibrate_tsc; x86_platform.legacy.rtc = 1; } -- cgit v1.2.3 From 368a540e0232ad446931f5a4e8a5e06f69f21343 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:20 -0400 Subject: x86/kvmclock: Remove memblock dependency KVM clock is initialized later compared to other hypervisor clocks because it has a dependency on the memblock allocator. Bring it in line with other hypervisors by using memory from the BSS instead of allocating it. The benefits: - Remove ifdef from common code - Earlier availability of the clock - Remove dependency on memblock, and reduce code The downside: - Static allocation of the per cpu data structures sized NR_CPUS * 64byte Will be addressed in follow up patches. [ tglx: Split out from larger series ] Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Acked-by: Paolo Bonzini Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-2-pasha.tatashin@oracle.com --- arch/x86/kernel/kvm.c | 1 + arch/x86/kernel/kvmclock.c | 66 ++++++++-------------------------------------- arch/x86/kernel/setup.c | 4 --- 3 files changed, 12 insertions(+), 59 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 5b2300b818af..c65c232d3ddd 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -628,6 +628,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, + .init.init_platform = kvmclock_init, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 3b8e7c13c614..1f6ac5aaa904 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,9 +23,9 @@ #include #include #include -#include #include #include +#include #include #include @@ -44,6 +44,13 @@ static int parse_no_kvmclock(char *arg) } early_param("no-kvmclock", parse_no_kvmclock); +/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ +#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) +#define WALL_CLOCK_SIZE (sizeof(struct pvclock_wall_clock)) + +static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE); +static u8 wall_clock_mem[PAGE_ALIGN(WALL_CLOCK_SIZE)] __aligned(PAGE_SIZE); + /* The hypervisor will put information about time periodically here */ static struct pvclock_vsyscall_time_info *hv_clock; static struct pvclock_wall_clock *wall_clock; @@ -245,43 +252,12 @@ static void kvm_shutdown(void) native_machine_shutdown(); } -static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size, - phys_addr_t align) -{ - phys_addr_t mem; - - mem = memblock_alloc(size, align); - if (!mem) - return 0; - - if (sev_active()) { - if (early_set_memory_decrypted((unsigned long)__va(mem), size)) - goto e_free; - } - - return mem; -e_free: - memblock_free(mem, size); - return 0; -} - -static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size) -{ - if (sev_active()) - early_set_memory_encrypted((unsigned long)__va(addr), size); - - memblock_free(addr, size); -} - void __init kvmclock_init(void) { struct pvclock_vcpu_time_info *vcpu_time; - unsigned long mem, mem_wall_clock; - int size, cpu, wall_clock_size; + int cpu; u8 flags; - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - if (!kvm_para_available()) return; @@ -291,28 +267,11 @@ void __init kvmclock_init(void) } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) return; - wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock)); - mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE); - if (!mem_wall_clock) - return; - - wall_clock = __va(mem_wall_clock); - memset(wall_clock, 0, wall_clock_size); - - mem = kvm_memblock_alloc(size, PAGE_SIZE); - if (!mem) { - kvm_memblock_free(mem_wall_clock, wall_clock_size); - wall_clock = NULL; - return; - } - - hv_clock = __va(mem); - memset(hv_clock, 0, size); + wall_clock = (struct pvclock_wall_clock *)wall_clock_mem; + hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem; if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; - kvm_memblock_free(mem, size); - kvm_memblock_free(mem_wall_clock, wall_clock_size); wall_clock = NULL; return; } @@ -357,13 +316,10 @@ int __init kvm_setup_vsyscall_timeinfo(void) int cpu; u8 flags; struct pvclock_vcpu_time_info *vcpu_time; - unsigned int size; if (!hv_clock) return 0; - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2f86d883dd95..da1dbd99cb6e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1197,10 +1197,6 @@ void __init setup_arch(char **cmdline_p) memblock_find_dma_reserve(); -#ifdef CONFIG_KVM_GUEST - kvmclock_init(); -#endif - tsc_early_delay_calibrate(); if (!early_xdbc_setup_hardware()) early_xdbc_register_console(); -- cgit v1.2.3 From 7ef363a39514ed8a6f2333fbae1875ac0953715a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2018 16:55:21 -0400 Subject: x86/kvmclock: Remove page size requirement from wall_clock There is no requirement for wall_clock data to be page aligned or page sized. Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-3-pasha.tatashin@oracle.com --- arch/x86/kernel/kvmclock.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1f6ac5aaa904..a995d7d7164c 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -46,14 +46,12 @@ early_param("no-kvmclock", parse_no_kvmclock); /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ #define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) -#define WALL_CLOCK_SIZE (sizeof(struct pvclock_wall_clock)) static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE); -static u8 wall_clock_mem[PAGE_ALIGN(WALL_CLOCK_SIZE)] __aligned(PAGE_SIZE); /* The hypervisor will put information about time periodically here */ static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock *wall_clock; +static struct pvclock_wall_clock wall_clock; /* * The wallclock is the time of day when we booted. Since then, some time may @@ -66,15 +64,15 @@ static void kvm_get_wallclock(struct timespec64 *now) int low, high; int cpu; - low = (int)slow_virt_to_phys(wall_clock); - high = ((u64)slow_virt_to_phys(wall_clock) >> 32); + low = (int)slow_virt_to_phys(&wall_clock); + high = ((u64)slow_virt_to_phys(&wall_clock) >> 32); native_write_msr(msr_kvm_wall_clock, low, high); cpu = get_cpu(); vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(wall_clock, vcpu_time, now); + pvclock_read_wallclock(&wall_clock, vcpu_time, now); put_cpu(); } @@ -267,12 +265,10 @@ void __init kvmclock_init(void) } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) return; - wall_clock = (struct pvclock_wall_clock *)wall_clock_mem; hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem; if (kvm_register_clock("primary cpu clock")) { hv_clock = NULL; - wall_clock = NULL; return; } -- cgit v1.2.3 From 7a5ddc8fe0ea9518cd7fb6a929cac7d864c6f300 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2018 16:55:22 -0400 Subject: x86/kvmclock: Decrapify kvm_register_clock() The return value is pointless because the wrmsr cannot fail if KVM_FEATURE_CLOCKSOURCE or KVM_FEATURE_CLOCKSOURCE2 are set. kvm_register_clock() is only called locally so wants to be static. Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-4-pasha.tatashin@oracle.com --- arch/x86/include/asm/kvm_para.h | 1 - arch/x86/kernel/kvmclock.c | 33 ++++++++++----------------------- 2 files changed, 10 insertions(+), 24 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 3aea2658323a..4c723632c036 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -7,7 +7,6 @@ #include extern void kvmclock_init(void); -extern int kvm_register_clock(char *txt); #ifdef CONFIG_KVM_GUEST bool kvm_check_and_clear_guest_paused(void); diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index a995d7d7164c..f0a0aef5e9fa 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -187,23 +187,19 @@ struct clocksource kvm_clock = { }; EXPORT_SYMBOL_GPL(kvm_clock); -int kvm_register_clock(char *txt) +static void kvm_register_clock(char *txt) { - int cpu = smp_processor_id(); - int low, high, ret; struct pvclock_vcpu_time_info *src; + int cpu = smp_processor_id(); + u64 pa; if (!hv_clock) - return 0; + return; src = &hv_clock[cpu].pvti; - low = (int)slow_virt_to_phys(src) | 1; - high = ((u64)slow_virt_to_phys(src) >> 32); - ret = native_write_msr_safe(msr_kvm_system_time, low, high); - printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", - cpu, high, low, txt); - - return ret; + pa = slow_virt_to_phys(src) | 0x01ULL; + wrmsrl(msr_kvm_system_time, pa); + pr_info("kvm-clock: cpu %d, msr %llx, %s\n", cpu, pa, txt); } static void kvm_save_sched_clock_state(void) @@ -218,11 +214,7 @@ static void kvm_restore_sched_clock_state(void) #ifdef CONFIG_X86_LOCAL_APIC static void kvm_setup_secondary_clock(void) { - /* - * Now that the first cpu already had this clocksource initialized, - * we shouldn't fail. - */ - WARN_ON(kvm_register_clock("secondary cpu clock")); + kvm_register_clock("secondary cpu clock"); } #endif @@ -265,16 +257,11 @@ void __init kvmclock_init(void) } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) return; - hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem; - - if (kvm_register_clock("primary cpu clock")) { - hv_clock = NULL; - return; - } - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); + hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem; + kvm_register_clock("primary cpu clock"); pvclock_set_pvti_cpu0_va(hv_clock); if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) -- cgit v1.2.3 From 146c394d0c3c8e88df433a179c2b0b85fd8cf247 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2018 16:55:23 -0400 Subject: x86/kvmclock: Cleanup the code - Cleanup the mrs write for wall clock. The type casts to (int) are sloppy because the wrmsr parameters are u32 and aside of that wrmsrl() already provides the high/low split for free. - Remove the pointless get_cpu()/put_cpu() dance from various functions. Either they are called during early init where CPU is guaranteed to be 0 or they are already called from non preemptible context where smp_processor_id() can be used safely - Simplify the convoluted check for kvmclock in the init function. - Mark the parameter parsing function __init. No point in keeping it around. - Convert to pr_info() Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-5-pasha.tatashin@oracle.com --- arch/x86/kernel/kvmclock.c | 72 ++++++++++++++-------------------------------- 1 file changed, 22 insertions(+), 50 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index f0a0aef5e9fa..4afb03e49a4f 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -37,7 +37,7 @@ static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static u64 kvm_sched_clock_offset; -static int parse_no_kvmclock(char *arg) +static int __init parse_no_kvmclock(char *arg) { kvmclock = 0; return 0; @@ -61,13 +61,9 @@ static struct pvclock_wall_clock wall_clock; static void kvm_get_wallclock(struct timespec64 *now) { struct pvclock_vcpu_time_info *vcpu_time; - int low, high; int cpu; - low = (int)slow_virt_to_phys(&wall_clock); - high = ((u64)slow_virt_to_phys(&wall_clock) >> 32); - - native_write_msr(msr_kvm_wall_clock, low, high); + wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); cpu = get_cpu(); @@ -117,11 +113,11 @@ static inline void kvm_sched_clock_init(bool stable) kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", - kvm_sched_clock_offset); + pr_info("kvm-clock: using sched offset of %llu cycles", + kvm_sched_clock_offset); BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > - sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); } /* @@ -135,16 +131,8 @@ static inline void kvm_sched_clock_init(bool stable) */ static unsigned long kvm_get_tsc_khz(void) { - struct pvclock_vcpu_time_info *src; - int cpu; - unsigned long tsc_khz; - - cpu = get_cpu(); - src = &hv_clock[cpu].pvti; - tsc_khz = pvclock_tsc_khz(src); - put_cpu(); setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - return tsc_khz; + return pvclock_tsc_khz(&hv_clock[0].pvti); } static void kvm_get_preset_lpj(void) @@ -161,29 +149,27 @@ static void kvm_get_preset_lpj(void) bool kvm_check_and_clear_guest_paused(void) { - bool ret = false; struct pvclock_vcpu_time_info *src; - int cpu = smp_processor_id(); + bool ret = false; if (!hv_clock) return ret; - src = &hv_clock[cpu].pvti; + src = &hv_clock[smp_processor_id()].pvti; if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { src->flags &= ~PVCLOCK_GUEST_STOPPED; pvclock_touch_watchdogs(); ret = true; } - return ret; } struct clocksource kvm_clock = { - .name = "kvm-clock", - .read = kvm_clock_get_cycles, - .rating = 400, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .name = "kvm-clock", + .read = kvm_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; EXPORT_SYMBOL_GPL(kvm_clock); @@ -199,7 +185,7 @@ static void kvm_register_clock(char *txt) src = &hv_clock[cpu].pvti; pa = slow_virt_to_phys(src) | 0x01ULL; wrmsrl(msr_kvm_system_time, pa); - pr_info("kvm-clock: cpu %d, msr %llx, %s\n", cpu, pa, txt); + pr_info("kvm-clock: cpu %d, msr %llx, %s", cpu, pa, txt); } static void kvm_save_sched_clock_state(void) @@ -244,20 +230,19 @@ static void kvm_shutdown(void) void __init kvmclock_init(void) { - struct pvclock_vcpu_time_info *vcpu_time; - int cpu; u8 flags; - if (!kvm_para_available()) + if (!kvm_para_available() || !kvmclock) return; - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) + } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { return; + } - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + pr_info("kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem; @@ -267,20 +252,15 @@ void __init kvmclock_init(void) if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - + flags = pvclock_read_flags(&hv_clock[0].pvti); kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); - put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.early_percpu_clock_init = - kvm_setup_secondary_clock; + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; @@ -296,20 +276,12 @@ void __init kvmclock_init(void) int __init kvm_setup_vsyscall_timeinfo(void) { #ifdef CONFIG_X86_64 - int cpu; u8 flags; - struct pvclock_vcpu_time_info *vcpu_time; if (!hv_clock) return 0; - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - - put_cpu(); - + flags = pvclock_read_flags(&hv_clock[0].pvti); if (!(flags & PVCLOCK_TSC_STABLE_BIT)) return 1; -- cgit v1.2.3 From 42f8df935efefba51d0c5321b1325436523e3377 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2018 16:55:24 -0400 Subject: x86/kvmclock: Mark variables __initdata and __ro_after_init The kvmclock parameter is init data and the other variables are not modified after init. Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-6-pasha.tatashin@oracle.com --- arch/x86/kernel/kvmclock.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 4afb03e49a4f..78aec160f5e0 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -32,10 +32,10 @@ #include #include -static int kvmclock __ro_after_init = 1; -static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; -static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; -static u64 kvm_sched_clock_offset; +static int kvmclock __initdata = 1; +static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; +static u64 kvm_sched_clock_offset __ro_after_init; static int __init parse_no_kvmclock(char *arg) { @@ -50,7 +50,7 @@ early_param("no-kvmclock", parse_no_kvmclock); static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE); /* The hypervisor will put information about time periodically here */ -static struct pvclock_vsyscall_time_info *hv_clock; +static struct pvclock_vsyscall_time_info *hv_clock __ro_after_init; static struct pvclock_wall_clock wall_clock; /* -- cgit v1.2.3 From e499a9b6dc488aff7f284bee51936f510ab7ad15 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2018 16:55:25 -0400 Subject: x86/kvmclock: Move kvmclock vsyscall param and init to kvmclock There is no point to have this in the kvm code itself and call it from there. This can be called from an initcall and the parameter is cleared when the hypervisor is not KVM. Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-7-pasha.tatashin@oracle.com --- arch/x86/include/asm/kvm_guest.h | 7 ------- arch/x86/kernel/kvm.c | 13 ------------ arch/x86/kernel/kvmclock.c | 44 ++++++++++++++++++++++++---------------- 3 files changed, 27 insertions(+), 37 deletions(-) delete mode 100644 arch/x86/include/asm/kvm_guest.h (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/kvm_guest.h b/arch/x86/include/asm/kvm_guest.h deleted file mode 100644 index 46185263d9c2..000000000000 --- a/arch/x86/include/asm/kvm_guest.h +++ /dev/null @@ -1,7 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_KVM_GUEST_H -#define _ASM_X86_KVM_GUEST_H - -int kvm_setup_vsyscall_timeinfo(void); - -#endif /* _ASM_X86_KVM_GUEST_H */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index c65c232d3ddd..a560750cc76f 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -45,7 +45,6 @@ #include #include #include -#include static int kvmapf = 1; @@ -66,15 +65,6 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -static int kvmclock_vsyscall = 1; -static int __init parse_no_kvmclock_vsyscall(char *arg) -{ - kvmclock_vsyscall = 0; - return 0; -} - -early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); - static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; @@ -560,9 +550,6 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); - if (kvmclock_vsyscall) - kvm_setup_vsyscall_timeinfo(); - #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 78aec160f5e0..7d690d2238f8 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -27,12 +27,14 @@ #include #include +#include #include #include #include #include static int kvmclock __initdata = 1; +static int kvmclock_vsyscall __initdata = 1; static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; static u64 kvm_sched_clock_offset __ro_after_init; @@ -44,6 +46,13 @@ static int __init parse_no_kvmclock(char *arg) } early_param("no-kvmclock", parse_no_kvmclock); +static int __init parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ #define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) @@ -228,6 +237,24 @@ static void kvm_shutdown(void) native_machine_shutdown(); } +static int __init kvm_setup_vsyscall_timeinfo(void) +{ +#ifdef CONFIG_X86_64 + u8 flags; + + if (!hv_clock || !kvmclock_vsyscall) + return 0; + + flags = pvclock_read_flags(&hv_clock[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 1; + + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif + return 0; +} +early_initcall(kvm_setup_vsyscall_timeinfo); + void __init kvmclock_init(void) { u8 flags; @@ -272,20 +299,3 @@ void __init kvmclock_init(void) clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; } - -int __init kvm_setup_vsyscall_timeinfo(void) -{ -#ifdef CONFIG_X86_64 - u8 flags; - - if (!hv_clock) - return 0; - - flags = pvclock_read_flags(&hv_clock[0].pvti); - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 1; - - kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; -#endif - return 0; -} -- cgit v1.2.3 From 95a3d4454bb1cf5bfd666c27fdd2dc188e17c14d Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 19 Jul 2018 16:55:26 -0400 Subject: x86/kvmclock: Switch kvmclock data to a PER_CPU variable The previous removal of the memblock dependency from kvmclock introduced a static data array sized 64bytes * CONFIG_NR_CPUS. That's wasteful on large systems when kvmclock is not used. Replace it with: - A static page sized array of pvclock data. It's page sized because the pvclock data of the boot cpu is mapped into the VDSO so otherwise random other data would be exposed to the vDSO - A PER_CPU variable of pvclock data pointers. This is used to access the pcvlock data storage on each CPU. The setup is done in two stages: - Early boot stores the pointer to the static page for the boot CPU in the per cpu data. - In the preparatory stage of CPU hotplug assign either an element of the static array (when the CPU number is in that range) or allocate memory and initialize the per cpu pointer. Signed-off-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Acked-by: Paolo Bonzini Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Link: https://lkml.kernel.org/r/20180719205545.16512-8-pasha.tatashin@oracle.com --- arch/x86/kernel/kvmclock.c | 99 +++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 37 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 7d690d2238f8..91b94c0ae4e3 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -55,12 +56,23 @@ early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); /* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ #define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) +#define HVC_BOOT_ARRAY_SIZE \ + (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) -static u8 hv_clock_mem[PAGE_ALIGN(HV_CLOCK_SIZE)] __aligned(PAGE_SIZE); - -/* The hypervisor will put information about time periodically here */ -static struct pvclock_vsyscall_time_info *hv_clock __ro_after_init; +static struct pvclock_vsyscall_time_info + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); static struct pvclock_wall_clock wall_clock; +static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); + +static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +{ + return &this_cpu_read(hv_clock_per_cpu)->pvti; +} + +static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) +{ + return this_cpu_read(hv_clock_per_cpu); +} /* * The wallclock is the time of day when we booted. Since then, some time may @@ -69,17 +81,10 @@ static struct pvclock_wall_clock wall_clock; */ static void kvm_get_wallclock(struct timespec64 *now) { - struct pvclock_vcpu_time_info *vcpu_time; - int cpu; - wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(&wall_clock, vcpu_time, now); - - put_cpu(); + preempt_disable(); + pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); + preempt_enable(); } static int kvm_set_wallclock(const struct timespec64 *now) @@ -89,14 +94,10 @@ static int kvm_set_wallclock(const struct timespec64 *now) static u64 kvm_clock_read(void) { - struct pvclock_vcpu_time_info *src; u64 ret; - int cpu; preempt_disable_notrace(); - cpu = smp_processor_id(); - src = &hv_clock[cpu].pvti; - ret = pvclock_clocksource_read(src); + ret = pvclock_clocksource_read(this_cpu_pvti()); preempt_enable_notrace(); return ret; } @@ -141,7 +142,7 @@ static inline void kvm_sched_clock_init(bool stable) static unsigned long kvm_get_tsc_khz(void) { setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - return pvclock_tsc_khz(&hv_clock[0].pvti); + return pvclock_tsc_khz(this_cpu_pvti()); } static void kvm_get_preset_lpj(void) @@ -158,15 +159,14 @@ static void kvm_get_preset_lpj(void) bool kvm_check_and_clear_guest_paused(void) { - struct pvclock_vcpu_time_info *src; + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); bool ret = false; - if (!hv_clock) + if (!src) return ret; - src = &hv_clock[smp_processor_id()].pvti; - if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { - src->flags &= ~PVCLOCK_GUEST_STOPPED; + if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) { + src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED; pvclock_touch_watchdogs(); ret = true; } @@ -184,17 +184,15 @@ EXPORT_SYMBOL_GPL(kvm_clock); static void kvm_register_clock(char *txt) { - struct pvclock_vcpu_time_info *src; - int cpu = smp_processor_id(); + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); u64 pa; - if (!hv_clock) + if (!src) return; - src = &hv_clock[cpu].pvti; - pa = slow_virt_to_phys(src) | 0x01ULL; + pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; wrmsrl(msr_kvm_system_time, pa); - pr_info("kvm-clock: cpu %d, msr %llx, %s", cpu, pa, txt); + pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } static void kvm_save_sched_clock_state(void) @@ -242,12 +240,12 @@ static int __init kvm_setup_vsyscall_timeinfo(void) #ifdef CONFIG_X86_64 u8 flags; - if (!hv_clock || !kvmclock_vsyscall) + if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) return 0; - flags = pvclock_read_flags(&hv_clock[0].pvti); + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 1; + return 0; kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; #endif @@ -255,6 +253,28 @@ static int __init kvm_setup_vsyscall_timeinfo(void) } early_initcall(kvm_setup_vsyscall_timeinfo); +static int kvmclock_setup_percpu(unsigned int cpu) +{ + struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu); + + /* + * The per cpu area setup replicates CPU0 data to all cpu + * pointers. So carefully check. CPU0 has been set up in init + * already. + */ + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) + return 0; + + /* Use the static page for the first CPUs, allocate otherwise */ + if (cpu < HVC_BOOT_ARRAY_SIZE) + p = &hv_clock_boot[cpu]; + else + p = kzalloc(sizeof(*p), GFP_KERNEL); + + per_cpu(hv_clock_per_cpu, cpu) = p; + return p ? 0 : -ENOMEM; +} + void __init kvmclock_init(void) { u8 flags; @@ -269,17 +289,22 @@ void __init kvmclock_init(void) return; } + if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu", + kvmclock_setup_percpu, NULL) < 0) { + return; + } + pr_info("kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - hv_clock = (struct pvclock_vsyscall_time_info *)hv_clock_mem; + this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); kvm_register_clock("primary cpu clock"); - pvclock_set_pvti_cpu0_va(hv_clock); + pvclock_set_pvti_cpu0_va(hv_clock_boot); if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); - flags = pvclock_read_flags(&hv_clock[0].pvti); + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); x86_platform.calibrate_tsc = kvm_get_tsc_khz; -- cgit v1.2.3 From 6fffacb30349e0903602d664f7ab6fc87e85162e Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:27 -0400 Subject: x86/alternatives, jumplabel: Use text_poke_early() before mm_init() It supposed to be safe to modify static branches after jump_label_init(). But, because static key modifying code eventually calls text_poke() it can end up accessing a struct page which has not been initialized yet. Here is how to quickly reproduce the problem. Insert code like this into init/main.c: | +static DEFINE_STATIC_KEY_FALSE(__test); | asmlinkage __visible void __init start_kernel(void) | { | char *command_line; |@@ -587,6 +609,10 @@ asmlinkage __visible void __init start_kernel(void) | vfs_caches_init_early(); | sort_main_extable(); | trap_init(); |+ { |+ static_branch_enable(&__test); |+ WARN_ON(!static_branch_likely(&__test)); |+ } | mm_init(); The following warnings show-up: WARNING: CPU: 0 PID: 0 at arch/x86/kernel/alternative.c:701 text_poke+0x20d/0x230 RIP: 0010:text_poke+0x20d/0x230 Call Trace: ? text_poke_bp+0x50/0xda ? arch_jump_label_transform+0x89/0xe0 ? __jump_label_update+0x78/0xb0 ? static_key_enable_cpuslocked+0x4d/0x80 ? static_key_enable+0x11/0x20 ? start_kernel+0x23e/0x4c8 ? secondary_startup_64+0xa5/0xb0 ---[ end trace abdc99c031b8a90a ]--- If the code above is moved after mm_init(), no warning is shown, as struct pages are initialized during handover from memblock. Use text_poke_early() in static branching until early boot IRQs are enabled and from there switch to text_poke. Also, ensure text_poke() is never invoked when unitialized memory access may happen by using adding a !after_bootmem assertion. Signed-off-by: Pavel Tatashin Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-9-pasha.tatashin@oracle.com --- arch/x86/include/asm/text-patching.h | 1 + arch/x86/kernel/alternative.c | 7 +++++++ arch/x86/kernel/jump_label.c | 11 +++++++---- 3 files changed, 15 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/text-patching.h b/arch/x86/include/asm/text-patching.h index 2ecd34e2d46c..e85ff65c43c3 100644 --- a/arch/x86/include/asm/text-patching.h +++ b/arch/x86/include/asm/text-patching.h @@ -37,5 +37,6 @@ extern void *text_poke_early(void *addr, const void *opcode, size_t len); extern void *text_poke(void *addr, const void *opcode, size_t len); extern int poke_int3_handler(struct pt_regs *regs); extern void *text_poke_bp(void *addr, const void *opcode, size_t len, void *handler); +extern int after_bootmem; #endif /* _ASM_X86_TEXT_PATCHING_H */ diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a481763a3776..014f214da581 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -668,6 +668,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, local_irq_save(flags); memcpy(addr, opcode, len); local_irq_restore(flags); + sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ return addr; @@ -693,6 +694,12 @@ void *text_poke(void *addr, const void *opcode, size_t len) struct page *pages[2]; int i; + /* + * While boot memory allocator is runnig we cannot use struct + * pages as they are not yet initialized. + */ + BUG_ON(!after_bootmem); + if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); pages[1] = vmalloc_to_page(addr + PAGE_SIZE); diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index e56c95be2808..eeea935e9bb5 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,15 +37,18 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - void *(*poker)(void *, const void *, size_t), - int init) +static void __ref __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + void *(*poker)(void *, const void *, size_t), + int init) { union jump_code_union code; const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + if (early_boot_irqs_disabled) + poker = text_poke_early; + if (type == JUMP_LABEL_JMP) { if (init) { /* -- cgit v1.2.3 From 8990cac6e5ea7fa57607736019fe8dca961b998f Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:28 -0400 Subject: x86/jump_label: Initialize static branching early Static branching is useful to runtime patch branches that are used in hot path, but are infrequently changed. The x86 clock framework is one example that uses static branches to setup the best clock during boot and never changes it again. It is desired to enable the TSC based sched clock early to allow fine grained boot time analysis early on. That requires the static branching functionality to be functional early as well. Static branching requires patching nop instructions, thus, arch_init_ideal_nops() must be called prior to jump_label_init(). Do all the necessary steps to call arch_init_ideal_nops() right after early_cpu_init(), which also allows to insert a call to jump_label_init() right after that. jump_label_init() will be called again from the generic init code, but the code is protected against reinitialization already. [ tglx: Massaged changelog ] Suggested-by: Peter Zijlstra Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Reviewed-by: Borislav Petkov Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-10-pasha.tatashin@oracle.com --- arch/x86/kernel/cpu/amd.c | 13 ++++++++----- arch/x86/kernel/cpu/common.c | 38 ++++++++++++++++++++------------------ arch/x86/kernel/setup.c | 4 ++-- 3 files changed, 30 insertions(+), 25 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 38915fbfae73..b732438c1a1e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -232,8 +232,6 @@ static void init_amd_k7(struct cpuinfo_x86 *c) } } - set_cpu_cap(c, X86_FEATURE_K7); - /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -617,6 +615,14 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_init_amd_mc(c); +#ifdef CONFIG_X86_32 + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_K7); +#endif + + if (c->x86 >= 0xf) + set_cpu_cap(c, X86_FEATURE_K8); + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); /* @@ -863,9 +869,6 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); - if (c->x86 >= 0xf) - set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has(c, X86_FEATURE_XMM2)) { unsigned long long val; int ret; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index eb4cb3efd20e..71281ac43b15 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1015,6 +1015,24 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); } +/* + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit + * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). + */ +static void detect_nopl(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_X86_32 + clear_cpu_cap(c, X86_FEATURE_NOPL); +#else + set_cpu_cap(c, X86_FEATURE_NOPL); +#endif +} + /* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, @@ -1089,6 +1107,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) */ if (!pgtable_l5_enabled()) setup_clear_cpu_cap(X86_FEATURE_LA57); + + detect_nopl(c); } void __init early_cpu_init(void) @@ -1124,24 +1144,6 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); } -/* - * The NOPL instruction is supposed to exist on all CPUs of family >= 6; - * unfortunately, that's not true in practice because of early VIA - * chips and (more importantly) broken virtualizers that are not easy - * to detect. In the latter case it doesn't even *fail* reliably, so - * probing for it doesn't even work. Disable it completely on 32-bit - * unless we can find a reliable way to detect all the broken cases. - * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). - */ -static void detect_nopl(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_32 - clear_cpu_cap(c, X86_FEATURE_NOPL); -#else - set_cpu_cap(c, X86_FEATURE_NOPL); -#endif -} - static void detect_null_seg_behavior(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index da1dbd99cb6e..7490de925a81 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -866,6 +866,8 @@ void __init setup_arch(char **cmdline_p) idt_setup_early_traps(); early_cpu_init(); + arch_init_ideal_nops(); + jump_label_init(); early_ioremap_init(); setup_olpc_ofw_pgd(); @@ -1268,8 +1270,6 @@ void __init setup_arch(char **cmdline_p) mcheck_init(); - arch_init_ideal_nops(); - register_refined_jiffies(CLOCK_TICK_RATE); #ifdef CONFIG_EFI -- cgit v1.2.3 From 9b3661cd7e5400689ed168a7275e75af333177e6 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 19 Jul 2018 16:55:29 -0400 Subject: x86/CPU: Call detect_nopl() only on the BSP Make it use the setup_* variants and have it be called only on the BSP and drop the call in generic_identify() - X86_FEATURE_NOPL will be replicated to the APs through the forced caps. Helps to keep the mess at a manageable level. Signed-off-by: Borislav Petkov Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-11-pasha.tatashin@oracle.com --- arch/x86/kernel/cpu/common.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 71281ac43b15..46408a8cdf62 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1024,12 +1024,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) * unless we can find a reliable way to detect all the broken cases. * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). */ -static void detect_nopl(struct cpuinfo_x86 *c) +static void detect_nopl(void) { #ifdef CONFIG_X86_32 - clear_cpu_cap(c, X86_FEATURE_NOPL); + setup_clear_cpu_cap(X86_FEATURE_NOPL); #else - set_cpu_cap(c, X86_FEATURE_NOPL); + setup_force_cpu_cap(X86_FEATURE_NOPL); #endif } @@ -1108,7 +1108,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (!pgtable_l5_enabled()) setup_clear_cpu_cap(X86_FEATURE_LA57); - detect_nopl(c); + detect_nopl(); } void __init early_cpu_init(void) @@ -1206,8 +1206,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - detect_nopl(c); - detect_null_seg_behavior(c); /* -- cgit v1.2.3 From fe9af81e524e8a86bdd59c0cc0d9e2b0ccaf840f Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:30 -0400 Subject: x86/tsc: Redefine notsc to behave as tsc=unstable Currently, the notsc kernel parameter disables the use of the TSC by sched_clock(). However, this parameter does not prevent the kernel from accessing tsc in other places. The only rationale to boot with notsc is to avoid timing discrepancies on multi-socket systems where TSC are not properly synchronized, and thus exclude TSC from being used for time keeping. But that prevents using TSC as sched_clock() as well, which is not necessary as the core sched_clock() implementation can handle non synchronized TSC based sched clocks just fine. However, there is another method to solve the above problem: booting with tsc=unstable parameter. This parameter allows sched_clock() to use TSC and just excludes it from timekeeping. So there is no real reason to keep notsc, but for compatibility reasons the parameter has to stay. Make it behave like 'tsc=unstable' instead. [ tglx: Massaged changelog ] Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Reviewed-by: Dou Liyang Reviewed-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-12-pasha.tatashin@oracle.com --- Documentation/admin-guide/kernel-parameters.txt | 2 -- Documentation/x86/x86_64/boot-options.txt | 4 +--- arch/x86/kernel/tsc.c | 18 +++--------------- 3 files changed, 4 insertions(+), 20 deletions(-) (limited to 'arch/x86/kernel') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 533ff5c68970..5aed30cd0350 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2835,8 +2835,6 @@ nosync [HW,M68K] Disables sync negotiation for all devices. - notsc [BUGS=X86-32] Disable Time Stamp Counter - nowatchdog [KNL] Disable both lockup detectors, i.e. soft-lockup and NMI watchdog (hard-lockup). diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 8d109ef67ab6..66114ab4f9fe 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -92,9 +92,7 @@ APICs Timing notsc - Don't use the CPU time stamp counter to read the wall time. - This can be used to work around timing problems on multiprocessor systems - with not properly synchronized CPUs. + Deprecated, use tsc=unstable instead. nohpet Don't use the HPET timer. diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 74392d9d51e0..186395041725 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -38,11 +38,6 @@ EXPORT_SYMBOL(tsc_khz); */ static int __read_mostly tsc_unstable; -/* native_sched_clock() is called before tsc_init(), so - we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */ -static int __read_mostly tsc_disabled = -1; - static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; @@ -248,8 +243,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); - tsc_disabled = 1; + mark_tsc_unstable("boot parameter notsc"); return 1; } #else @@ -1307,7 +1301,7 @@ unreg: static int __init init_tsc_clocksource(void) { - if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) + if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; if (tsc_unstable) @@ -1414,12 +1408,6 @@ void __init tsc_init(void) set_cyc2ns_scale(tsc_khz, cpu, cyc); } - if (tsc_disabled > 0) - return; - - /* now allow native_sched_clock() to use rdtsc */ - - tsc_disabled = 0; static_branch_enable(&__use_tsc); if (!no_sched_irq_time) @@ -1455,7 +1443,7 @@ unsigned long calibrate_delay_is_known(void) int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); - if (tsc_disabled || !constant_tsc || !mask) + if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); -- cgit v1.2.3 From cf7a63ef4e0203f6f33284c69e8188d91422de83 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:38 -0400 Subject: x86/tsc: Calibrate tsc only once During boot tsc is calibrated twice: once in tsc_early_delay_calibrate(), and the second time in tsc_init(). Rename tsc_early_delay_calibrate() to tsc_early_init(), and rework it so the calibration is done only early, and make tsc_init() to use the values already determined in tsc_early_init(). Sometimes it is not possible to determine tsc early, as the subsystem that is required is not yet initialized, in such case try again later in tsc_init(). Suggested-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-20-pasha.tatashin@oracle.com --- arch/x86/include/asm/tsc.h | 2 +- arch/x86/kernel/setup.c | 2 +- arch/x86/kernel/tsc.c | 87 +++++++++++++++++++++++++--------------------- 3 files changed, 49 insertions(+), 42 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 2701d221583a..c4368ff73652 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -33,7 +33,7 @@ static inline cycles_t get_cycles(void) extern struct system_counterval_t convert_art_to_tsc(u64 art); extern struct system_counterval_t convert_art_ns_to_tsc(u64 art_ns); -extern void tsc_early_delay_calibrate(void); +extern void tsc_early_init(void); extern void tsc_init(void); extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 7490de925a81..5d32c55aeb8b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1014,6 +1014,7 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + tsc_early_init(); x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ @@ -1199,7 +1200,6 @@ void __init setup_arch(char **cmdline_p) memblock_find_dma_reserve(); - tsc_early_delay_calibrate(); if (!early_xdbc_setup_hardware()) early_xdbc_register_console(); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 186395041725..4cab2236169e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -33,6 +33,8 @@ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); +#define KHZ 1000 + /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ @@ -1335,34 +1337,10 @@ unreg: */ device_initcall(init_tsc_clocksource); -void __init tsc_early_delay_calibrate(void) -{ - unsigned long lpj; - - if (!boot_cpu_has(X86_FEATURE_TSC)) - return; - - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - - tsc_khz = tsc_khz ? : cpu_khz; - if (!tsc_khz) - return; - - lpj = tsc_khz * 1000; - do_div(lpj, HZ); - loops_per_jiffy = lpj; -} - -void __init tsc_init(void) +static bool __init determine_cpu_tsc_frequencies(void) { - u64 lpj, cyc; - int cpu; - - if (!boot_cpu_has(X86_FEATURE_TSC)) { - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; - } + /* Make sure that cpu and tsc are not already calibrated */ + WARN_ON(cpu_khz || tsc_khz); cpu_khz = x86_platform.calibrate_cpu(); tsc_khz = x86_platform.calibrate_tsc(); @@ -1377,20 +1355,52 @@ void __init tsc_init(void) else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; - if (!tsc_khz) { - mark_tsc_unstable("could not calculate TSC khz"); - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; - } + if (tsc_khz == 0) + return false; pr_info("Detected %lu.%03lu MHz processor\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + (unsigned long)cpu_khz / KHZ, + (unsigned long)cpu_khz % KHZ); if (cpu_khz != tsc_khz) { pr_info("Detected %lu.%03lu MHz TSC", - (unsigned long)tsc_khz / 1000, - (unsigned long)tsc_khz % 1000); + (unsigned long)tsc_khz / KHZ, + (unsigned long)tsc_khz % KHZ); + } + return true; +} + +static unsigned long __init get_loops_per_jiffy(void) +{ + unsigned long lpj = tsc_khz * KHZ; + + do_div(lpj, HZ); + return lpj; +} + +void __init tsc_early_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + if (!determine_cpu_tsc_frequencies()) + return; + loops_per_jiffy = get_loops_per_jiffy(); +} + +void __init tsc_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC)) { + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } + + if (!tsc_khz) { + /* We failed to determine frequencies earlier, try again */ + if (!determine_cpu_tsc_frequencies()) { + mark_tsc_unstable("could not calculate TSC khz"); + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } } /* Sanitize TSC ADJUST before cyc2ns gets initialized */ @@ -1413,10 +1423,7 @@ void __init tsc_init(void) if (!no_sched_irq_time) enable_sched_clock_irqtime(); - lpj = ((u64)tsc_khz * 1000); - do_div(lpj, HZ); - lpj_fine = lpj; - + lpj_fine = get_loops_per_jiffy(); use_tsc_delay(); check_system_tsc_reliable(); -- cgit v1.2.3 From e2a9ca29b5edc89da2fddeae30e1070b272395c5 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:39 -0400 Subject: x86/tsc: Initialize cyc2ns when tsc frequency is determined cyc2ns converts tsc to nanoseconds, and it is handled in a per-cpu data structure. Currently, the setup code for c2ns data for every possible CPU goes through the same sequence of calculations as for the boot CPU, but is based on the same tsc frequency as the boot CPU, and thus this is not necessary. Initialize the boot cpu when tsc frequency is determined. Copy the calculated data from the boot CPU to the other CPUs in tsc_init(). In addition do the following: - Remove unnecessary zeroing of c2ns data by removing cyc2ns_data_init() - Split set_cyc2ns_scale() into two functions, so set_cyc2ns_scale() can be called when system is up, and wraps around __set_cyc2ns_scale() that can be called directly when system is booting but avoids saving restoring IRQs and going and waking up from idle. Suggested-by: Thomas Gleixner Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-21-pasha.tatashin@oracle.com --- arch/x86/kernel/tsc.c | 94 +++++++++++++++++++++++++++++---------------------- 1 file changed, 53 insertions(+), 41 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 4cab2236169e..7ea0718a4c75 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -103,23 +103,6 @@ void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static void cyc2ns_data_init(struct cyc2ns_data *data) -{ - data->cyc2ns_mul = 0; - data->cyc2ns_shift = 0; - data->cyc2ns_offset = 0; -} - -static void __init cyc2ns_init(int cpu) -{ - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - cyc2ns_data_init(&c2n->data[0]); - cyc2ns_data_init(&c2n->data[1]); - - seqcount_init(&c2n->seq); -} - static inline unsigned long long cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; @@ -135,18 +118,11 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; - unsigned long flags; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - if (!khz) - goto done; ns_now = cycles_2_ns(tsc_now); @@ -178,12 +154,55 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_ c2n->data[0] = data; raw_write_seqcount_latch(&c2n->seq); c2n->data[1] = data; +} + +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +{ + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (khz) + __set_cyc2ns_scale(khz, cpu, tsc_now); -done: sched_clock_idle_wakeup_event(); local_irq_restore(flags); } +/* + * Initialize cyc2ns for boot cpu + */ +static void __init cyc2ns_init_boot_cpu(void) +{ + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + + seqcount_init(&c2n->seq); + __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); +} + +/* + * Secondary CPUs do not run through cyc2ns_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ +static void __init cyc2ns_init_secondary_cpus(void) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + struct cyc2ns_data *data = c2n->data; + + for_each_possible_cpu(cpu) { + if (cpu != this_cpu) { + seqcount_init(&c2n->seq); + c2n = per_cpu_ptr(&cyc2ns, cpu); + c2n->data[0] = data[0]; + c2n->data[1] = data[1]; + } + } +} + /* * Scheduler clock - returns current time in nanosec units. */ @@ -1385,6 +1404,10 @@ void __init tsc_early_init(void) if (!determine_cpu_tsc_frequencies()) return; loops_per_jiffy = get_loops_per_jiffy(); + + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ + tsc_store_and_check_tsc_adjust(true); + cyc2ns_init_boot_cpu(); } void __init tsc_init(void) @@ -1401,23 +1424,12 @@ void __init tsc_init(void) setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ + tsc_store_and_check_tsc_adjust(true); + cyc2ns_init_boot_cpu(); } - /* Sanitize TSC ADJUST before cyc2ns gets initialized */ - tsc_store_and_check_tsc_adjust(true); - - /* - * Secondary CPUs do not run through tsc_init(), so set up - * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) - */ - cyc = rdtsc(); - for_each_possible_cpu(cpu) { - cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu, cyc); - } - + cyc2ns_init_secondary_cpus(); static_branch_enable(&__use_tsc); if (!no_sched_irq_time) -- cgit v1.2.3 From 4763f03d3d186ce8a1125844790152d76804ad60 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:40 -0400 Subject: x86/tsc: Use TSC as sched clock early All prerequesites for enabling TSC as sched clock early in the boot process are available now: - Early attempt of TSC calibration - Early availablity of static branch patching If TSC frequency can be established in the early calibration, enable the static key which switches sched clock to use TSC. [ tglx: Massaged changelog ] Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-22-pasha.tatashin@oracle.com --- arch/x86/kernel/tsc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7ea0718a4c75..9277ae9b68b3 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -1408,6 +1408,7 @@ void __init tsc_early_init(void) /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); cyc2ns_init_boot_cpu(); + static_branch_enable(&__use_tsc); } void __init tsc_init(void) -- cgit v1.2.3 From 03821f451d2d2d7599061244734245be139014ea Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:44 -0400 Subject: x86/tsc: Split native_calibrate_cpu() into early and late parts During early boot TSC and CPU frequency can be calibrated using MSR, CPUID, and quick PIT calibration methods. The other methods PIT/HPET/PMTIMER are available only after ACPI is initialized. Split native_calibrate_cpu() into early and late parts so they can be called separately during early and late tsc calibration. Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-26-pasha.tatashin@oracle.com --- arch/x86/include/asm/tsc.h | 1 + arch/x86/kernel/tsc.c | 54 ++++++++++++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index c4368ff73652..88140e4f2292 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -40,6 +40,7 @@ extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern void mark_tsc_async_resets(char *reason); extern unsigned long native_calibrate_cpu(void); +extern unsigned long native_calibrate_cpu_early(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9277ae9b68b3..60586779b02c 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -680,30 +680,17 @@ static unsigned long cpu_khz_from_cpuid(void) return eax_base_mhz * 1000; } -/** - * native_calibrate_cpu - calibrate the cpu on boot +/* + * calibrate cpu using pit, hpet, and ptimer methods. They are available + * later in boot after acpi is initialized. */ -unsigned long native_calibrate_cpu(void) +static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; - fast_calibrate = cpu_khz_from_cpuid(); - if (fast_calibrate) - return fast_calibrate; - - fast_calibrate = cpu_khz_from_msr(); - if (fast_calibrate) - return fast_calibrate; - - local_irq_save(flags); - fast_calibrate = quick_pit_calibrate(); - local_irq_restore(flags); - if (fast_calibrate) - return fast_calibrate; - /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes @@ -846,6 +833,37 @@ unsigned long native_calibrate_cpu(void) return tsc_pit_min; } +/** + * native_calibrate_cpu_early - can calibrate the cpu early in boot + */ +unsigned long native_calibrate_cpu_early(void) +{ + unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); + + if (!fast_calibrate) + fast_calibrate = cpu_khz_from_msr(); + if (!fast_calibrate) { + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); + local_irq_restore(flags); + } + return fast_calibrate; +} + + +/** + * native_calibrate_cpu - calibrate the cpu + */ +unsigned long native_calibrate_cpu(void) +{ + unsigned long tsc_freq = native_calibrate_cpu_early(); + + if (!tsc_freq) + tsc_freq = pit_hpet_ptimer_calibrate_cpu(); + + return tsc_freq; +} + void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP -- cgit v1.2.3 From 8dbe438589f373544a1af8b4a859e4da853c0f90 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Thu, 19 Jul 2018 16:55:45 -0400 Subject: x86/tsc: Make use of tsc_calibrate_cpu_early() During early boot enable tsc_calibrate_cpu_early() and switch to tsc_calibrate_cpu() only later. Do this unconditionally, because it is unknown what methods other cpus will use to calibrate once they are onlined. If by the time tsc_init() is called tsc frequency is still unknown do only pit_hpet_ptimer_calibrate_cpu() to calibrate, as this function contains the only methods wich have not been called and tried earlier. Signed-off-by: Pavel Tatashin Signed-off-by: Thomas Gleixner Cc: steven.sistare@oracle.com Cc: daniel.m.jordan@oracle.com Cc: linux@armlinux.org.uk Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com Cc: john.stultz@linaro.org Cc: sboyd@codeaurora.org Cc: hpa@zytor.com Cc: douly.fnst@cn.fujitsu.com Cc: peterz@infradead.org Cc: prarit@redhat.com Cc: feng.tang@intel.com Cc: pmladek@suse.com Cc: gnomes@lxorguk.ukuu.org.uk Cc: linux-s390@vger.kernel.org Cc: boris.ostrovsky@oracle.com Cc: jgross@suse.com Cc: pbonzini@redhat.com Link: https://lkml.kernel.org/r/20180719205545.16512-27-pasha.tatashin@oracle.com --- arch/x86/include/asm/tsc.h | 1 - arch/x86/kernel/tsc.c | 25 +++++++++++++++++++------ arch/x86/kernel/x86_init.c | 2 +- 3 files changed, 20 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h index 88140e4f2292..eb5bbfeccb66 100644 --- a/arch/x86/include/asm/tsc.h +++ b/arch/x86/include/asm/tsc.h @@ -39,7 +39,6 @@ extern void mark_tsc_unstable(char *reason); extern int unsynchronized_tsc(void); extern int check_tsc_unstable(void); extern void mark_tsc_async_resets(char *reason); -extern unsigned long native_calibrate_cpu(void); extern unsigned long native_calibrate_cpu_early(void); extern unsigned long native_calibrate_tsc(void); extern unsigned long long native_sched_clock_from_tsc(u64 tsc); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 60586779b02c..02e416b87ac1 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -854,7 +854,7 @@ unsigned long native_calibrate_cpu_early(void) /** * native_calibrate_cpu - calibrate the cpu */ -unsigned long native_calibrate_cpu(void) +static unsigned long native_calibrate_cpu(void) { unsigned long tsc_freq = native_calibrate_cpu_early(); @@ -1374,13 +1374,19 @@ unreg: */ device_initcall(init_tsc_clocksource); -static bool __init determine_cpu_tsc_frequencies(void) +static bool __init determine_cpu_tsc_frequencies(bool early) { /* Make sure that cpu and tsc are not already calibrated */ WARN_ON(cpu_khz || tsc_khz); - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); + if (early) { + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + } else { + /* We should not be here with non-native cpu calibration */ + WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); + cpu_khz = pit_hpet_ptimer_calibrate_cpu(); + } /* * Trust non-zero tsc_khz as authorative, @@ -1419,7 +1425,7 @@ void __init tsc_early_init(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) return; - if (!determine_cpu_tsc_frequencies()) + if (!determine_cpu_tsc_frequencies(true)) return; loops_per_jiffy = get_loops_per_jiffy(); @@ -1431,6 +1437,13 @@ void __init tsc_early_init(void) void __init tsc_init(void) { + /* + * native_calibrate_cpu_early can only calibrate using methods that are + * available early in boot. + */ + if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) + x86_platform.calibrate_cpu = native_calibrate_cpu; + if (!boot_cpu_has(X86_FEATURE_TSC)) { setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; @@ -1438,7 +1451,7 @@ void __init tsc_init(void) if (!tsc_khz) { /* We failed to determine frequencies earlier, try again */ - if (!determine_cpu_tsc_frequencies()) { + if (!determine_cpu_tsc_frequencies(false)) { mark_tsc_unstable("could not calculate TSC khz"); setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3ab867603e81..2792b5573818 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -109,7 +109,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; struct x86_platform_ops x86_platform __ro_after_init = { - .calibrate_cpu = native_calibrate_cpu, + .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, -- cgit v1.2.3 From 608008a45798fe9e2aee04f99b5270ea57c1376f Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Mon, 30 Jul 2018 15:54:20 +0800 Subject: x86/tsc: Consolidate init code Split out suplicated code from tsc_early_init() and tsc_init() into a common helper and fixup some comment typos. [ tglx: Massaged changelog and renamed function ] Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Reviewed-by: Pavel Tatashin Cc: Cc: Peter Zijlstra Cc: "H. Peter Anvin" Link: https://lkml.kernel.org/r/20180730075421.22830-2-douly.fnst@cn.fujitsu.com --- arch/x86/kernel/tsc.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 02e416b87ac1..1463468ba9a0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -182,7 +182,7 @@ static void __init cyc2ns_init_boot_cpu(void) } /* - * Secondary CPUs do not run through cyc2ns_init(), so set up + * Secondary CPUs do not run through tsc_init(), so set up * all the scale factors for all CPUs, assuming the same * speed as the bootup CPU. (cpufreq notifiers will fix this * up if their speed diverges) @@ -1389,7 +1389,7 @@ static bool __init determine_cpu_tsc_frequencies(bool early) } /* - * Trust non-zero tsc_khz as authorative, + * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ @@ -1421,6 +1421,14 @@ static unsigned long __init get_loops_per_jiffy(void) return lpj; } +static void __init tsc_enable_sched_clock(void) +{ + /* Sanitize TSC ADJUST before cyc2ns gets initialized */ + tsc_store_and_check_tsc_adjust(true); + cyc2ns_init_boot_cpu(); + static_branch_enable(&__use_tsc); +} + void __init tsc_early_init(void) { if (!boot_cpu_has(X86_FEATURE_TSC)) @@ -1429,10 +1437,7 @@ void __init tsc_early_init(void) return; loops_per_jiffy = get_loops_per_jiffy(); - /* Sanitize TSC ADJUST before cyc2ns gets initialized */ - tsc_store_and_check_tsc_adjust(true); - cyc2ns_init_boot_cpu(); - static_branch_enable(&__use_tsc); + tsc_enable_sched_clock(); } void __init tsc_init(void) @@ -1456,13 +1461,10 @@ void __init tsc_init(void) setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; } - /* Sanitize TSC ADJUST before cyc2ns gets initialized */ - tsc_store_and_check_tsc_adjust(true); - cyc2ns_init_boot_cpu(); + tsc_enable_sched_clock(); } cyc2ns_init_secondary_cpus(); - static_branch_enable(&__use_tsc); if (!no_sched_irq_time) enable_sched_clock_irqtime(); -- cgit v1.2.3 From 1088c6eef261939bda8346ec35b513790a2111d5 Mon Sep 17 00:00:00 2001 From: Dou Liyang Date: Mon, 30 Jul 2018 15:54:21 +0800 Subject: x86/kvmclock: Mark kvm_get_preset_lpj() as __init MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kvm_get_preset_lpj() is only called from kvmclock_init(), so mark it __init as well. Signed-off-by: Dou Liyang Signed-off-by: Thomas Gleixner Reviewed-by: Pavel Tatashin Cc: Cc: Paolo Bonzini Cc: Radim Krčmář Cc: Cc: "H. Peter Anvin" Cc: "Radim Krčmář" Cc: kvm@vger.kernel.org Link: https://lkml.kernel.org/r/20180730075421.22830-3-douly.fnst@cn.fujitsu.com --- arch/x86/kernel/kvmclock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 91b94c0ae4e3..d2edd7e6c294 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -145,7 +145,7 @@ static unsigned long kvm_get_tsc_khz(void) return pvclock_tsc_khz(this_cpu_pvti()); } -static void kvm_get_preset_lpj(void) +static void __init kvm_get_preset_lpj(void) { unsigned long khz; u64 lpj; -- cgit v1.2.3