diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 04:28:19 +0300 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2018-08-14 04:28:19 +0300 |
commit | 13e091b6dd0e78a518a7d8756607d3acb8215768 (patch) | |
tree | ab7b6eef8d53008602be8dd5966655816abfeda6 /arch/x86/kernel | |
parent | eac341194426ba7ead3444923b9eba491ae4feeb (diff) | |
parent | 1088c6eef261939bda8346ec35b513790a2111d5 (diff) | |
download | linux-13e091b6dd0e78a518a7d8756607d3acb8215768.tar.xz |
Merge branch 'x86-timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 timer updates from Thomas Gleixner:
"Early TSC based time stamping to allow better boot time analysis.
This comes with a general cleanup of the TSC calibration code which
grew warts and duct taping over the years and removes 250 lines of
code. Initiated and mostly implemented by Pavel with help from various
folks"
* 'x86-timers-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (37 commits)
x86/kvmclock: Mark kvm_get_preset_lpj() as __init
x86/tsc: Consolidate init code
sched/clock: Disable interrupts when calling generic_sched_clock_init()
timekeeping: Prevent false warning when persistent clock is not available
sched/clock: Close a hole in sched_clock_init()
x86/tsc: Make use of tsc_calibrate_cpu_early()
x86/tsc: Split native_calibrate_cpu() into early and late parts
sched/clock: Use static key for sched_clock_running
sched/clock: Enable sched clock early
sched/clock: Move sched clock initialization and merge with generic clock
x86/tsc: Use TSC as sched clock early
x86/tsc: Initialize cyc2ns when tsc frequency is determined
x86/tsc: Calibrate tsc only once
ARM/time: Remove read_boot_clock64()
s390/time: Remove read_boot_clock64()
timekeeping: Default boot time offset to local_clock()
timekeeping: Replace read_boot_clock64() with read_persistent_wall_and_boot_offset()
s390/time: Add read_persistent_wall_and_boot_offset()
x86/xen/time: Output xen sched_clock time from 0
x86/xen/time: Initialize pv xen time in init_hypervisor_platform()
...
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r-- | arch/x86/kernel/alternative.c | 7 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/amd.c | 13 | ||||
-rw-r--r-- | arch/x86/kernel/cpu/common.c | 40 | ||||
-rw-r--r-- | arch/x86/kernel/jump_label.c | 11 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 14 | ||||
-rw-r--r-- | arch/x86/kernel/kvmclock.c | 258 | ||||
-rw-r--r-- | arch/x86/kernel/setup.c | 10 | ||||
-rw-r--r-- | arch/x86/kernel/tsc.c | 259 | ||||
-rw-r--r-- | arch/x86/kernel/tsc_msr.c | 96 | ||||
-rw-r--r-- | arch/x86/kernel/x86_init.c | 2 |
10 files changed, 350 insertions, 360 deletions
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index a481763a3776..014f214da581 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -668,6 +668,7 @@ void *__init_or_module text_poke_early(void *addr, const void *opcode, local_irq_save(flags); memcpy(addr, opcode, len); local_irq_restore(flags); + sync_core(); /* Could also do a CLFLUSH here to speed up CPU recovery; but that causes hangs on some VIA CPUs. */ return addr; @@ -693,6 +694,12 @@ void *text_poke(void *addr, const void *opcode, size_t len) struct page *pages[2]; int i; + /* + * While boot memory allocator is runnig we cannot use struct + * pages as they are not yet initialized. + */ + BUG_ON(!after_bootmem); + if (!core_kernel_text((unsigned long)addr)) { pages[0] = vmalloc_to_page(addr); pages[1] = vmalloc_to_page(addr + PAGE_SIZE); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 38915fbfae73..b732438c1a1e 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -232,8 +232,6 @@ static void init_amd_k7(struct cpuinfo_x86 *c) } } - set_cpu_cap(c, X86_FEATURE_K7); - /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -617,6 +615,14 @@ static void early_init_amd(struct cpuinfo_x86 *c) early_init_amd_mc(c); +#ifdef CONFIG_X86_32 + if (c->x86 == 6) + set_cpu_cap(c, X86_FEATURE_K7); +#endif + + if (c->x86 >= 0xf) + set_cpu_cap(c, X86_FEATURE_K8); + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); /* @@ -863,9 +869,6 @@ static void init_amd(struct cpuinfo_x86 *c) init_amd_cacheinfo(c); - if (c->x86 >= 0xf) - set_cpu_cap(c, X86_FEATURE_K8); - if (cpu_has(c, X86_FEATURE_XMM2)) { unsigned long long val; int ret; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index df28e931d732..ba6b8bb1c036 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1019,6 +1019,24 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) } /* + * The NOPL instruction is supposed to exist on all CPUs of family >= 6; + * unfortunately, that's not true in practice because of early VIA + * chips and (more importantly) broken virtualizers that are not easy + * to detect. In the latter case it doesn't even *fail* reliably, so + * probing for it doesn't even work. Disable it completely on 32-bit + * unless we can find a reliable way to detect all the broken cases. + * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). + */ +static void detect_nopl(void) +{ +#ifdef CONFIG_X86_32 + setup_clear_cpu_cap(X86_FEATURE_NOPL); +#else + setup_force_cpu_cap(X86_FEATURE_NOPL); +#endif +} + +/* * Do minimum CPU detection early. * Fields really needed: vendor, cpuid_level, family, model, mask, * cache alignment. @@ -1092,6 +1110,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) */ if (!pgtable_l5_enabled()) setup_clear_cpu_cap(X86_FEATURE_LA57); + + detect_nopl(); } void __init early_cpu_init(void) @@ -1127,24 +1147,6 @@ void __init early_cpu_init(void) early_identify_cpu(&boot_cpu_data); } -/* - * The NOPL instruction is supposed to exist on all CPUs of family >= 6; - * unfortunately, that's not true in practice because of early VIA - * chips and (more importantly) broken virtualizers that are not easy - * to detect. In the latter case it doesn't even *fail* reliably, so - * probing for it doesn't even work. Disable it completely on 32-bit - * unless we can find a reliable way to detect all the broken cases. - * Enable it explicitly on 64-bit for non-constant inputs of cpu_has(). - */ -static void detect_nopl(struct cpuinfo_x86 *c) -{ -#ifdef CONFIG_X86_32 - clear_cpu_cap(c, X86_FEATURE_NOPL); -#else - set_cpu_cap(c, X86_FEATURE_NOPL); -#endif -} - static void detect_null_seg_behavior(struct cpuinfo_x86 *c) { #ifdef CONFIG_X86_64 @@ -1207,8 +1209,6 @@ static void generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - detect_nopl(c); - detect_null_seg_behavior(c); /* diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index e56c95be2808..eeea935e9bb5 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -37,15 +37,18 @@ static void bug_at(unsigned char *ip, int line) BUG(); } -static void __jump_label_transform(struct jump_entry *entry, - enum jump_label_type type, - void *(*poker)(void *, const void *, size_t), - int init) +static void __ref __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + void *(*poker)(void *, const void *, size_t), + int init) { union jump_code_union code; const unsigned char default_nop[] = { STATIC_KEY_INIT_NOP }; const unsigned char *ideal_nop = ideal_nops[NOP_ATOMIC5]; + if (early_boot_irqs_disabled) + poker = text_poke_early; + if (type == JUMP_LABEL_JMP) { if (init) { /* diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a37bda38d205..09aaabb2bbf1 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -45,7 +45,6 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> -#include <asm/kvm_guest.h> static int kvmapf = 1; @@ -66,15 +65,6 @@ static int __init parse_no_stealacc(char *arg) early_param("no-steal-acc", parse_no_stealacc); -static int kvmclock_vsyscall = 1; -static int __init parse_no_kvmclock_vsyscall(char *arg) -{ - kvmclock_vsyscall = 0; - return 0; -} - -early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); - static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64); static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64); static int has_steal_clock = 0; @@ -560,9 +550,6 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); - if (kvmclock_vsyscall) - kvm_setup_vsyscall_timeinfo(); - #ifdef CONFIG_SMP smp_ops.smp_prepare_cpus = kvm_smp_prepare_cpus; smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; @@ -628,6 +615,7 @@ const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, .type = X86_HYPER_KVM, + .init.init_platform = kvmclock_init, .init.guest_late_init = kvm_guest_init, .init.x2apic_available = kvm_para_available, }; diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 3b8e7c13c614..d2edd7e6c294 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -23,30 +23,56 @@ #include <asm/apic.h> #include <linux/percpu.h> #include <linux/hardirq.h> -#include <linux/memblock.h> +#include <linux/cpuhotplug.h> #include <linux/sched.h> #include <linux/sched/clock.h> +#include <linux/mm.h> +#include <asm/hypervisor.h> #include <asm/mem_encrypt.h> #include <asm/x86_init.h> #include <asm/reboot.h> #include <asm/kvmclock.h> -static int kvmclock __ro_after_init = 1; -static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; -static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; -static u64 kvm_sched_clock_offset; +static int kvmclock __initdata = 1; +static int kvmclock_vsyscall __initdata = 1; +static int msr_kvm_system_time __ro_after_init = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock __ro_after_init = MSR_KVM_WALL_CLOCK; +static u64 kvm_sched_clock_offset __ro_after_init; -static int parse_no_kvmclock(char *arg) +static int __init parse_no_kvmclock(char *arg) { kvmclock = 0; return 0; } early_param("no-kvmclock", parse_no_kvmclock); -/* The hypervisor will put information about time periodically here */ -static struct pvclock_vsyscall_time_info *hv_clock; -static struct pvclock_wall_clock *wall_clock; +static int __init parse_no_kvmclock_vsyscall(char *arg) +{ + kvmclock_vsyscall = 0; + return 0; +} +early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall); + +/* Aligned to page sizes to match whats mapped via vsyscalls to userspace */ +#define HV_CLOCK_SIZE (sizeof(struct pvclock_vsyscall_time_info) * NR_CPUS) +#define HVC_BOOT_ARRAY_SIZE \ + (PAGE_SIZE / sizeof(struct pvclock_vsyscall_time_info)) + +static struct pvclock_vsyscall_time_info + hv_clock_boot[HVC_BOOT_ARRAY_SIZE] __aligned(PAGE_SIZE); +static struct pvclock_wall_clock wall_clock; +static DEFINE_PER_CPU(struct pvclock_vsyscall_time_info *, hv_clock_per_cpu); + +static inline struct pvclock_vcpu_time_info *this_cpu_pvti(void) +{ + return &this_cpu_read(hv_clock_per_cpu)->pvti; +} + +static inline struct pvclock_vsyscall_time_info *this_cpu_hvclock(void) +{ + return this_cpu_read(hv_clock_per_cpu); +} /* * The wallclock is the time of day when we booted. Since then, some time may @@ -55,21 +81,10 @@ static struct pvclock_wall_clock *wall_clock; */ static void kvm_get_wallclock(struct timespec64 *now) { - struct pvclock_vcpu_time_info *vcpu_time; - int low, high; - int cpu; - - low = (int)slow_virt_to_phys(wall_clock); - high = ((u64)slow_virt_to_phys(wall_clock) >> 32); - - native_write_msr(msr_kvm_wall_clock, low, high); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - pvclock_read_wallclock(wall_clock, vcpu_time, now); - - put_cpu(); + wrmsrl(msr_kvm_wall_clock, slow_virt_to_phys(&wall_clock)); + preempt_disable(); + pvclock_read_wallclock(&wall_clock, this_cpu_pvti(), now); + preempt_enable(); } static int kvm_set_wallclock(const struct timespec64 *now) @@ -79,14 +94,10 @@ static int kvm_set_wallclock(const struct timespec64 *now) static u64 kvm_clock_read(void) { - struct pvclock_vcpu_time_info *src; u64 ret; - int cpu; preempt_disable_notrace(); - cpu = smp_processor_id(); - src = &hv_clock[cpu].pvti; - ret = pvclock_clocksource_read(src); + ret = pvclock_clocksource_read(this_cpu_pvti()); preempt_enable_notrace(); return ret; } @@ -112,11 +123,11 @@ static inline void kvm_sched_clock_init(bool stable) kvm_sched_clock_offset = kvm_clock_read(); pv_time_ops.sched_clock = kvm_sched_clock_read; - printk(KERN_INFO "kvm-clock: using sched offset of %llu cycles\n", - kvm_sched_clock_offset); + pr_info("kvm-clock: using sched offset of %llu cycles", + kvm_sched_clock_offset); BUILD_BUG_ON(sizeof(kvm_sched_clock_offset) > - sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); + sizeof(((struct pvclock_vcpu_time_info *)NULL)->system_time)); } /* @@ -130,19 +141,11 @@ static inline void kvm_sched_clock_init(bool stable) */ static unsigned long kvm_get_tsc_khz(void) { - struct pvclock_vcpu_time_info *src; - int cpu; - unsigned long tsc_khz; - - cpu = get_cpu(); - src = &hv_clock[cpu].pvti; - tsc_khz = pvclock_tsc_khz(src); - put_cpu(); setup_force_cpu_cap(X86_FEATURE_TSC_KNOWN_FREQ); - return tsc_khz; + return pvclock_tsc_khz(this_cpu_pvti()); } -static void kvm_get_preset_lpj(void) +static void __init kvm_get_preset_lpj(void) { unsigned long khz; u64 lpj; @@ -156,49 +159,40 @@ static void kvm_get_preset_lpj(void) bool kvm_check_and_clear_guest_paused(void) { + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); bool ret = false; - struct pvclock_vcpu_time_info *src; - int cpu = smp_processor_id(); - if (!hv_clock) + if (!src) return ret; - src = &hv_clock[cpu].pvti; - if ((src->flags & PVCLOCK_GUEST_STOPPED) != 0) { - src->flags &= ~PVCLOCK_GUEST_STOPPED; + if ((src->pvti.flags & PVCLOCK_GUEST_STOPPED) != 0) { + src->pvti.flags &= ~PVCLOCK_GUEST_STOPPED; pvclock_touch_watchdogs(); ret = true; } - return ret; } struct clocksource kvm_clock = { - .name = "kvm-clock", - .read = kvm_clock_get_cycles, - .rating = 400, - .mask = CLOCKSOURCE_MASK(64), - .flags = CLOCK_SOURCE_IS_CONTINUOUS, + .name = "kvm-clock", + .read = kvm_clock_get_cycles, + .rating = 400, + .mask = CLOCKSOURCE_MASK(64), + .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; EXPORT_SYMBOL_GPL(kvm_clock); -int kvm_register_clock(char *txt) +static void kvm_register_clock(char *txt) { - int cpu = smp_processor_id(); - int low, high, ret; - struct pvclock_vcpu_time_info *src; - - if (!hv_clock) - return 0; + struct pvclock_vsyscall_time_info *src = this_cpu_hvclock(); + u64 pa; - src = &hv_clock[cpu].pvti; - low = (int)slow_virt_to_phys(src) | 1; - high = ((u64)slow_virt_to_phys(src) >> 32); - ret = native_write_msr_safe(msr_kvm_system_time, low, high); - printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", - cpu, high, low, txt); + if (!src) + return; - return ret; + pa = slow_virt_to_phys(&src->pvti) | 0x01ULL; + wrmsrl(msr_kvm_system_time, pa); + pr_info("kvm-clock: cpu %d, msr %llx, %s", smp_processor_id(), pa, txt); } static void kvm_save_sched_clock_state(void) @@ -213,11 +207,7 @@ static void kvm_restore_sched_clock_state(void) #ifdef CONFIG_X86_LOCAL_APIC static void kvm_setup_secondary_clock(void) { - /* - * Now that the first cpu already had this clocksource initialized, - * we shouldn't fail. - */ - WARN_ON(kvm_register_clock("secondary cpu clock")); + kvm_register_clock("secondary cpu clock"); } #endif @@ -245,100 +235,84 @@ static void kvm_shutdown(void) native_machine_shutdown(); } -static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size, - phys_addr_t align) +static int __init kvm_setup_vsyscall_timeinfo(void) { - phys_addr_t mem; +#ifdef CONFIG_X86_64 + u8 flags; - mem = memblock_alloc(size, align); - if (!mem) + if (!per_cpu(hv_clock_per_cpu, 0) || !kvmclock_vsyscall) return 0; - if (sev_active()) { - if (early_set_memory_decrypted((unsigned long)__va(mem), size)) - goto e_free; - } + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); + if (!(flags & PVCLOCK_TSC_STABLE_BIT)) + return 0; - return mem; -e_free: - memblock_free(mem, size); + kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; +#endif return 0; } +early_initcall(kvm_setup_vsyscall_timeinfo); -static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size) +static int kvmclock_setup_percpu(unsigned int cpu) { - if (sev_active()) - early_set_memory_encrypted((unsigned long)__va(addr), size); + struct pvclock_vsyscall_time_info *p = per_cpu(hv_clock_per_cpu, cpu); - memblock_free(addr, size); + /* + * The per cpu area setup replicates CPU0 data to all cpu + * pointers. So carefully check. CPU0 has been set up in init + * already. + */ + if (!cpu || (p && p != per_cpu(hv_clock_per_cpu, 0))) + return 0; + + /* Use the static page for the first CPUs, allocate otherwise */ + if (cpu < HVC_BOOT_ARRAY_SIZE) + p = &hv_clock_boot[cpu]; + else + p = kzalloc(sizeof(*p), GFP_KERNEL); + + per_cpu(hv_clock_per_cpu, cpu) = p; + return p ? 0 : -ENOMEM; } void __init kvmclock_init(void) { - struct pvclock_vcpu_time_info *vcpu_time; - unsigned long mem, mem_wall_clock; - int size, cpu, wall_clock_size; u8 flags; - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - if (!kvm_para_available()) + if (!kvm_para_available() || !kvmclock) return; - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; - } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) - return; - - wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock)); - mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE); - if (!mem_wall_clock) - return; - - wall_clock = __va(mem_wall_clock); - memset(wall_clock, 0, wall_clock_size); - - mem = kvm_memblock_alloc(size, PAGE_SIZE); - if (!mem) { - kvm_memblock_free(mem_wall_clock, wall_clock_size); - wall_clock = NULL; + } else if (!kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { return; } - hv_clock = __va(mem); - memset(hv_clock, 0, size); - - if (kvm_register_clock("primary cpu clock")) { - hv_clock = NULL; - kvm_memblock_free(mem, size); - kvm_memblock_free(mem_wall_clock, wall_clock_size); - wall_clock = NULL; + if (cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "kvmclock:setup_percpu", + kvmclock_setup_percpu, NULL) < 0) { return; } - printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + pr_info("kvm-clock: Using msrs %x and %x", msr_kvm_system_time, msr_kvm_wall_clock); - pvclock_set_pvti_cpu0_va(hv_clock); + this_cpu_write(hv_clock_per_cpu, &hv_clock_boot[0]); + kvm_register_clock("primary cpu clock"); + pvclock_set_pvti_cpu0_va(hv_clock_boot); if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); - cpu = get_cpu(); - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - + flags = pvclock_read_flags(&hv_clock_boot[0].pvti); kvm_sched_clock_init(flags & PVCLOCK_TSC_STABLE_BIT); - put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.early_percpu_clock_init = - kvm_setup_secondary_clock; + x86_cpuinit.early_percpu_clock_init = kvm_setup_secondary_clock; #endif x86_platform.save_sched_clock_state = kvm_save_sched_clock_state; x86_platform.restore_sched_clock_state = kvm_restore_sched_clock_state; @@ -350,31 +324,3 @@ void __init kvmclock_init(void) clocksource_register_hz(&kvm_clock, NSEC_PER_SEC); pv_info.name = "KVM"; } - -int __init kvm_setup_vsyscall_timeinfo(void) -{ -#ifdef CONFIG_X86_64 - int cpu; - u8 flags; - struct pvclock_vcpu_time_info *vcpu_time; - unsigned int size; - - if (!hv_clock) - return 0; - - size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS); - - cpu = get_cpu(); - - vcpu_time = &hv_clock[cpu].pvti; - flags = pvclock_read_flags(vcpu_time); - - put_cpu(); - - if (!(flags & PVCLOCK_TSC_STABLE_BIT)) - return 1; - - kvm_clock.archdata.vclock_mode = VCLOCK_PVCLOCK; -#endif - return 0; -} diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 2f86d883dd95..5d32c55aeb8b 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -866,6 +866,8 @@ void __init setup_arch(char **cmdline_p) idt_setup_early_traps(); early_cpu_init(); + arch_init_ideal_nops(); + jump_label_init(); early_ioremap_init(); setup_olpc_ofw_pgd(); @@ -1012,6 +1014,7 @@ void __init setup_arch(char **cmdline_p) */ init_hypervisor_platform(); + tsc_early_init(); x86_init.resources.probe_roms(); /* after parse_early_param, so could debug it */ @@ -1197,11 +1200,6 @@ void __init setup_arch(char **cmdline_p) memblock_find_dma_reserve(); -#ifdef CONFIG_KVM_GUEST - kvmclock_init(); -#endif - - tsc_early_delay_calibrate(); if (!early_xdbc_setup_hardware()) early_xdbc_register_console(); @@ -1272,8 +1270,6 @@ void __init setup_arch(char **cmdline_p) mcheck_init(); - arch_init_ideal_nops(); - register_refined_jiffies(CLOCK_TICK_RATE); #ifdef CONFIG_EFI diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 74392d9d51e0..1463468ba9a0 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -33,16 +33,13 @@ EXPORT_SYMBOL(cpu_khz); unsigned int __read_mostly tsc_khz; EXPORT_SYMBOL(tsc_khz); +#define KHZ 1000 + /* * TSC can be unstable due to cpufreq or due to unsynced TSCs */ static int __read_mostly tsc_unstable; -/* native_sched_clock() is called before tsc_init(), so - we must start with the TSC soft disabled to prevent - erroneous rdtsc usage on !boot_cpu_has(X86_FEATURE_TSC) processors */ -static int __read_mostly tsc_disabled = -1; - static DEFINE_STATIC_KEY_FALSE(__use_tsc); int tsc_clocksource_reliable; @@ -106,23 +103,6 @@ void cyc2ns_read_end(void) * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static void cyc2ns_data_init(struct cyc2ns_data *data) -{ - data->cyc2ns_mul = 0; - data->cyc2ns_shift = 0; - data->cyc2ns_offset = 0; -} - -static void __init cyc2ns_init(int cpu) -{ - struct cyc2ns *c2n = &per_cpu(cyc2ns, cpu); - - cyc2ns_data_init(&c2n->data[0]); - cyc2ns_data_init(&c2n->data[1]); - - seqcount_init(&c2n->seq); -} - static inline unsigned long long cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; @@ -138,18 +118,11 @@ static inline unsigned long long cycles_2_ns(unsigned long long cyc) return ns; } -static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +static void __set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) { unsigned long long ns_now; struct cyc2ns_data data; struct cyc2ns *c2n; - unsigned long flags; - - local_irq_save(flags); - sched_clock_idle_sleep_event(); - - if (!khz) - goto done; ns_now = cycles_2_ns(tsc_now); @@ -181,13 +154,56 @@ static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_ c2n->data[0] = data; raw_write_seqcount_latch(&c2n->seq); c2n->data[1] = data; +} + +static void set_cyc2ns_scale(unsigned long khz, int cpu, unsigned long long tsc_now) +{ + unsigned long flags; + + local_irq_save(flags); + sched_clock_idle_sleep_event(); + + if (khz) + __set_cyc2ns_scale(khz, cpu, tsc_now); -done: sched_clock_idle_wakeup_event(); local_irq_restore(flags); } /* + * Initialize cyc2ns for boot cpu + */ +static void __init cyc2ns_init_boot_cpu(void) +{ + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + + seqcount_init(&c2n->seq); + __set_cyc2ns_scale(tsc_khz, smp_processor_id(), rdtsc()); +} + +/* + * Secondary CPUs do not run through tsc_init(), so set up + * all the scale factors for all CPUs, assuming the same + * speed as the bootup CPU. (cpufreq notifiers will fix this + * up if their speed diverges) + */ +static void __init cyc2ns_init_secondary_cpus(void) +{ + unsigned int cpu, this_cpu = smp_processor_id(); + struct cyc2ns *c2n = this_cpu_ptr(&cyc2ns); + struct cyc2ns_data *data = c2n->data; + + for_each_possible_cpu(cpu) { + if (cpu != this_cpu) { + seqcount_init(&c2n->seq); + c2n = per_cpu_ptr(&cyc2ns, cpu); + c2n->data[0] = data[0]; + c2n->data[1] = data[1]; + } + } +} + +/* * Scheduler clock - returns current time in nanosec units. */ u64 native_sched_clock(void) @@ -248,8 +264,7 @@ EXPORT_SYMBOL_GPL(check_tsc_unstable); #ifdef CONFIG_X86_TSC int __init notsc_setup(char *str) { - pr_warn("Kernel compiled with CONFIG_X86_TSC, cannot disable TSC completely\n"); - tsc_disabled = 1; + mark_tsc_unstable("boot parameter notsc"); return 1; } #else @@ -665,30 +680,17 @@ static unsigned long cpu_khz_from_cpuid(void) return eax_base_mhz * 1000; } -/** - * native_calibrate_cpu - calibrate the cpu on boot +/* + * calibrate cpu using pit, hpet, and ptimer methods. They are available + * later in boot after acpi is initialized. */ -unsigned long native_calibrate_cpu(void) +static unsigned long pit_hpet_ptimer_calibrate_cpu(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate; + unsigned long flags, latch, ms; int hpet = is_hpet_enabled(), i, loopmin; - fast_calibrate = cpu_khz_from_cpuid(); - if (fast_calibrate) - return fast_calibrate; - - fast_calibrate = cpu_khz_from_msr(); - if (fast_calibrate) - return fast_calibrate; - - local_irq_save(flags); - fast_calibrate = quick_pit_calibrate(); - local_irq_restore(flags); - if (fast_calibrate) - return fast_calibrate; - /* * Run 5 calibration loops to get the lowest frequency value * (the best estimate). We use two different calibration modes @@ -831,6 +833,37 @@ unsigned long native_calibrate_cpu(void) return tsc_pit_min; } +/** + * native_calibrate_cpu_early - can calibrate the cpu early in boot + */ +unsigned long native_calibrate_cpu_early(void) +{ + unsigned long flags, fast_calibrate = cpu_khz_from_cpuid(); + + if (!fast_calibrate) + fast_calibrate = cpu_khz_from_msr(); + if (!fast_calibrate) { + local_irq_save(flags); + fast_calibrate = quick_pit_calibrate(); + local_irq_restore(flags); + } + return fast_calibrate; +} + + +/** + * native_calibrate_cpu - calibrate the cpu + */ +static unsigned long native_calibrate_cpu(void) +{ + unsigned long tsc_freq = native_calibrate_cpu_early(); + + if (!tsc_freq) + tsc_freq = pit_hpet_ptimer_calibrate_cpu(); + + return tsc_freq; +} + void recalibrate_cpu_khz(void) { #ifndef CONFIG_SMP @@ -1307,7 +1340,7 @@ unreg: static int __init init_tsc_clocksource(void) { - if (!boot_cpu_has(X86_FEATURE_TSC) || tsc_disabled > 0 || !tsc_khz) + if (!boot_cpu_has(X86_FEATURE_TSC) || !tsc_khz) return 0; if (tsc_unstable) @@ -1341,40 +1374,22 @@ unreg: */ device_initcall(init_tsc_clocksource); -void __init tsc_early_delay_calibrate(void) +static bool __init determine_cpu_tsc_frequencies(bool early) { - unsigned long lpj; - - if (!boot_cpu_has(X86_FEATURE_TSC)) - return; - - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - - tsc_khz = tsc_khz ? : cpu_khz; - if (!tsc_khz) - return; - - lpj = tsc_khz * 1000; - do_div(lpj, HZ); - loops_per_jiffy = lpj; -} - -void __init tsc_init(void) -{ - u64 lpj, cyc; - int cpu; - - if (!boot_cpu_has(X86_FEATURE_TSC)) { - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; + /* Make sure that cpu and tsc are not already calibrated */ + WARN_ON(cpu_khz || tsc_khz); + + if (early) { + cpu_khz = x86_platform.calibrate_cpu(); + tsc_khz = x86_platform.calibrate_tsc(); + } else { + /* We should not be here with non-native cpu calibration */ + WARN_ON(x86_platform.calibrate_cpu != native_calibrate_cpu); + cpu_khz = pit_hpet_ptimer_calibrate_cpu(); } - cpu_khz = x86_platform.calibrate_cpu(); - tsc_khz = x86_platform.calibrate_tsc(); - /* - * Trust non-zero tsc_khz as authorative, + * Trust non-zero tsc_khz as authoritative, * and use it to sanity check cpu_khz, * which will be off if system timer is off. */ @@ -1383,52 +1398,78 @@ void __init tsc_init(void) else if (abs(cpu_khz - tsc_khz) * 10 > tsc_khz) cpu_khz = tsc_khz; - if (!tsc_khz) { - mark_tsc_unstable("could not calculate TSC khz"); - setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); - return; - } + if (tsc_khz == 0) + return false; pr_info("Detected %lu.%03lu MHz processor\n", - (unsigned long)cpu_khz / 1000, - (unsigned long)cpu_khz % 1000); + (unsigned long)cpu_khz / KHZ, + (unsigned long)cpu_khz % KHZ); if (cpu_khz != tsc_khz) { pr_info("Detected %lu.%03lu MHz TSC", - (unsigned long)tsc_khz / 1000, - (unsigned long)tsc_khz % 1000); + (unsigned long)tsc_khz / KHZ, + (unsigned long)tsc_khz % KHZ); } + return true; +} + +static unsigned long __init get_loops_per_jiffy(void) +{ + unsigned long lpj = tsc_khz * KHZ; + do_div(lpj, HZ); + return lpj; +} + +static void __init tsc_enable_sched_clock(void) +{ /* Sanitize TSC ADJUST before cyc2ns gets initialized */ tsc_store_and_check_tsc_adjust(true); + cyc2ns_init_boot_cpu(); + static_branch_enable(&__use_tsc); +} + +void __init tsc_early_init(void) +{ + if (!boot_cpu_has(X86_FEATURE_TSC)) + return; + if (!determine_cpu_tsc_frequencies(true)) + return; + loops_per_jiffy = get_loops_per_jiffy(); + tsc_enable_sched_clock(); +} + +void __init tsc_init(void) +{ /* - * Secondary CPUs do not run through tsc_init(), so set up - * all the scale factors for all CPUs, assuming the same - * speed as the bootup CPU. (cpufreq notifiers will fix this - * up if their speed diverges) + * native_calibrate_cpu_early can only calibrate using methods that are + * available early in boot. */ - cyc = rdtsc(); - for_each_possible_cpu(cpu) { - cyc2ns_init(cpu); - set_cyc2ns_scale(tsc_khz, cpu, cyc); - } + if (x86_platform.calibrate_cpu == native_calibrate_cpu_early) + x86_platform.calibrate_cpu = native_calibrate_cpu; - if (tsc_disabled > 0) + if (!boot_cpu_has(X86_FEATURE_TSC)) { + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); return; + } - /* now allow native_sched_clock() to use rdtsc */ + if (!tsc_khz) { + /* We failed to determine frequencies earlier, try again */ + if (!determine_cpu_tsc_frequencies(false)) { + mark_tsc_unstable("could not calculate TSC khz"); + setup_clear_cpu_cap(X86_FEATURE_TSC_DEADLINE_TIMER); + return; + } + tsc_enable_sched_clock(); + } - tsc_disabled = 0; - static_branch_enable(&__use_tsc); + cyc2ns_init_secondary_cpus(); if (!no_sched_irq_time) enable_sched_clock_irqtime(); - lpj = ((u64)tsc_khz * 1000); - do_div(lpj, HZ); - lpj_fine = lpj; - + lpj_fine = get_loops_per_jiffy(); use_tsc_delay(); check_system_tsc_reliable(); @@ -1455,7 +1496,7 @@ unsigned long calibrate_delay_is_known(void) int constant_tsc = cpu_has(&cpu_data(cpu), X86_FEATURE_CONSTANT_TSC); const struct cpumask *mask = topology_core_cpumask(cpu); - if (tsc_disabled || !constant_tsc || !mask) + if (!constant_tsc || !mask) return 0; sibling = cpumask_any_but(mask, cpu); diff --git a/arch/x86/kernel/tsc_msr.c b/arch/x86/kernel/tsc_msr.c index 19afdbd7d0a7..27ef714d886c 100644 --- a/arch/x86/kernel/tsc_msr.c +++ b/arch/x86/kernel/tsc_msr.c @@ -1,17 +1,19 @@ +// SPDX-License-Identifier: GPL-2.0 /* - * tsc_msr.c - TSC frequency enumeration via MSR + * TSC frequency enumeration via MSR * - * Copyright (C) 2013 Intel Corporation + * Copyright (C) 2013, 2018 Intel Corporation * Author: Bin Gao <bin.gao@intel.com> - * - * This file is released under the GPLv2. */ #include <linux/kernel.h> -#include <asm/processor.h> -#include <asm/setup.h> + #include <asm/apic.h> +#include <asm/cpu_device_id.h> +#include <asm/intel-family.h> +#include <asm/msr.h> #include <asm/param.h> +#include <asm/tsc.h> #define MAX_NUM_FREQS 9 @@ -23,44 +25,48 @@ * field msr_plat does. */ struct freq_desc { - u8 x86_family; /* CPU family */ - u8 x86_model; /* model */ u8 msr_plat; /* 1: use MSR_PLATFORM_INFO, 0: MSR_IA32_PERF_STATUS */ u32 freqs[MAX_NUM_FREQS]; }; -static struct freq_desc freq_desc_tables[] = { - /* PNW */ - { 6, 0x27, 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }, - /* CLV+ */ - { 6, 0x35, 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } }, - /* TNG - Intel Atom processor Z3400 series */ - { 6, 0x4a, 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } }, - /* VLV2 - Intel Atom processor E3000, Z3600, Z3700 series */ - { 6, 0x37, 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } }, - /* ANN - Intel Atom processor Z3500 series */ - { 6, 0x5a, 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } }, - /* AMT - Intel Atom processor X7-Z8000 and X5-Z8000 series */ - { 6, 0x4c, 1, { 83300, 100000, 133300, 116700, - 80000, 93300, 90000, 88900, 87500 } }, +/* + * Penwell and Clovertrail use spread spectrum clock, + * so the freq number is not exactly the same as reported + * by MSR based on SDM. + */ +static const struct freq_desc freq_desc_pnw = { + 0, { 0, 0, 0, 0, 0, 99840, 0, 83200 } }; -static int match_cpu(u8 family, u8 model) -{ - int i; +static const struct freq_desc freq_desc_clv = { + 0, { 0, 133200, 0, 0, 0, 99840, 0, 83200 } +}; - for (i = 0; i < ARRAY_SIZE(freq_desc_tables); i++) { - if ((family == freq_desc_tables[i].x86_family) && - (model == freq_desc_tables[i].x86_model)) - return i; - } +static const struct freq_desc freq_desc_byt = { + 1, { 83300, 100000, 133300, 116700, 80000, 0, 0, 0 } +}; - return -1; -} +static const struct freq_desc freq_desc_cht = { + 1, { 83300, 100000, 133300, 116700, 80000, 93300, 90000, 88900, 87500 } +}; -/* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ -#define id_to_freq(cpu_index, freq_id) \ - (freq_desc_tables[cpu_index].freqs[freq_id]) +static const struct freq_desc freq_desc_tng = { + 1, { 0, 100000, 133300, 0, 0, 0, 0, 0 } +}; + +static const struct freq_desc freq_desc_ann = { + 1, { 83300, 100000, 133300, 100000, 0, 0, 0, 0 } +}; + +static const struct x86_cpu_id tsc_msr_cpu_ids[] = { + INTEL_CPU_FAM6(ATOM_PENWELL, freq_desc_pnw), + INTEL_CPU_FAM6(ATOM_CLOVERVIEW, freq_desc_clv), + INTEL_CPU_FAM6(ATOM_SILVERMONT1, freq_desc_byt), + INTEL_CPU_FAM6(ATOM_AIRMONT, freq_desc_cht), + INTEL_CPU_FAM6(ATOM_MERRIFIELD, freq_desc_tng), + INTEL_CPU_FAM6(ATOM_MOOREFIELD, freq_desc_ann), + {} +}; /* * MSR-based CPU/TSC frequency discovery for certain CPUs. @@ -70,18 +76,17 @@ static int match_cpu(u8 family, u8 model) */ unsigned long cpu_khz_from_msr(void) { - u32 lo, hi, ratio, freq_id, freq; + u32 lo, hi, ratio, freq; + const struct freq_desc *freq_desc; + const struct x86_cpu_id *id; unsigned long res; - int cpu_index; - - if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) - return 0; - cpu_index = match_cpu(boot_cpu_data.x86, boot_cpu_data.x86_model); - if (cpu_index < 0) + id = x86_match_cpu(tsc_msr_cpu_ids); + if (!id) return 0; - if (freq_desc_tables[cpu_index].msr_plat) { + freq_desc = (struct freq_desc *)id->driver_data; + if (freq_desc->msr_plat) { rdmsr(MSR_PLATFORM_INFO, lo, hi); ratio = (lo >> 8) & 0xff; } else { @@ -91,8 +96,9 @@ unsigned long cpu_khz_from_msr(void) /* Get FSB FREQ ID */ rdmsr(MSR_FSB_FREQ, lo, hi); - freq_id = lo & 0x7; - freq = id_to_freq(cpu_index, freq_id); + + /* Map CPU reference clock freq ID(0-7) to CPU reference clock freq(KHz) */ + freq = freq_desc->freqs[lo & 0x7]; /* TSC frequency = maximum resolved freq * maximum resolved bus ratio */ res = freq * ratio; diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 3ab867603e81..2792b5573818 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -109,7 +109,7 @@ struct x86_cpuinit_ops x86_cpuinit = { static void default_nmi_init(void) { }; struct x86_platform_ops x86_platform __ro_after_init = { - .calibrate_cpu = native_calibrate_cpu, + .calibrate_cpu = native_calibrate_cpu_early, .calibrate_tsc = native_calibrate_tsc, .get_wallclock = mach_get_cmos_time, .set_wallclock = mach_set_rtc_mmss, |