diff options
Diffstat (limited to 'arch/x86')
28 files changed, 351 insertions, 268 deletions
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h index ccffa53750a8..39bcefc20de7 100644 --- a/arch/x86/include/asm/i8259.h +++ b/arch/x86/include/asm/i8259.h @@ -60,6 +60,7 @@ struct legacy_pic { void (*mask_all)(void); void (*restore_mask)(void); void (*init)(int auto_eoi); + int (*probe)(void); int (*irq_pending)(unsigned int irq); void (*make_irq)(unsigned int irq); }; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 9265196e877f..30cfd64295a0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -505,6 +505,7 @@ struct kvm_vcpu_arch { u32 virtual_tsc_mult; u32 virtual_tsc_khz; s64 ia32_tsc_adjust_msr; + u64 tsc_scaling_ratio; atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ unsigned nmi_pending; /* NMI queued after currently running handler */ @@ -777,7 +778,7 @@ struct kvm_x86_ops { void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); void (*vcpu_put)(struct kvm_vcpu *vcpu); - void (*update_db_bp_intercept)(struct kvm_vcpu *vcpu); + void (*update_bp_intercept)(struct kvm_vcpu *vcpu); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr); u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); @@ -844,7 +845,7 @@ struct kvm_x86_ops { int (*get_lpage_level)(void); bool (*rdtscp_supported)(void); bool (*invpcid_supported)(void); - void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment, bool host); + void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment); void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); @@ -852,11 +853,9 @@ struct kvm_x86_ops { bool (*has_wbinvd_exit)(void); - void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale); u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu); void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); - u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); @@ -923,17 +922,6 @@ struct kvm_arch_async_pf { extern struct kvm_x86_ops *kvm_x86_ops; -static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, - s64 adjustment) -{ - kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, false); -} - -static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) -{ - kvm_x86_ops->adjust_tsc_offset(vcpu, adjustment, true); -} - int kvm_mmu_module_init(void); void kvm_mmu_module_exit(void); @@ -986,10 +974,12 @@ u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); /* control of guest tsc rate supported? */ extern bool kvm_has_tsc_control; -/* minimum supported tsc_khz for guests */ -extern u32 kvm_min_guest_tsc_khz; /* maximum supported tsc_khz for guests */ extern u32 kvm_max_guest_tsc_khz; +/* number of bits of the fractional part of the TSC scaling ratio */ +extern u8 kvm_tsc_scaling_ratio_frac_bits; +/* maximum allowed value of TSC scaling ratio */ +extern u64 kvm_max_tsc_scaling_ratio; enum emulation_result { EMULATE_DONE, /* no further processing */ @@ -1235,6 +1225,9 @@ void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm, void kvm_define_shared_msr(unsigned index, u32 msr); int kvm_set_shared_msr(unsigned index, u64 val, u64 mask); +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc); +u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc); + unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu); bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 9f3905697f12..690b4027e17c 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -35,7 +35,7 @@ #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd -#define MSR_NHM_PLATFORM_INFO 0x000000ce +#define MSR_PLATFORM_INFO 0x000000ce #define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2 #define NHM_C3_AUTO_DEMOTE (1UL << 25) @@ -44,7 +44,6 @@ #define SNB_C1_AUTO_UNDEMOTE (1UL << 27) #define SNB_C3_AUTO_UNDEMOTE (1UL << 28) -#define MSR_PLATFORM_INFO 0x000000ce #define MSR_MTRRcap 0x000000fe #define MSR_IA32_BBL_CR_CTL 0x00000119 #define MSR_IA32_BBL_CR_CTL3 0x0000011e diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index aa336ff3e03e..14c63c7e8337 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -73,6 +73,7 @@ #define SECONDARY_EXEC_ENABLE_PML 0x00020000 #define SECONDARY_EXEC_XSAVES 0x00100000 #define SECONDARY_EXEC_PCOMMIT 0x00200000 +#define SECONDARY_EXEC_TSC_SCALING 0x02000000 #define PIN_BASED_EXT_INTR_MASK 0x00000001 #define PIN_BASED_NMI_EXITING 0x00000008 @@ -167,6 +168,8 @@ enum vmcs_field { VMWRITE_BITMAP = 0x00002028, XSS_EXIT_BITMAP = 0x0000202C, XSS_EXIT_BITMAP_HIGH = 0x0000202D, + TSC_MULTIPLIER = 0x00002032, + TSC_MULTIPLIER_HIGH = 0x00002033, GUEST_PHYSICAL_ADDRESS = 0x00002400, GUEST_PHYSICAL_ADDRESS_HIGH = 0x00002401, VMCS_LINK_POINTER = 0x00002800, diff --git a/arch/x86/include/uapi/asm/svm.h b/arch/x86/include/uapi/asm/svm.h index b5d7640abc5d..8a4add8e4639 100644 --- a/arch/x86/include/uapi/asm/svm.h +++ b/arch/x86/include/uapi/asm/svm.h @@ -100,6 +100,7 @@ { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + AC_VECTOR, "AC excp" }, \ { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ { SVM_EXIT_INTR, "interrupt" }, \ { SVM_EXIT_NMI, "nmi" }, \ diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 836d11b92811..861bc59c8f25 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -361,7 +361,11 @@ int __init arch_probe_nr_irqs(void) if (nr < nr_irqs) nr_irqs = nr; - return nr_legacy_irqs(); + /* + * We don't know if PIC is present at this point so we need to do + * probe() to get the right number of legacy IRQs. + */ + return legacy_pic->probe(); } #ifdef CONFIG_X86_IO_APIC diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 4a70fc6d400a..a8816b325162 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -352,6 +352,7 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) #ifdef CONFIG_SMP unsigned bits; int cpu = smp_processor_id(); + unsigned int socket_id, core_complex_id; bits = c->x86_coreid_bits; /* Low order bits define the core id (index of core in socket) */ @@ -361,6 +362,18 @@ static void amd_detect_cmp(struct cpuinfo_x86 *c) /* use socket ID also for last level cache */ per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; amd_get_topology(c); + + /* + * Fix percpu cpu_llc_id here as LLC topology is different + * for Fam17h systems. + */ + if (c->x86 != 0x17 || !cpuid_edx(0x80000006)) + return; + + socket_id = (c->apicid >> bits) - 1; + core_complex_id = (c->apicid & ((1 << bits) - 1)) >> 3; + + per_cpu(cpu_llc_id, cpu) = (socket_id << 3) | core_complex_id; #endif } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4ddd780aeac9..c2b7522cbf35 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -273,10 +273,9 @@ __setup("nosmap", setup_disable_smap); static __always_inline void setup_smap(struct cpuinfo_x86 *c) { - unsigned long eflags; + unsigned long eflags = native_save_fl(); /* This should have been cleared long ago */ - raw_local_save_flags(eflags); BUG_ON(eflags & X86_EFLAGS_AC); if (cpu_has(c, X86_FEATURE_SMAP)) { diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 98a13db5f4be..209ac1e7d1f0 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -97,6 +97,7 @@ static void early_init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x27: /* Penwell */ case 0x35: /* Cloverview */ + case 0x4a: /* Merrifield */ set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC_S3); break; default: diff --git a/arch/x86/kernel/cpu/perf_event_intel_rapl.c b/arch/x86/kernel/cpu/perf_event_intel_rapl.c index 81431c0f0614..ed446bdcbf31 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_rapl.c +++ b/arch/x86/kernel/cpu/perf_event_intel_rapl.c @@ -107,12 +107,6 @@ static ssize_t __rapl_##_var##_show(struct kobject *kobj, \ static struct kobj_attribute format_attr_##_var = \ __ATTR(_name, 0444, __rapl_##_var##_show, NULL) -#define RAPL_EVENT_DESC(_name, _config) \ -{ \ - .attr = __ATTR(_name, 0444, rapl_event_show, NULL), \ - .config = _config, \ -} - #define RAPL_CNTR_WIDTH 32 /* 32-bit rapl counters */ #define RAPL_EVENT_ATTR_STR(_name, v, str) \ diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index ef29b742cea7..31c6a60505e6 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -385,20 +385,19 @@ fpu__alloc_mathframe(unsigned long sp, int ia32_frame, */ void fpu__init_prepare_fx_sw_frame(void) { - int fsave_header_size = sizeof(struct fregs_state); int size = xstate_size + FP_XSTATE_MAGIC2_SIZE; - if (config_enabled(CONFIG_X86_32)) - size += fsave_header_size; - fx_sw_reserved.magic1 = FP_XSTATE_MAGIC1; fx_sw_reserved.extended_size = size; fx_sw_reserved.xfeatures = xfeatures_mask; fx_sw_reserved.xstate_size = xstate_size; - if (config_enabled(CONFIG_IA32_EMULATION)) { + if (config_enabled(CONFIG_IA32_EMULATION) || + config_enabled(CONFIG_X86_32)) { + int fsave_header_size = sizeof(struct fregs_state); + fx_sw_reserved_ia32 = fx_sw_reserved; - fx_sw_reserved_ia32.extended_size += fsave_header_size; + fx_sw_reserved_ia32.extended_size = size + fsave_header_size; } } diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c index 6454f2731b56..70fc312221fc 100644 --- a/arch/x86/kernel/fpu/xstate.c +++ b/arch/x86/kernel/fpu/xstate.c @@ -694,7 +694,6 @@ void *get_xsave_addr(struct xregs_state *xsave, int xstate_feature) if (!boot_cpu_has(X86_FEATURE_XSAVE)) return NULL; - xsave = ¤t->thread.fpu.state.xsave; /* * We should not ever be requesting features that we * have not enabled. Remember that pcntxt_mask is diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 1d40ca8a73f2..ffdc0e860390 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -65,6 +65,9 @@ startup_64: * tables and then reload them. */ + /* Sanitize CPU configuration */ + call verify_cpu + /* * Compute the delta between the address I am compiled to run at and the * address I am actually running at. @@ -174,6 +177,9 @@ ENTRY(secondary_startup_64) * after the boot processor executes this code. */ + /* Sanitize CPU configuration */ + call verify_cpu + movq $(init_level4_pgt - __START_KERNEL_map), %rax 1: @@ -288,6 +294,8 @@ ENTRY(secondary_startup_64) pushq %rax # target address in negative space lretq +#include "verify_cpu.S" + #ifdef CONFIG_HOTPLUG_CPU /* * Boot CPU0 entry point. It's called from play_dead(). Everything has been set diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 16cb827a5b27..be22f5a2192e 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -295,16 +295,11 @@ static void unmask_8259A(void) raw_spin_unlock_irqrestore(&i8259A_lock, flags); } -static void init_8259A(int auto_eoi) +static int probe_8259A(void) { unsigned long flags; unsigned char probe_val = ~(1 << PIC_CASCADE_IR); unsigned char new_val; - - i8259A_auto_eoi = auto_eoi; - - raw_spin_lock_irqsave(&i8259A_lock, flags); - /* * Check to see if we have a PIC. * Mask all except the cascade and read @@ -312,16 +307,28 @@ static void init_8259A(int auto_eoi) * have a PIC, we will read 0xff as opposed to the * value we wrote. */ + raw_spin_lock_irqsave(&i8259A_lock, flags); + outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ outb(probe_val, PIC_MASTER_IMR); new_val = inb(PIC_MASTER_IMR); if (new_val != probe_val) { printk(KERN_INFO "Using NULL legacy PIC\n"); legacy_pic = &null_legacy_pic; - raw_spin_unlock_irqrestore(&i8259A_lock, flags); - return; } + raw_spin_unlock_irqrestore(&i8259A_lock, flags); + return nr_legacy_irqs(); +} + +static void init_8259A(int auto_eoi) +{ + unsigned long flags; + + i8259A_auto_eoi = auto_eoi; + + raw_spin_lock_irqsave(&i8259A_lock, flags); + outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ /* @@ -379,6 +386,10 @@ static int legacy_pic_irq_pending_noop(unsigned int irq) { return 0; } +static int legacy_pic_probe(void) +{ + return 0; +} struct legacy_pic null_legacy_pic = { .nr_legacy_irqs = 0, @@ -388,6 +399,7 @@ struct legacy_pic null_legacy_pic = { .mask_all = legacy_pic_noop, .restore_mask = legacy_pic_noop, .init = legacy_pic_int_noop, + .probe = legacy_pic_probe, .irq_pending = legacy_pic_irq_pending_noop, .make_irq = legacy_pic_uint_noop, }; @@ -400,6 +412,7 @@ struct legacy_pic default_legacy_pic = { .mask_all = mask_8259A, .restore_mask = unmask_8259A, .init = init_8259A, + .probe = probe_8259A, .irq_pending = i8259A_irq_pending, .make_irq = make_8259A_irq, }; diff --git a/arch/x86/kernel/mcount_64.S b/arch/x86/kernel/mcount_64.S index 94ea120fa21f..87e1762e2bca 100644 --- a/arch/x86/kernel/mcount_64.S +++ b/arch/x86/kernel/mcount_64.S @@ -278,6 +278,12 @@ trace: /* save_mcount_regs fills in first two parameters */ save_mcount_regs + /* + * When DYNAMIC_FTRACE is not defined, ARCH_SUPPORTS_FTRACE_OPS is not + * set (see include/asm/ftrace.h and include/linux/ftrace.h). Only the + * ip and parent ip are used and the list function is called when + * function tracing is enabled. + */ call *ftrace_trace_function restore_mcount_regs diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index a1e4da98c8f0..29db25f9a745 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1188,7 +1188,7 @@ void __init setup_arch(char **cmdline_p) */ clone_pgd_range(initial_page_table, swapper_pg_dir + KERNEL_PGD_BOUNDARY, - KERNEL_PGD_PTRS); + min(KERNEL_PGD_PTRS, KERNEL_PGD_BOUNDARY)); #endif tboot_probe(); diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S index b9242bacbe59..4cf401f581e7 100644 --- a/arch/x86/kernel/verify_cpu.S +++ b/arch/x86/kernel/verify_cpu.S @@ -34,10 +34,11 @@ #include <asm/msr-index.h> verify_cpu: - pushfl # Save caller passed flags - pushl $0 # Kill any dangerous flags - popfl + pushf # Save caller passed flags + push $0 # Kill any dangerous flags + popf +#ifndef __x86_64__ pushfl # standard way to check for cpuid popl %eax movl %eax,%ebx @@ -48,6 +49,7 @@ verify_cpu: popl %eax cmpl %eax,%ebx jz verify_cpu_no_longmode # cpu has no cpuid +#endif movl $0x0,%eax # See if cpuid 1 is implemented cpuid @@ -130,10 +132,10 @@ verify_cpu_sse_test: jmp verify_cpu_sse_test # try again verify_cpu_no_longmode: - popfl # Restore caller passed flags + popf # Restore caller passed flags movl $1,%eax ret verify_cpu_sse_ok: - popfl # Restore caller passed flags + popf # Restore caller passed flags xorl %eax, %eax ret diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ecd4ea1d28a8..4d30b865be30 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -1250,7 +1250,7 @@ void wait_lapic_expire(struct kvm_vcpu *vcpu) tsc_deadline = apic->lapic_timer.expired_tscdeadline; apic->lapic_timer.expired_tscdeadline = 0; - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); trace_kvm_wait_lapic_expire(vcpu->vcpu_id, guest_tsc - tsc_deadline); /* __delay is delay_tsc whenever the hardware has TSC, thus always. */ @@ -1318,7 +1318,7 @@ static void start_apic_timer(struct kvm_lapic *apic) local_irq_save(flags); now = apic->lapic_timer.timer.base->get_time(); - guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, rdtsc()); + guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); if (likely(tscdeadline > guest_tsc)) { ns = (tscdeadline - guest_tsc) * 1000000ULL; do_div(ns, this_tsc_khz); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 7d85bcae3332..e7c2c1428a69 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3359,7 +3359,7 @@ exit: return reserved; } -int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) +int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct) { u64 spte; bool reserved; @@ -3368,7 +3368,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) return RET_MMIO_PF_EMULATE; reserved = walk_shadow_page_get_mmio_spte(vcpu, addr, &spte); - if (unlikely(reserved)) + if (WARN_ON(reserved)) return RET_MMIO_PF_BUG; if (is_mmio_spte(spte)) { @@ -3392,17 +3392,7 @@ int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct) */ return RET_MMIO_PF_RETRY; } -EXPORT_SYMBOL_GPL(handle_mmio_page_fault_common); - -static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, - u32 error_code, bool direct) -{ - int ret; - - ret = handle_mmio_page_fault_common(vcpu, addr, direct); - WARN_ON(ret == RET_MMIO_PF_BUG); - return ret; -} +EXPORT_SYMBOL_GPL(handle_mmio_page_fault); static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, bool prefault) @@ -3413,7 +3403,7 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, pgprintk("%s: gva %lx error %x\n", __func__, gva, error_code); if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gva, error_code, true); + r = handle_mmio_page_fault(vcpu, gva, true); if (likely(r != RET_MMIO_PF_INVALID)) return r; @@ -3503,7 +3493,7 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code, MMU_WARN_ON(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, gpa, error_code, true); + r = handle_mmio_page_fault(vcpu, gpa, true); if (likely(r != RET_MMIO_PF_INVALID)) return r; diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index e4202e41d535..55ffb7b0f95e 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -56,13 +56,13 @@ void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu, struct kvm_mmu *context); /* - * Return values of handle_mmio_page_fault_common: + * Return values of handle_mmio_page_fault: * RET_MMIO_PF_EMULATE: it is a real mmio page fault, emulate the instruction * directly. * RET_MMIO_PF_INVALID: invalid spte is detected then let the real page * fault path update the mmio spte. * RET_MMIO_PF_RETRY: let CPU fault again on the address. - * RET_MMIO_PF_BUG: bug is detected. + * RET_MMIO_PF_BUG: a bug was detected (and a WARN was printed). */ enum { RET_MMIO_PF_EMULATE = 1, @@ -71,7 +71,7 @@ enum { RET_MMIO_PF_BUG = -1 }; -int handle_mmio_page_fault_common(struct kvm_vcpu *vcpu, u64 addr, bool direct); +int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct); void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu); void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index b41faa91a6f9..3058a22a658d 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -705,8 +705,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code, pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); if (unlikely(error_code & PFERR_RSVD_MASK)) { - r = handle_mmio_page_fault(vcpu, addr, error_code, - mmu_is_nested(vcpu)); + r = handle_mmio_page_fault(vcpu, addr, mmu_is_nested(vcpu)); if (likely(r != RET_MMIO_PF_INVALID)) return r; diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index f2c8e4917688..83a1c643f9a5 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -158,8 +158,6 @@ struct vcpu_svm { unsigned long int3_rip; u32 apf_reason; - u64 tsc_ratio; - /* cached guest cpuid flags for faster access */ bool nrips_enabled : 1; }; @@ -214,7 +212,6 @@ static int nested_svm_intercept(struct vcpu_svm *svm); static int nested_svm_vmexit(struct vcpu_svm *svm); static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, bool has_error_code, u32 error_code); -static u64 __scale_tsc(u64 ratio, u64 tsc); enum { VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, @@ -894,20 +891,9 @@ static __init int svm_hardware_setup(void) kvm_enable_efer_bits(EFER_FFXSR); if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - u64 max; - kvm_has_tsc_control = true; - - /* - * Make sure the user can only configure tsc_khz values that - * fit into a signed integer. - * A min value is not calculated needed because it will always - * be 1 on all machines and a value of 0 is used to disable - * tsc-scaling for the vcpu. - */ - max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX)); - - kvm_max_guest_tsc_khz = max; + kvm_max_tsc_scaling_ratio = TSC_RATIO_MAX; + kvm_tsc_scaling_ratio_frac_bits = 32; } if (nested) { @@ -971,68 +957,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) seg->base = 0; } -static u64 __scale_tsc(u64 ratio, u64 tsc) -{ - u64 mult, frac, _tsc; - - mult = ratio >> 32; - frac = ratio & ((1ULL << 32) - 1); - - _tsc = tsc; - _tsc *= mult; - _tsc += (tsc >> 32) * frac; - _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32; - - return _tsc; -} - -static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 _tsc = tsc; - - if (svm->tsc_ratio != TSC_RATIO_DEFAULT) - _tsc = __scale_tsc(svm->tsc_ratio, tsc); - - return _tsc; -} - -static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) -{ - struct vcpu_svm *svm = to_svm(vcpu); - u64 ratio; - u64 khz; - - /* Guest TSC same frequency as host TSC? */ - if (!scale) { - svm->tsc_ratio = TSC_RATIO_DEFAULT; - return; - } - - /* TSC scaling supported? */ - if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { - if (user_tsc_khz > tsc_khz) { - vcpu->arch.tsc_catchup = 1; - vcpu->arch.tsc_always_catchup = 1; - } else - WARN(1, "user requested TSC rate below hardware speed\n"); - return; - } - - khz = user_tsc_khz; - - /* TSC scaling required - calculate ratio */ - ratio = khz << 32; - do_div(ratio, tsc_khz); - - if (ratio == 0 || ratio & TSC_RATIO_RSVD) { - WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n", - user_tsc_khz); - return; - } - svm->tsc_ratio = ratio; -} - static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -1059,16 +983,10 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) +static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { struct vcpu_svm *svm = to_svm(vcpu); - if (host) { - if (svm->tsc_ratio != TSC_RATIO_DEFAULT) - WARN_ON(adjustment < 0); - adjustment = svm_scale_tsc(vcpu, (u64)adjustment); - } - svm->vmcb->control.tsc_offset += adjustment; if (is_guest_mode(vcpu)) svm->nested.hsave->control.tsc_offset += adjustment; @@ -1080,15 +998,6 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho mark_dirty(svm->vmcb, VMCB_INTERCEPTS); } -static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) -{ - u64 tsc; - - tsc = svm_scale_tsc(vcpu, rdtsc()); - - return target_tsc - tsc; -} - static void init_vmcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; @@ -1110,6 +1019,8 @@ static void init_vmcb(struct vcpu_svm *svm) set_exception_intercept(svm, PF_VECTOR); set_exception_intercept(svm, UD_VECTOR); set_exception_intercept(svm, MC_VECTOR); + set_exception_intercept(svm, AC_VECTOR); + set_exception_intercept(svm, DB_VECTOR); set_intercept(svm, INTERCEPT_INTR); set_intercept(svm, INTERCEPT_NMI); @@ -1235,8 +1146,6 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id) goto out; } - svm->tsc_ratio = TSC_RATIO_DEFAULT; - err = kvm_vcpu_init(&svm->vcpu, kvm, id); if (err) goto free_svm; @@ -1322,10 +1231,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); - if (static_cpu_has(X86_FEATURE_TSCRATEMSR) && - svm->tsc_ratio != __this_cpu_read(current_tsc_ratio)) { - __this_cpu_write(current_tsc_ratio, svm->tsc_ratio); - wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio); + if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { + u64 tsc_ratio = vcpu->arch.tsc_scaling_ratio; + if (tsc_ratio != __this_cpu_read(current_tsc_ratio)) { + __this_cpu_write(current_tsc_ratio, tsc_ratio); + wrmsrl(MSR_AMD64_TSC_RATIO, tsc_ratio); + } } } @@ -1644,20 +1555,13 @@ static void svm_set_segment(struct kvm_vcpu *vcpu, mark_dirty(svm->vmcb, VMCB_SEG); } -static void update_db_bp_intercept(struct kvm_vcpu *vcpu) +static void update_bp_intercept(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); - clr_exception_intercept(svm, DB_VECTOR); clr_exception_intercept(svm, BP_VECTOR); - if (svm->nmi_singlestep) - set_exception_intercept(svm, DB_VECTOR); - if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { - if (vcpu->guest_debug & - (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) - set_exception_intercept(svm, DB_VECTOR); if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) set_exception_intercept(svm, BP_VECTOR); } else @@ -1763,7 +1667,6 @@ static int db_interception(struct vcpu_svm *svm) if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_bp_intercept(&svm->vcpu); } if (svm->vcpu.guest_debug & @@ -1798,6 +1701,12 @@ static int ud_interception(struct vcpu_svm *svm) return 1; } +static int ac_interception(struct vcpu_svm *svm) +{ + kvm_queue_exception_e(&svm->vcpu, AC_VECTOR, 0); + return 1; +} + static void svm_fpu_activate(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -3075,8 +2984,7 @@ static int cr8_write_interception(struct vcpu_svm *svm) static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) { struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); - return vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, host_tsc); + return vmcb->control.tsc_offset + host_tsc; } static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -3086,7 +2994,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) switch (msr_info->index) { case MSR_IA32_TSC: { msr_info->data = svm->vmcb->control.tsc_offset + - svm_scale_tsc(vcpu, rdtsc()); + kvm_scale_tsc(vcpu, rdtsc()); break; } @@ -3362,6 +3270,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm *svm) = { [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, [SVM_EXIT_EXCP_BASE + NM_VECTOR] = nm_interception, [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, + [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, [SVM_EXIT_INTR] = intr_interception, [SVM_EXIT_NMI] = nmi_interception, [SVM_EXIT_SMI] = nop_on_interception, @@ -3745,7 +3654,6 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu) */ svm->nmi_singlestep = true; svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); - update_db_bp_intercept(vcpu); } static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr) @@ -4371,7 +4279,7 @@ static struct kvm_x86_ops svm_x86_ops = { .vcpu_load = svm_vcpu_load, .vcpu_put = svm_vcpu_put, - .update_db_bp_intercept = update_db_bp_intercept, + .update_bp_intercept = update_bp_intercept, .get_msr = svm_get_msr, .set_msr = svm_set_msr, .get_segment_base = svm_get_segment_base, @@ -4443,11 +4351,9 @@ static struct kvm_x86_ops svm_x86_ops = { .has_wbinvd_exit = svm_has_wbinvd_exit, - .set_tsc_khz = svm_set_tsc_khz, .read_tsc_offset = svm_read_tsc_offset, .write_tsc_offset = svm_write_tsc_offset, - .adjust_tsc_offset = svm_adjust_tsc_offset, - .compute_tsc_offset = svm_compute_tsc_offset, + .adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest, .read_l1_tsc = svm_read_l1_tsc, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5eb56ed77c1f..87acc5221740 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -107,6 +107,8 @@ static u64 __read_mostly host_xss; static bool __read_mostly enable_pml = 1; module_param_named(pml, enable_pml, bool, S_IRUGO); +#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL + #define KVM_GUEST_CR0_MASK (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST (X86_CR0_WP | X86_CR0_NE) #define KVM_VM_CR0_ALWAYS_ON \ @@ -1172,6 +1174,12 @@ static inline bool cpu_has_vmx_pml(void) return vmcs_config.cpu_based_2nd_exec_ctrl & SECONDARY_EXEC_ENABLE_PML; } +static inline bool cpu_has_vmx_tsc_scaling(void) +{ + return vmcs_config.cpu_based_2nd_exec_ctrl & + SECONDARY_EXEC_TSC_SCALING; +} + static inline bool report_flexpriority(void) { return flexpriority_enabled; @@ -1631,7 +1639,7 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu) u32 eb; eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) | - (1u << NM_VECTOR) | (1u << DB_VECTOR); + (1u << NM_VECTOR) | (1u << DB_VECTOR) | (1u << AC_VECTOR); if ((vcpu->guest_debug & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) == (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) @@ -2053,6 +2061,12 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ + + /* Setup TSC multiplier */ + if (cpu_has_vmx_tsc_scaling()) + vmcs_write64(TSC_MULTIPLIER, + vcpu->arch.tsc_scaling_ratio); + vmx->loaded_vmcs->cpu = cpu; } @@ -2357,15 +2371,16 @@ static void setup_msrs(struct vcpu_vmx *vmx) /* * reads and returns guest's timestamp counter "register" - * guest_tsc = host_tsc + tsc_offset -- 21.3 + * guest_tsc = (host_tsc * tsc multiplier) >> 48 + tsc_offset + * -- Intel TSC Scaling for Virtualization White Paper, sec 1.3 */ -static u64 guest_read_tsc(void) +static u64 guest_read_tsc(struct kvm_vcpu *vcpu) { u64 host_tsc, tsc_offset; host_tsc = rdtsc(); tsc_offset = vmcs_read64(TSC_OFFSET); - return host_tsc + tsc_offset; + return kvm_scale_tsc(vcpu, host_tsc) + tsc_offset; } /* @@ -2382,22 +2397,6 @@ static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) return host_tsc + tsc_offset; } -/* - * Engage any workarounds for mis-matched TSC rates. Currently limited to - * software catchup for faster rates on slower CPUs. - */ -static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) -{ - if (!scale) - return; - - if (user_tsc_khz > tsc_khz) { - vcpu->arch.tsc_catchup = 1; - vcpu->arch.tsc_always_catchup = 1; - } else - WARN(1, "user requested TSC rate below hardware speed\n"); -} - static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu) { return vmcs_read64(TSC_OFFSET); @@ -2429,7 +2428,7 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) } } -static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool host) +static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment) { u64 offset = vmcs_read64(TSC_OFFSET); @@ -2442,11 +2441,6 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment, bool ho offset + adjustment); } -static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) -{ - return target_tsc - rdtsc(); -} - static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0); @@ -2778,7 +2772,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_EFER: return kvm_get_msr_common(vcpu, msr_info); case MSR_IA32_TSC: - msr_info->data = guest_read_tsc(); + msr_info->data = guest_read_tsc(vcpu); break; case MSR_IA32_SYSENTER_CS: msr_info->data = vmcs_read32(GUEST_SYSENTER_CS); @@ -3154,7 +3148,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) SECONDARY_EXEC_SHADOW_VMCS | SECONDARY_EXEC_XSAVES | SECONDARY_EXEC_ENABLE_PML | - SECONDARY_EXEC_PCOMMIT; + SECONDARY_EXEC_PCOMMIT | + SECONDARY_EXEC_TSC_SCALING; if (adjust_vmx_controls(min2, opt2, MSR_IA32_VMX_PROCBASED_CTLS2, &_cpu_based_2nd_exec_control) < 0) @@ -5266,6 +5261,9 @@ static int handle_exception(struct kvm_vcpu *vcpu) return handle_rmode_exception(vcpu, ex_no, error_code); switch (ex_no) { + case AC_VECTOR: + kvm_queue_exception_e(vcpu, AC_VECTOR, error_code); + return 1; case DB_VECTOR: dr6 = vmcs_readl(EXIT_QUALIFICATION); if (!(vcpu->guest_debug & @@ -5908,7 +5906,7 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) return 1; } - ret = handle_mmio_page_fault_common(vcpu, gpa, true); + ret = handle_mmio_page_fault(vcpu, gpa, true); if (likely(ret == RET_MMIO_PF_EMULATE)) return x86_emulate_instruction(vcpu, gpa, 0, NULL, 0) == EMULATE_DONE; @@ -6199,6 +6197,12 @@ static __init int hardware_setup(void) if (!cpu_has_vmx_apicv()) enable_apicv = 0; + if (cpu_has_vmx_tsc_scaling()) { + kvm_has_tsc_control = true; + kvm_max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX; + kvm_tsc_scaling_ratio_frac_bits = 48; + } + if (enable_apicv) kvm_x86_ops->update_cr8_intercept = NULL; else { @@ -8008,6 +8012,9 @@ static void dump_vmcs(void) vmcs_read32(IDT_VECTORING_INFO_FIELD), vmcs_read32(IDT_VECTORING_ERROR_CODE)); pr_err("TSC Offset = 0x%016lx\n", vmcs_readl(TSC_OFFSET)); + if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING) + pr_err("TSC Multiplier = 0x%016lx\n", + vmcs_readl(TSC_MULTIPLIER)); if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) pr_err("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD)); if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR) @@ -10752,7 +10759,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .vcpu_load = vmx_vcpu_load, .vcpu_put = vmx_vcpu_put, - .update_db_bp_intercept = update_exception_bitmap, + .update_bp_intercept = update_exception_bitmap, .get_msr = vmx_get_msr, .set_msr = vmx_set_msr, .get_segment_base = vmx_get_segment_base, @@ -10826,11 +10833,9 @@ static struct kvm_x86_ops vmx_x86_ops = { .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, - .set_tsc_khz = vmx_set_tsc_khz, .read_tsc_offset = vmx_read_tsc_offset, .write_tsc_offset = vmx_write_tsc_offset, - .adjust_tsc_offset = vmx_adjust_tsc_offset, - .compute_tsc_offset = vmx_compute_tsc_offset, + .adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest, .read_l1_tsc = vmx_read_l1_tsc, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 4a6eff166fc6..00462bd63129 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -93,10 +93,10 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu); static void process_nmi(struct kvm_vcpu *vcpu); static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); -struct kvm_x86_ops *kvm_x86_ops; +struct kvm_x86_ops *kvm_x86_ops __read_mostly; EXPORT_SYMBOL_GPL(kvm_x86_ops); -static bool ignore_msrs = 0; +static bool __read_mostly ignore_msrs = 0; module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR); unsigned int min_timer_period_us = 500; @@ -105,20 +105,25 @@ module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); static bool __read_mostly kvmclock_periodic_sync = true; module_param(kvmclock_periodic_sync, bool, S_IRUGO); -bool kvm_has_tsc_control; +bool __read_mostly kvm_has_tsc_control; EXPORT_SYMBOL_GPL(kvm_has_tsc_control); -u32 kvm_max_guest_tsc_khz; +u32 __read_mostly kvm_max_guest_tsc_khz; EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz); +u8 __read_mostly kvm_tsc_scaling_ratio_frac_bits; +EXPORT_SYMBOL_GPL(kvm_tsc_scaling_ratio_frac_bits); +u64 __read_mostly kvm_max_tsc_scaling_ratio; +EXPORT_SYMBOL_GPL(kvm_max_tsc_scaling_ratio); +static u64 __read_mostly kvm_default_tsc_scaling_ratio; /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ -static u32 tsc_tolerance_ppm = 250; +static u32 __read_mostly tsc_tolerance_ppm = 250; module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR); /* lapic timer advance (tscdeadline mode only) in nanoseconds */ -unsigned int lapic_timer_advance_ns = 0; +unsigned int __read_mostly lapic_timer_advance_ns = 0; module_param(lapic_timer_advance_ns, uint, S_IRUGO | S_IWUSR); -static bool backwards_tsc_observed = false; +static bool __read_mostly backwards_tsc_observed = false; #define KVM_NR_SHARED_MSRS 16 @@ -1249,14 +1254,53 @@ static u32 adjust_tsc_khz(u32 khz, s32 ppm) return v; } -static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) +static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) +{ + u64 ratio; + + /* Guest TSC same frequency as host TSC? */ + if (!scale) { + vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + return 0; + } + + /* TSC scaling supported? */ + if (!kvm_has_tsc_control) { + if (user_tsc_khz > tsc_khz) { + vcpu->arch.tsc_catchup = 1; + vcpu->arch.tsc_always_catchup = 1; + return 0; + } else { + WARN(1, "user requested TSC rate below hardware speed\n"); + return -1; + } + } + + /* TSC scaling required - calculate ratio */ + ratio = mul_u64_u32_div(1ULL << kvm_tsc_scaling_ratio_frac_bits, + user_tsc_khz, tsc_khz); + + if (ratio == 0 || ratio >= kvm_max_tsc_scaling_ratio) { + WARN_ONCE(1, "Invalid TSC scaling ratio - virtual-tsc-khz=%u\n", + user_tsc_khz); + return -1; + } + + vcpu->arch.tsc_scaling_ratio = ratio; + return 0; +} + +static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) { u32 thresh_lo, thresh_hi; int use_scaling = 0; /* tsc_khz can be zero if TSC calibration fails */ - if (this_tsc_khz == 0) - return; + if (this_tsc_khz == 0) { + /* set tsc_scaling_ratio to a safe value */ + vcpu->arch.tsc_scaling_ratio = kvm_default_tsc_scaling_ratio; + return -1; + } /* Compute a scale to convert nanoseconds in TSC cycles */ kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, @@ -1276,7 +1320,7 @@ static void kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 this_tsc_khz) pr_debug("kvm: requested TSC rate %u falls outside tolerance [%u,%u]\n", this_tsc_khz, thresh_lo, thresh_hi); use_scaling = 1; } - kvm_x86_ops->set_tsc_khz(vcpu, this_tsc_khz, use_scaling); + return set_tsc_khz(vcpu, this_tsc_khz, use_scaling); } static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) @@ -1322,6 +1366,48 @@ static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset) vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset; } +/* + * Multiply tsc by a fixed point number represented by ratio. + * + * The most significant 64-N bits (mult) of ratio represent the + * integral part of the fixed point number; the remaining N bits + * (frac) represent the fractional part, ie. ratio represents a fixed + * point number (mult + frac * 2^(-N)). + * + * N equals to kvm_tsc_scaling_ratio_frac_bits. + */ +static inline u64 __scale_tsc(u64 ratio, u64 tsc) +{ + return mul_u64_u64_shr(tsc, ratio, kvm_tsc_scaling_ratio_frac_bits); +} + +u64 kvm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc) +{ + u64 _tsc = tsc; + u64 ratio = vcpu->arch.tsc_scaling_ratio; + + if (ratio != kvm_default_tsc_scaling_ratio) + _tsc = __scale_tsc(ratio, tsc); + + return _tsc; +} +EXPORT_SYMBOL_GPL(kvm_scale_tsc); + +static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) +{ + u64 tsc; + + tsc = kvm_scale_tsc(vcpu, rdtsc()); + + return target_tsc - tsc; +} + +u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) +{ + return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc)); +} +EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); + void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) { struct kvm *kvm = vcpu->kvm; @@ -1333,7 +1419,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) u64 data = msr->data; raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); - offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); + offset = kvm_compute_tsc_offset(vcpu, data); ns = get_kernel_ns(); elapsed = ns - kvm->arch.last_tsc_nsec; @@ -1390,7 +1476,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) } else { u64 delta = nsec_to_cycles(vcpu, elapsed); data += delta; - offset = kvm_x86_ops->compute_tsc_offset(vcpu, data); + offset = kvm_compute_tsc_offset(vcpu, data); pr_debug("kvm: adjusted tsc offset by %llu\n", delta); } matched = true; @@ -1447,6 +1533,20 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr) EXPORT_SYMBOL_GPL(kvm_write_tsc); +static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, + s64 adjustment) +{ + kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); +} + +static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) +{ + if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio) + WARN_ON(adjustment < 0); + adjustment = kvm_scale_tsc(vcpu, (u64) adjustment); + kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment); +} + #ifdef CONFIG_X86_64 static cycle_t read_tsc(void) @@ -1608,7 +1708,7 @@ static void kvm_gen_update_masterclock(struct kvm *kvm) static int kvm_guest_time_update(struct kvm_vcpu *v) { - unsigned long flags, this_tsc_khz; + unsigned long flags, this_tsc_khz, tgt_tsc_khz; struct kvm_vcpu_arch *vcpu = &v->arch; struct kvm_arch *ka = &v->kvm->arch; s64 kernel_ns; @@ -1645,7 +1745,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) kernel_ns = get_kernel_ns(); } - tsc_timestamp = kvm_x86_ops->read_l1_tsc(v, host_tsc); + tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); /* * We may have to catch up the TSC to match elapsed wall clock @@ -1671,7 +1771,9 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) return 0; if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) { - kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz, + tgt_tsc_khz = kvm_has_tsc_control ? + vcpu->virtual_tsc_khz : this_tsc_khz; + kvm_get_time_scale(NSEC_PER_SEC / 1000, tgt_tsc_khz, &vcpu->hv_clock.tsc_shift, &vcpu->hv_clock.tsc_to_system_mul); vcpu->hw_tsc_khz = this_tsc_khz; @@ -2617,7 +2719,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); if (check_tsc_unstable()) { - u64 offset = kvm_x86_ops->compute_tsc_offset(vcpu, + u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; @@ -3319,9 +3421,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (user_tsc_khz == 0) user_tsc_khz = tsc_khz; - kvm_set_tsc_khz(vcpu, user_tsc_khz); + if (!kvm_set_tsc_khz(vcpu, user_tsc_khz)) + r = 0; - r = 0; goto out; } case KVM_GET_TSC_KHZ: { @@ -6452,8 +6554,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu, - rdtsc()); + vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -7015,7 +7116,7 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, */ kvm_set_rflags(vcpu, rflags); - kvm_x86_ops->update_db_bp_intercept(vcpu); + kvm_x86_ops->update_bp_intercept(vcpu); r = 0; @@ -7364,6 +7465,20 @@ int kvm_arch_hardware_setup(void) if (r != 0) return r; + if (kvm_has_tsc_control) { + /* + * Make sure the user can only configure tsc_khz values that + * fit into a signed integer. + * A min value is not calculated needed because it will always + * be 1 on all machines. + */ + u64 max = min(0x7fffffffULL, + __scale_tsc(kvm_max_tsc_scaling_ratio, tsc_khz)); + kvm_max_guest_tsc_khz = max; + + kvm_default_tsc_scaling_ratio = 1ULL << kvm_tsc_scaling_ratio_frac_bits; + } + kvm_init_msr_list(); return 0; } diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index 1bf417e9cc13..a035c2aa7801 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -358,6 +358,21 @@ static void walk_pud_level(struct seq_file *m, struct pg_state *st, pgd_t addr, #define pgd_none(a) pud_none(__pud(pgd_val(a))) #endif +#ifdef CONFIG_X86_64 +static inline bool is_hypervisor_range(int idx) +{ + /* + * ffff800000000000 - ffff87ffffffffff is reserved for + * the hypervisor. + */ + return paravirt_enabled() && + (idx >= pgd_index(__PAGE_OFFSET) - 16) && + (idx < pgd_index(__PAGE_OFFSET)); +} +#else +static inline bool is_hypervisor_range(int idx) { return false; } +#endif + static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, bool checkwx) { @@ -381,7 +396,7 @@ static void ptdump_walk_pgd_level_core(struct seq_file *m, pgd_t *pgd, for (i = 0; i < PTRS_PER_PGD; i++) { st.current_address = normalize_addr(i * PGD_LEVEL_MULT); - if (!pgd_none(*start)) { + if (!pgd_none(*start) && !is_hypervisor_range(i)) { if (pgd_large(*start) || !pgd_present(*start)) { prot = pgd_flags(*start); note_page(m, &st, __pgprot(prot), 1); diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c index b0ae85f90f10..1202d5ca2fb5 100644 --- a/arch/x86/mm/mpx.c +++ b/arch/x86/mm/mpx.c @@ -586,6 +586,29 @@ static unsigned long mpx_bd_entry_to_bt_addr(struct mm_struct *mm, } /* + * We only want to do a 4-byte get_user() on 32-bit. Otherwise, + * we might run off the end of the bounds table if we are on + * a 64-bit kernel and try to get 8 bytes. + */ +int get_user_bd_entry(struct mm_struct *mm, unsigned long *bd_entry_ret, + long __user *bd_entry_ptr) +{ + u32 bd_entry_32; + int ret; + + if (is_64bit_mm(mm)) + return get_user(*bd_entry_ret, bd_entry_ptr); + + /* + * Note that get_user() uses the type of the *pointer* to + * establish the size of the get, not the destination. + */ + ret = get_user(bd_entry_32, (u32 __user *)bd_entry_ptr); + *bd_entry_ret = bd_entry_32; + return ret; +} + +/* * Get the base of bounds tables pointed by specific bounds * directory entry. */ @@ -605,7 +628,7 @@ static int get_bt_addr(struct mm_struct *mm, int need_write = 0; pagefault_disable(); - ret = get_user(bd_entry, bd_entry_ptr); + ret = get_user_bd_entry(mm, &bd_entry, bd_entry_ptr); pagefault_enable(); if (!ret) break; @@ -700,11 +723,23 @@ static unsigned long mpx_get_bt_entry_offset_bytes(struct mm_struct *mm, */ static inline unsigned long bd_entry_virt_space(struct mm_struct *mm) { - unsigned long long virt_space = (1ULL << boot_cpu_data.x86_virt_bits); - if (is_64bit_mm(mm)) - return virt_space / MPX_BD_NR_ENTRIES_64; - else - return virt_space / MPX_BD_NR_ENTRIES_32; + unsigned long long virt_space; + unsigned long long GB = (1ULL << 30); + + /* + * This covers 32-bit emulation as well as 32-bit kernels + * running on 64-bit harware. + */ + if (!is_64bit_mm(mm)) + return (4ULL * GB) / MPX_BD_NR_ENTRIES_32; + + /* + * 'x86_virt_bits' returns what the hardware is capable + * of, and returns the full >32-bit adddress space when + * running 32-bit kernels on 64-bit hardware. + */ + virt_space = (1ULL << boot_cpu_data.x86_virt_bits); + return virt_space / MPX_BD_NR_ENTRIES_64; } /* diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S index b972649d3a18..98816804e131 100644 --- a/arch/x86/um/stub_32.S +++ b/arch/x86/um/stub_32.S @@ -1,6 +1,5 @@ #include <as-layout.h> - .globl syscall_stub .section .__syscall_stub, "ax" .globl batch_syscall_stub diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S index 7160b20172d0..ba914b3b8cc4 100644 --- a/arch/x86/um/stub_64.S +++ b/arch/x86/um/stub_64.S @@ -1,25 +1,9 @@ #include <as-layout.h> - .globl syscall_stub .section .__syscall_stub, "ax" -syscall_stub: - syscall - /* We don't have 64-bit constants, so this constructs the address - * we need. - */ - movq $(STUB_DATA >> 32), %rbx - salq $32, %rbx - movq $(STUB_DATA & 0xffffffff), %rcx - or %rcx, %rbx - movq %rax, (%rbx) - int3 - .globl batch_syscall_stub batch_syscall_stub: - mov $(STUB_DATA >> 32), %rbx - sal $32, %rbx - mov $(STUB_DATA & 0xffffffff), %rax - or %rax, %rbx + mov $(STUB_DATA), %rbx /* load pointer to first operation */ mov %rbx, %rsp add $0x10, %rsp |