diff options
Diffstat (limited to 'arch/x86')
-rw-r--r-- | arch/x86/Kconfig | 3 | ||||
-rw-r--r-- | arch/x86/hyperv/mmu.c | 2 | ||||
-rw-r--r-- | arch/x86/include/asm/irq_remapping.h | 5 | ||||
-rw-r--r-- | arch/x86/include/asm/paravirt.h | 5 | ||||
-rw-r--r-- | arch/x86/include/asm/paravirt_types.h | 3 | ||||
-rw-r--r-- | arch/x86/include/asm/tlbflush.h | 24 | ||||
-rw-r--r-- | arch/x86/include/asm/xen/hypercall.h | 118 | ||||
-rw-r--r-- | arch/x86/kernel/kvm.c | 5 | ||||
-rw-r--r-- | arch/x86/kernel/paravirt.c | 2 | ||||
-rw-r--r-- | arch/x86/kernel/pci-dma.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/init.c | 17 | ||||
-rw-r--r-- | arch/x86/mm/pgtable.c | 8 | ||||
-rw-r--r-- | arch/x86/mm/tlb.c | 205 | ||||
-rw-r--r-- | arch/x86/xen/enlighten_pv.c | 13 | ||||
-rw-r--r-- | arch/x86/xen/mmu_pv.c | 4 | ||||
-rw-r--r-- | arch/x86/xen/setup.c | 31 | ||||
-rw-r--r-- | arch/x86/xen/xen-ops.h | 1 |
17 files changed, 132 insertions, 322 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 512003f16889..c5ff296bc5d1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -180,7 +180,8 @@ config X86 select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP - select HAVE_RCU_TABLE_FREE + select HAVE_RCU_TABLE_FREE if PARAVIRT + select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c index 1147e1fed7ff..ef5f29f913d7 100644 --- a/arch/x86/hyperv/mmu.c +++ b/arch/x86/hyperv/mmu.c @@ -9,6 +9,7 @@ #include <asm/mshyperv.h> #include <asm/msr.h> #include <asm/tlbflush.h> +#include <asm/tlb.h> #define CREATE_TRACE_POINTS #include <asm/trace/hyperv.h> @@ -231,4 +232,5 @@ void hyperv_setup_mmu_ops(void) pr_info("Using hypercall for remote TLB flush\n"); pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; + pv_mmu_ops.tlb_remove_table = tlb_remove_table; } diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h index 023b4a9fc846..5f26962eff42 100644 --- a/arch/x86/include/asm/irq_remapping.h +++ b/arch/x86/include/asm/irq_remapping.h @@ -33,6 +33,11 @@ enum irq_remap_cap { IRQ_POSTING_CAP = 0, }; +enum { + IRQ_REMAP_XAPIC_MODE, + IRQ_REMAP_X2APIC_MODE, +}; + struct vcpu_data { u64 pi_desc_addr; /* Physical address of PI Descriptor */ u32 vector; /* Guest vector of the interrupt */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index d49bbf4bb5c8..e375d4266b53 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -309,6 +309,11 @@ static inline void flush_tlb_others(const struct cpumask *cpumask, PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info); } +static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +{ + PVOP_VCALL2(pv_mmu_ops.tlb_remove_table, tlb, table); +} + static inline int paravirt_pgd_alloc(struct mm_struct *mm) { return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm); diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 180bc0bff0fb..4b75acc23b30 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -54,6 +54,7 @@ struct desc_struct; struct task_struct; struct cpumask; struct flush_tlb_info; +struct mmu_gather; /* * Wrapper type for pointers to code which uses the non-standard @@ -222,6 +223,8 @@ struct pv_mmu_ops { void (*flush_tlb_others)(const struct cpumask *cpus, const struct flush_tlb_info *info); + void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); + /* Hooks for allocating and freeing a pagetable top-level */ int (*pgd_alloc)(struct mm_struct *mm); void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd); diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 511bf5fae8b8..29c9da6c62fc 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -148,6 +148,22 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid) #define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr) #endif +static inline bool tlb_defer_switch_to_init_mm(void) +{ + /* + * If we have PCID, then switching to init_mm is reasonably + * fast. If we don't have PCID, then switching to init_mm is + * quite slow, so we try to defer it in the hopes that we can + * avoid it entirely. The latter approach runs the risk of + * receiving otherwise unnecessary IPIs. + * + * This choice is just a heuristic. The tlb code can handle this + * function returning true or false regardless of whether we have + * PCID. + */ + return !static_cpu_has(X86_FEATURE_PCID); +} + struct tlb_context { u64 ctx_id; u64 tlb_gen; @@ -536,11 +552,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch); #ifndef CONFIG_PARAVIRT #define flush_tlb_others(mask, info) \ native_flush_tlb_others(mask, info) -#endif -extern void tlb_flush_remove_tables(struct mm_struct *mm); -extern void tlb_flush_remove_tables_local(void *arg); - -#define HAVE_TLB_FLUSH_REMOVE_TABLES +#define paravirt_tlb_remove_table(tlb, page) \ + tlb_remove_page(tlb, (void *)(page)) +#endif #endif /* _ASM_X86_TLBFLUSH_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 6b2f90a0b149..ef05bea7010d 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -197,17 +197,6 @@ extern struct { char _entry[32]; } hypercall_page[]; (type)__res; \ }) -#define _hypercall5(type, name, a1, a2, a3, a4, a5) \ -({ \ - __HYPERCALL_DECLS; \ - __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \ - asm volatile (__HYPERCALL \ - : __HYPERCALL_5PARAM \ - : __HYPERCALL_ENTRY(name) \ - : __HYPERCALL_CLOBBER5); \ - (type)__res; \ -}) - static inline long xen_single_call(unsigned int call, unsigned long a1, unsigned long a2, @@ -267,47 +256,12 @@ HYPERVISOR_set_gdt(unsigned long *frame_list, int entries) } static inline int -HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp) -{ - return _hypercall2(int, stack_switch, ss, esp); -} - -#ifdef CONFIG_X86_32 -static inline int -HYPERVISOR_set_callbacks(unsigned long event_selector, - unsigned long event_address, - unsigned long failsafe_selector, - unsigned long failsafe_address) -{ - return _hypercall4(int, set_callbacks, - event_selector, event_address, - failsafe_selector, failsafe_address); -} -#else /* CONFIG_X86_64 */ -static inline int -HYPERVISOR_set_callbacks(unsigned long event_address, - unsigned long failsafe_address, - unsigned long syscall_address) -{ - return _hypercall3(int, set_callbacks, - event_address, failsafe_address, - syscall_address); -} -#endif /* CONFIG_X86_{32,64} */ - -static inline int HYPERVISOR_callback_op(int cmd, void *arg) { return _hypercall2(int, callback_op, cmd, arg); } static inline int -HYPERVISOR_fpu_taskswitch(int set) -{ - return _hypercall1(int, fpu_taskswitch, set); -} - -static inline int HYPERVISOR_sched_op(int cmd, void *arg) { return _hypercall2(int, sched_op, cmd, arg); @@ -419,19 +373,6 @@ HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count) } static inline int -HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val, - unsigned long flags, domid_t domid) -{ - if (sizeof(new_val) == sizeof(long)) - return _hypercall4(int, update_va_mapping_otherdomain, va, - new_val.pte, flags, domid); - else - return _hypercall5(int, update_va_mapping_otherdomain, va, - new_val.pte, new_val.pte >> 32, - flags, domid); -} - -static inline int HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type) { return _hypercall2(int, vm_assist, cmd, type); @@ -465,12 +406,6 @@ HYPERVISOR_suspend(unsigned long start_info_mfn) return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn); } -static inline int -HYPERVISOR_nmi_op(unsigned long op, unsigned long arg) -{ - return _hypercall2(int, nmi_op, op, arg); -} - static inline unsigned long __must_check HYPERVISOR_hvm_op(int op, void *arg) { @@ -529,39 +464,6 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va, } static inline void -MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd, - void *uop, unsigned int count) -{ - mcl->op = __HYPERVISOR_grant_table_op; - mcl->args[0] = cmd; - mcl->args[1] = (unsigned long)uop; - mcl->args[2] = count; - - trace_xen_mc_entry(mcl, 3); -} - -static inline void -MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va, - pte_t new_val, unsigned long flags, - domid_t domid) -{ - mcl->op = __HYPERVISOR_update_va_mapping_otherdomain; - mcl->args[0] = va; - if (sizeof(new_val) == sizeof(long)) { - mcl->args[1] = new_val.pte; - mcl->args[2] = flags; - mcl->args[3] = domid; - } else { - mcl->args[1] = new_val.pte; - mcl->args[2] = new_val.pte >> 32; - mcl->args[3] = flags; - mcl->args[4] = domid; - } - - trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 4 : 5); -} - -static inline void MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { @@ -582,16 +484,6 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, } static inline void -MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg) -{ - mcl->op = __HYPERVISOR_memory_op; - mcl->args[0] = cmd; - mcl->args[1] = (unsigned long)arg; - - trace_xen_mc_entry(mcl, 2); -} - -static inline void MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req, int count, int *success_count, domid_t domid) { @@ -618,16 +510,6 @@ MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count, } static inline void -MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries) -{ - mcl->op = __HYPERVISOR_set_gdt; - mcl->args[0] = (unsigned long)frames; - mcl->args[1] = entries; - - trace_xen_mc_entry(mcl, 2); -} - -static inline void MULTI_stack_switch(struct multicall_entry *mcl, unsigned long ss, unsigned long esp) { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 0f471bd93417..d9b71924c23c 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -45,6 +45,7 @@ #include <asm/apic.h> #include <asm/apicdef.h> #include <asm/hypervisor.h> +#include <asm/tlb.h> static int kvmapf = 1; @@ -636,8 +637,10 @@ static void __init kvm_guest_init(void) if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) && !kvm_para_has_hint(KVM_HINTS_REALTIME) && - kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) + kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) { pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others; + pv_mmu_ops.tlb_remove_table = tlb_remove_table; + } if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) apic_set_eoi_write(kvm_guest_apic_eoi_write); diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 930c88341e4e..afdb303285f8 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -41,6 +41,7 @@ #include <asm/tlbflush.h> #include <asm/timer.h> #include <asm/special_insns.h> +#include <asm/tlb.h> /* * nop stub, which must not clobber anything *including the stack* to @@ -409,6 +410,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = { .flush_tlb_kernel = native_flush_tlb_global, .flush_tlb_one_user = native_flush_tlb_one_user, .flush_tlb_others = native_flush_tlb_others, + .tlb_remove_table = (void (*)(struct mmu_gather *, void *))tlb_remove_page, .pgd_alloc = __paravirt_pgd_alloc, .pgd_free = paravirt_nop, diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index acfd04121da3..7ba73fe0d917 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -40,8 +40,14 @@ int iommu_detected __read_mostly = 0; * devices and allow every device to access to whole physical memory. This is * useful if a user wants to use an IOMMU only for KVM device assignment to * guests and not for driver dma translation. + * It is also possible to disable by default in kernel config, and enable with + * iommu=nopt at boot time. */ +#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH +int iommu_pass_through __read_mostly = 1; +#else int iommu_pass_through __read_mostly; +#endif extern struct iommu_table_entry __iommu_table[], __iommu_table_end[]; @@ -135,6 +141,8 @@ static __init int iommu_setup(char *p) #endif if (!strncmp(p, "pt", 2)) iommu_pass_through = 1; + if (!strncmp(p, "nopt", 4)) + iommu_pass_through = 0; gart_parse_options(p); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index acfab322fbe0..5c32a7665492 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -99,15 +99,22 @@ __ref void *alloc_low_pages(unsigned int num) } if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) { - unsigned long ret; - if (min_pfn_mapped >= max_pfn_mapped) - panic("alloc_low_pages: ran out of memory"); - ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT, + unsigned long ret = 0; + + if (min_pfn_mapped < max_pfn_mapped) { + ret = memblock_find_in_range( + min_pfn_mapped << PAGE_SHIFT, max_pfn_mapped << PAGE_SHIFT, PAGE_SIZE * num , PAGE_SIZE); + } + if (ret) + memblock_reserve(ret, PAGE_SIZE * num); + else if (can_use_brk_pgt) + ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE)); + if (!ret) panic("alloc_low_pages: can not alloc memory"); - memblock_reserve(ret, PAGE_SIZE * num); + pfn = ret >> PAGE_SHIFT; } else { pfn = pgt_buf_end; diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 3ef095c70ae3..e848a4811785 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -63,7 +63,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) { pgtable_page_dtor(pte); paravirt_release_pte(page_to_pfn(pte)); - tlb_remove_table(tlb, pte); + paravirt_tlb_remove_table(tlb, pte); } #if CONFIG_PGTABLE_LEVELS > 2 @@ -79,21 +79,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) tlb->need_flush_all = 1; #endif pgtable_pmd_page_dtor(page); - tlb_remove_table(tlb, page); + paravirt_tlb_remove_table(tlb, page); } #if CONFIG_PGTABLE_LEVELS > 3 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) { paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_page(pud)); + paravirt_tlb_remove_table(tlb, virt_to_page(pud)); } #if CONFIG_PGTABLE_LEVELS > 4 void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) { paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); - tlb_remove_table(tlb, virt_to_page(p4d)); + paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); } #endif /* CONFIG_PGTABLE_LEVELS > 4 */ #endif /* CONFIG_PGTABLE_LEVELS > 3 */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 752dbf4e0e50..9517d1b2a281 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -7,7 +7,6 @@ #include <linux/export.h> #include <linux/cpu.h> #include <linux/debugfs.h> -#include <linux/gfp.h> #include <asm/tlbflush.h> #include <asm/mmu_context.h> @@ -186,11 +185,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, { struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm); u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); - bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy); unsigned cpu = smp_processor_id(); u64 next_tlb_gen; - bool need_flush; - u16 new_asid; /* * NB: The scheduler will call us with prev == next when switching @@ -244,41 +240,20 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next->context.ctx_id); /* - * Even in lazy TLB mode, the CPU should stay set in the - * mm_cpumask. The TLB shootdown code can figure out from - * from cpu_tlbstate.is_lazy whether or not to send an IPI. + * We don't currently support having a real mm loaded without + * our cpu set in mm_cpumask(). We have all the bookkeeping + * in place to figure out whether we would need to flush + * if our cpu were cleared in mm_cpumask(), but we don't + * currently use it. */ if (WARN_ON_ONCE(real_prev != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next)))) cpumask_set_cpu(cpu, mm_cpumask(next)); - /* - * If the CPU is not in lazy TLB mode, we are just switching - * from one thread in a process to another thread in the same - * process. No TLB flush required. - */ - if (!was_lazy) - return; - - /* - * Read the tlb_gen to check whether a flush is needed. - * If the TLB is up to date, just use it. - * The barrier synchronizes with the tlb_gen increment in - * the TLB shootdown code. - */ - smp_mb(); - next_tlb_gen = atomic64_read(&next->context.tlb_gen); - if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) == - next_tlb_gen) - return; - - /* - * TLB contents went out of date while we were in lazy - * mode. Fall through to the TLB switching code below. - */ - new_asid = prev_asid; - need_flush = true; + return; } else { + u16 new_asid; + bool need_flush; u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id); /* @@ -329,41 +304,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); - } - if (need_flush) { - this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); - this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); - load_new_mm_cr3(next->pgd, new_asid, true); + if (need_flush) { + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, true); + + /* + * NB: This gets called via leave_mm() in the idle path + * where RCU functions differently. Tracing normally + * uses RCU, so we need to use the _rcuidle variant. + * + * (There is no good reason for this. The idle code should + * be rearranged to call this before rcu_idle_enter().) + */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); + } else { + /* The new ASID is already up to date. */ + load_new_mm_cr3(next->pgd, new_asid, false); + + /* See above wrt _rcuidle. */ + trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + } /* - * NB: This gets called via leave_mm() in the idle path - * where RCU functions differently. Tracing normally - * uses RCU, so we need to use the _rcuidle variant. - * - * (There is no good reason for this. The idle code should - * be rearranged to call this before rcu_idle_enter().) + * Record last user mm's context id, so we can avoid + * flushing branch buffer with IBPB if we switch back + * to the same user. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL); - } else { - /* The new ASID is already up to date. */ - load_new_mm_cr3(next->pgd, new_asid, false); + if (next != &init_mm) + this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - /* See above wrt _rcuidle. */ - trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0); + this_cpu_write(cpu_tlbstate.loaded_mm, next); + this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); } - /* - * Record last user mm's context id, so we can avoid - * flushing branch buffer with IBPB if we switch back - * to the same user. - */ - if (next != &init_mm) - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id); - - this_cpu_write(cpu_tlbstate.loaded_mm, next); - this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid); - load_mm_cr4(next); switch_ldt(real_prev, next); } @@ -386,7 +361,20 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm) return; - this_cpu_write(cpu_tlbstate.is_lazy, true); + if (tlb_defer_switch_to_init_mm()) { + /* + * There's a significant optimization that may be possible + * here. We have accurate enough TLB flush tracking that we + * don't need to maintain coherence of TLB per se when we're + * lazy. We do, however, need to maintain coherence of + * paging-structure caches. We could, in principle, leave our + * old mm loaded and only switch to init_mm when + * tlb_remove_page() happens. + */ + this_cpu_write(cpu_tlbstate.is_lazy, true); + } else { + switch_mm(NULL, &init_mm, NULL); + } } /* @@ -473,9 +461,6 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f, * paging-structure cache to avoid speculatively reading * garbage into our TLB. Since switching to init_mm is barely * slower than a minimal flush, just switch to init_mm. - * - * This should be rare, with native_flush_tlb_others skipping - * IPIs to lazy TLB mode CPUs. */ switch_mm_irqs_off(NULL, &init_mm, NULL); return; @@ -582,9 +567,6 @@ static void flush_tlb_func_remote(void *info) void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info) { - cpumask_var_t lazymask; - unsigned int cpu; - count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (info->end == TLB_FLUSH_ALL) trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL); @@ -608,6 +590,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * that UV should be updated so that smp_call_function_many(), * etc, are optimal on UV. */ + unsigned int cpu; + cpu = smp_processor_id(); cpumask = uv_flush_tlb_others(cpumask, info); if (cpumask) @@ -615,29 +599,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask, (void *)info, 1); return; } - - /* - * A temporary cpumask is used in order to skip sending IPIs - * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm). - * If the allocation fails, simply IPI every CPU in mm_cpumask. - */ - if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) { - smp_call_function_many(cpumask, flush_tlb_func_remote, - (void *)info, 1); - return; - } - - cpumask_copy(lazymask, cpumask); - - for_each_cpu(cpu, lazymask) { - if (per_cpu(cpu_tlbstate.is_lazy, cpu)) - cpumask_clear_cpu(cpu, lazymask); - } - - smp_call_function_many(lazymask, flush_tlb_func_remote, + smp_call_function_many(cpumask, flush_tlb_func_remote, (void *)info, 1); - - free_cpumask_var(lazymask); } /* @@ -690,68 +653,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, put_cpu(); } -void tlb_flush_remove_tables_local(void *arg) -{ - struct mm_struct *mm = arg; - - if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm && - this_cpu_read(cpu_tlbstate.is_lazy)) { - /* - * We're in lazy mode. We need to at least flush our - * paging-structure cache to avoid speculatively reading - * garbage into our TLB. Since switching to init_mm is barely - * slower than a minimal flush, just switch to init_mm. - */ - switch_mm_irqs_off(NULL, &init_mm, NULL); - } -} - -static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm, - struct cpumask *lazy_cpus) -{ - int cpu; - - for_each_cpu(cpu, mm_cpumask(mm)) { - if (!per_cpu(cpu_tlbstate.is_lazy, cpu)) - cpumask_set_cpu(cpu, lazy_cpus); - } -} - -void tlb_flush_remove_tables(struct mm_struct *mm) -{ - int cpu = get_cpu(); - cpumask_var_t lazy_cpus; - - if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) { - put_cpu(); - return; - } - - if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) { - /* - * If the cpumask allocation fails, do a brute force flush - * on all the CPUs that have this mm loaded. - */ - smp_call_function_many(mm_cpumask(mm), - tlb_flush_remove_tables_local, (void *)mm, 1); - put_cpu(); - return; - } - - /* - * CPUs with !is_lazy either received a TLB flush IPI while the user - * pages in this address range were unmapped, or have context switched - * and reloaded %CR3 since then. - * - * Shootdown IPIs at page table freeing time only need to be sent to - * CPUs that may have out of date TLB contents. - */ - mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus); - smp_call_function_many(lazy_cpus, - tlb_flush_remove_tables_local, (void *)mm, 1); - free_cpumask_var(lazy_cpus); - put_cpu(); -} static void do_flush_tlb_all(void *info) { diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c index ee3b00c7acda..52a7c3faee0c 100644 --- a/arch/x86/xen/enlighten_pv.c +++ b/arch/x86/xen/enlighten_pv.c @@ -122,6 +122,8 @@ static void __init xen_banner(void) static void __init xen_pv_init_platform(void) { + populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP)); + set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info); HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP); @@ -1170,13 +1172,13 @@ static void __init xen_boot_params_init_edd(void) * we do this, we have to be careful not to call any stack-protected * function, which is most of the kernel. */ -static void xen_setup_gdt(int cpu) +static void __init xen_setup_gdt(int cpu) { pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot; pv_cpu_ops.load_gdt = xen_load_gdt_boot; - setup_stack_canary_segment(0); - switch_to_new_gdt(0); + setup_stack_canary_segment(cpu); + switch_to_new_gdt(cpu); pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry; pv_cpu_ops.load_gdt = xen_load_gdt; @@ -1385,8 +1387,11 @@ asmlinkage __visible void __init xen_start_kernel(void) xen_boot_params_init_edd(); } - add_preferred_console("tty", 0, NULL); + if (!boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); add_preferred_console("hvc", 0, NULL); + if (boot_params.screen_info.orig_video_isVGA) + add_preferred_console("tty", 0, NULL); #ifdef CONFIG_PCI /* PCI BIOS service won't work from a PV guest. */ diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index 52206ad81e4b..45b700ac5fe7 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -67,6 +67,7 @@ #include <asm/init.h> #include <asm/pat.h> #include <asm/smp.h> +#include <asm/tlb.h> #include <asm/xen/hypercall.h> #include <asm/xen/hypervisor.h> @@ -2171,6 +2172,8 @@ void __init xen_relocate_p2m(void) #else /* !CONFIG_X86_64 */ static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); +RESERVE_BRK(fixup_kernel_pmd, PAGE_SIZE); +RESERVE_BRK(fixup_kernel_pte, PAGE_SIZE); static void __init xen_write_cr3_init(unsigned long cr3) { @@ -2397,6 +2400,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = { .flush_tlb_kernel = xen_flush_tlb, .flush_tlb_one_user = xen_flush_tlb_one_user, .flush_tlb_others = xen_flush_tlb_others, + .tlb_remove_table = tlb_remove_table, .pgd_alloc = xen_pgd_alloc, .pgd_free = xen_pgd_free, diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 6e0d2086eacb..1163e33121fb 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -906,37 +906,6 @@ char * __init xen_memory_setup(void) } /* - * Machine specific memory setup for auto-translated guests. - */ -char * __init xen_auto_xlated_memory_setup(void) -{ - struct xen_memory_map memmap; - int i; - int rc; - - memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries); - set_xen_guest_handle(memmap.buffer, xen_e820_table.entries); - - rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap); - if (rc < 0) - panic("No memory map (%d)\n", rc); - - xen_e820_table.nr_entries = memmap.nr_entries; - - e820__update_table(&xen_e820_table); - - for (i = 0; i < xen_e820_table.nr_entries; i++) - e820__range_add(xen_e820_table.entries[i].addr, xen_e820_table.entries[i].size, xen_e820_table.entries[i].type); - - /* Remove p2m info, it is not needed. */ - xen_start_info->mfn_list = 0; - xen_start_info->first_p2m_pfn = 0; - xen_start_info->nr_p2m_frames = 0; - - return "Xen"; -} - -/* * Set the bit indicating "nosegneg" library variants should be used. * We only need to bother in pure 32-bit mode; compat 32-bit processes * can have un-truncated segments, so wrapping around is allowed. diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index e78684597f57..0e60bd918695 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -50,7 +50,6 @@ void __init xen_inv_extra_mem(void); void __init xen_remap_memory(void); phys_addr_t __init xen_find_free_area(phys_addr_t size); char * __init xen_memory_setup(void); -char * xen_auto_xlated_memory_setup(void); void __init xen_arch_setup(void); void xen_enable_sysenter(void); void xen_enable_syscall(void); |