summaryrefslogtreecommitdiff
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig3
-rw-r--r--arch/x86/hyperv/mmu.c2
-rw-r--r--arch/x86/include/asm/irq_remapping.h5
-rw-r--r--arch/x86/include/asm/paravirt.h5
-rw-r--r--arch/x86/include/asm/paravirt_types.h3
-rw-r--r--arch/x86/include/asm/tlbflush.h24
-rw-r--r--arch/x86/include/asm/xen/hypercall.h118
-rw-r--r--arch/x86/kernel/kvm.c5
-rw-r--r--arch/x86/kernel/paravirt.c2
-rw-r--r--arch/x86/kernel/pci-dma.c8
-rw-r--r--arch/x86/mm/init.c17
-rw-r--r--arch/x86/mm/pgtable.c8
-rw-r--r--arch/x86/mm/tlb.c205
-rw-r--r--arch/x86/xen/enlighten_pv.c13
-rw-r--r--arch/x86/xen/mmu_pv.c4
-rw-r--r--arch/x86/xen/setup.c31
-rw-r--r--arch/x86/xen/xen-ops.h1
17 files changed, 132 insertions, 322 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 512003f16889..c5ff296bc5d1 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -180,7 +180,8 @@ config X86
select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
select HAVE_PERF_REGS
select HAVE_PERF_USER_STACK_DUMP
- select HAVE_RCU_TABLE_FREE
+ select HAVE_RCU_TABLE_FREE if PARAVIRT
+ select HAVE_RCU_TABLE_INVALIDATE if HAVE_RCU_TABLE_FREE
select HAVE_REGS_AND_STACK_ACCESS_API
select HAVE_RELIABLE_STACKTRACE if X86_64 && (UNWINDER_FRAME_POINTER || UNWINDER_ORC) && STACK_VALIDATION
select HAVE_STACKPROTECTOR if CC_HAS_SANE_STACKPROTECTOR
diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index 1147e1fed7ff..ef5f29f913d7 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -9,6 +9,7 @@
#include <asm/mshyperv.h>
#include <asm/msr.h>
#include <asm/tlbflush.h>
+#include <asm/tlb.h>
#define CREATE_TRACE_POINTS
#include <asm/trace/hyperv.h>
@@ -231,4 +232,5 @@ void hyperv_setup_mmu_ops(void)
pr_info("Using hypercall for remote TLB flush\n");
pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others;
+ pv_mmu_ops.tlb_remove_table = tlb_remove_table;
}
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index 023b4a9fc846..5f26962eff42 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -33,6 +33,11 @@ enum irq_remap_cap {
IRQ_POSTING_CAP = 0,
};
+enum {
+ IRQ_REMAP_XAPIC_MODE,
+ IRQ_REMAP_X2APIC_MODE,
+};
+
struct vcpu_data {
u64 pi_desc_addr; /* Physical address of PI Descriptor */
u32 vector; /* Guest vector of the interrupt */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index d49bbf4bb5c8..e375d4266b53 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -309,6 +309,11 @@ static inline void flush_tlb_others(const struct cpumask *cpumask,
PVOP_VCALL2(pv_mmu_ops.flush_tlb_others, cpumask, info);
}
+static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table)
+{
+ PVOP_VCALL2(pv_mmu_ops.tlb_remove_table, tlb, table);
+}
+
static inline int paravirt_pgd_alloc(struct mm_struct *mm)
{
return PVOP_CALL1(int, pv_mmu_ops.pgd_alloc, mm);
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 180bc0bff0fb..4b75acc23b30 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -54,6 +54,7 @@ struct desc_struct;
struct task_struct;
struct cpumask;
struct flush_tlb_info;
+struct mmu_gather;
/*
* Wrapper type for pointers to code which uses the non-standard
@@ -222,6 +223,8 @@ struct pv_mmu_ops {
void (*flush_tlb_others)(const struct cpumask *cpus,
const struct flush_tlb_info *info);
+ void (*tlb_remove_table)(struct mmu_gather *tlb, void *table);
+
/* Hooks for allocating and freeing a pagetable top-level */
int (*pgd_alloc)(struct mm_struct *mm);
void (*pgd_free)(struct mm_struct *mm, pgd_t *pgd);
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 511bf5fae8b8..29c9da6c62fc 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -148,6 +148,22 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
#endif
+static inline bool tlb_defer_switch_to_init_mm(void)
+{
+ /*
+ * If we have PCID, then switching to init_mm is reasonably
+ * fast. If we don't have PCID, then switching to init_mm is
+ * quite slow, so we try to defer it in the hopes that we can
+ * avoid it entirely. The latter approach runs the risk of
+ * receiving otherwise unnecessary IPIs.
+ *
+ * This choice is just a heuristic. The tlb code can handle this
+ * function returning true or false regardless of whether we have
+ * PCID.
+ */
+ return !static_cpu_has(X86_FEATURE_PCID);
+}
+
struct tlb_context {
u64 ctx_id;
u64 tlb_gen;
@@ -536,11 +552,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
#ifndef CONFIG_PARAVIRT
#define flush_tlb_others(mask, info) \
native_flush_tlb_others(mask, info)
-#endif
-extern void tlb_flush_remove_tables(struct mm_struct *mm);
-extern void tlb_flush_remove_tables_local(void *arg);
-
-#define HAVE_TLB_FLUSH_REMOVE_TABLES
+#define paravirt_tlb_remove_table(tlb, page) \
+ tlb_remove_page(tlb, (void *)(page))
+#endif
#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 6b2f90a0b149..ef05bea7010d 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -197,17 +197,6 @@ extern struct { char _entry[32]; } hypercall_page[];
(type)__res; \
})
-#define _hypercall5(type, name, a1, a2, a3, a4, a5) \
-({ \
- __HYPERCALL_DECLS; \
- __HYPERCALL_5ARG(a1, a2, a3, a4, a5); \
- asm volatile (__HYPERCALL \
- : __HYPERCALL_5PARAM \
- : __HYPERCALL_ENTRY(name) \
- : __HYPERCALL_CLOBBER5); \
- (type)__res; \
-})
-
static inline long
xen_single_call(unsigned int call,
unsigned long a1, unsigned long a2,
@@ -267,47 +256,12 @@ HYPERVISOR_set_gdt(unsigned long *frame_list, int entries)
}
static inline int
-HYPERVISOR_stack_switch(unsigned long ss, unsigned long esp)
-{
- return _hypercall2(int, stack_switch, ss, esp);
-}
-
-#ifdef CONFIG_X86_32
-static inline int
-HYPERVISOR_set_callbacks(unsigned long event_selector,
- unsigned long event_address,
- unsigned long failsafe_selector,
- unsigned long failsafe_address)
-{
- return _hypercall4(int, set_callbacks,
- event_selector, event_address,
- failsafe_selector, failsafe_address);
-}
-#else /* CONFIG_X86_64 */
-static inline int
-HYPERVISOR_set_callbacks(unsigned long event_address,
- unsigned long failsafe_address,
- unsigned long syscall_address)
-{
- return _hypercall3(int, set_callbacks,
- event_address, failsafe_address,
- syscall_address);
-}
-#endif /* CONFIG_X86_{32,64} */
-
-static inline int
HYPERVISOR_callback_op(int cmd, void *arg)
{
return _hypercall2(int, callback_op, cmd, arg);
}
static inline int
-HYPERVISOR_fpu_taskswitch(int set)
-{
- return _hypercall1(int, fpu_taskswitch, set);
-}
-
-static inline int
HYPERVISOR_sched_op(int cmd, void *arg)
{
return _hypercall2(int, sched_op, cmd, arg);
@@ -419,19 +373,6 @@ HYPERVISOR_grant_table_op(unsigned int cmd, void *uop, unsigned int count)
}
static inline int
-HYPERVISOR_update_va_mapping_otherdomain(unsigned long va, pte_t new_val,
- unsigned long flags, domid_t domid)
-{
- if (sizeof(new_val) == sizeof(long))
- return _hypercall4(int, update_va_mapping_otherdomain, va,
- new_val.pte, flags, domid);
- else
- return _hypercall5(int, update_va_mapping_otherdomain, va,
- new_val.pte, new_val.pte >> 32,
- flags, domid);
-}
-
-static inline int
HYPERVISOR_vm_assist(unsigned int cmd, unsigned int type)
{
return _hypercall2(int, vm_assist, cmd, type);
@@ -465,12 +406,6 @@ HYPERVISOR_suspend(unsigned long start_info_mfn)
return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
}
-static inline int
-HYPERVISOR_nmi_op(unsigned long op, unsigned long arg)
-{
- return _hypercall2(int, nmi_op, op, arg);
-}
-
static inline unsigned long __must_check
HYPERVISOR_hvm_op(int op, void *arg)
{
@@ -529,39 +464,6 @@ MULTI_update_va_mapping(struct multicall_entry *mcl, unsigned long va,
}
static inline void
-MULTI_grant_table_op(struct multicall_entry *mcl, unsigned int cmd,
- void *uop, unsigned int count)
-{
- mcl->op = __HYPERVISOR_grant_table_op;
- mcl->args[0] = cmd;
- mcl->args[1] = (unsigned long)uop;
- mcl->args[2] = count;
-
- trace_xen_mc_entry(mcl, 3);
-}
-
-static inline void
-MULTI_update_va_mapping_otherdomain(struct multicall_entry *mcl, unsigned long va,
- pte_t new_val, unsigned long flags,
- domid_t domid)
-{
- mcl->op = __HYPERVISOR_update_va_mapping_otherdomain;
- mcl->args[0] = va;
- if (sizeof(new_val) == sizeof(long)) {
- mcl->args[1] = new_val.pte;
- mcl->args[2] = flags;
- mcl->args[3] = domid;
- } else {
- mcl->args[1] = new_val.pte;
- mcl->args[2] = new_val.pte >> 32;
- mcl->args[3] = flags;
- mcl->args[4] = domid;
- }
-
- trace_xen_mc_entry(mcl, sizeof(new_val) == sizeof(long) ? 4 : 5);
-}
-
-static inline void
MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
struct desc_struct desc)
{
@@ -582,16 +484,6 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr,
}
static inline void
-MULTI_memory_op(struct multicall_entry *mcl, unsigned int cmd, void *arg)
-{
- mcl->op = __HYPERVISOR_memory_op;
- mcl->args[0] = cmd;
- mcl->args[1] = (unsigned long)arg;
-
- trace_xen_mc_entry(mcl, 2);
-}
-
-static inline void
MULTI_mmu_update(struct multicall_entry *mcl, struct mmu_update *req,
int count, int *success_count, domid_t domid)
{
@@ -618,16 +510,6 @@ MULTI_mmuext_op(struct multicall_entry *mcl, struct mmuext_op *op, int count,
}
static inline void
-MULTI_set_gdt(struct multicall_entry *mcl, unsigned long *frames, int entries)
-{
- mcl->op = __HYPERVISOR_set_gdt;
- mcl->args[0] = (unsigned long)frames;
- mcl->args[1] = entries;
-
- trace_xen_mc_entry(mcl, 2);
-}
-
-static inline void
MULTI_stack_switch(struct multicall_entry *mcl,
unsigned long ss, unsigned long esp)
{
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 0f471bd93417..d9b71924c23c 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -45,6 +45,7 @@
#include <asm/apic.h>
#include <asm/apicdef.h>
#include <asm/hypervisor.h>
+#include <asm/tlb.h>
static int kvmapf = 1;
@@ -636,8 +637,10 @@ static void __init kvm_guest_init(void)
if (kvm_para_has_feature(KVM_FEATURE_PV_TLB_FLUSH) &&
!kvm_para_has_hint(KVM_HINTS_REALTIME) &&
- kvm_para_has_feature(KVM_FEATURE_STEAL_TIME))
+ kvm_para_has_feature(KVM_FEATURE_STEAL_TIME)) {
pv_mmu_ops.flush_tlb_others = kvm_flush_tlb_others;
+ pv_mmu_ops.tlb_remove_table = tlb_remove_table;
+ }
if (kvm_para_has_feature(KVM_FEATURE_PV_EOI))
apic_set_eoi_write(kvm_guest_apic_eoi_write);
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 930c88341e4e..afdb303285f8 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -41,6 +41,7 @@
#include <asm/tlbflush.h>
#include <asm/timer.h>
#include <asm/special_insns.h>
+#include <asm/tlb.h>
/*
* nop stub, which must not clobber anything *including the stack* to
@@ -409,6 +410,7 @@ struct pv_mmu_ops pv_mmu_ops __ro_after_init = {
.flush_tlb_kernel = native_flush_tlb_global,
.flush_tlb_one_user = native_flush_tlb_one_user,
.flush_tlb_others = native_flush_tlb_others,
+ .tlb_remove_table = (void (*)(struct mmu_gather *, void *))tlb_remove_page,
.pgd_alloc = __paravirt_pgd_alloc,
.pgd_free = paravirt_nop,
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index acfd04121da3..7ba73fe0d917 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -40,8 +40,14 @@ int iommu_detected __read_mostly = 0;
* devices and allow every device to access to whole physical memory. This is
* useful if a user wants to use an IOMMU only for KVM device assignment to
* guests and not for driver dma translation.
+ * It is also possible to disable by default in kernel config, and enable with
+ * iommu=nopt at boot time.
*/
+#ifdef CONFIG_IOMMU_DEFAULT_PASSTHROUGH
+int iommu_pass_through __read_mostly = 1;
+#else
int iommu_pass_through __read_mostly;
+#endif
extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
@@ -135,6 +141,8 @@ static __init int iommu_setup(char *p)
#endif
if (!strncmp(p, "pt", 2))
iommu_pass_through = 1;
+ if (!strncmp(p, "nopt", 4))
+ iommu_pass_through = 0;
gart_parse_options(p);
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index acfab322fbe0..5c32a7665492 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -99,15 +99,22 @@ __ref void *alloc_low_pages(unsigned int num)
}
if ((pgt_buf_end + num) > pgt_buf_top || !can_use_brk_pgt) {
- unsigned long ret;
- if (min_pfn_mapped >= max_pfn_mapped)
- panic("alloc_low_pages: ran out of memory");
- ret = memblock_find_in_range(min_pfn_mapped << PAGE_SHIFT,
+ unsigned long ret = 0;
+
+ if (min_pfn_mapped < max_pfn_mapped) {
+ ret = memblock_find_in_range(
+ min_pfn_mapped << PAGE_SHIFT,
max_pfn_mapped << PAGE_SHIFT,
PAGE_SIZE * num , PAGE_SIZE);
+ }
+ if (ret)
+ memblock_reserve(ret, PAGE_SIZE * num);
+ else if (can_use_brk_pgt)
+ ret = __pa(extend_brk(PAGE_SIZE * num, PAGE_SIZE));
+
if (!ret)
panic("alloc_low_pages: can not alloc memory");
- memblock_reserve(ret, PAGE_SIZE * num);
+
pfn = ret >> PAGE_SHIFT;
} else {
pfn = pgt_buf_end;
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 3ef095c70ae3..e848a4811785 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -63,7 +63,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
{
pgtable_page_dtor(pte);
paravirt_release_pte(page_to_pfn(pte));
- tlb_remove_table(tlb, pte);
+ paravirt_tlb_remove_table(tlb, pte);
}
#if CONFIG_PGTABLE_LEVELS > 2
@@ -79,21 +79,21 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
tlb->need_flush_all = 1;
#endif
pgtable_pmd_page_dtor(page);
- tlb_remove_table(tlb, page);
+ paravirt_tlb_remove_table(tlb, page);
}
#if CONFIG_PGTABLE_LEVELS > 3
void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
{
paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
- tlb_remove_table(tlb, virt_to_page(pud));
+ paravirt_tlb_remove_table(tlb, virt_to_page(pud));
}
#if CONFIG_PGTABLE_LEVELS > 4
void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d)
{
paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT);
- tlb_remove_table(tlb, virt_to_page(p4d));
+ paravirt_tlb_remove_table(tlb, virt_to_page(p4d));
}
#endif /* CONFIG_PGTABLE_LEVELS > 4 */
#endif /* CONFIG_PGTABLE_LEVELS > 3 */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 752dbf4e0e50..9517d1b2a281 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -7,7 +7,6 @@
#include <linux/export.h>
#include <linux/cpu.h>
#include <linux/debugfs.h>
-#include <linux/gfp.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -186,11 +185,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
{
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
- bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
- bool need_flush;
- u16 new_asid;
/*
* NB: The scheduler will call us with prev == next when switching
@@ -244,41 +240,20 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next->context.ctx_id);
/*
- * Even in lazy TLB mode, the CPU should stay set in the
- * mm_cpumask. The TLB shootdown code can figure out from
- * from cpu_tlbstate.is_lazy whether or not to send an IPI.
+ * We don't currently support having a real mm loaded without
+ * our cpu set in mm_cpumask(). We have all the bookkeeping
+ * in place to figure out whether we would need to flush
+ * if our cpu were cleared in mm_cpumask(), but we don't
+ * currently use it.
*/
if (WARN_ON_ONCE(real_prev != &init_mm &&
!cpumask_test_cpu(cpu, mm_cpumask(next))))
cpumask_set_cpu(cpu, mm_cpumask(next));
- /*
- * If the CPU is not in lazy TLB mode, we are just switching
- * from one thread in a process to another thread in the same
- * process. No TLB flush required.
- */
- if (!was_lazy)
- return;
-
- /*
- * Read the tlb_gen to check whether a flush is needed.
- * If the TLB is up to date, just use it.
- * The barrier synchronizes with the tlb_gen increment in
- * the TLB shootdown code.
- */
- smp_mb();
- next_tlb_gen = atomic64_read(&next->context.tlb_gen);
- if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
- next_tlb_gen)
- return;
-
- /*
- * TLB contents went out of date while we were in lazy
- * mode. Fall through to the TLB switching code below.
- */
- new_asid = prev_asid;
- need_flush = true;
+ return;
} else {
+ u16 new_asid;
+ bool need_flush;
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
/*
@@ -329,41 +304,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
- }
- if (need_flush) {
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
- this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
- load_new_mm_cr3(next->pgd, new_asid, true);
+ if (need_flush) {
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
+ this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
+ load_new_mm_cr3(next->pgd, new_asid, true);
+
+ /*
+ * NB: This gets called via leave_mm() in the idle path
+ * where RCU functions differently. Tracing normally
+ * uses RCU, so we need to use the _rcuidle variant.
+ *
+ * (There is no good reason for this. The idle code should
+ * be rearranged to call this before rcu_idle_enter().)
+ */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
+ } else {
+ /* The new ASID is already up to date. */
+ load_new_mm_cr3(next->pgd, new_asid, false);
+
+ /* See above wrt _rcuidle. */
+ trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+ }
/*
- * NB: This gets called via leave_mm() in the idle path
- * where RCU functions differently. Tracing normally
- * uses RCU, so we need to use the _rcuidle variant.
- *
- * (There is no good reason for this. The idle code should
- * be rearranged to call this before rcu_idle_enter().)
+ * Record last user mm's context id, so we can avoid
+ * flushing branch buffer with IBPB if we switch back
+ * to the same user.
*/
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
- } else {
- /* The new ASID is already up to date. */
- load_new_mm_cr3(next->pgd, new_asid, false);
+ if (next != &init_mm)
+ this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
- /* See above wrt _rcuidle. */
- trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
+ this_cpu_write(cpu_tlbstate.loaded_mm, next);
+ this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
}
- /*
- * Record last user mm's context id, so we can avoid
- * flushing branch buffer with IBPB if we switch back
- * to the same user.
- */
- if (next != &init_mm)
- this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
-
- this_cpu_write(cpu_tlbstate.loaded_mm, next);
- this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
-
load_mm_cr4(next);
switch_ldt(real_prev, next);
}
@@ -386,7 +361,20 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
- this_cpu_write(cpu_tlbstate.is_lazy, true);
+ if (tlb_defer_switch_to_init_mm()) {
+ /*
+ * There's a significant optimization that may be possible
+ * here. We have accurate enough TLB flush tracking that we
+ * don't need to maintain coherence of TLB per se when we're
+ * lazy. We do, however, need to maintain coherence of
+ * paging-structure caches. We could, in principle, leave our
+ * old mm loaded and only switch to init_mm when
+ * tlb_remove_page() happens.
+ */
+ this_cpu_write(cpu_tlbstate.is_lazy, true);
+ } else {
+ switch_mm(NULL, &init_mm, NULL);
+ }
}
/*
@@ -473,9 +461,6 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
* paging-structure cache to avoid speculatively reading
* garbage into our TLB. Since switching to init_mm is barely
* slower than a minimal flush, just switch to init_mm.
- *
- * This should be rare, with native_flush_tlb_others skipping
- * IPIs to lazy TLB mode CPUs.
*/
switch_mm_irqs_off(NULL, &init_mm, NULL);
return;
@@ -582,9 +567,6 @@ static void flush_tlb_func_remote(void *info)
void native_flush_tlb_others(const struct cpumask *cpumask,
const struct flush_tlb_info *info)
{
- cpumask_var_t lazymask;
- unsigned int cpu;
-
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
if (info->end == TLB_FLUSH_ALL)
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
@@ -608,6 +590,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
* that UV should be updated so that smp_call_function_many(),
* etc, are optimal on UV.
*/
+ unsigned int cpu;
+
cpu = smp_processor_id();
cpumask = uv_flush_tlb_others(cpumask, info);
if (cpumask)
@@ -615,29 +599,8 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
(void *)info, 1);
return;
}
-
- /*
- * A temporary cpumask is used in order to skip sending IPIs
- * to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
- * If the allocation fails, simply IPI every CPU in mm_cpumask.
- */
- if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
- smp_call_function_many(cpumask, flush_tlb_func_remote,
- (void *)info, 1);
- return;
- }
-
- cpumask_copy(lazymask, cpumask);
-
- for_each_cpu(cpu, lazymask) {
- if (per_cpu(cpu_tlbstate.is_lazy, cpu))
- cpumask_clear_cpu(cpu, lazymask);
- }
-
- smp_call_function_many(lazymask, flush_tlb_func_remote,
+ smp_call_function_many(cpumask, flush_tlb_func_remote,
(void *)info, 1);
-
- free_cpumask_var(lazymask);
}
/*
@@ -690,68 +653,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
put_cpu();
}
-void tlb_flush_remove_tables_local(void *arg)
-{
- struct mm_struct *mm = arg;
-
- if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
- this_cpu_read(cpu_tlbstate.is_lazy)) {
- /*
- * We're in lazy mode. We need to at least flush our
- * paging-structure cache to avoid speculatively reading
- * garbage into our TLB. Since switching to init_mm is barely
- * slower than a minimal flush, just switch to init_mm.
- */
- switch_mm_irqs_off(NULL, &init_mm, NULL);
- }
-}
-
-static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
- struct cpumask *lazy_cpus)
-{
- int cpu;
-
- for_each_cpu(cpu, mm_cpumask(mm)) {
- if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
- cpumask_set_cpu(cpu, lazy_cpus);
- }
-}
-
-void tlb_flush_remove_tables(struct mm_struct *mm)
-{
- int cpu = get_cpu();
- cpumask_var_t lazy_cpus;
-
- if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
- put_cpu();
- return;
- }
-
- if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
- /*
- * If the cpumask allocation fails, do a brute force flush
- * on all the CPUs that have this mm loaded.
- */
- smp_call_function_many(mm_cpumask(mm),
- tlb_flush_remove_tables_local, (void *)mm, 1);
- put_cpu();
- return;
- }
-
- /*
- * CPUs with !is_lazy either received a TLB flush IPI while the user
- * pages in this address range were unmapped, or have context switched
- * and reloaded %CR3 since then.
- *
- * Shootdown IPIs at page table freeing time only need to be sent to
- * CPUs that may have out of date TLB contents.
- */
- mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
- smp_call_function_many(lazy_cpus,
- tlb_flush_remove_tables_local, (void *)mm, 1);
- free_cpumask_var(lazy_cpus);
- put_cpu();
-}
static void do_flush_tlb_all(void *info)
{
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index ee3b00c7acda..52a7c3faee0c 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -122,6 +122,8 @@ static void __init xen_banner(void)
static void __init xen_pv_init_platform(void)
{
+ populate_extra_pte(fix_to_virt(FIX_PARAVIRT_BOOTMAP));
+
set_fixmap(FIX_PARAVIRT_BOOTMAP, xen_start_info->shared_info);
HYPERVISOR_shared_info = (void *)fix_to_virt(FIX_PARAVIRT_BOOTMAP);
@@ -1170,13 +1172,13 @@ static void __init xen_boot_params_init_edd(void)
* we do this, we have to be careful not to call any stack-protected
* function, which is most of the kernel.
*/
-static void xen_setup_gdt(int cpu)
+static void __init xen_setup_gdt(int cpu)
{
pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry_boot;
pv_cpu_ops.load_gdt = xen_load_gdt_boot;
- setup_stack_canary_segment(0);
- switch_to_new_gdt(0);
+ setup_stack_canary_segment(cpu);
+ switch_to_new_gdt(cpu);
pv_cpu_ops.write_gdt_entry = xen_write_gdt_entry;
pv_cpu_ops.load_gdt = xen_load_gdt;
@@ -1385,8 +1387,11 @@ asmlinkage __visible void __init xen_start_kernel(void)
xen_boot_params_init_edd();
}
- add_preferred_console("tty", 0, NULL);
+ if (!boot_params.screen_info.orig_video_isVGA)
+ add_preferred_console("tty", 0, NULL);
add_preferred_console("hvc", 0, NULL);
+ if (boot_params.screen_info.orig_video_isVGA)
+ add_preferred_console("tty", 0, NULL);
#ifdef CONFIG_PCI
/* PCI BIOS service won't work from a PV guest. */
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 52206ad81e4b..45b700ac5fe7 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -67,6 +67,7 @@
#include <asm/init.h>
#include <asm/pat.h>
#include <asm/smp.h>
+#include <asm/tlb.h>
#include <asm/xen/hypercall.h>
#include <asm/xen/hypervisor.h>
@@ -2171,6 +2172,8 @@ void __init xen_relocate_p2m(void)
#else /* !CONFIG_X86_64 */
static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
+RESERVE_BRK(fixup_kernel_pmd, PAGE_SIZE);
+RESERVE_BRK(fixup_kernel_pte, PAGE_SIZE);
static void __init xen_write_cr3_init(unsigned long cr3)
{
@@ -2397,6 +2400,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
.flush_tlb_kernel = xen_flush_tlb,
.flush_tlb_one_user = xen_flush_tlb_one_user,
.flush_tlb_others = xen_flush_tlb_others,
+ .tlb_remove_table = tlb_remove_table,
.pgd_alloc = xen_pgd_alloc,
.pgd_free = xen_pgd_free,
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 6e0d2086eacb..1163e33121fb 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -906,37 +906,6 @@ char * __init xen_memory_setup(void)
}
/*
- * Machine specific memory setup for auto-translated guests.
- */
-char * __init xen_auto_xlated_memory_setup(void)
-{
- struct xen_memory_map memmap;
- int i;
- int rc;
-
- memmap.nr_entries = ARRAY_SIZE(xen_e820_table.entries);
- set_xen_guest_handle(memmap.buffer, xen_e820_table.entries);
-
- rc = HYPERVISOR_memory_op(XENMEM_memory_map, &memmap);
- if (rc < 0)
- panic("No memory map (%d)\n", rc);
-
- xen_e820_table.nr_entries = memmap.nr_entries;
-
- e820__update_table(&xen_e820_table);
-
- for (i = 0; i < xen_e820_table.nr_entries; i++)
- e820__range_add(xen_e820_table.entries[i].addr, xen_e820_table.entries[i].size, xen_e820_table.entries[i].type);
-
- /* Remove p2m info, it is not needed. */
- xen_start_info->mfn_list = 0;
- xen_start_info->first_p2m_pfn = 0;
- xen_start_info->nr_p2m_frames = 0;
-
- return "Xen";
-}
-
-/*
* Set the bit indicating "nosegneg" library variants should be used.
* We only need to bother in pure 32-bit mode; compat 32-bit processes
* can have un-truncated segments, so wrapping around is allowed.
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index e78684597f57..0e60bd918695 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -50,7 +50,6 @@ void __init xen_inv_extra_mem(void);
void __init xen_remap_memory(void);
phys_addr_t __init xen_find_free_area(phys_addr_t size);
char * __init xen_memory_setup(void);
-char * xen_auto_xlated_memory_setup(void);
void __init xen_arch_setup(void);
void xen_enable_sysenter(void);
void xen_enable_syscall(void);