summaryrefslogtreecommitdiff
path: root/arch/x86/xen/enlighten.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/xen/enlighten.c')
-rw-r--r--arch/x86/xen/enlighten.c290
1 files changed, 235 insertions, 55 deletions
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index a8f8844b8d32..bf4bda6d3e9a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -31,6 +31,7 @@
#include <linux/pci.h>
#include <linux/gfp.h>
#include <linux/memblock.h>
+#include <linux/syscore_ops.h>
#include <xen/xen.h>
#include <xen/interface/xen.h>
@@ -38,10 +39,12 @@
#include <xen/interface/physdev.h>
#include <xen/interface/vcpu.h>
#include <xen/interface/memory.h>
+#include <xen/interface/xen-mca.h>
#include <xen/features.h>
#include <xen/page.h>
#include <xen/hvm.h>
#include <xen/hvc-console.h>
+#include <xen/acpi.h>
#include <asm/paravirt.h>
#include <asm/apic.h>
@@ -63,6 +66,7 @@
#include <asm/stackprotector.h>
#include <asm/hypervisor.h>
#include <asm/mwait.h>
+#include <asm/pci_x86.h>
#ifdef CONFIG_ACPI
#include <linux/acpi.h>
@@ -74,6 +78,7 @@
#include "xen-ops.h"
#include "mmu.h"
+#include "smp.h"
#include "multicalls.h"
EXPORT_SYMBOL_GPL(hypercall_page);
@@ -104,7 +109,7 @@ EXPORT_SYMBOL_GPL(xen_have_vector_callback);
* Point at some empty memory to start with. We map the real shared_info
* page as soon as fixmap is up and running.
*/
-struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
+struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
/*
* Flag to determine whether vcpu info placement is available on all
@@ -121,6 +126,19 @@ struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
*/
static int have_vcpu_info_placement = 1;
+struct tls_descs {
+ struct desc_struct desc[3];
+};
+
+/*
+ * Updating the 3 TLS descriptors in the GDT on every task switch is
+ * surprisingly expensive so we avoid updating them if they haven't
+ * changed. Since Xen writes different descriptors than the one
+ * passed in the update_descriptor hypercall we keep shadow copies to
+ * compare against.
+ */
+static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+
static void clamp_max_cpus(void)
{
#ifdef CONFIG_SMP
@@ -206,6 +224,9 @@ static void __init xen_banner(void)
xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : "");
}
+#define CPUID_THERM_POWER_LEAF 6
+#define APERFMPERF_PRESENT 0
+
static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0;
static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0;
@@ -239,6 +260,11 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
*dx = cpuid_leaf5_edx_val;
return;
+ case CPUID_THERM_POWER_LEAF:
+ /* Disabling APERFMPERF for kernel usage */
+ maskecx = ~(1 << APERFMPERF_PRESENT);
+ break;
+
case 0xb:
/* Suppress extended topology stuff */
maskebx = 0;
@@ -330,9 +356,7 @@ static void __init xen_init_cpuid_mask(void)
unsigned int xsave_mask;
cpuid_leaf1_edx_mask =
- ~((1 << X86_FEATURE_MCE) | /* disable MCE */
- (1 << X86_FEATURE_MCA) | /* disable MCA */
- (1 << X86_FEATURE_MTRR) | /* disable MTRR */
+ ~((1 << X86_FEATURE_MTRR) | /* disable MTRR */
(1 << X86_FEATURE_ACC)); /* thermal monitoring */
if (!xen_initial_domain())
@@ -529,12 +553,28 @@ static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
BUG();
}
+static inline bool desc_equal(const struct desc_struct *d1,
+ const struct desc_struct *d2)
+{
+ return d1->a == d2->a && d1->b == d2->b;
+}
+
static void load_TLS_descriptor(struct thread_struct *t,
unsigned int cpu, unsigned int i)
{
- struct desc_struct *gdt = get_cpu_gdt_table(cpu);
- xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
- struct multicall_space mc = __xen_mc_entry(0);
+ struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
+ struct desc_struct *gdt;
+ xmaddr_t maddr;
+ struct multicall_space mc;
+
+ if (desc_equal(shadow, &t->tls_array[i]))
+ return;
+
+ *shadow = t->tls_array[i];
+
+ gdt = get_cpu_gdt_table(cpu);
+ maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+ mc = __xen_mc_entry(0);
MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
}
@@ -616,8 +656,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
/*
* Look for known traps using IST, and substitute them
* appropriately. The debugger ones are the only ones we care
- * about. Xen will handle faults like double_fault and
- * machine_check, so we should never see them. Warn if
+ * about. Xen will handle faults like double_fault,
+ * so we should never see them. Warn if
* there's an unexpected IST-using fault handler.
*/
if (addr == (unsigned long)debug)
@@ -632,7 +672,11 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val,
return 0;
#ifdef CONFIG_X86_MCE
} else if (addr == (unsigned long)machine_check) {
- return 0;
+ /*
+ * when xen hypervisor inject vMCE to guest,
+ * use native mce handler to handle it
+ */
+ ;
#endif
} else {
/* Some other trap using IST? */
@@ -809,9 +853,40 @@ static void xen_io_delay(void)
}
#ifdef CONFIG_X86_LOCAL_APIC
+static unsigned long xen_set_apic_id(unsigned int x)
+{
+ WARN_ON(1);
+ return x;
+}
+static unsigned int xen_get_apic_id(unsigned long x)
+{
+ return ((x)>>24) & 0xFFu;
+}
static u32 xen_apic_read(u32 reg)
{
- return 0;
+ struct xen_platform_op op = {
+ .cmd = XENPF_get_cpuinfo,
+ .interface_version = XENPF_INTERFACE_VERSION,
+ .u.pcpu_info.xen_cpuid = 0,
+ };
+ int ret = 0;
+
+ /* Shouldn't need this as APIC is turned off for PV, and we only
+ * get called on the bootup processor. But just in case. */
+ if (!xen_initial_domain() || smp_processor_id())
+ return 0;
+
+ if (reg == APIC_LVR)
+ return 0x10;
+
+ if (reg != APIC_ID)
+ return 0;
+
+ ret = HYPERVISOR_dom0_op(&op);
+ if (ret)
+ return 0;
+
+ return op.u.pcpu_info.apic_id << 24;
}
static void xen_apic_write(u32 reg, u32 val)
@@ -849,6 +924,16 @@ static void set_xen_basic_apic_ops(void)
apic->icr_write = xen_apic_icr_write;
apic->wait_icr_idle = xen_apic_wait_icr_idle;
apic->safe_wait_icr_idle = xen_safe_apic_wait_icr_idle;
+ apic->set_apic_id = xen_set_apic_id;
+ apic->get_apic_id = xen_get_apic_id;
+
+#ifdef CONFIG_SMP
+ apic->send_IPI_allbutself = xen_send_IPI_allbutself;
+ apic->send_IPI_mask_allbutself = xen_send_IPI_mask_allbutself;
+ apic->send_IPI_mask = xen_send_IPI_mask;
+ apic->send_IPI_all = xen_send_IPI_all;
+ apic->send_IPI_self = xen_send_IPI_self;
+#endif
}
#endif
@@ -1073,6 +1158,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.read_msr = native_read_msr_safe,
.write_msr = xen_write_msr_safe,
+
.read_tsc = native_read_tsc,
.read_pmc = native_read_pmc,
@@ -1306,7 +1392,6 @@ asmlinkage void __init xen_start_kernel(void)
xen_raw_console_write("mapping kernel into physical memory\n");
pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
- xen_ident_map_ISA();
/* Allocate and initialize top and mid mfn levels for p2m structure */
xen_build_mfn_list_list();
@@ -1362,11 +1447,17 @@ asmlinkage void __init xen_start_kernel(void)
xen_start_info->console.domU.mfn = 0;
xen_start_info->console.domU.evtchn = 0;
+ xen_init_apic();
+
/* Make sure ACS will be enabled */
pci_request_acs();
- }
-
+ xen_acpi_sleep_register();
+ }
+#ifdef CONFIG_PCI
+ /* PCI BIOS service won't work from a PV guest. */
+ pci_probe &= ~PCI_PROBE_BIOS;
+#endif
xen_raw_console_write("about to get started...\n");
xen_setup_runstate_info(0);
@@ -1379,64 +1470,155 @@ asmlinkage void __init xen_start_kernel(void)
#endif
}
-static int init_hvm_pv_info(int *major, int *minor)
-{
- uint32_t eax, ebx, ecx, edx, pages, msr, base;
- u64 pfn;
-
- base = xen_cpuid_base();
- cpuid(base + 1, &eax, &ebx, &ecx, &edx);
-
- *major = eax >> 16;
- *minor = eax & 0xffff;
- printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
-
- cpuid(base + 2, &pages, &msr, &ecx, &edx);
-
- pfn = __pa(hypercall_page);
- wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
-
- xen_setup_features();
-
- pv_info.name = "Xen HVM";
-
- xen_domain_type = XEN_HVM_DOMAIN;
+#ifdef CONFIG_XEN_PVHVM
+/*
+ * The pfn containing the shared_info is located somewhere in RAM. This
+ * will cause trouble if the current kernel is doing a kexec boot into a
+ * new kernel. The new kernel (and its startup code) can not know where
+ * the pfn is, so it can not reserve the page. The hypervisor will
+ * continue to update the pfn, and as a result memory corruption occours
+ * in the new kernel.
+ *
+ * One way to work around this issue is to allocate a page in the
+ * xen-platform pci device's BAR memory range. But pci init is done very
+ * late and the shared_info page is already in use very early to read
+ * the pvclock. So moving the pfn from RAM to MMIO is racy because some
+ * code paths on other vcpus could access the pfn during the small
+ * window when the old pfn is moved to the new pfn. There is even a
+ * small window were the old pfn is not backed by a mfn, and during that
+ * time all reads return -1.
+ *
+ * Because it is not known upfront where the MMIO region is located it
+ * can not be used right from the start in xen_hvm_init_shared_info.
+ *
+ * To minimise trouble the move of the pfn is done shortly before kexec.
+ * This does not eliminate the race because all vcpus are still online
+ * when the syscore_ops will be called. But hopefully there is no work
+ * pending at this point in time. Also the syscore_op is run last which
+ * reduces the risk further.
+ */
- return 0;
-}
+static struct shared_info *xen_hvm_shared_info;
-void __ref xen_hvm_init_shared_info(void)
+static void xen_hvm_connect_shared_info(unsigned long pfn)
{
- int cpu;
struct xen_add_to_physmap xatp;
- static struct shared_info *shared_info_page = 0;
- if (!shared_info_page)
- shared_info_page = (struct shared_info *)
- extend_brk(PAGE_SIZE, PAGE_SIZE);
xatp.domid = DOMID_SELF;
xatp.idx = 0;
xatp.space = XENMAPSPACE_shared_info;
- xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+ xatp.gpfn = pfn;
if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
BUG();
- HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+}
+static void xen_hvm_set_shared_info(struct shared_info *sip)
+{
+ int cpu;
+
+ HYPERVISOR_shared_info = sip;
/* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
* page, we use it in the event channel upcall and in some pvclock
* related functions. We don't need the vcpu_info placement
* optimizations because we don't use any pv_mmu or pv_irq op on
* HVM.
- * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
- * online but xen_hvm_init_shared_info is run at resume time too and
+ * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
+ * online but xen_hvm_set_shared_info is run at resume time too and
* in that case multiple vcpus might be online. */
for_each_online_cpu(cpu) {
per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
}
}
-#ifdef CONFIG_XEN_PVHVM
+/* Reconnect the shared_info pfn to a mfn */
+void xen_hvm_resume_shared_info(void)
+{
+ xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+}
+
+#ifdef CONFIG_KEXEC
+static struct shared_info *xen_hvm_shared_info_kexec;
+static unsigned long xen_hvm_shared_info_pfn_kexec;
+
+/* Remember a pfn in MMIO space for kexec reboot */
+void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
+{
+ xen_hvm_shared_info_kexec = sip;
+ xen_hvm_shared_info_pfn_kexec = pfn;
+}
+
+static void xen_hvm_syscore_shutdown(void)
+{
+ struct xen_memory_reservation reservation = {
+ .domid = DOMID_SELF,
+ .nr_extents = 1,
+ };
+ unsigned long prev_pfn;
+ int rc;
+
+ if (!xen_hvm_shared_info_kexec)
+ return;
+
+ prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
+ set_xen_guest_handle(reservation.extent_start, &prev_pfn);
+
+ /* Move pfn to MMIO, disconnects previous pfn from mfn */
+ xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
+
+ /* Update pointers, following hypercall is also a memory barrier */
+ xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
+
+ /* Allocate new mfn for previous pfn */
+ do {
+ rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+ if (rc == 0)
+ msleep(123);
+ } while (rc == 0);
+
+ /* Make sure the previous pfn is really connected to a (new) mfn */
+ BUG_ON(rc != 1);
+}
+
+static struct syscore_ops xen_hvm_syscore_ops = {
+ .shutdown = xen_hvm_syscore_shutdown,
+};
+#endif
+
+/* Use a pfn in RAM, may move to MMIO before kexec. */
+static void __init xen_hvm_init_shared_info(void)
+{
+ /* Remember pointer for resume */
+ xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
+ xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+ xen_hvm_set_shared_info(xen_hvm_shared_info);
+}
+
+static void __init init_hvm_pv_info(void)
+{
+ int major, minor;
+ uint32_t eax, ebx, ecx, edx, pages, msr, base;
+ u64 pfn;
+
+ base = xen_cpuid_base();
+ cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+
+ major = eax >> 16;
+ minor = eax & 0xffff;
+ printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+
+ cpuid(base + 2, &pages, &msr, &ecx, &edx);
+
+ pfn = __pa(hypercall_page);
+ wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+
+ xen_setup_features();
+
+ pv_info.name = "Xen HVM";
+
+ xen_domain_type = XEN_HVM_DOMAIN;
+}
+
static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
@@ -1459,14 +1641,12 @@ static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
static void __init xen_hvm_guest_init(void)
{
- int r;
- int major, minor;
-
- r = init_hvm_pv_info(&major, &minor);
- if (r < 0)
- return;
+ init_hvm_pv_info();
xen_hvm_init_shared_info();
+#ifdef CONFIG_KEXEC
+ register_syscore_ops(&xen_hvm_syscore_ops);
+#endif
if (xen_feature(XENFEAT_hvm_callback_vector))
xen_have_vector_callback = 1;