summaryrefslogtreecommitdiff
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile3
-rw-r--r--arch/x86/kernel/acpi/boot.c156
-rw-r--r--arch/x86/kernel/acpi/cstate.c9
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S2
-rw-r--r--arch/x86/kernel/acpi/sleep.c13
-rw-r--r--arch/x86/kernel/acpi/wakeup_32.S2
-rw-r--r--arch/x86/kernel/alternative.c48
-rw-r--r--arch/x86/kernel/amd_iommu.c221
-rw-r--r--arch/x86/kernel/amd_iommu_init.c26
-rw-r--r--arch/x86/kernel/apb_timer.c37
-rw-r--r--arch/x86/kernel/aperture_64.c4
-rw-r--r--arch/x86/kernel/apic/Makefile7
-rw-r--r--arch/x86/kernel/apic/apic.c45
-rw-r--r--arch/x86/kernel/apic/es7000_32.c20
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c107
-rw-r--r--arch/x86/kernel/apic/io_apic.c101
-rw-r--r--arch/x86/kernel/apic/nmi.c7
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c3
-rw-r--r--arch/x86/kernel/apm_32.c6
-rw-r--r--arch/x86/kernel/cpu/Makefile6
-rw-r--r--arch/x86/kernel/cpu/amd.c77
-rw-r--r--arch/x86/kernel/cpu/bugs.c2
-rw-r--r--arch/x86/kernel/cpu/cmpxchg.c72
-rw-r--r--arch/x86/kernel/cpu/common.c68
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile4
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c11
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h26
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c6
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c41
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c8
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c188
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h2
-rw-r--r--arch/x86/kernel/cpu/hypervisor.c55
-rw-r--r--arch/x86/kernel/cpu/intel.c8
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c263
-rw-r--r--arch/x86/kernel/cpu/mcheck/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c138
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c128
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c9
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c208
-rw-r--r--arch/x86/kernel/cpu/mshyperv.c56
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c6
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c3
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c56
-rw-r--r--arch/x86/kernel/cpu/perf_event.c869
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c50
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c358
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c641
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_lbr.c218
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c942
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c31
-rw-r--r--arch/x86/kernel/cpu/scattered.c63
-rw-r--r--arch/x86/kernel/cpu/topology.c (renamed from arch/x86/kernel/cpu/addon_cpuid_features.c)56
-rw-r--r--arch/x86/kernel/cpu/vmware.c47
-rw-r--r--arch/x86/kernel/cpuid.c2
-rw-r--r--arch/x86/kernel/ds.c1437
-rw-r--r--arch/x86/kernel/ds_selftest.c408
-rw-r--r--arch/x86/kernel/ds_selftest.h15
-rw-r--r--arch/x86/kernel/dumpstack.c6
-rw-r--r--arch/x86/kernel/dumpstack.h56
-rw-r--r--arch/x86/kernel/dumpstack_32.c2
-rw-r--r--arch/x86/kernel/dumpstack_64.c1
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/early-quirks.c18
-rw-r--r--arch/x86/kernel/early_printk.c8
-rw-r--r--arch/x86/kernel/entry_32.S33
-rw-r--r--arch/x86/kernel/entry_64.S13
-rw-r--r--arch/x86/kernel/head32.c2
-rw-r--r--arch/x86/kernel/head_32.S6
-rw-r--r--arch/x86/kernel/head_64.S5
-rw-r--r--arch/x86/kernel/hpet.c44
-rw-r--r--arch/x86/kernel/hw_breakpoint.c92
-rw-r--r--arch/x86/kernel/i387.c143
-rw-r--r--arch/x86/kernel/i8253.c14
-rw-r--r--arch/x86/kernel/i8259.c25
-rw-r--r--arch/x86/kernel/init_task.c2
-rw-r--r--arch/x86/kernel/irqinit.c2
-rw-r--r--arch/x86/kernel/kgdb.c297
-rw-r--r--arch/x86/kernel/kprobes.c51
-rw-r--r--arch/x86/kernel/kvmclock.c56
-rw-r--r--arch/x86/kernel/microcode_core.c5
-rw-r--r--arch/x86/kernel/microcode_intel.c22
-rw-r--r--arch/x86/kernel/mpparse.c25
-rw-r--r--arch/x86/kernel/mrst.c117
-rw-r--r--arch/x86/kernel/msr.c2
-rw-r--r--arch/x86/kernel/olpc.c20
-rw-r--r--arch/x86/kernel/olpc_ofw.c106
-rw-r--r--arch/x86/kernel/pci-calgary_64.c17
-rw-r--r--arch/x86/kernel/pci-swiotlb.c2
-rw-r--r--arch/x86/kernel/process.c92
-rw-r--r--arch/x86/kernel/process_32.c14
-rw-r--r--arch/x86/kernel/process_64.c15
-rw-r--r--arch/x86/kernel/ptrace.c384
-rw-r--r--arch/x86/kernel/pvclock.c37
-rw-r--r--arch/x86/kernel/quirks.c3
-rw-r--r--arch/x86/kernel/reboot.c8
-rw-r--r--arch/x86/kernel/setup.c18
-rw-r--r--arch/x86/kernel/setup_percpu.c25
-rw-r--r--arch/x86/kernel/sfi.c4
-rw-r--r--arch/x86/kernel/smpboot.c43
-rw-r--r--arch/x86/kernel/stacktrace.c31
-rw-r--r--arch/x86/kernel/step.c46
-rw-r--r--arch/x86/kernel/syscall_table_32.S3
-rw-r--r--arch/x86/kernel/tboot.c21
-rw-r--r--arch/x86/kernel/tlb_uv.c1280
-rw-r--r--arch/x86/kernel/traps.c215
-rw-r--r--arch/x86/kernel/tsc.c5
-rw-r--r--arch/x86/kernel/uv_irq.c12
-rw-r--r--arch/x86/kernel/verify_cpu_64.S3
-rw-r--r--arch/x86/kernel/vmlinux.lds.S4
-rw-r--r--arch/x86/kernel/vsyscall_64.c17
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/x86_init.c7
-rw-r--r--arch/x86/kernel/xsave.c203
120 files changed, 6334 insertions, 4895 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 4c58352209e0..0925676266bd 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
obj-y += process.o
obj-y += i387.o xsave.o
obj-y += ptrace.o
-obj-$(CONFIG_X86_DS) += ds.o
-obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o
obj-$(CONFIG_X86_32) += tls.o
obj-$(CONFIG_IA32_EMULATION) += tls.o
obj-y += step.o
@@ -106,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o
scx200-y += scx200_32.o
obj-$(CONFIG_OLPC) += olpc.o
+obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
obj-$(CONFIG_X86_MRST) += mrst.o
microcode-y := microcode_core.o
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index cd40aba6aa95..c05872aa3ce0 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -63,7 +63,6 @@ EXPORT_SYMBOL(acpi_disabled);
int acpi_noirq; /* skip ACPI IRQ initialization */
int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */
EXPORT_SYMBOL(acpi_pci_disabled);
-int acpi_ht __initdata = 1; /* enable HT */
int acpi_lapic;
int acpi_ioapic;
@@ -94,6 +93,53 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC;
/*
+ * ISA irqs by default are the first 16 gsis but can be
+ * any gsi as specified by an interrupt source override.
+ */
+static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
+};
+
+static unsigned int gsi_to_irq(unsigned int gsi)
+{
+ unsigned int irq = gsi + NR_IRQS_LEGACY;
+ unsigned int i;
+
+ for (i = 0; i < NR_IRQS_LEGACY; i++) {
+ if (isa_irq_to_gsi[i] == gsi) {
+ return i;
+ }
+ }
+
+ /* Provide an identity mapping of gsi == irq
+ * except on truly weird platforms that have
+ * non isa irqs in the first 16 gsis.
+ */
+ if (gsi >= NR_IRQS_LEGACY)
+ irq = gsi;
+ else
+ irq = gsi_top + gsi;
+
+ return irq;
+}
+
+static u32 irq_to_gsi(int irq)
+{
+ unsigned int gsi;
+
+ if (irq < NR_IRQS_LEGACY)
+ gsi = isa_irq_to_gsi[irq];
+ else if (irq < gsi_top)
+ gsi = irq;
+ else if (irq < (gsi_top + NR_IRQS_LEGACY))
+ gsi = irq - gsi_top;
+ else
+ gsi = 0xffffffff;
+
+ return gsi;
+}
+
+/*
* Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END,
* to map the target physical address. The problem is that set_fixmap()
* provides a single page, and it is possible that the page is not
@@ -313,7 +359,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end)
/*
* Parse Interrupt Source Override for the ACPI SCI
*/
-static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
+static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi)
{
if (trigger == 0) /* compatible SCI trigger is level */
trigger = 3;
@@ -333,7 +379,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger)
* If GSI is < 16, this will update its flags,
* else it will create a new mp_irqs[] entry.
*/
- mp_override_legacy_irq(gsi, polarity, trigger, gsi);
+ mp_override_legacy_irq(bus_irq, polarity, trigger, gsi);
/*
* stash over-ride to indicate we've been here
@@ -357,9 +403,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
acpi_table_print_madt_entry(header);
if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) {
- acpi_sci_ioapic_setup(intsrc->global_irq,
+ acpi_sci_ioapic_setup(intsrc->source_irq,
intsrc->inti_flags & ACPI_MADT_POLARITY_MASK,
- (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2);
+ (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2,
+ intsrc->global_irq);
return 0;
}
@@ -448,7 +495,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
{
- *irq = gsi;
+ *irq = gsi_to_irq(gsi);
#ifdef CONFIG_X86_IO_APIC
if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
@@ -458,6 +505,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
return 0;
}
+int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
+{
+ if (isa_irq >= 16)
+ return -1;
+ *gsi = irq_to_gsi(isa_irq);
+ return 0;
+}
+
/*
* success: return IRQ number (>=0)
* failure: return < 0
@@ -482,7 +537,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
}
#endif
- irq = plat_gsi;
+ irq = gsi_to_irq(plat_gsi);
return irq;
}
@@ -867,29 +922,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
extern int es7000_plat;
#endif
-int __init acpi_probe_gsi(void)
-{
- int idx;
- int gsi;
- int max_gsi = 0;
-
- if (acpi_disabled)
- return 0;
-
- if (!acpi_ioapic)
- return 0;
-
- max_gsi = 0;
- for (idx = 0; idx < nr_ioapics; idx++) {
- gsi = mp_gsi_routing[idx].gsi_end;
-
- if (gsi > max_gsi)
- max_gsi = gsi;
- }
-
- return max_gsi + 1;
-}
-
static void assign_to_mp_irq(struct mpc_intsrc *m,
struct mpc_intsrc *mp_irq)
{
@@ -947,13 +979,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
mp_irq.dstirq = pin; /* INTIN# */
save_mp_irq(&mp_irq);
+
+ isa_irq_to_gsi[bus_irq] = gsi;
}
void __init mp_config_acpi_legacy_irqs(void)
{
int i;
- int ioapic;
- unsigned int dstapic;
struct mpc_intsrc mp_irq;
#if defined (CONFIG_MCA) || defined (CONFIG_EISA)
@@ -974,19 +1006,27 @@ void __init mp_config_acpi_legacy_irqs(void)
#endif
/*
- * Locate the IOAPIC that manages the ISA IRQs (0-15).
- */
- ioapic = mp_find_ioapic(0);
- if (ioapic < 0)
- return;
- dstapic = mp_ioapics[ioapic].apicid;
-
- /*
* Use the default configuration for the IRQs 0-15. Unless
* overridden by (MADT) interrupt source override entries.
*/
for (i = 0; i < 16; i++) {
+ int ioapic, pin;
+ unsigned int dstapic;
int idx;
+ u32 gsi;
+
+ /* Locate the gsi that irq i maps to. */
+ if (acpi_isa_irq_to_gsi(i, &gsi))
+ continue;
+
+ /*
+ * Locate the IOAPIC that manages the ISA IRQ.
+ */
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
+ continue;
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ dstapic = mp_ioapics[ioapic].apicid;
for (idx = 0; idx < mp_irq_entries; idx++) {
struct mpc_intsrc *irq = mp_irqs + idx;
@@ -996,7 +1036,7 @@ void __init mp_config_acpi_legacy_irqs(void)
break;
/* Do we already have a mapping for this IOAPIC pin */
- if (irq->dstapic == dstapic && irq->dstirq == i)
+ if (irq->dstapic == dstapic && irq->dstirq == pin)
break;
}
@@ -1011,7 +1051,7 @@ void __init mp_config_acpi_legacy_irqs(void)
mp_irq.dstapic = dstapic;
mp_irq.irqtype = mp_INT;
mp_irq.srcbusirq = i; /* Identity mapped */
- mp_irq.dstirq = i;
+ mp_irq.dstirq = pin;
save_mp_irq(&mp_irq);
}
@@ -1076,11 +1116,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
ioapic_pin = mp_find_ioapic_pin(ioapic, gsi);
-#ifdef CONFIG_X86_32
- if (ioapic_renumber_irq)
- gsi = ioapic_renumber_irq(ioapic, gsi);
-#endif
-
if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
printk(KERN_ERR "Invalid reference to IOAPIC pin "
"%d-%d\n", mp_ioapics[ioapic].apicid,
@@ -1094,7 +1129,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
- io_apic_set_pci_routing(dev, gsi, &irq_attr);
+ io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr);
return gsi;
}
@@ -1154,7 +1189,8 @@ static int __init acpi_parse_madt_ioapic_entries(void)
* pretend we got one so we can set the SCI flags.
*/
if (!acpi_sci_override_gsi)
- acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
+ acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0,
+ acpi_gbl_FADT.sci_interrupt);
/* Fill in identity legacy mappings where no override */
mp_config_acpi_legacy_irqs();
@@ -1464,9 +1500,8 @@ void __init acpi_boot_table_init(void)
/*
* If acpi_disabled, bail out
- * One exception: acpi=ht continues far enough to enumerate LAPICs
*/
- if (acpi_disabled && !acpi_ht)
+ if (acpi_disabled)
return;
/*
@@ -1497,9 +1532,8 @@ int __init early_acpi_boot_init(void)
{
/*
* If acpi_disabled, bail out
- * One exception: acpi=ht continues far enough to enumerate LAPICs
*/
- if (acpi_disabled && !acpi_ht)
+ if (acpi_disabled)
return 1;
/*
@@ -1517,9 +1551,8 @@ int __init acpi_boot_init(void)
/*
* If acpi_disabled, bail out
- * One exception: acpi=ht continues far enough to enumerate LAPICs
*/
- if (acpi_disabled && !acpi_ht)
+ if (acpi_disabled)
return 1;
acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1554,21 +1587,12 @@ static int __init parse_acpi(char *arg)
/* acpi=force to over-ride black-list */
else if (strcmp(arg, "force") == 0) {
acpi_force = 1;
- acpi_ht = 1;
acpi_disabled = 0;
}
/* acpi=strict disables out-of-spec workarounds */
else if (strcmp(arg, "strict") == 0) {
acpi_strict = 1;
}
- /* Limit ACPI just to boot-time to enable HT */
- else if (strcmp(arg, "ht") == 0) {
- if (!acpi_force) {
- printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
- disable_acpi();
- }
- acpi_ht = 1;
- }
/* acpi=rsdt use RSDT instead of XSDT */
else if (strcmp(arg, "rsdt") == 0) {
acpi_rsdt_forced = 1;
@@ -1576,6 +1600,10 @@ static int __init parse_acpi(char *arg)
/* "acpi=noirq" disables ACPI interrupt routing */
else if (strcmp(arg, "noirq") == 0) {
acpi_noirq_set();
+ }
+ /* "acpi=copy_dsdt" copys DSDT */
+ else if (strcmp(arg, "copy_dsdt") == 0) {
+ acpi_gbl_copy_dsdt_locally = 1;
} else {
/* Core will printk when we return error. */
return -EINVAL;
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 2e837f5080fe..fb7a5f052e2b 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -145,6 +145,15 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu,
percpu_entry->states[cx->index].eax = cx->address;
percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK;
}
+
+ /*
+ * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared,
+ * then we should skip checking BM_STS for this C-state.
+ * ref: "Intel Processor Vendor-Specific ACPI Interface Specification"
+ */
+ if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2))
+ cx->bm_sts_skip = 1;
+
return retval;
}
EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe);
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 580b4e296010..28595d6df47c 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -104,7 +104,7 @@ _start:
movl %eax, %ecx
orl %edx, %ecx
jz 1f
- movl $0xc0000080, %ecx
+ movl $MSR_EFER, %ecx
wrmsr
1:
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index f9961034e557..33cec152070d 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -2,7 +2,7 @@
* sleep.c - x86-specific ACPI sleep support.
*
* Copyright (C) 2001-2003 Patrick Mochel
- * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz>
+ * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz>
*/
#include <linux/acpi.h>
@@ -157,13 +157,16 @@ static int __init acpi_sleep_setup(char *str)
#ifdef CONFIG_HIBERNATION
if (strncmp(str, "s4_nohwsig", 10) == 0)
acpi_no_s4_hw_signature();
- if (strncmp(str, "s4_nonvs", 8) == 0)
- acpi_s4_no_nvs();
+ if (strncmp(str, "s4_nonvs", 8) == 0) {
+ pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
+ "please use acpi_sleep=nonvs instead");
+ acpi_nvs_nosave();
+ }
#endif
+ if (strncmp(str, "nonvs", 5) == 0)
+ acpi_nvs_nosave();
if (strncmp(str, "old_ordering", 12) == 0)
acpi_old_suspend_ordering();
- if (strncmp(str, "sci_force_enable", 16) == 0)
- acpi_set_sci_en_on_resume();
str = strchr(str, ',');
if (str != NULL)
str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S
index 8ded418b0593..13ab720573e3 100644
--- a/arch/x86/kernel/acpi/wakeup_32.S
+++ b/arch/x86/kernel/acpi/wakeup_32.S
@@ -1,4 +1,4 @@
- .section .text.page_aligned
+ .section .text..page_aligned
#include <linux/linkage.h>
#include <asm/segment.h>
#include <asm/page_types.h>
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 1a160d5d44d0..f65ab8b014c4 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
}
extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
-extern u8 *__smp_locks[], *__smp_locks_end[];
+extern s32 __smp_locks[], __smp_locks_end[];
static void *text_poke_early(void *addr, const void *opcode, size_t len);
/* Replace instructions with better alternatives for this CPU type.
@@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
u8 *instr = a->instr;
BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
+ BUG_ON(a->cpuid >= NCAPINTS*32);
if (!boot_cpu_has(a->cpuid))
continue;
#ifdef CONFIG_X86_64
@@ -235,37 +236,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
#ifdef CONFIG_SMP
-static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_lock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
{
- u8 **ptr;
+ const s32 *poff;
mutex_lock(&text_mutex);
- for (ptr = start; ptr < end; ptr++) {
- if (*ptr < text)
- continue;
- if (*ptr > text_end)
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
continue;
/* turn DS segment override prefix into lock prefix */
- text_poke(*ptr, ((unsigned char []){0xf0}), 1);
+ if (*ptr == 0x3e)
+ text_poke(ptr, ((unsigned char []){0xf0}), 1);
};
mutex_unlock(&text_mutex);
}
-static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end)
+static void alternatives_smp_unlock(const s32 *start, const s32 *end,
+ u8 *text, u8 *text_end)
{
- u8 **ptr;
+ const s32 *poff;
if (noreplace_smp)
return;
mutex_lock(&text_mutex);
- for (ptr = start; ptr < end; ptr++) {
- if (*ptr < text)
- continue;
- if (*ptr > text_end)
+ for (poff = start; poff < end; poff++) {
+ u8 *ptr = (u8 *)poff + *poff;
+
+ if (!*poff || ptr < text || ptr >= text_end)
continue;
/* turn lock prefix into DS segment override prefix */
- text_poke(*ptr, ((unsigned char []){0x3E}), 1);
+ if (*ptr == 0xf0)
+ text_poke(ptr, ((unsigned char []){0x3E}), 1);
};
mutex_unlock(&text_mutex);
}
@@ -276,8 +281,8 @@ struct smp_alt_module {
char *name;
/* ptrs to lock prefixes */
- u8 **locks;
- u8 **locks_end;
+ const s32 *locks;
+ const s32 *locks_end;
/* .text segment, needed to avoid patching init code ;) */
u8 *text;
@@ -398,16 +403,19 @@ void alternatives_smp_switch(int smp)
int alternatives_text_reserved(void *start, void *end)
{
struct smp_alt_module *mod;
- u8 **ptr;
+ const s32 *poff;
u8 *text_start = start;
u8 *text_end = end;
list_for_each_entry(mod, &smp_alt_modules, next) {
if (mod->text > text_end || mod->text_end < text_start)
continue;
- for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
- if (text_start <= *ptr && text_end >= *ptr)
+ for (poff = mod->locks; poff < mod->locks_end; poff++) {
+ const u8 *ptr = (const u8 *)poff + *poff;
+
+ if (text_start <= ptr && text_end > ptr)
return 1;
+ }
}
return 0;
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index f854d89b7edf..fa044e1e30a2 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -731,18 +731,22 @@ static bool increase_address_space(struct protection_domain *domain,
static u64 *alloc_pte(struct protection_domain *domain,
unsigned long address,
- int end_lvl,
+ unsigned long page_size,
u64 **pte_page,
gfp_t gfp)
{
+ int level, end_lvl;
u64 *pte, *page;
- int level;
+
+ BUG_ON(!is_power_of_2(page_size));
while (address > PM_LEVEL_SIZE(domain->mode))
increase_address_space(domain, gfp);
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+ address = PAGE_SIZE_ALIGN(address, page_size);
+ end_lvl = PAGE_SIZE_LEVEL(page_size);
while (level > end_lvl) {
if (!IOMMU_PTE_PRESENT(*pte)) {
@@ -752,6 +756,10 @@ static u64 *alloc_pte(struct protection_domain *domain,
*pte = PM_LEVEL_PDE(level, virt_to_phys(page));
}
+ /* No level skipping support yet */
+ if (PM_PTE_LEVEL(*pte) != level)
+ return NULL;
+
level -= 1;
pte = IOMMU_PTE_PAGE(*pte);
@@ -769,28 +777,47 @@ static u64 *alloc_pte(struct protection_domain *domain,
* This function checks if there is a PTE for a given dma address. If
* there is one, it returns the pointer to it.
*/
-static u64 *fetch_pte(struct protection_domain *domain,
- unsigned long address, int map_size)
+static u64 *fetch_pte(struct protection_domain *domain, unsigned long address)
{
int level;
u64 *pte;
- level = domain->mode - 1;
- pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+ if (address > PM_LEVEL_SIZE(domain->mode))
+ return NULL;
- while (level > map_size) {
+ level = domain->mode - 1;
+ pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
+
+ while (level > 0) {
+
+ /* Not Present */
if (!IOMMU_PTE_PRESENT(*pte))
return NULL;
+ /* Large PTE */
+ if (PM_PTE_LEVEL(*pte) == 0x07) {
+ unsigned long pte_mask, __pte;
+
+ /*
+ * If we have a series of large PTEs, make
+ * sure to return a pointer to the first one.
+ */
+ pte_mask = PTE_PAGE_SIZE(*pte);
+ pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1);
+ __pte = ((unsigned long)pte) & pte_mask;
+
+ return (u64 *)__pte;
+ }
+
+ /* No level skipping support yet */
+ if (PM_PTE_LEVEL(*pte) != level)
+ return NULL;
+
level -= 1;
+ /* Walk to the next level */
pte = IOMMU_PTE_PAGE(*pte);
pte = &pte[PM_LEVEL_INDEX(level, address)];
-
- if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
- pte = NULL;
- break;
- }
}
return pte;
@@ -807,44 +834,84 @@ static int iommu_map_page(struct protection_domain *dom,
unsigned long bus_addr,
unsigned long phys_addr,
int prot,
- int map_size)
+ unsigned long page_size)
{
u64 __pte, *pte;
-
- bus_addr = PAGE_ALIGN(bus_addr);
- phys_addr = PAGE_ALIGN(phys_addr);
-
- BUG_ON(!PM_ALIGNED(map_size, bus_addr));
- BUG_ON(!PM_ALIGNED(map_size, phys_addr));
+ int i, count;
if (!(prot & IOMMU_PROT_MASK))
return -EINVAL;
- pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL);
+ bus_addr = PAGE_ALIGN(bus_addr);
+ phys_addr = PAGE_ALIGN(phys_addr);
+ count = PAGE_SIZE_PTE_COUNT(page_size);
+ pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL);
+
+ for (i = 0; i < count; ++i)
+ if (IOMMU_PTE_PRESENT(pte[i]))
+ return -EBUSY;
- if (IOMMU_PTE_PRESENT(*pte))
- return -EBUSY;
+ if (page_size > PAGE_SIZE) {
+ __pte = PAGE_SIZE_PTE(phys_addr, page_size);
+ __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC;
+ } else
+ __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC;
- __pte = phys_addr | IOMMU_PTE_P;
if (prot & IOMMU_PROT_IR)
__pte |= IOMMU_PTE_IR;
if (prot & IOMMU_PROT_IW)
__pte |= IOMMU_PTE_IW;
- *pte = __pte;
+ for (i = 0; i < count; ++i)
+ pte[i] = __pte;
update_domain(dom);
return 0;
}
-static void iommu_unmap_page(struct protection_domain *dom,
- unsigned long bus_addr, int map_size)
+static unsigned long iommu_unmap_page(struct protection_domain *dom,
+ unsigned long bus_addr,
+ unsigned long page_size)
{
- u64 *pte = fetch_pte(dom, bus_addr, map_size);
+ unsigned long long unmap_size, unmapped;
+ u64 *pte;
+
+ BUG_ON(!is_power_of_2(page_size));
+
+ unmapped = 0;
+
+ while (unmapped < page_size) {
+
+ pte = fetch_pte(dom, bus_addr);
+
+ if (!pte) {
+ /*
+ * No PTE for this address
+ * move forward in 4kb steps
+ */
+ unmap_size = PAGE_SIZE;
+ } else if (PM_PTE_LEVEL(*pte) == 0) {
+ /* 4kb PTE found for this address */
+ unmap_size = PAGE_SIZE;
+ *pte = 0ULL;
+ } else {
+ int count, i;
+
+ /* Large PTE found which maps this address */
+ unmap_size = PTE_PAGE_SIZE(*pte);
+ count = PAGE_SIZE_PTE_COUNT(unmap_size);
+ for (i = 0; i < count; i++)
+ pte[i] = 0ULL;
+ }
- if (pte)
- *pte = 0;
+ bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size;
+ unmapped += unmap_size;
+ }
+
+ BUG_ON(!is_power_of_2(unmapped));
+
+ return unmapped;
}
/*
@@ -878,7 +945,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
for (addr = e->address_start; addr < e->address_end;
addr += PAGE_SIZE) {
ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot,
- PM_MAP_4k);
+ PAGE_SIZE);
if (ret)
return ret;
/*
@@ -1006,7 +1073,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
u64 *pte, *pte_page;
for (i = 0; i < num_ptes; ++i) {
- pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k,
+ pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE,
&pte_page, gfp);
if (!pte)
goto out_free;
@@ -1042,7 +1109,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
for (i = dma_dom->aperture[index]->offset;
i < dma_dom->aperture_size;
i += PAGE_SIZE) {
- u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k);
+ u64 *pte = fetch_pte(&dma_dom->domain, i);
if (!pte || !IOMMU_PTE_PRESENT(*pte))
continue;
@@ -1420,6 +1487,7 @@ static int __attach_device(struct device *dev,
struct protection_domain *domain)
{
struct iommu_dev_data *dev_data, *alias_data;
+ int ret;
dev_data = get_dev_data(dev);
alias_data = get_dev_data(dev_data->alias);
@@ -1431,13 +1499,14 @@ static int __attach_device(struct device *dev,
spin_lock(&domain->lock);
/* Some sanity checks */
+ ret = -EBUSY;
if (alias_data->domain != NULL &&
alias_data->domain != domain)
- return -EBUSY;
+ goto out_unlock;
if (dev_data->domain != NULL &&
dev_data->domain != domain)
- return -EBUSY;
+ goto out_unlock;
/* Do real assignment */
if (dev_data->alias != dev) {
@@ -1453,10 +1522,14 @@ static int __attach_device(struct device *dev,
atomic_inc(&dev_data->bind);
+ ret = 0;
+
+out_unlock:
+
/* ready */
spin_unlock(&domain->lock);
- return 0;
+ return ret;
}
/*
@@ -1712,7 +1785,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
if (!pte) {
- pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page,
+ pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page,
GFP_ATOMIC);
aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
} else
@@ -2257,10 +2330,6 @@ int __init amd_iommu_init_dma_ops(void)
iommu_detected = 1;
swiotlb = 0;
-#ifdef CONFIG_GART_IOMMU
- gart_iommu_aperture_disabled = 1;
- gart_iommu_aperture = 0;
-#endif
/* Make the driver finally visible to the drivers */
dma_ops = &amd_iommu_dma_ops;
@@ -2439,12 +2508,11 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
return ret;
}
-static int amd_iommu_map_range(struct iommu_domain *dom,
- unsigned long iova, phys_addr_t paddr,
- size_t size, int iommu_prot)
+static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova,
+ phys_addr_t paddr, int gfp_order, int iommu_prot)
{
+ unsigned long page_size = 0x1000UL << gfp_order;
struct protection_domain *domain = dom->priv;
- unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE);
int prot = 0;
int ret;
@@ -2453,61 +2521,50 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
if (iommu_prot & IOMMU_WRITE)
prot |= IOMMU_PROT_IW;
- iova &= PAGE_MASK;
- paddr &= PAGE_MASK;
-
mutex_lock(&domain->api_lock);
-
- for (i = 0; i < npages; ++i) {
- ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
- if (ret)
- return ret;
-
- iova += PAGE_SIZE;
- paddr += PAGE_SIZE;
- }
-
+ ret = iommu_map_page(domain, iova, paddr, prot, page_size);
mutex_unlock(&domain->api_lock);
- return 0;
+ return ret;
}
-static void amd_iommu_unmap_range(struct iommu_domain *dom,
- unsigned long iova, size_t size)
+static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
+ int gfp_order)
{
-
struct protection_domain *domain = dom->priv;
- unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE);
+ unsigned long page_size, unmap_size;
- iova &= PAGE_MASK;
+ page_size = 0x1000UL << gfp_order;
mutex_lock(&domain->api_lock);
-
- for (i = 0; i < npages; ++i) {
- iommu_unmap_page(domain, iova, PM_MAP_4k);
- iova += PAGE_SIZE;
- }
+ unmap_size = iommu_unmap_page(domain, iova, page_size);
+ mutex_unlock(&domain->api_lock);
iommu_flush_tlb_pde(domain);
- mutex_unlock(&domain->api_lock);
+ return get_order(unmap_size);
}
static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
unsigned long iova)
{
struct protection_domain *domain = dom->priv;
- unsigned long offset = iova & ~PAGE_MASK;
+ unsigned long offset_mask;
phys_addr_t paddr;
- u64 *pte;
+ u64 *pte, __pte;
- pte = fetch_pte(domain, iova, PM_MAP_4k);
+ pte = fetch_pte(domain, iova);
if (!pte || !IOMMU_PTE_PRESENT(*pte))
return 0;
- paddr = *pte & IOMMU_PAGE_MASK;
- paddr |= offset;
+ if (PM_PTE_LEVEL(*pte) == 0)
+ offset_mask = PAGE_SIZE - 1;
+ else
+ offset_mask = PTE_PAGE_SIZE(*pte) - 1;
+
+ __pte = *pte & PM_ADDR_MASK;
+ paddr = (__pte & ~offset_mask) | (iova & offset_mask);
return paddr;
}
@@ -2515,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
static int amd_iommu_domain_has_cap(struct iommu_domain *domain,
unsigned long cap)
{
+ switch (cap) {
+ case IOMMU_CAP_CACHE_COHERENCY:
+ return 1;
+ }
+
return 0;
}
@@ -2523,8 +2585,8 @@ static struct iommu_ops amd_iommu_ops = {
.domain_destroy = amd_iommu_domain_destroy,
.attach_dev = amd_iommu_attach_device,
.detach_dev = amd_iommu_detach_device,
- .map = amd_iommu_map_range,
- .unmap = amd_iommu_unmap_range,
+ .map = amd_iommu_map,
+ .unmap = amd_iommu_unmap,
.iova_to_phys = amd_iommu_iova_to_phys,
.domain_has_cap = amd_iommu_domain_has_cap,
};
@@ -2552,8 +2614,7 @@ int __init amd_iommu_init_passthrough(void)
pt_domain->mode |= PAGE_MODE_NONE;
- while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
-
+ for_each_pci_dev(dev) {
if (!check_device(&dev->dev))
continue;
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 6360abf993d4..3cc63e2b8dd4 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -120,6 +120,7 @@ struct ivmd_header {
bool amd_iommu_dump;
static int __initdata amd_iommu_detected;
+static bool __initdata amd_iommu_disabled;
u16 amd_iommu_last_bdf; /* largest PCI device id we have
to handle */
@@ -286,8 +287,12 @@ static u8 * __init iommu_map_mmio_space(u64 address)
{
u8 *ret;
- if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu"))
+ if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) {
+ pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n",
+ address);
+ pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n");
return NULL;
+ }
ret = ioremap_nocache(address, MMIO_REGION_LENGTH);
if (ret != NULL)
@@ -1313,7 +1318,7 @@ static int __init amd_iommu_init(void)
ret = amd_iommu_init_dma_ops();
if (ret)
- goto free;
+ goto free_disable;
amd_iommu_init_api();
@@ -1331,9 +1336,10 @@ static int __init amd_iommu_init(void)
out:
return ret;
-free:
+free_disable:
disable_iommus();
+free:
amd_iommu_uninit_devices();
free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
@@ -1352,6 +1358,15 @@ free:
free_unity_maps();
+#ifdef CONFIG_GART_IOMMU
+ /*
+ * We failed to initialize the AMD IOMMU - try fallback to GART
+ * if possible.
+ */
+ gart_iommu_init();
+
+#endif
+
goto out;
}
@@ -1372,6 +1387,9 @@ void __init amd_iommu_detect(void)
if (no_iommu || (iommu_detected && !gart_iommu_aperture))
return;
+ if (amd_iommu_disabled)
+ return;
+
if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
iommu_detected = 1;
amd_iommu_detected = 1;
@@ -1401,6 +1419,8 @@ static int __init parse_amd_iommu_options(char *str)
for (; *str; ++str) {
if (strncmp(str, "fullflush", 9) == 0)
amd_iommu_unmap_flush = true;
+ if (strncmp(str, "off", 3) == 0)
+ amd_iommu_disabled = true;
}
return 1;
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index a35347501d36..8dd77800ff5d 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -43,10 +43,11 @@
#include <asm/fixmap.h>
#include <asm/apb_timer.h>
+#include <asm/mrst.h>
#define APBT_MASK CLOCKSOURCE_MASK(32)
#define APBT_SHIFT 22
-#define APBT_CLOCKEVENT_RATING 150
+#define APBT_CLOCKEVENT_RATING 110
#define APBT_CLOCKSOURCE_RATING 250
#define APBT_MIN_DELTA_USEC 200
@@ -83,8 +84,6 @@ struct apbt_dev {
char name[10];
};
-int disable_apbt_percpu __cpuinitdata;
-
static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
#ifdef CONFIG_SMP
@@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = {
};
/*
- * if user does not want to use per CPU apb timer, just give it a lower rating
- * than local apic timer and skip the late per cpu timer init.
- */
-static inline int __init setup_x86_mrst_timer(char *arg)
-{
- if (!arg)
- return -EINVAL;
-
- if (strcmp("apbt_only", arg) == 0)
- disable_apbt_percpu = 0;
- else if (strcmp("lapic_and_apbt", arg) == 0)
- disable_apbt_percpu = 1;
- else {
- pr_warning("X86 MRST timer option %s not recognised"
- " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
- arg);
- return -EINVAL;
- }
- return 0;
-}
-__setup("x86_mrst_timer=", setup_x86_mrst_timer);
-
-/*
* start count down from 0xffff_ffff. this is done by toggling the enable bit
* then load initial load count to ~0.
*/
@@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void)
adev->num = smp_processor_id();
memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
- if (disable_apbt_percpu) {
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
global_clock_event = &adev->evt;
printk(KERN_DEBUG "%s clockevent registered as global\n",
@@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
static __init int apbt_late_init(void)
{
- if (disable_apbt_percpu || !apb_timer_block_enabled)
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT ||
+ !apb_timer_block_enabled)
return 0;
/* This notifier should be called after workqueue is ready */
hotcpu_notifier(apbt_cpuhp_notify, -20);
@@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode,
int timer_num;
struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
+ BUG_ON(!apbt_virt_address);
+
timer_num = adev->num;
pr_debug("%s CPU %d timer %d mode=%d\n",
__func__, first_cpu(*evt->cpumask), timer_num, mode);
@@ -676,7 +655,7 @@ void __init apbt_time_init(void)
}
#ifdef CONFIG_SMP
/* kernel cmdline disable apb timer, so we will use lapic timers */
- if (disable_apbt_percpu) {
+ if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
printk(KERN_INFO "apbt: disabled per cpu timer\n");
return;
}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index b5d8b0bcf235..a2e0caf26e17 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void)
* or BIOS forget to put that in reserved.
* try to update e820 to make that region as reserved.
*/
- u32 agp_aper_base = 0, agp_aper_order = 0;
+ u32 agp_aper_order = 0;
int i, fix, slot, valid_agp = 0;
u32 ctl;
u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
@@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void)
return;
/* This is mostly duplicate of iommu_hole_init */
- agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
+ search_agp_bridge(&agp_aper_order, &valid_agp);
fix = 0;
for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 565c1bfc507d..910f20b457c4 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,12 @@
# Makefile for local APIC drivers and for the IO-APIC code
#
-obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
+obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o
+ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y)
+obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
+endif
+obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
+
obj-$(CONFIG_X86_IO_APIC) += io_apic.o
obj-$(CONFIG_SMP) += ipi.o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e5a4a1e01618..980508c79082 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -51,6 +51,7 @@
#include <asm/smp.h>
#include <asm/mce.h>
#include <asm/kvm_para.h>
+#include <asm/tsc.h>
unsigned int num_processors;
@@ -459,7 +460,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
}
/*
- * Setup the local APIC timer for this CPU. Copy the initilized values
+ * Setup the local APIC timer for this CPU. Copy the initialized values
* of the boot CPU and register the clock event in the framework.
*/
static void __cpuinit setup_APIC_timer(void)
@@ -920,7 +921,7 @@ void disable_local_APIC(void)
unsigned int value;
/* APIC hasn't been mapped yet */
- if (!apic_phys)
+ if (!x2apic_mode && !apic_phys)
return;
clear_local_APIC();
@@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void)
*/
void __cpuinit setup_local_APIC(void)
{
- unsigned int value;
- int i, j;
+ unsigned int value, queued;
+ int i, j, acked = 0;
+ unsigned long long tsc = 0, ntsc;
+ long long max_loops = cpu_khz;
+
+ if (cpu_has_tsc)
+ rdtscll(tsc);
if (disable_apic) {
arch_disable_smp_support();
@@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void)
* the interrupt. Hence a vector might get locked. It was noticed
* for timer irq (vector 0x31). Issue an extra EOI to clear ISR.
*/
- for (i = APIC_ISR_NR - 1; i >= 0; i--) {
- value = apic_read(APIC_ISR + i*0x10);
- for (j = 31; j >= 0; j--) {
- if (value & (1<<j))
- ack_APIC_irq();
+ do {
+ queued = 0;
+ for (i = APIC_ISR_NR - 1; i >= 0; i--)
+ queued |= apic_read(APIC_IRR + i*0x10);
+
+ for (i = APIC_ISR_NR - 1; i >= 0; i--) {
+ value = apic_read(APIC_ISR + i*0x10);
+ for (j = 31; j >= 0; j--) {
+ if (value & (1<<j)) {
+ ack_APIC_irq();
+ acked++;
+ }
+ }
}
- }
+ if (acked > 256) {
+ printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n",
+ acked);
+ break;
+ }
+ if (cpu_has_tsc) {
+ rdtscll(ntsc);
+ max_loops = (cpu_khz << 10) - (ntsc - tsc);
+ } else
+ max_loops--;
+ } while (queued && max_loops > 0);
+ WARN_ON(max_loops <= 0);
/*
* Now that we are all set up, enable the APIC
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 03ba1b895f5e..8593582d8022 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -129,25 +129,6 @@ int es7000_plat;
* GSI override for ES7000 platforms.
*/
-static unsigned int base;
-
-static int
-es7000_rename_gsi(int ioapic, int gsi)
-{
- if (es7000_plat == ES7000_ZORRO)
- return gsi;
-
- if (!base) {
- int i;
- for (i = 0; i < nr_ioapics; i++)
- base += nr_ioapic_registers[i];
- }
-
- if (!ioapic && (gsi < 16))
- gsi += base;
-
- return gsi;
-}
static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
{
@@ -190,7 +171,6 @@ static void setup_unisys(void)
es7000_plat = ES7000_ZORRO;
else
es7000_plat = ES7000_CLASSIC;
- ioapic_renumber_irq = es7000_rename_gsi;
}
/*
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
new file mode 100644
index 000000000000..cefd6942f0e9
--- /dev/null
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -0,0 +1,107 @@
+/*
+ * HW NMI watchdog support
+ *
+ * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc.
+ *
+ * Arch specific calls to support NMI watchdog
+ *
+ * Bits copied from original nmi.c file
+ *
+ */
+#include <asm/apic.h>
+
+#include <linux/cpumask.h>
+#include <linux/kdebug.h>
+#include <linux/notifier.h>
+#include <linux/kprobes.h>
+#include <linux/nmi.h>
+#include <linux/module.h>
+
+/* For reliability, we're prepared to waste bits here. */
+static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
+
+u64 hw_nmi_get_sample_period(void)
+{
+ return (u64)(cpu_khz) * 1000 * 60;
+}
+
+#ifdef ARCH_HAS_NMI_WATCHDOG
+void arch_trigger_all_cpu_backtrace(void)
+{
+ int i;
+
+ cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
+
+ printk(KERN_INFO "sending NMI to all CPUs:\n");
+ apic->send_IPI_all(NMI_VECTOR);
+
+ /* Wait for up to 10 seconds for all CPUs to do the backtrace */
+ for (i = 0; i < 10 * 1000; i++) {
+ if (cpumask_empty(to_cpumask(backtrace_mask)))
+ break;
+ mdelay(1);
+ }
+}
+
+static int __kprobes
+arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
+ unsigned long cmd, void *__args)
+{
+ struct die_args *args = __args;
+ struct pt_regs *regs;
+ int cpu = smp_processor_id();
+
+ switch (cmd) {
+ case DIE_NMI:
+ case DIE_NMI_IPI:
+ break;
+
+ default:
+ return NOTIFY_DONE;
+ }
+
+ regs = args->regs;
+
+ if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
+ static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
+
+ arch_spin_lock(&lock);
+ printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
+ show_regs(regs);
+ dump_stack();
+ arch_spin_unlock(&lock);
+ cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
+ return NOTIFY_STOP;
+ }
+
+ return NOTIFY_DONE;
+}
+
+static __read_mostly struct notifier_block backtrace_notifier = {
+ .notifier_call = arch_trigger_all_cpu_backtrace_handler,
+ .next = NULL,
+ .priority = 1
+};
+
+static int __init register_trigger_all_cpu_backtrace(void)
+{
+ register_die_notifier(&backtrace_notifier);
+ return 0;
+}
+early_initcall(register_trigger_all_cpu_backtrace);
+#endif
+
+/* STUB calls to mimic old nmi_watchdog behaviour */
+#if defined(CONFIG_X86_LOCAL_APIC)
+unsigned int nmi_watchdog = NMI_NONE;
+EXPORT_SYMBOL(nmi_watchdog);
+void acpi_nmi_enable(void) { return; }
+void acpi_nmi_disable(void) { return; }
+#endif
+atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
+EXPORT_SYMBOL(nmi_active);
+int unknown_nmi_panic;
+void cpu_nmi_set_wd_enabled(void) { return; }
+void stop_apic_nmi_watchdog(void *unused) { return; }
+void setup_apic_nmi_watchdog(void *unused) { return; }
+int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index eb2789c3f721..4dc0084ec1b1 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -89,6 +89,9 @@ int nr_ioapics;
/* IO APIC gsi routing info */
struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS];
+/* The one past the highest gsi number used */
+u32 gsi_top;
+
/* MP IRQ source entries */
struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
@@ -1013,10 +1016,9 @@ static inline int irq_trigger(int idx)
return MPBIOS_trigger(idx);
}
-int (*ioapic_renumber_irq)(int ioapic, int irq);
static int pin_2_irq(int idx, int apic, int pin)
{
- int irq, i;
+ int irq;
int bus = mp_irqs[idx].srcbus;
/*
@@ -1028,18 +1030,12 @@ static int pin_2_irq(int idx, int apic, int pin)
if (test_bit(bus, mp_bus_not_pci)) {
irq = mp_irqs[idx].srcbusirq;
} else {
- /*
- * PCI IRQs are mapped in order
- */
- i = irq = 0;
- while (i < apic)
- irq += nr_ioapic_registers[i++];
- irq += pin;
- /*
- * For MPS mode, so far only needed by ES7000 platform
- */
- if (ioapic_renumber_irq)
- irq = ioapic_renumber_irq(apic, irq);
+ u32 gsi = mp_gsi_routing[apic].gsi_base + pin;
+
+ if (gsi >= NR_IRQS_LEGACY)
+ irq = gsi;
+ else
+ irq = gsi_top + gsi;
}
#ifdef CONFIG_X86_32
@@ -1950,20 +1946,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 };
void __init enable_IO_APIC(void)
{
- union IO_APIC_reg_01 reg_01;
int i8259_apic, i8259_pin;
int apic;
- unsigned long flags;
-
- /*
- * The number of IO-APIC IRQ registers (== #pins):
- */
- for (apic = 0; apic < nr_ioapics; apic++) {
- raw_spin_lock_irqsave(&ioapic_lock, flags);
- reg_01.raw = io_apic_read(apic, 1);
- raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- nr_ioapic_registers[apic] = reg_01.bits.entries+1;
- }
if (!legacy_pic->nr_legacy_irqs)
return;
@@ -3413,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
cfg = desc->chip_data;
- read_msi_msg_desc(desc, &msg);
+ get_cached_msi_msg_desc(desc, &msg);
msg.data &= ~MSI_DATA_VECTOR_MASK;
msg.data |= MSI_DATA_VECTOR(cfg->vector);
@@ -3858,27 +3842,20 @@ int __init io_apic_get_redir_entries (int ioapic)
reg_01.raw = io_apic_read(ioapic, 1);
raw_spin_unlock_irqrestore(&ioapic_lock, flags);
- return reg_01.bits.entries;
+ /* The register returns the maximum index redir index
+ * supported, which is one less than the total number of redir
+ * entries.
+ */
+ return reg_01.bits.entries + 1;
}
void __init probe_nr_irqs_gsi(void)
{
- int nr = 0;
+ int nr;
- nr = acpi_probe_gsi();
- if (nr > nr_irqs_gsi) {
+ nr = gsi_top + NR_IRQS_LEGACY;
+ if (nr > nr_irqs_gsi)
nr_irqs_gsi = nr;
- } else {
- /* for acpi=off or acpi is not compiled in */
- int idx;
-
- nr = 0;
- for (idx = 0; idx < nr_ioapics; idx++)
- nr += io_apic_get_redir_entries(idx) + 1;
-
- if (nr > nr_irqs_gsi)
- nr_irqs_gsi = nr;
- }
printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
}
@@ -4085,22 +4062,27 @@ int __init io_apic_get_version(int ioapic)
return reg_01.bits.version;
}
-int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
+int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
{
- int i;
+ int ioapic, pin, idx;
if (skip_ioapic_setup)
return -1;
- for (i = 0; i < mp_irq_entries; i++)
- if (mp_irqs[i].irqtype == mp_INT &&
- mp_irqs[i].srcbusirq == bus_irq)
- break;
- if (i >= mp_irq_entries)
+ ioapic = mp_find_ioapic(gsi);
+ if (ioapic < 0)
return -1;
- *trigger = irq_trigger(i);
- *polarity = irq_polarity(i);
+ pin = mp_find_ioapic_pin(ioapic, gsi);
+ if (pin < 0)
+ return -1;
+
+ idx = find_irq_entry(ioapic, pin, mp_INT);
+ if (idx < 0)
+ return -1;
+
+ *trigger = irq_trigger(idx);
+ *polarity = irq_polarity(idx);
return 0;
}
@@ -4241,7 +4223,7 @@ void __init ioapic_insert_resources(void)
}
}
-int mp_find_ioapic(int gsi)
+int mp_find_ioapic(u32 gsi)
{
int i = 0;
@@ -4256,7 +4238,7 @@ int mp_find_ioapic(int gsi)
return -1;
}
-int mp_find_ioapic_pin(int ioapic, int gsi)
+int mp_find_ioapic_pin(int ioapic, u32 gsi)
{
if (WARN_ON(ioapic == -1))
return -1;
@@ -4284,6 +4266,7 @@ static int bad_ioapic(unsigned long address)
void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
{
int idx = 0;
+ int entries;
if (bad_ioapic(address))
return;
@@ -4302,9 +4285,17 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
* Build basic GSI lookup table to facilitate gsi->io_apic lookups
* and to prevent reprogramming of IOAPIC pins (PCI GSIs).
*/
+ entries = io_apic_get_redir_entries(idx);
mp_gsi_routing[idx].gsi_base = gsi_base;
- mp_gsi_routing[idx].gsi_end = gsi_base +
- io_apic_get_redir_entries(idx);
+ mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1;
+
+ /*
+ * The number of IO-APIC IRQ registers (== #pins):
+ */
+ nr_ioapic_registers[idx] = entries;
+
+ if (mp_gsi_routing[idx].gsi_end >= gsi_top)
+ gsi_top = mp_gsi_routing[idx].gsi_end + 1;
printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
"GSI %d-%d\n", idx, mp_ioapics[idx].apicid,
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 1edaf15c0b8e..a43f71cb30f8 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
int cpu = smp_processor_id();
int rc = 0;
- /* check for other users first */
- if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
- == NOTIFY_STOP) {
- rc = 1;
- touched = 1;
- }
-
sum = get_timer_irqs(cpu);
if (__get_cpu_var(nmi_touch)) {
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index c085d52dbaf2..e46f98f36e31 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -735,9 +735,6 @@ void __init uv_system_init(void)
uv_node_to_blade[nid] = blade;
uv_cpu_to_blade[cpu] = blade;
max_pnode = max(pnode, max_pnode);
-
- printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
- cpu, apicid, pnode, nid, lcpu, blade);
}
/* Add blade/pnode info for nodes without cpus */
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 031aa887b0eb..4c9c67bf09b7 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -140,7 +140,7 @@
* is now the way life works).
* Fix thinko in suspend() (wrong return).
* Notify drivers on critical suspend.
- * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz>
+ * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz>
* modified by sfr).
* Disable interrupts while we are suspended (Andy Henroid
* <andy_henroid@yahoo.com> fixed by sfr).
@@ -1224,7 +1224,7 @@ static void reinit_timer(void)
#ifdef INIT_TIMER_AFTER_SUSPEND
unsigned long flags;
- spin_lock_irqsave(&i8253_lock, flags);
+ raw_spin_lock_irqsave(&i8253_lock, flags);
/* set the clock to HZ */
outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
udelay(10);
@@ -1232,7 +1232,7 @@ static void reinit_timer(void)
udelay(10);
outb_pit(LATCH >> 8, PIT_CH0); /* MSB */
udelay(10);
- spin_unlock_irqrestore(&i8253_lock, flags);
+ raw_spin_unlock_irqrestore(&i8253_lock, flags);
#endif
}
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c202b62f3671..3f0ebe429a01 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -12,11 +12,11 @@ endif
nostackp := $(call cc-option, -fno-stack-protector)
CFLAGS_common.o := $(nostackp)
-obj-y := intel_cacheinfo.o addon_cpuid_features.o
+obj-y := intel_cacheinfo.o scattered.o topology.o
obj-y += proc.o capflags.o powerflags.o common.o
-obj-y += vmware.o hypervisor.o sched.o
+obj-y += vmware.o hypervisor.o sched.o mshyperv.o
-obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
+obj-$(CONFIG_X86_32) += bugs.o
obj-$(CONFIG_X86_64) += bugs_64.o
obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index e485825130d2..60a57b13082d 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
}
}
- if (c->x86 == 0x10 || c->x86 == 0x11)
+ if (c->x86 >= 0x10)
set_cpu_cap(c, X86_FEATURE_REP_GOOD);
/* get apicid instead of initial apic id from cpuid */
@@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
num_cache_leaves = 3;
}
- if (c->x86 >= 0xf && c->x86 <= 0x11)
+ if (c->x86 >= 0xf)
set_cpu_cap(c, X86_FEATURE_K8);
if (cpu_has_xmm2) {
@@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
fam10h_check_enable_mmcfg();
}
- if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) {
+ if (c == &boot_cpu_data && c->x86 >= 0xf) {
unsigned long long tseg;
/*
@@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = {
};
cpu_dev_register(amd_cpu_dev);
+
+/*
+ * AMD errata checking
+ *
+ * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or
+ * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that
+ * have an OSVW id assigned, which it takes as first argument. Both take a
+ * variable number of family-specific model-stepping ranges created by
+ * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const
+ * int[] in arch/x86/include/asm/processor.h.
+ *
+ * Example:
+ *
+ * const int amd_erratum_319[] =
+ * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2),
+ * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0),
+ * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0));
+ */
+
+const int amd_erratum_400[] =
+ AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf),
+ AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_400);
+
+const int amd_erratum_383[] =
+ AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf));
+EXPORT_SYMBOL_GPL(amd_erratum_383);
+
+bool cpu_has_amd_erratum(const int *erratum)
+{
+ struct cpuinfo_x86 *cpu = &current_cpu_data;
+ int osvw_id = *erratum++;
+ u32 range;
+ u32 ms;
+
+ /*
+ * If called early enough that current_cpu_data hasn't been initialized
+ * yet, fall back to boot_cpu_data.
+ */
+ if (cpu->x86 == 0)
+ cpu = &boot_cpu_data;
+
+ if (cpu->x86_vendor != X86_VENDOR_AMD)
+ return false;
+
+ if (osvw_id >= 0 && osvw_id < 65536 &&
+ cpu_has(cpu, X86_FEATURE_OSVW)) {
+ u64 osvw_len;
+
+ rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len);
+ if (osvw_id < osvw_len) {
+ u64 osvw_bits;
+
+ rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6),
+ osvw_bits);
+ return osvw_bits & (1ULL << (osvw_id & 0x3f));
+ }
+ }
+
+ /* OSVW unavailable or ID unknown, match family-model-stepping range */
+ ms = (cpu->x86_model << 8) | cpu->x86_mask;
+ while ((range = *erratum++))
+ if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) &&
+ (ms >= AMD_MODEL_RANGE_START(range)) &&
+ (ms <= AMD_MODEL_RANGE_END(range)))
+ return true;
+
+ return false;
+}
+
+EXPORT_SYMBOL_GPL(cpu_has_amd_erratum);
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index 01a265212395..c39576cb3018 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -86,7 +86,7 @@ static void __init check_fpu(void)
static void __init check_hlt(void)
{
- if (paravirt_enabled())
+ if (boot_cpu_data.x86 >= 5 || paravirt_enabled())
return;
printk(KERN_INFO "Checking 'hlt' instruction... ");
diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c
deleted file mode 100644
index 2056ccf572cc..000000000000
--- a/arch/x86/kernel/cpu/cmpxchg.c
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * cmpxchg*() fallbacks for CPU not supporting these instructions
- */
-
-#include <linux/kernel.h>
-#include <linux/smp.h>
-#include <linux/module.h>
-
-#ifndef CONFIG_X86_CMPXCHG
-unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new)
-{
- u8 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u8 *)ptr;
- if (prev == old)
- *(u8 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u8);
-
-unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new)
-{
- u16 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u16 *)ptr;
- if (prev == old)
- *(u16 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u16);
-
-unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new)
-{
- u32 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg for 386. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u32 *)ptr;
- if (prev == old)
- *(u32 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_386_u32);
-#endif
-
-#ifndef CONFIG_X86_CMPXCHG64
-unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new)
-{
- u64 prev;
- unsigned long flags;
-
- /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */
- local_irq_save(flags);
- prev = *(u64 *)ptr;
- if (prev == old)
- *(u64 *)ptr = new;
- local_irq_restore(flags);
- return prev;
-}
-EXPORT_SYMBOL(cmpxchg_486_u64);
-#endif
-
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 4868e4a951ee..490dac63c2d2 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page);
static int __init x86_xsave_setup(char *s)
{
setup_clear_cpu_cap(X86_FEATURE_XSAVE);
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
return 1;
}
__setup("noxsave", x86_xsave_setup);
+static int __init x86_xsaveopt_setup(char *s)
+{
+ setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
+ return 1;
+}
+__setup("noxsaveopt", x86_xsaveopt_setup);
+
#ifdef CONFIG_X86_32
static int cachesize_override __cpuinitdata = -1;
static int disable_x86_serial_nr __cpuinitdata = 1;
@@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
c->x86_capability[4] = excap;
}
+ /* Additional Intel-defined flags: level 0x00000007 */
+ if (c->cpuid_level >= 0x00000007) {
+ u32 eax, ebx, ecx, edx;
+
+ cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+
+ if (eax > 0)
+ c->x86_capability[9] = ebx;
+ }
+
/* AMD-defined flags: level 0x80000001 */
xlvl = cpuid_eax(0x80000000);
c->extended_cpuid_level = xlvl;
@@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
if (c->extended_cpuid_level >= 0x80000007)
c->x86_power = cpuid_edx(0x80000007);
+ init_scattered_cpuid_features(c);
}
static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c)
@@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
get_model_name(c); /* Default name */
- init_scattered_cpuid_features(c);
detect_nopl(c);
}
@@ -1084,6 +1102,20 @@ static void clear_all_debug_regs(void)
}
}
+#ifdef CONFIG_KGDB
+/*
+ * Restore debug regs if using kgdbwait and you have a kernel debugger
+ * connection established.
+ */
+static void dbg_restore_debug_regs(void)
+{
+ if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break))
+ arch_kgdb_ops.correct_hw_break();
+}
+#else /* ! CONFIG_KGDB */
+#define dbg_restore_debug_regs()
+#endif /* ! CONFIG_KGDB */
+
/*
* cpu_init() initializes state that is per-CPU. Some data is already
* initialized (naturally) in the bootstrap process, such as the GDT
@@ -1107,9 +1139,9 @@ void __cpuinit cpu_init(void)
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
- if (cpu != 0 && percpu_read(node_number) == 0 &&
- cpu_to_node(cpu) != NUMA_NO_NODE)
- percpu_write(node_number, cpu_to_node(cpu));
+ if (cpu != 0 && percpu_read(numa_node) == 0 &&
+ early_cpu_to_node(cpu) != NUMA_NO_NODE)
+ set_numa_node(early_cpu_to_node(cpu));
#endif
me = current;
@@ -1174,20 +1206,11 @@ void __cpuinit cpu_init(void)
load_TR_desc();
load_LDT(&init_mm.context);
-#ifdef CONFIG_KGDB
- /*
- * If the kgdb is connected no debug regs should be altered. This
- * is only applicable when KGDB and a KGDB I/O module are built
- * into the kernel and you are using early debugging with
- * kgdbwait. KGDB will control the kernel HW breakpoint registers.
- */
- if (kgdb_connected && arch_kgdb_ops.correct_hw_break)
- arch_kgdb_ops.correct_hw_break();
- else
-#endif
- clear_all_debug_regs();
+ clear_all_debug_regs();
+ dbg_restore_debug_regs();
fpu_init();
+ xsave_init();
raw_local_save_flags(kernel_eflags);
@@ -1239,23 +1262,16 @@ void __cpuinit cpu_init(void)
#endif
clear_all_debug_regs();
+ dbg_restore_debug_regs();
/*
* Force FPU initialization:
*/
- if (cpu_has_xsave)
- current_thread_info()->status = TS_XSAVE;
- else
- current_thread_info()->status = 0;
+ current_thread_info()->status = 0;
clear_used_math();
mxcsr_feature_mask_init();
- /*
- * Boot processor to setup the FP and extended state context info.
- */
- if (smp_processor_id() == boot_cpu_id)
- init_thread_xstate();
-
+ fpu_init();
xsave_init();
}
#endif
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 1840c0a5170b..bd54bf67e6fb 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -2,8 +2,8 @@
# K8 systems. ACPI is preferred to all other hardware-specific drivers.
# speedstep-* is preferred over p4-clockmod.
-obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
-obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
+obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
+obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 459168083b77..246cd3afbb5f 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -34,7 +34,6 @@
#include <linux/compiler.h>
#include <linux/dmi.h>
#include <linux/slab.h>
-#include <trace/events/power.h>
#include <linux/acpi.h>
#include <linux/io.h>
@@ -46,6 +45,7 @@
#include <asm/msr.h>
#include <asm/processor.h>
#include <asm/cpufeature.h>
+#include "mperf.h"
#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
"acpi-cpufreq", msg)
@@ -71,8 +71,6 @@ struct acpi_cpufreq_data {
static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
-static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
-
/* acpi_perf_data is a pointer to percpu data. */
static struct acpi_processor_performance *acpi_perf_data;
@@ -240,45 +238,6 @@ static u32 get_cur_val(const struct cpumask *mask)
return cmd.val;
}
-/* Called via smp_call_function_single(), on the target CPU */
-static void read_measured_perf_ctrs(void *_cur)
-{
- struct aperfmperf *am = _cur;
-
- get_aperfmperf(am);
-}
-
-/*
- * Return the measured active (C0) frequency on this CPU since last call
- * to this function.
- * Input: cpu number
- * Return: Average CPU frequency in terms of max frequency (zero on error)
- *
- * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
- * over a period of time, while CPU is in C0 state.
- * IA32_MPERF counts at the rate of max advertised frequency
- * IA32_APERF counts at the rate of actual CPU frequency
- * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
- * no meaning should be associated with absolute values of these MSRs.
- */
-static unsigned int get_measured_perf(struct cpufreq_policy *policy,
- unsigned int cpu)
-{
- struct aperfmperf perf;
- unsigned long ratio;
- unsigned int retval;
-
- if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
- return 0;
-
- ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
- per_cpu(acfreq_old_perf, cpu) = perf;
-
- retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
-
- return retval;
-}
-
static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
{
struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
@@ -364,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
}
}
- trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
-
switch (data->cpu_feature) {
case SYSTEM_INTEL_MSR_CAPABLE:
cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
@@ -391,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
freqs.old = perf->states[perf->state].core_frequency * 1000;
freqs.new = data->freq_table[next_state].frequency;
- for_each_cpu(i, cmd.mask) {
+ for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
}
@@ -407,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
}
}
- for_each_cpu(i, cmd.mask) {
+ for_each_cpu(i, policy->cpus) {
freqs.cpu = i;
cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
}
@@ -702,7 +659,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
/* Check for APERF/MPERF support in hardware */
if (cpu_has(c, X86_FEATURE_APERFMPERF))
- acpi_cpufreq_driver.getavg = get_measured_perf;
+ acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
dprintk("CPU%u - ACPI performance management activated.\n", cpu);
for (i = 0; i < perf->state_count; i++)
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index 16e3483be9e3..32974cf84232 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -169,12 +169,9 @@ static int gx_freq_mult[16] = {
* Low Level chipset interface *
****************************************************************/
static struct pci_device_id gx_chipset_tbl[] __initdata = {
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY,
- PCI_ANY_ID, PCI_ANY_ID },
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520,
- PCI_ANY_ID, PCI_ANY_ID },
- { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510,
- PCI_ANY_ID, PCI_ANY_ID },
+ { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
+ { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
+ { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
{ 0, },
};
@@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void)
}
/* detect which companion chip is used */
- while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) {
+ for_each_pci_dev(gx_pci) {
if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
return gx_pci;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index 7e7eea4f8261..03162dac6271 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -426,7 +426,7 @@ static int guess_fsb(int mult)
}
-static int __init longhaul_get_ranges(void)
+static int __cpuinit longhaul_get_ranges(void)
{
unsigned int i, j, k = 0;
unsigned int ratio;
@@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void)
}
-static void __init longhaul_setup_voltagescaling(void)
+static void __cpuinit longhaul_setup_voltagescaling(void)
{
union msr_longhaul longhaul;
struct mV_pos minvid, maxvid, vid;
@@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void)
return 0;
}
-static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
{
struct cpuinfo_x86 *c = &cpu_data(0);
char *cpuname = NULL;
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
index e2360a469f79..cbf48fbca881 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h
@@ -56,7 +56,7 @@ union msr_longhaul {
/*
* VIA C3 Samuel 1 & Samuel 2 (stepping 0)
*/
-static const int __initdata samuel1_mults[16] = {
+static const int __cpuinitdata samuel1_mults[16] = {
-1, /* 0000 -> RESERVED */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = {
-1, /* 1111 -> RESERVED */
};
-static const int __initdata samuel1_eblcr[16] = {
+static const int __cpuinitdata samuel1_eblcr[16] = {
50, /* 0000 -> RESERVED */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = {
/*
* VIA C3 Samuel2 Stepping 1->15
*/
-static const int __initdata samuel2_eblcr[16] = {
+static const int __cpuinitdata samuel2_eblcr[16] = {
50, /* 0000 -> 5.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = {
/*
* VIA C3 Ezra
*/
-static const int __initdata ezra_mults[16] = {
+static const int __cpuinitdata ezra_mults[16] = {
100, /* 0000 -> 10.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = {
120, /* 1111 -> 12.0x */
};
-static const int __initdata ezra_eblcr[16] = {
+static const int __cpuinitdata ezra_eblcr[16] = {
50, /* 0000 -> 5.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = {
/*
* VIA C3 (Ezra-T) [C5M].
*/
-static const int __initdata ezrat_mults[32] = {
+static const int __cpuinitdata ezrat_mults[32] = {
100, /* 0000 -> 10.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = {
-1, /* 1111 -> RESERVED (12.0x) */
};
-static const int __initdata ezrat_eblcr[32] = {
+static const int __cpuinitdata ezrat_eblcr[32] = {
50, /* 0000 -> 5.0x */
30, /* 0001 -> 3.0x */
40, /* 0010 -> 4.0x */
@@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = {
/*
* VIA C3 Nehemiah */
-static const int __initdata nehemiah_mults[32] = {
+static const int __cpuinitdata nehemiah_mults[32] = {
100, /* 0000 -> 10.0x */
-1, /* 0001 -> 16.0x */
40, /* 0010 -> 4.0x */
@@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = {
-1, /* 1111 -> 12.0x */
};
-static const int __initdata nehemiah_eblcr[32] = {
+static const int __cpuinitdata nehemiah_eblcr[32] = {
50, /* 0000 -> 5.0x */
160, /* 0001 -> 16.0x */
40, /* 0010 -> 4.0x */
@@ -315,7 +315,7 @@ struct mV_pos {
unsigned short pos;
};
-static const struct mV_pos __initdata vrm85_mV[32] = {
+static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
{1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
{1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
{1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
@@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = {
{1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
};
-static const unsigned char __initdata mV_vrm85[32] = {
+static const unsigned char __cpuinitdata mV_vrm85[32] = {
0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
};
-static const struct mV_pos __initdata mobilevrm_mV[32] = {
+static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
{1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
{1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
{1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
@@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = {
{675, 3}, {650, 2}, {625, 1}, {600, 0}
};
-static const unsigned char __initdata mV_mobilevrm[32] = {
+static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index e7b559d74c52..fc09f142d94d 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu)
* TMTA rules:
* performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
*/
-static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
- unsigned int *high_freq)
+static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
+ unsigned int *high_freq)
{
u32 msr_lo, msr_hi;
u32 save_lo, save_hi;
@@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq,
}
-static int __init longrun_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
{
int result = 0;
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
new file mode 100644
index 000000000000..911e193018ae
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.c
@@ -0,0 +1,51 @@
+#include <linux/kernel.h>
+#include <linux/smp.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/cpufreq.h>
+#include <linux/slab.h>
+
+#include "mperf.h"
+
+static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
+
+/* Called via smp_call_function_single(), on the target CPU */
+static void read_measured_perf_ctrs(void *_cur)
+{
+ struct aperfmperf *am = _cur;
+
+ get_aperfmperf(am);
+}
+
+/*
+ * Return the measured active (C0) frequency on this CPU since last call
+ * to this function.
+ * Input: cpu number
+ * Return: Average CPU frequency in terms of max frequency (zero on error)
+ *
+ * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
+ * over a period of time, while CPU is in C0 state.
+ * IA32_MPERF counts at the rate of max advertised frequency
+ * IA32_APERF counts at the rate of actual CPU frequency
+ * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
+ * no meaning should be associated with absolute values of these MSRs.
+ */
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+ unsigned int cpu)
+{
+ struct aperfmperf perf;
+ unsigned long ratio;
+ unsigned int retval;
+
+ if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
+ return 0;
+
+ ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
+ per_cpu(acfreq_old_perf, cpu) = perf;
+
+ retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
+
+ return retval;
+}
+EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
+MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
new file mode 100644
index 000000000000..5dbf2950dc22
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/mperf.h
@@ -0,0 +1,9 @@
+/*
+ * (c) 2010 Advanced Micro Devices, Inc.
+ * Your use of this code is subject to the terms and conditions of the
+ * GNU general public license version 2. See "COPYING" or
+ * http://www.gnu.org/licenses/gpl.html
+ */
+
+unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
+ unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 7b8a8ba67b07..bd1cac747f67 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
}
}
- if (c->x86 != 0xF) {
- if (!cpu_has(c, X86_FEATURE_EST))
- printk(KERN_WARNING PFX "Unknown CPU. "
- "Please send an e-mail to "
- "<cpufreq@vger.kernel.org>\n");
+ if (c->x86 != 0xF)
return 0;
- }
/* on P-4s, the TSC runs with constant frequency independent whether
* throttling is active or not. */
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
index ce7cde713e71..a36de5bbb622 100644
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -368,22 +368,16 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
return -ENODEV;
out_obj = output.pointer;
- if (out_obj->type != ACPI_TYPE_BUFFER) {
- ret = -ENODEV;
- goto out_free;
- }
+ if (out_obj->type != ACPI_TYPE_BUFFER)
+ return -ENODEV;
errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
- if (errors) {
- ret = -ENODEV;
- goto out_free;
- }
+ if (errors)
+ return -ENODEV;
supported = *((u32 *)(out_obj->buffer.pointer + 4));
- if (!(supported & 0x1)) {
- ret = -ENODEV;
- goto out_free;
- }
+ if (!(supported & 0x1))
+ return -ENODEV;
out_free:
kfree(output.pointer);
@@ -397,13 +391,17 @@ static int __init pcc_cpufreq_probe(void)
struct pcc_memory_resource *mem_resource;
struct pcc_register_resource *reg_resource;
union acpi_object *out_obj, *member;
- acpi_handle handle, osc_handle;
+ acpi_handle handle, osc_handle, pcch_handle;
int ret = 0;
status = acpi_get_handle(NULL, "\\_SB", &handle);
if (ACPI_FAILURE(status))
return -ENODEV;
+ status = acpi_get_handle(handle, "PCCH", &pcch_handle);
+ if (ACPI_FAILURE(status))
+ return -ENODEV;
+
status = acpi_get_handle(handle, "_OSC", &osc_handle);
if (ACPI_SUCCESS(status)) {
ret = pcc_cpufreq_do_osc(&osc_handle);
@@ -543,13 +541,13 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
if (!pcch_virt_addr) {
result = -1;
- goto pcch_null;
+ goto out;
}
result = pcc_get_offset(cpu);
if (result) {
dprintk("init: PCCP evaluation failed\n");
- goto free;
+ goto out;
}
policy->max = policy->cpuinfo.max_freq =
@@ -558,14 +556,15 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
ioread32(&pcch_hdr->minimum_frequency) * 1000;
policy->cur = pcc_get_freq(cpu);
+ if (!policy->cur) {
+ dprintk("init: Unable to get current CPU frequency\n");
+ result = -EINVAL;
+ goto out;
+ }
+
dprintk("init: policy->max is %d, policy->min is %d\n",
policy->max, policy->min);
-
- return 0;
-free:
- pcc_clear_mapping();
- free_percpu(pcc_cpu_info);
-pcch_null:
+out:
return result;
}
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index 9a97116f89e5..4a45fd6e41ba 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy)
* We will then get the same kind of behaviour already tested under
* the "well-known" other OS.
*/
-static int __init fixup_sgtc(void)
+static int __cpuinit fixup_sgtc(void)
{
unsigned int sgtc;
unsigned int m;
@@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu)
}
-static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
+static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
{
printk(KERN_WARNING PFX
"%s laptop with broken PST tables in BIOS detected.\n",
@@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d)
* A BIOS update is all that can save them.
* Mention this, and disable cpufreq.
*/
-static struct dmi_system_id __initdata powernow_dmi_table[] = {
+static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
{
.callback = acer_cpufreq_pst,
.ident = "Acer Aspire",
@@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = {
{ }
};
-static int __init powernow_cpu_init(struct cpufreq_policy *policy)
+static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
{
union msr_fidvidstatus fidvidstatus;
int result;
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index b6215b9798e2..491977baf6c0 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -1,6 +1,5 @@
-
/*
- * (c) 2003-2006 Advanced Micro Devices, Inc.
+ * (c) 2003-2010 Advanced Micro Devices, Inc.
* Your use of this code is subject to the terms and conditions of the
* GNU general public license version 2. See "COPYING" or
* http://www.gnu.org/licenses/gpl.html
@@ -10,7 +9,7 @@
* Based on the powernow-k7.c module written by Dave Jones.
* (C) 2003 Dave Jones on behalf of SuSE Labs
* (C) 2004 Dominik Brodowski <linux@brodo.de>
- * (C) 2004 Pavel Machek <pavel@suse.cz>
+ * (C) 2004 Pavel Machek <pavel@ucw.cz>
* Licensed under the terms of the GNU GPL License version 2.
* Based upon datasheets & sample CPUs kindly provided by AMD.
*
@@ -46,6 +45,7 @@
#define PFX "powernow-k8: "
#define VERSION "version 2.20.00"
#include "powernow-k8.h"
+#include "mperf.h"
/* serialize freq changes */
static DEFINE_MUTEX(fidvid_mutex);
@@ -54,6 +54,12 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
static int cpu_family = CPU_OPTERON;
+/* core performance boost */
+static bool cpb_capable, cpb_enabled;
+static struct msr __percpu *msrs;
+
+static struct cpufreq_driver cpufreq_amd64_driver;
+
#ifndef CONFIG_SMP
static inline const struct cpumask *cpu_core_mask(int cpu)
{
@@ -800,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data)
* www.amd.com
*/
printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
+ printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
+ " and Cool'N'Quiet support is enabled in BIOS setup\n");
return -ENODEV;
}
@@ -904,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
{
int i;
u32 hi = 0, lo = 0;
- rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo);
- data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
+ rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
+ data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
for (i = 0; i < data->acpi_data.state_count; i++) {
u32 index;
@@ -1017,13 +1025,12 @@ static int get_transition_latency(struct powernow_k8_data *data)
}
if (max_latency == 0) {
/*
- * Fam 11h always returns 0 as transition latency.
- * This is intended and means "very fast". While cpufreq core
- * and governors currently can handle that gracefully, better
- * set it to 1 to avoid problems in the future.
- * For all others it's a BIOS bug.
+ * Fam 11h and later may return 0 as transition latency. This
+ * is intended and means "very fast". While cpufreq core and
+ * governors currently can handle that gracefully, better set it
+ * to 1 to avoid problems in the future.
*/
- if (boot_cpu_data.x86 != 0x11)
+ if (boot_cpu_data.x86 < 0x11)
printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
"latency\n");
max_latency = 1;
@@ -1249,6 +1256,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
struct powernow_k8_data *data;
struct init_on_cpu init_on_cpu;
int rc;
+ struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
if (!cpu_online(pol->cpu))
return -ENODEV;
@@ -1323,6 +1331,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
return -EINVAL;
}
+ /* Check for APERF/MPERF support in hardware */
+ if (cpu_has(c, X86_FEATURE_APERFMPERF))
+ cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
+
cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
if (cpu_family == CPU_HW_PSTATE)
@@ -1394,8 +1406,77 @@ out:
return khz;
}
+static void _cpb_toggle_msrs(bool t)
+{
+ int cpu;
+
+ get_online_cpus();
+
+ rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+ for_each_cpu(cpu, cpu_online_mask) {
+ struct msr *reg = per_cpu_ptr(msrs, cpu);
+ if (t)
+ reg->l &= ~BIT(25);
+ else
+ reg->l |= BIT(25);
+ }
+ wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+ put_online_cpus();
+}
+
+/*
+ * Switch on/off core performance boosting.
+ *
+ * 0=disable
+ * 1=enable.
+ */
+static void cpb_toggle(bool t)
+{
+ if (!cpb_capable)
+ return;
+
+ if (t && !cpb_enabled) {
+ cpb_enabled = true;
+ _cpb_toggle_msrs(t);
+ printk(KERN_INFO PFX "Core Boosting enabled.\n");
+ } else if (!t && cpb_enabled) {
+ cpb_enabled = false;
+ _cpb_toggle_msrs(t);
+ printk(KERN_INFO PFX "Core Boosting disabled.\n");
+ }
+}
+
+static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
+ size_t count)
+{
+ int ret = -EINVAL;
+ unsigned long val = 0;
+
+ ret = strict_strtoul(buf, 10, &val);
+ if (!ret && (val == 0 || val == 1) && cpb_capable)
+ cpb_toggle(val);
+ else
+ return -EINVAL;
+
+ return count;
+}
+
+static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
+{
+ return sprintf(buf, "%u\n", cpb_enabled);
+}
+
+#define define_one_rw(_name) \
+static struct freq_attr _name = \
+__ATTR(_name, 0644, show_##_name, store_##_name)
+
+define_one_rw(cpb);
+
static struct freq_attr *powernow_k8_attr[] = {
&cpufreq_freq_attr_scaling_available_freqs,
+ &cpb,
NULL,
};
@@ -1411,10 +1492,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = {
.attr = powernow_k8_attr,
};
+/*
+ * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
+ * cannot block the remaining ones from boosting. On the CPU_UP path we
+ * simply keep the boost-disable flag in sync with the current global
+ * state.
+ */
+static int cpb_notify(struct notifier_block *nb, unsigned long action,
+ void *hcpu)
+{
+ unsigned cpu = (long)hcpu;
+ u32 lo, hi;
+
+ switch (action) {
+ case CPU_UP_PREPARE:
+ case CPU_UP_PREPARE_FROZEN:
+
+ if (!cpb_enabled) {
+ rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+ lo |= BIT(25);
+ wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+ }
+ break;
+
+ case CPU_DOWN_PREPARE:
+ case CPU_DOWN_PREPARE_FROZEN:
+ rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
+ lo &= ~BIT(25);
+ wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
+ break;
+
+ default:
+ break;
+ }
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block cpb_nb = {
+ .notifier_call = cpb_notify,
+};
+
/* driver entry point for init */
static int __cpuinit powernowk8_init(void)
{
- unsigned int i, supported_cpus = 0;
+ unsigned int i, supported_cpus = 0, cpu;
for_each_online_cpu(i) {
int rc;
@@ -1423,15 +1545,36 @@ static int __cpuinit powernowk8_init(void)
supported_cpus++;
}
- if (supported_cpus == num_online_cpus()) {
- printk(KERN_INFO PFX "Found %d %s "
- "processors (%d cpu cores) (" VERSION ")\n",
- num_online_nodes(),
- boot_cpu_data.x86_model_id, supported_cpus);
- return cpufreq_register_driver(&cpufreq_amd64_driver);
+ if (supported_cpus != num_online_cpus())
+ return -ENODEV;
+
+ printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
+ num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
+
+ if (boot_cpu_has(X86_FEATURE_CPB)) {
+
+ cpb_capable = true;
+
+ register_cpu_notifier(&cpb_nb);
+
+ msrs = msrs_alloc();
+ if (!msrs) {
+ printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
+ return -ENOMEM;
+ }
+
+ rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
+
+ for_each_cpu(cpu, cpu_online_mask) {
+ struct msr *reg = per_cpu_ptr(msrs, cpu);
+ cpb_enabled |= !(!!(reg->l & BIT(25)));
+ }
+
+ printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
+ (cpb_enabled ? "on" : "off"));
}
- return -ENODEV;
+ return cpufreq_register_driver(&cpufreq_amd64_driver);
}
/* driver entry point for term */
@@ -1439,6 +1582,13 @@ static void __exit powernowk8_exit(void)
{
dprintk("exit\n");
+ if (boot_cpu_has(X86_FEATURE_CPB)) {
+ msrs_free(msrs);
+ msrs = NULL;
+
+ unregister_cpu_notifier(&cpb_nb);
+ }
+
cpufreq_unregister_driver(&cpufreq_amd64_driver);
}
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
index 02ce824073cb..df3529b1c02d 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
@@ -5,7 +5,6 @@
* http://www.gnu.org/licenses/gpl.html
*/
-
enum pstate {
HW_PSTATE_INVALID = 0xff,
HW_PSTATE_0 = 0,
@@ -55,7 +54,6 @@ struct powernow_k8_data {
struct cpumask *available_cores;
};
-
/* processor's cpuid instruction support */
#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
#define CPUID_XFAM 0x0ff00000 /* extended family */
diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c
index 08be922de33a..8095f8611f8a 100644
--- a/arch/x86/kernel/cpu/hypervisor.c
+++ b/arch/x86/kernel/cpu/hypervisor.c
@@ -21,37 +21,58 @@
*
*/
+#include <linux/module.h>
#include <asm/processor.h>
-#include <asm/vmware.h>
#include <asm/hypervisor.h>
-static inline void __cpuinit
-detect_hypervisor_vendor(struct cpuinfo_x86 *c)
+/*
+ * Hypervisor detect order. This is specified explicitly here because
+ * some hypervisors might implement compatibility modes for other
+ * hypervisors and therefore need to be detected in specific sequence.
+ */
+static const __initconst struct hypervisor_x86 * const hypervisors[] =
{
- if (vmware_platform())
- c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE;
- else
- c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE;
-}
+ &x86_hyper_vmware,
+ &x86_hyper_ms_hyperv,
+#ifdef CONFIG_XEN_PVHVM
+ &x86_hyper_xen_hvm,
+#endif
+};
-static inline void __cpuinit
-hypervisor_set_feature_bits(struct cpuinfo_x86 *c)
+const struct hypervisor_x86 *x86_hyper;
+EXPORT_SYMBOL(x86_hyper);
+
+static inline void __init
+detect_hypervisor_vendor(void)
{
- if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) {
- vmware_set_feature_bits(c);
- return;
+ const struct hypervisor_x86 *h, * const *p;
+
+ for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) {
+ h = *p;
+ if (h->detect()) {
+ x86_hyper = h;
+ printk(KERN_INFO "Hypervisor detected: %s\n", h->name);
+ break;
+ }
}
}
void __cpuinit init_hypervisor(struct cpuinfo_x86 *c)
{
- detect_hypervisor_vendor(c);
- hypervisor_set_feature_bits(c);
+ if (x86_hyper && x86_hyper->set_cpu_features)
+ x86_hyper->set_cpu_features(c);
}
void __init init_hypervisor_platform(void)
{
+
+ detect_hypervisor_vendor();
+
+ if (!x86_hyper)
+ return;
+
init_hypervisor(&boot_cpu_data);
- if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE)
- vmware_platform_setup();
+
+ if (x86_hyper->init_platform)
+ x86_hyper->init_platform();
}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 1366c7cfd483..85f69cdeae10 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -12,7 +12,6 @@
#include <asm/processor.h>
#include <asm/pgtable.h>
#include <asm/msr.h>
-#include <asm/ds.h>
#include <asm/bugs.h>
#include <asm/cpu.h>
@@ -373,12 +372,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON);
}
- if (c->cpuid_level > 6) {
- unsigned ecx = cpuid_ecx(6);
- if (ecx & 0x01)
- set_cpu_cap(c, X86_FEATURE_APERFMPERF);
- }
-
if (cpu_has_xmm2)
set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC);
if (cpu_has_ds) {
@@ -388,7 +381,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
set_cpu_cap(c, X86_FEATURE_BTS);
if (!(l1 & (1<<12)))
set_cpu_cap(c, X86_FEATURE_PEBS);
- ds_init_intel(c);
}
if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 95962a93f99a..898c2f4eab88 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx {
u32 full;
};
+struct amd_l3_cache {
+ struct pci_dev *dev;
+ bool can_disable;
+ unsigned indices;
+ u8 subcaches[4];
+};
+
struct _cpuid4_info {
union _cpuid4_leaf_eax eax;
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
- bool can_disable;
- unsigned int l3_indices;
+ struct amd_l3_cache *l3;
DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
};
@@ -164,8 +170,7 @@ struct _cpuid4_info_regs {
union _cpuid4_leaf_ebx ebx;
union _cpuid4_leaf_ecx ecx;
unsigned long size;
- bool can_disable;
- unsigned int l3_indices;
+ struct amd_l3_cache *l3;
};
unsigned short num_cache_leaves;
@@ -302,124 +307,246 @@ struct _cache_attr {
};
#ifdef CONFIG_CPU_SUP_AMD
-static unsigned int __cpuinit amd_calc_l3_indices(void)
+
+/*
+ * L3 cache descriptors
+ */
+static struct amd_l3_cache **__cpuinitdata l3_caches;
+
+static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
{
- /*
- * We're called over smp_call_function_single() and therefore
- * are on the correct cpu.
- */
- int cpu = smp_processor_id();
- int node = cpu_to_node(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
unsigned int sc0, sc1, sc2, sc3;
u32 val = 0;
- pci_read_config_dword(dev, 0x1C4, &val);
+ pci_read_config_dword(l3->dev, 0x1C4, &val);
/* calculate subcache sizes */
- sc0 = !(val & BIT(0));
- sc1 = !(val & BIT(4));
- sc2 = !(val & BIT(8)) + !(val & BIT(9));
- sc3 = !(val & BIT(12)) + !(val & BIT(13));
+ l3->subcaches[0] = sc0 = !(val & BIT(0));
+ l3->subcaches[1] = sc1 = !(val & BIT(4));
+ l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
+ l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
- return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
+ l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
}
-static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
{
- if (index < 3)
+ struct amd_l3_cache *l3;
+ struct pci_dev *dev = node_to_k8_nb_misc(node);
+
+ l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
+ if (!l3) {
+ printk(KERN_WARNING "Error allocating L3 struct\n");
+ return NULL;
+ }
+
+ l3->dev = dev;
+
+ amd_calc_l3_indices(l3);
+
+ return l3;
+}
+
+static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
+ int index)
+{
+ int node;
+
+ if (boot_cpu_data.x86 != 0x10)
return;
- if (boot_cpu_data.x86 == 0x11)
+ if (index < 3)
return;
/* see errata #382 and #388 */
- if ((boot_cpu_data.x86 == 0x10) &&
- ((boot_cpu_data.x86_model < 0x8) ||
- (boot_cpu_data.x86_mask < 0x1)))
+ if (boot_cpu_data.x86_model < 0x8)
return;
+ if ((boot_cpu_data.x86_model == 0x8 ||
+ boot_cpu_data.x86_model == 0x9)
+ &&
+ boot_cpu_data.x86_mask < 0x1)
+ return;
+
/* not in virtualized environments */
if (num_k8_northbridges == 0)
return;
- this_leaf->can_disable = true;
- this_leaf->l3_indices = amd_calc_l3_indices();
+ /*
+ * Strictly speaking, the amount in @size below is leaked since it is
+ * never freed but this is done only on shutdown so it doesn't matter.
+ */
+ if (!l3_caches) {
+ int size = num_k8_northbridges * sizeof(struct amd_l3_cache *);
+
+ l3_caches = kzalloc(size, GFP_ATOMIC);
+ if (!l3_caches)
+ return;
+ }
+
+ node = amd_get_nb_id(smp_processor_id());
+
+ if (!l3_caches[node]) {
+ l3_caches[node] = amd_init_l3_cache(node);
+ l3_caches[node]->can_disable = true;
+ }
+
+ WARN_ON(!l3_caches[node]);
+
+ this_leaf->l3 = l3_caches[node];
}
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int index)
+/*
+ * check whether a slot used for disabling an L3 index is occupied.
+ * @l3: L3 cache descriptor
+ * @slot: slot number (0..1)
+ *
+ * @returns: the disabled index if used or negative value if slot free.
+ */
+int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
{
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = amd_get_nb_id(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
unsigned int reg = 0;
- if (!this_leaf->can_disable)
- return -EINVAL;
+ pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg);
+
+ /* check whether this slot is activated already */
+ if (reg & (3UL << 30))
+ return reg & 0xfff;
- if (!dev)
+ return -1;
+}
+
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+ unsigned int slot)
+{
+ int index;
+
+ if (!this_leaf->l3 || !this_leaf->l3->can_disable)
return -EINVAL;
- pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
- return sprintf(buf, "0x%08x\n", reg);
+ index = amd_get_l3_disable_slot(this_leaf->l3, slot);
+ if (index >= 0)
+ return sprintf(buf, "%d\n", index);
+
+ return sprintf(buf, "FREE\n");
}
-#define SHOW_CACHE_DISABLE(index) \
+#define SHOW_CACHE_DISABLE(slot) \
static ssize_t \
-show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
+show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \
{ \
- return show_cache_disable(this_leaf, buf, index); \
+ return show_cache_disable(this_leaf, buf, slot); \
}
SHOW_CACHE_DISABLE(0)
SHOW_CACHE_DISABLE(1)
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count, unsigned int index)
+static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
+ unsigned slot, unsigned long idx)
{
- int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- int node = amd_get_nb_id(cpu);
- struct pci_dev *dev = node_to_k8_nb_misc(node);
- unsigned long val = 0;
+ int i;
+
+ idx |= BIT(30);
+
+ /*
+ * disable index in all 4 subcaches
+ */
+ for (i = 0; i < 4; i++) {
+ u32 reg = idx | (i << 20);
+
+ if (!l3->subcaches[i])
+ continue;
+
+ pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+
+ /*
+ * We need to WBINVD on a core on the node containing the L3
+ * cache which indices we disable therefore a simple wbinvd()
+ * is not sufficient.
+ */
+ wbinvd_on_cpu(cpu);
+
+ reg |= BIT(31);
+ pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg);
+ }
+}
+
+/*
+ * disable a L3 cache index by using a disable-slot
+ *
+ * @l3: L3 cache descriptor
+ * @cpu: A CPU on the node containing the L3 cache
+ * @slot: slot number (0..1)
+ * @index: index to disable
+ *
+ * @return: 0 on success, error status on failure
+ */
+int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
+ unsigned long index)
+{
+ int ret = 0;
#define SUBCACHE_MASK (3UL << 20)
#define SUBCACHE_INDEX 0xfff
- if (!this_leaf->can_disable)
+ /*
+ * check whether this slot is already used or
+ * the index is already disabled
+ */
+ ret = amd_get_l3_disable_slot(l3, slot);
+ if (ret >= 0)
return -EINVAL;
+ /*
+ * check whether the other slot has disabled the
+ * same index already
+ */
+ if (index == amd_get_l3_disable_slot(l3, !slot))
+ return -EINVAL;
+
+ /* do not allow writes outside of allowed bits */
+ if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
+ ((index & SUBCACHE_INDEX) > l3->indices))
+ return -EINVAL;
+
+ amd_l3_disable_index(l3, cpu, slot, index);
+
+ return 0;
+}
+
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+ const char *buf, size_t count,
+ unsigned int slot)
+{
+ unsigned long val = 0;
+ int cpu, err = 0;
+
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!dev)
+ if (!this_leaf->l3 || !this_leaf->l3->can_disable)
return -EINVAL;
- if (strict_strtoul(buf, 10, &val) < 0)
- return -EINVAL;
+ cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
- /* do not allow writes outside of allowed bits */
- if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
- ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
+ if (strict_strtoul(buf, 10, &val) < 0)
return -EINVAL;
- val |= BIT(30);
- pci_write_config_dword(dev, 0x1BC + index * 4, val);
- /*
- * We need to WBINVD on a core on the node containing the L3 cache which
- * indices we disable therefore a simple wbinvd() is not sufficient.
- */
- wbinvd_on_cpu(cpu);
- pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
+ err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val);
+ if (err) {
+ if (err == -EEXIST)
+ printk(KERN_WARNING "L3 disable slot %d in use!\n",
+ slot);
+ return err;
+ }
return count;
}
-#define STORE_CACHE_DISABLE(index) \
+#define STORE_CACHE_DISABLE(slot) \
static ssize_t \
-store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
+store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
const char *buf, size_t count) \
{ \
- return store_cache_disable(this_leaf, buf, count, index); \
+ return store_cache_disable(this_leaf, buf, count, slot); \
}
STORE_CACHE_DISABLE(0)
STORE_CACHE_DISABLE(1)
@@ -431,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
#else /* CONFIG_CPU_SUP_AMD */
static void __cpuinit
-amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
+amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
{
};
#endif /* CONFIG_CPU_SUP_AMD */
@@ -447,8 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index,
if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
amd_cpuid4(index, &eax, &ebx, &ecx);
- if (boot_cpu_data.x86 >= 0x10)
- amd_check_l3_disable(index, this_leaf);
+ amd_check_l3_disable(this_leaf, index);
} else {
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
}
@@ -705,6 +831,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
for (i = 0; i < num_cache_leaves; i++)
cache_remove_shared_cpu_map(cpu, i);
+ kfree(per_cpu(ici_cpuid4_info, cpu)->l3);
kfree(per_cpu(ici_cpuid4_info, cpu));
per_cpu(ici_cpuid4_info, cpu) = NULL;
}
@@ -989,7 +1116,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
this_leaf = CPUID4_INFO_IDX(cpu, i);
- if (this_leaf->can_disable)
+ if (this_leaf->l3 && this_leaf->l3->can_disable)
ktype_cache.default_attrs = default_l3_attrs;
else
ktype_cache.default_attrs = default_attrs;
diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile
index 4ac6d48fe11b..bb34b03af252 100644
--- a/arch/x86/kernel/cpu/mcheck/Makefile
+++ b/arch/x86/kernel/cpu/mcheck/Makefile
@@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o
obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o
obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o
+
+obj-$(CONFIG_ACPI_APEI) += mce-apei.o
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
new file mode 100644
index 000000000000..745b54f9be89
--- /dev/null
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -0,0 +1,138 @@
+/*
+ * Bridge between MCE and APEI
+ *
+ * On some machine, corrected memory errors are reported via APEI
+ * generic hardware error source (GHES) instead of corrected Machine
+ * Check. These corrected memory errors can be reported to user space
+ * through /dev/mcelog via faking a corrected Machine Check, so that
+ * the error memory page can be offlined by /sbin/mcelog if the error
+ * count for one page is beyond the threshold.
+ *
+ * For fatal MCE, save MCE record into persistent storage via ERST, so
+ * that the MCE record can be logged after reboot via ERST.
+ *
+ * Copyright 2010 Intel Corp.
+ * Author: Huang Ying <ying.huang@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include <linux/kernel.h>
+#include <linux/acpi.h>
+#include <linux/cper.h>
+#include <acpi/apei.h>
+#include <asm/mce.h>
+
+#include "mce-internal.h"
+
+void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err)
+{
+ struct mce m;
+
+ /* Only corrected MC is reported */
+ if (!corrected)
+ return;
+
+ mce_setup(&m);
+ m.bank = 1;
+ /* Fake a memory read corrected error with unknown channel */
+ m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f;
+ m.addr = mem_err->physical_addr;
+ mce_log(&m);
+ mce_notify_irq();
+}
+EXPORT_SYMBOL_GPL(apei_mce_report_mem_error);
+
+#define CPER_CREATOR_MCE \
+ UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \
+ 0x64, 0x90, 0xb8, 0x9d)
+#define CPER_SECTION_TYPE_MCE \
+ UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \
+ 0x04, 0x4a, 0x38, 0xfc)
+
+/*
+ * CPER specification (in UEFI specification 2.3 appendix N) requires
+ * byte-packed.
+ */
+struct cper_mce_record {
+ struct cper_record_header hdr;
+ struct cper_section_descriptor sec_hdr;
+ struct mce mce;
+} __packed;
+
+int apei_write_mce(struct mce *m)
+{
+ struct cper_mce_record rcd;
+
+ memset(&rcd, 0, sizeof(rcd));
+ memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE);
+ rcd.hdr.revision = CPER_RECORD_REV;
+ rcd.hdr.signature_end = CPER_SIG_END;
+ rcd.hdr.section_count = 1;
+ rcd.hdr.error_severity = CPER_SER_FATAL;
+ /* timestamp, platform_id, partition_id are all invalid */
+ rcd.hdr.validation_bits = 0;
+ rcd.hdr.record_length = sizeof(rcd);
+ rcd.hdr.creator_id = CPER_CREATOR_MCE;
+ rcd.hdr.notification_type = CPER_NOTIFY_MCE;
+ rcd.hdr.record_id = cper_next_record_id();
+ rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR;
+
+ rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd;
+ rcd.sec_hdr.section_length = sizeof(rcd.mce);
+ rcd.sec_hdr.revision = CPER_SEC_REV;
+ /* fru_id and fru_text is invalid */
+ rcd.sec_hdr.validation_bits = 0;
+ rcd.sec_hdr.flags = CPER_SEC_PRIMARY;
+ rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE;
+ rcd.sec_hdr.section_severity = CPER_SER_FATAL;
+
+ memcpy(&rcd.mce, m, sizeof(*m));
+
+ return erst_write(&rcd.hdr);
+}
+
+ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ struct cper_mce_record rcd;
+ ssize_t len;
+
+ len = erst_read_next(&rcd.hdr, sizeof(rcd));
+ if (len <= 0)
+ return len;
+ /* Can not skip other records in storage via ERST unless clear them */
+ else if (len != sizeof(rcd) ||
+ uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) {
+ if (printk_ratelimit())
+ pr_warning(
+ "MCE-APEI: Can not skip the unknown record in ERST");
+ return -EIO;
+ }
+
+ memcpy(m, &rcd.mce, sizeof(*m));
+ *record_id = rcd.hdr.record_id;
+
+ return sizeof(*m);
+}
+
+/* Check whether there is record in ERST */
+int apei_check_mce(void)
+{
+ return erst_get_record_count();
+}
+
+int apei_clear_mce(u64 record_id)
+{
+ return erst_clear(record_id);
+}
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 32996f9fab67..fefcc69ee8b5 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -28,3 +28,26 @@ extern int mce_ser;
extern struct mce_bank *mce_banks;
+#ifdef CONFIG_ACPI_APEI
+int apei_write_mce(struct mce *m);
+ssize_t apei_read_mce(struct mce *m, u64 *record_id);
+int apei_check_mce(void);
+int apei_clear_mce(u64 record_id);
+#else
+static inline int apei_write_mce(struct mce *m)
+{
+ return -EINVAL;
+}
+static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id)
+{
+ return 0;
+}
+static inline int apei_check_mce(void)
+{
+ return 0;
+}
+static inline int apei_clear_mce(u64 record_id)
+{
+ return -EINVAL;
+}
+#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 8a6f0afa767e..ed41562909fe 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -36,6 +36,7 @@
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/debugfs.h>
+#include <linux/edac_mce.h>
#include <asm/processor.h>
#include <asm/hw_irq.h>
@@ -50,7 +51,7 @@
static DEFINE_MUTEX(mce_read_mutex);
#define rcu_dereference_check_mce(p) \
- rcu_dereference_check((p), \
+ rcu_dereference_index_check((p), \
rcu_read_lock_sched_held() || \
lockdep_is_held(&mce_read_mutex))
@@ -106,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
static int default_decode_mce(struct notifier_block *nb, unsigned long val,
void *data)
{
- pr_emerg("No human readable MCE decoding support on this CPU type.\n");
- pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
+ pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
+ pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
return NOTIFY_STOP;
}
@@ -169,6 +170,15 @@ void mce_log(struct mce *mce)
entry = rcu_dereference_check_mce(mcelog.next);
for (;;) {
/*
+ * If edac_mce is enabled, it will check the error type
+ * and will process it, if it is a known error.
+ * Otherwise, the error will be sent through mcelog
+ * interface
+ */
+ if (edac_mce_parse(mce))
+ return;
+
+ /*
* When the buffer fills up discard new entries.
* Assume that the earlier errors are the more
* interesting ones:
@@ -201,11 +211,11 @@ void mce_log(struct mce *mce)
static void print_mce(struct mce *m)
{
- pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n",
+ pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
m->extcpu, m->mcgstatus, m->bank, m->status);
if (m->ip) {
- pr_emerg("RIP%s %02x:<%016Lx> ",
+ pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ",
!(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "",
m->cs, m->ip);
@@ -214,14 +224,14 @@ static void print_mce(struct mce *m)
pr_cont("\n");
}
- pr_emerg("TSC %llx ", m->tsc);
+ pr_emerg(HW_ERR "TSC %llx ", m->tsc);
if (m->addr)
pr_cont("ADDR %llx ", m->addr);
if (m->misc)
pr_cont("MISC %llx ", m->misc);
pr_cont("\n");
- pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
+ pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n",
m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid);
/*
@@ -231,16 +241,6 @@ static void print_mce(struct mce *m)
atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
}
-static void print_mce_head(void)
-{
- pr_emerg("\nHARDWARE ERROR\n");
-}
-
-static void print_mce_tail(void)
-{
- pr_emerg("This is not a software problem!\n");
-}
-
#define PANIC_TIMEOUT 5 /* 5 seconds */
static atomic_t mce_paniced;
@@ -264,7 +264,7 @@ static void wait_for_panic(void)
static void mce_panic(char *msg, struct mce *final, char *exp)
{
- int i;
+ int i, apei_err = 0;
if (!fake_panic) {
/*
@@ -281,14 +281,16 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
if (atomic_inc_return(&mce_fake_paniced) > 1)
return;
}
- print_mce_head();
/* First print corrected ones that are still unlogged */
for (i = 0; i < MCE_LOG_LEN; i++) {
struct mce *m = &mcelog.entry[i];
if (!(m->status & MCI_STATUS_VAL))
continue;
- if (!(m->status & MCI_STATUS_UC))
+ if (!(m->status & MCI_STATUS_UC)) {
print_mce(m);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
}
/* Now print uncorrected but with the final one last */
for (i = 0; i < MCE_LOG_LEN; i++) {
@@ -297,22 +299,27 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
continue;
if (!(m->status & MCI_STATUS_UC))
continue;
- if (!final || memcmp(m, final, sizeof(struct mce)))
+ if (!final || memcmp(m, final, sizeof(struct mce))) {
print_mce(m);
+ if (!apei_err)
+ apei_err = apei_write_mce(m);
+ }
}
- if (final)
+ if (final) {
print_mce(final);
+ if (!apei_err)
+ apei_err = apei_write_mce(final);
+ }
if (cpu_missing)
- printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n");
- print_mce_tail();
+ pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n");
if (exp)
- printk(KERN_EMERG "Machine check: %s\n", exp);
+ pr_emerg(HW_ERR "Machine check: %s\n", exp);
if (!fake_panic) {
if (panic_timeout == 0)
panic_timeout = mce_panic_timeout;
panic(msg);
} else
- printk(KERN_EMERG "Fake kernel panic: %s\n", msg);
+ pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg);
}
/* Support code for software error injection */
@@ -539,7 +546,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
struct mce m;
int i;
- __get_cpu_var(mce_poll_count)++;
+ percpu_inc(mce_poll_count);
mce_setup(&m);
@@ -581,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
mce_log(&m);
+ atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
add_taint(TAINT_MACHINE_CHECK);
}
@@ -934,7 +942,7 @@ void do_machine_check(struct pt_regs *regs, long error_code)
atomic_inc(&mce_entry);
- __get_cpu_var(mce_exception_count)++;
+ percpu_inc(mce_exception_count);
if (notify_die(DIE_NMI, "machine check", regs, error_code,
18, SIGKILL) == NOTIFY_STOP)
@@ -1201,7 +1209,7 @@ int mce_notify_irq(void)
schedule_work(&mce_trigger_work);
if (__ratelimit(&ratelimit))
- printk(KERN_INFO "Machine check events logged\n");
+ pr_info(HW_ERR "Machine check events logged\n");
return 1;
}
@@ -1493,6 +1501,43 @@ static void collect_tscs(void *data)
rdtscll(cpu_tsc[smp_processor_id()]);
}
+static int mce_apei_read_done;
+
+/* Collect MCE record of previous boot in persistent storage via APEI ERST. */
+static int __mce_read_apei(char __user **ubuf, size_t usize)
+{
+ int rc;
+ u64 record_id;
+ struct mce m;
+
+ if (usize < sizeof(struct mce))
+ return -EINVAL;
+
+ rc = apei_read_mce(&m, &record_id);
+ /* Error or no more MCE record */
+ if (rc <= 0) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ rc = -EFAULT;
+ if (copy_to_user(*ubuf, &m, sizeof(struct mce)))
+ return rc;
+ /*
+ * In fact, we should have cleared the record after that has
+ * been flushed to the disk or sent to network in
+ * /sbin/mcelog, but we have no interface to support that now,
+ * so just clear it to avoid duplication.
+ */
+ rc = apei_clear_mce(record_id);
+ if (rc) {
+ mce_apei_read_done = 1;
+ return rc;
+ }
+ *ubuf += sizeof(struct mce);
+
+ return 0;
+}
+
static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
loff_t *off)
{
@@ -1506,15 +1551,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
return -ENOMEM;
mutex_lock(&mce_read_mutex);
+
+ if (!mce_apei_read_done) {
+ err = __mce_read_apei(&buf, usize);
+ if (err || buf != ubuf)
+ goto out;
+ }
+
next = rcu_dereference_check_mce(mcelog.next);
/* Only supports full reads right now */
- if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
- mutex_unlock(&mce_read_mutex);
- kfree(cpu_tsc);
-
- return -EINVAL;
- }
+ err = -EINVAL;
+ if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce))
+ goto out;
err = 0;
prev = 0;
@@ -1562,10 +1611,15 @@ timeout:
memset(&mcelog.entry[i], 0, sizeof(struct mce));
}
}
+
+ if (err)
+ err = -EFAULT;
+
+out:
mutex_unlock(&mce_read_mutex);
kfree(cpu_tsc);
- return err ? -EFAULT : buf - ubuf;
+ return err ? err : buf - ubuf;
}
static unsigned int mce_poll(struct file *file, poll_table *wait)
@@ -1573,6 +1627,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait)
poll_wait(file, &mce_wait, wait);
if (rcu_dereference_check_mce(mcelog.next))
return POLLIN | POLLRDNORM;
+ if (!mce_apei_read_done && apei_check_mce())
+ return POLLIN | POLLRDNORM;
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 62b48e40920a..6fcd0936194f 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot)
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Already owned by someone else? */
- if (val & CMCI_EN) {
+ if (val & MCI_CTL2_CMCI_EN) {
if (test_and_clear_bit(i, owned) && !boot)
print_update("SHD", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
continue;
}
- val |= CMCI_EN | CMCI_THRESHOLD;
+ val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK;
+ val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD;
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
/* Did the enable bit stick? -- the bank supports CMCI */
- if (val & CMCI_EN) {
+ if (val & MCI_CTL2_CMCI_EN) {
if (!test_and_set_bit(i, owned) && !boot)
print_update("CMCI", &hdr, i);
__clear_bit(i, __get_cpu_var(mce_poll_banks));
@@ -155,7 +156,7 @@ void cmci_clear(void)
continue;
/* Disable CMCI */
rdmsrl(MSR_IA32_MCx_CTL2(i), val);
- val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK);
+ val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK);
wrmsrl(MSR_IA32_MCx_CTL2(i), val);
__clear_bit(i, __get_cpu_var(mce_banks_owned));
}
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 81c499eceb21..c2a8b26d4fea 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -34,15 +34,25 @@
/* How long to wait between reporting thermal events */
#define CHECK_INTERVAL (300 * HZ)
+#define THERMAL_THROTTLING_EVENT 0
+#define POWER_LIMIT_EVENT 1
+
/*
- * Current thermal throttling state:
+ * Current thermal event state:
*/
-struct thermal_state {
- bool is_throttled;
-
+struct _thermal_state {
+ bool new_event;
+ int event;
u64 next_check;
- unsigned long throttle_count;
- unsigned long last_throttle_count;
+ unsigned long count;
+ unsigned long last_count;
+};
+
+struct thermal_state {
+ struct _thermal_state core_throttle;
+ struct _thermal_state core_power_limit;
+ struct _thermal_state package_throttle;
+ struct _thermal_state package_power_limit;
};
static DEFINE_PER_CPU(struct thermal_state, thermal_state);
@@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly;
#ifdef CONFIG_SYSFS
#define define_therm_throt_sysdev_one_ro(_name) \
- static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
+ static SYSDEV_ATTR(_name, 0444, \
+ therm_throt_sysdev_show_##_name, \
+ NULL) \
-#define define_therm_throt_sysdev_show_func(name) \
+#define define_therm_throt_sysdev_show_func(event, name) \
\
-static ssize_t therm_throt_sysdev_show_##name( \
+static ssize_t therm_throt_sysdev_show_##event##_##name( \
struct sys_device *dev, \
struct sysdev_attribute *attr, \
char *buf) \
@@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \
ssize_t ret; \
\
preempt_disable(); /* CPU hotplug */ \
- if (cpu_online(cpu)) \
+ if (cpu_online(cpu)) { \
ret = sprintf(buf, "%lu\n", \
- per_cpu(thermal_state, cpu).name); \
- else \
+ per_cpu(thermal_state, cpu).event.name); \
+ } else \
ret = 0; \
preempt_enable(); \
\
return ret; \
}
-define_therm_throt_sysdev_show_func(throttle_count);
-define_therm_throt_sysdev_one_ro(throttle_count);
+define_therm_throt_sysdev_show_func(core_throttle, count);
+define_therm_throt_sysdev_one_ro(core_throttle_count);
+
+define_therm_throt_sysdev_show_func(core_power_limit, count);
+define_therm_throt_sysdev_one_ro(core_power_limit_count);
+
+define_therm_throt_sysdev_show_func(package_throttle, count);
+define_therm_throt_sysdev_one_ro(package_throttle_count);
+
+define_therm_throt_sysdev_show_func(package_power_limit, count);
+define_therm_throt_sysdev_one_ro(package_power_limit_count);
static struct attribute *thermal_throttle_attrs[] = {
- &attr_throttle_count.attr,
+ &attr_core_throttle_count.attr,
NULL
};
-static struct attribute_group thermal_throttle_attr_group = {
+static struct attribute_group thermal_attr_group = {
.attrs = thermal_throttle_attrs,
.name = "thermal_throttle"
};
#endif /* CONFIG_SYSFS */
+#define CORE_LEVEL 0
+#define PACKAGE_LEVEL 1
+
/***
* therm_throt_process - Process thermal throttling event from interrupt
* @curr: Whether the condition is current or not (boolean), since the
@@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = {
* 1 : Event should be logged further, and a message has been
* printed to the syslog.
*/
-static int therm_throt_process(bool is_throttled)
+static int therm_throt_process(bool new_event, int event, int level)
{
- struct thermal_state *state;
- unsigned int this_cpu;
- bool was_throttled;
+ struct _thermal_state *state;
+ unsigned int this_cpu = smp_processor_id();
+ bool old_event;
u64 now;
+ struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
- this_cpu = smp_processor_id();
now = get_jiffies_64();
- state = &per_cpu(thermal_state, this_cpu);
+ if (level == CORE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->core_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->core_power_limit;
+ else
+ return 0;
+ } else if (level == PACKAGE_LEVEL) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ state = &pstate->package_throttle;
+ else if (event == POWER_LIMIT_EVENT)
+ state = &pstate->package_power_limit;
+ else
+ return 0;
+ } else
+ return 0;
- was_throttled = state->is_throttled;
- state->is_throttled = is_throttled;
+ old_event = state->new_event;
+ state->new_event = new_event;
- if (is_throttled)
- state->throttle_count++;
+ if (new_event)
+ state->count++;
if (time_before64(now, state->next_check) &&
- state->throttle_count != state->last_throttle_count)
+ state->count != state->last_count)
return 0;
state->next_check = now + CHECK_INTERVAL;
- state->last_throttle_count = state->throttle_count;
+ state->last_count = state->count;
/* if we just entered the thermal event */
- if (is_throttled) {
- printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count);
+ if (new_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
+ else
+ printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package",
+ state->count);
add_taint(TAINT_MACHINE_CHECK);
return 1;
}
- if (was_throttled) {
- printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu);
+ if (old_event) {
+ if (event == THERMAL_THROTTLING_EVENT)
+ printk(KERN_INFO "CPU%d: %s temperature/speed normal\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
+ else
+ printk(KERN_INFO "CPU%d: %s power limit normal\n",
+ this_cpu,
+ level == CORE_LEVEL ? "Core" : "Package");
return 1;
}
@@ -149,13 +204,32 @@ static int therm_throt_process(bool is_throttled)
/* Add/Remove thermal_throttle interface for CPU device: */
static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev)
{
- return sysfs_create_group(&sys_dev->kobj,
- &thermal_throttle_attr_group);
+ int err;
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
+
+ err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group);
+ if (err)
+ return err;
+
+ if (cpu_has(c, X86_FEATURE_PLN))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_core_power_limit_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PTS))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_package_throttle_count.attr,
+ thermal_attr_group.name);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ err = sysfs_add_file_to_group(&sys_dev->kobj,
+ &attr_package_power_limit_count.attr,
+ thermal_attr_group.name);
+
+ return err;
}
static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev)
{
- sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group);
+ sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group);
}
/* Mutex protecting device creation against CPU hotplug: */
@@ -190,7 +264,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb,
mutex_unlock(&therm_cpu_lock);
break;
}
- return err ? NOTIFY_BAD : NOTIFY_OK;
+ return notifier_from_errno(err);
}
static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata =
@@ -226,14 +300,50 @@ device_initcall(thermal_throttle_init_device);
#endif /* CONFIG_SYSFS */
+/*
+ * Set up the most two significant bit to notify mce log that this thermal
+ * event type.
+ * This is a temp solution. May be changed in the future with mce log
+ * infrasture.
+ */
+#define CORE_THROTTLED (0)
+#define CORE_POWER_LIMIT ((__u64)1 << 62)
+#define PACKAGE_THROTTLED ((__u64)2 << 62)
+#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
+
/* Thermal transition interrupt handler */
static void intel_thermal_interrupt(void)
{
__u64 msr_val;
+ struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
- if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0))
- mce_log_therm_throt_event(msr_val);
+
+ if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ CORE_LEVEL) != 0)
+ mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
+
+ if (cpu_has(c, X86_FEATURE_PLN))
+ if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ CORE_LEVEL) != 0)
+ mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
+
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
+ if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
+ THERMAL_THROTTLING_EVENT,
+ PACKAGE_LEVEL) != 0)
+ mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ if (therm_throt_process(msr_val &
+ PACKAGE_THERM_STATUS_POWER_LIMIT,
+ POWER_LIMIT_EVENT,
+ PACKAGE_LEVEL) != 0)
+ mce_log_therm_throt_event(PACKAGE_POWER_LIMIT
+ | msr_val);
+ }
}
static void unexpected_thermal_interrupt(void)
@@ -335,8 +445,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
apic_write(APIC_LVTTHMR, h);
rdmsr(MSR_IA32_THERM_INTERRUPT, l, h);
- wrmsr(MSR_IA32_THERM_INTERRUPT,
- l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE
+ | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_THERM_INTERRUPT,
+ l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h);
+
+ if (cpu_has(c, X86_FEATURE_PTS)) {
+ rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h);
+ if (cpu_has(c, X86_FEATURE_PLN))
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE
+ | PACKAGE_THERM_INT_PLN_ENABLE), h);
+ else
+ wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT,
+ l | (PACKAGE_THERM_INT_LOW_ENABLE
+ | PACKAGE_THERM_INT_HIGH_ENABLE), h);
+ }
smp_thermal_vector = intel_thermal_interrupt;
diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c
new file mode 100644
index 000000000000..d944bf6c50e9
--- /dev/null
+++ b/arch/x86/kernel/cpu/mshyperv.c
@@ -0,0 +1,56 @@
+/*
+ * HyperV Detection code.
+ *
+ * Copyright (C) 2010, Novell, Inc.
+ * Author : K. Y. Srinivasan <ksrinivasan@novell.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ */
+
+#include <linux/types.h>
+#include <linux/module.h>
+#include <asm/processor.h>
+#include <asm/hypervisor.h>
+#include <asm/hyperv.h>
+#include <asm/mshyperv.h>
+
+struct ms_hyperv_info ms_hyperv;
+EXPORT_SYMBOL_GPL(ms_hyperv);
+
+static bool __init ms_hyperv_platform(void)
+{
+ u32 eax;
+ u32 hyp_signature[3];
+
+ if (!boot_cpu_has(X86_FEATURE_HYPERVISOR))
+ return false;
+
+ cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS,
+ &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]);
+
+ return eax >= HYPERV_CPUID_MIN &&
+ eax <= HYPERV_CPUID_MAX &&
+ !memcmp("Microsoft Hv", hyp_signature, 12);
+}
+
+static void __init ms_hyperv_init_platform(void)
+{
+ /*
+ * Extract the features and hints
+ */
+ ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES);
+ ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO);
+
+ printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n",
+ ms_hyperv.features, ms_hyperv.hints);
+}
+
+const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = {
+ .name = "Microsoft HyperV",
+ .detect = ms_hyperv_platform,
+ .init_platform = ms_hyperv_init_platform,
+};
+EXPORT_SYMBOL(x86_hyper_ms_hyperv);
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 06130b52f012..c5f59d071425 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i)
unsigned long gran_base, chunk_base, lose_base;
char gran_factor, chunk_factor, lose_factor;
- gran_base = to_size_factor(result[i].gran_sizek, &gran_factor),
- chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor),
- lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor),
+ gran_base = to_size_factor(result[i].gran_sizek, &gran_factor);
+ chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor);
+ lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor);
pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t",
result[i].bad ? "*BAD*" : " ",
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index fd31a441c61c..7d28d7d03885 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
{
unsigned int mask_lo, mask_hi, base_lo, base_hi;
unsigned int tmp, hi;
- int cpu;
/*
* get_mtrr doesn't need to update mtrr_state, also it could be called
* from any cpu, so try to print it out directly.
*/
- cpu = get_cpu();
+ get_cpu();
rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi);
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 79556bd9b602..01c0f3ee6cc3 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -35,6 +35,7 @@
#include <linux/types.h> /* FIXME: kvm_para.h needs this */
+#include <linux/stop_machine.h>
#include <linux/kvm_para.h>
#include <linux/uaccess.h>
#include <linux/module.h>
@@ -143,22 +144,28 @@ struct set_mtrr_data {
mtrr_type smp_type;
};
+static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work);
+
/**
- * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
+ * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs.
* @info: pointer to mtrr configuration data
*
* Returns nothing.
*/
-static void ipi_handler(void *info)
+static int mtrr_work_handler(void *info)
{
#ifdef CONFIG_SMP
struct set_mtrr_data *data = info;
unsigned long flags;
+ atomic_dec(&data->count);
+ while (!atomic_read(&data->gate))
+ cpu_relax();
+
local_irq_save(flags);
atomic_dec(&data->count);
- while (!atomic_read(&data->gate))
+ while (atomic_read(&data->gate))
cpu_relax();
/* The master has cleared me to execute */
@@ -173,12 +180,13 @@ static void ipi_handler(void *info)
}
atomic_dec(&data->count);
- while (atomic_read(&data->gate))
+ while (!atomic_read(&data->gate))
cpu_relax();
atomic_dec(&data->count);
local_irq_restore(flags);
#endif
+ return 0;
}
static inline int types_compatible(mtrr_type type1, mtrr_type type2)
@@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
*
* This is kinda tricky, but fortunately, Intel spelled it out for us cleanly:
*
- * 1. Send IPI to do the following:
+ * 1. Queue work to do the following on all processors:
* 2. Disable Interrupts
* 3. Wait for all procs to do so
* 4. Enter no-fill cache mode
@@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2)
* 15. Enable interrupts.
*
* What does that mean for us? Well, first we set data.count to the number
- * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait
- * until it hits 0 and proceed. We set the data.gate flag and reset data.count.
- * Meanwhile, they are waiting for that flag to be set. Once it's set, each
+ * of CPUs. As each CPU announces that it started the rendezvous handler by
+ * decrementing the count, We reset data.count and set the data.gate flag
+ * allowing all the cpu's to proceed with the work. As each cpu disables
+ * interrupts, it'll decrement data.count once. We wait until it hits 0 and
+ * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they
+ * are waiting for that flag to be cleared. Once it's cleared, each
* CPU goes through the transition of updating MTRRs.
* The CPU vendors may each do it differently,
* so we call mtrr_if->set() callback and let them take care of it.
* When they're done, they again decrement data->count and wait for data.gate
- * to be reset.
+ * to be set.
* When we finish, we wait for data.count to hit 0 and toggle the data.gate flag
* Everyone then enables interrupts and we all continue on.
*
@@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
{
struct set_mtrr_data data;
unsigned long flags;
+ int cpu;
+
+ preempt_disable();
data.smp_reg = reg;
data.smp_base = base;
@@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
atomic_set(&data.gate, 0);
/* Start the ball rolling on other CPUs */
- if (smp_call_function(ipi_handler, &data, 0) != 0)
- panic("mtrr: timed out waiting for other CPUs\n");
+ for_each_online_cpu(cpu) {
+ struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu);
+
+ if (cpu == smp_processor_id())
+ continue;
+
+ stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work);
+ }
- local_irq_save(flags);
while (atomic_read(&data.count))
cpu_relax();
@@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
smp_wmb();
atomic_set(&data.gate, 1);
+ local_irq_save(flags);
+
+ while (atomic_read(&data.count))
+ cpu_relax();
+
+ /* Ok, reset count and toggle gate */
+ atomic_set(&data.count, num_booting_cpus() - 1);
+ smp_wmb();
+ atomic_set(&data.gate, 0);
+
/* Do our MTRR business */
/*
@@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
atomic_set(&data.count, num_booting_cpus() - 1);
smp_wmb();
- atomic_set(&data.gate, 0);
+ atomic_set(&data.gate, 1);
/*
* Wait here for everyone to have seen the gate change
@@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
cpu_relax();
local_irq_restore(flags);
+ preempt_enable();
}
/**
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index db5bdc8addf8..f2da20fda02d 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,46 +31,51 @@
#include <asm/nmi.h>
#include <asm/compat.h>
-static u64 perf_event_mask __read_mostly;
+#if 0
+#undef wrmsrl
+#define wrmsrl(msr, val) \
+do { \
+ trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\
+ (unsigned long)(val)); \
+ native_write_msr((msr), (u32)((u64)(val)), \
+ (u32)((u64)(val) >> 32)); \
+} while (0)
+#endif
-/* The maximal number of PEBS events: */
-#define MAX_PEBS_EVENTS 4
+/*
+ * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
+ */
+static unsigned long
+copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
+{
+ unsigned long offset, addr = (unsigned long)from;
+ int type = in_nmi() ? KM_NMI : KM_IRQ0;
+ unsigned long size, len = 0;
+ struct page *page;
+ void *map;
+ int ret;
-/* The size of a BTS record in bytes: */
-#define BTS_RECORD_SIZE 24
+ do {
+ ret = __get_user_pages_fast(addr, 1, 0, &page);
+ if (!ret)
+ break;
-/* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048)
+ offset = addr & (PAGE_SIZE - 1);
+ size = min(PAGE_SIZE - offset, n - len);
-/* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128)
+ map = kmap_atomic(page, type);
+ memcpy(to, map+offset, size);
+ kunmap_atomic(map, type);
+ put_page(page);
+ len += size;
+ to += size;
+ addr += size;
-/*
- * Bits in the debugctlmsr controlling branch tracing.
- */
-#define X86_DEBUGCTL_TR (1 << 6)
-#define X86_DEBUGCTL_BTS (1 << 7)
-#define X86_DEBUGCTL_BTINT (1 << 8)
-#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9)
-#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10)
+ } while (len < n);
-/*
- * A debug store configuration.
- *
- * We only support architectures that use 64bit fields.
- */
-struct debug_store {
- u64 bts_buffer_base;
- u64 bts_index;
- u64 bts_absolute_maximum;
- u64 bts_interrupt_threshold;
- u64 pebs_buffer_base;
- u64 pebs_index;
- u64 pebs_absolute_maximum;
- u64 pebs_interrupt_threshold;
- u64 pebs_event_reset[MAX_PEBS_EVENTS];
-};
+ return len;
+}
struct event_constraint {
union {
@@ -89,18 +94,42 @@ struct amd_nb {
struct event_constraint event_constraints[X86_PMC_IDX_MAX];
};
+#define MAX_LBR_ENTRIES 16
+
struct cpu_hw_events {
+ /*
+ * Generic x86 PMC bits
+ */
struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
- unsigned long interrupts;
int enabled;
- struct debug_store *ds;
int n_events;
int n_added;
+ int n_txn;
int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
u64 tags[X86_PMC_IDX_MAX];
struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
+
+ unsigned int group_flag;
+
+ /*
+ * Intel DebugStore bits
+ */
+ struct debug_store *ds;
+ u64 pebs_enabled;
+
+ /*
+ * Intel LBR bits
+ */
+ int lbr_users;
+ void *lbr_context;
+ struct perf_branch_stack lbr_stack;
+ struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
+
+ /*
+ * AMD specific bits
+ */
struct amd_nb *amd_nb;
};
@@ -114,44 +143,75 @@ struct cpu_hw_events {
#define EVENT_CONSTRAINT(c, n, m) \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
+/*
+ * Constraint on the Event code.
+ */
#define INTEL_EVENT_CONSTRAINT(c, n) \
- EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
+ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
+/*
+ * Constraint on the Event code + UMask + fixed-mask
+ *
+ * filter mask to validate fixed counter events.
+ * the following filters disqualify for fixed counters:
+ * - inv
+ * - edge
+ * - cnt-mask
+ * The other filters are supported by fixed counters.
+ * The any-thread option is supported starting with v3.
+ */
#define FIXED_EVENT_CONSTRAINT(c, n) \
- EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
+ EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK)
+
+/*
+ * Constraint on the Event code + UMask
+ */
+#define PEBS_EVENT_CONSTRAINT(c, n) \
+ EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
#define EVENT_CONSTRAINT_END \
EVENT_CONSTRAINT(0, 0, 0)
#define for_each_event_constraint(e, c) \
- for ((e) = (c); (e)->cmask; (e)++)
+ for ((e) = (c); (e)->weight; (e)++)
+
+union perf_capabilities {
+ struct {
+ u64 lbr_format : 6;
+ u64 pebs_trap : 1;
+ u64 pebs_arch_reg : 1;
+ u64 pebs_format : 4;
+ u64 smm_freeze : 1;
+ };
+ u64 capabilities;
+};
/*
* struct x86_pmu - generic x86 pmu
*/
struct x86_pmu {
+ /*
+ * Generic x86 PMC bits
+ */
const char *name;
int version;
int (*handle_irq)(struct pt_regs *);
void (*disable_all)(void);
- void (*enable_all)(void);
+ void (*enable_all)(int added);
void (*enable)(struct perf_event *);
void (*disable)(struct perf_event *);
+ int (*hw_config)(struct perf_event *event);
+ int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign);
unsigned eventsel;
unsigned perfctr;
u64 (*event_map)(int);
- u64 (*raw_event)(u64);
int max_events;
- int num_events;
- int num_events_fixed;
- int event_bits;
- u64 event_mask;
+ int num_counters;
+ int num_counters_fixed;
+ int cntval_bits;
+ u64 cntval_mask;
int apic;
u64 max_period;
- u64 intel_ctrl;
- void (*enable_bts)(u64 config);
- void (*disable_bts)(void);
-
struct event_constraint *
(*get_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
@@ -159,11 +219,33 @@ struct x86_pmu {
void (*put_event_constraints)(struct cpu_hw_events *cpuc,
struct perf_event *event);
struct event_constraint *event_constraints;
+ void (*quirks)(void);
+ int perfctr_second_write;
int (*cpu_prepare)(int cpu);
void (*cpu_starting)(int cpu);
void (*cpu_dying)(int cpu);
void (*cpu_dead)(int cpu);
+
+ /*
+ * Intel Arch Perfmon v2+
+ */
+ u64 intel_ctrl;
+ union perf_capabilities intel_cap;
+
+ /*
+ * Intel DebugStore bits
+ */
+ int bts, pebs;
+ int pebs_record_size;
+ void (*drain_pebs)(struct pt_regs *regs);
+ struct event_constraint *pebs_constraints;
+
+ /*
+ * Intel LBR
+ */
+ unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
+ int lbr_nr; /* hardware stack size */
};
static struct x86_pmu x86_pmu __read_mostly;
@@ -198,7 +280,7 @@ static u64
x86_perf_event_update(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- int shift = 64 - x86_pmu.event_bits;
+ int shift = 64 - x86_pmu.cntval_bits;
u64 prev_raw_count, new_raw_count;
int idx = hwc->idx;
s64 delta;
@@ -214,10 +296,10 @@ x86_perf_event_update(struct perf_event *event)
* count to the generic event atomically:
*/
again:
- prev_raw_count = atomic64_read(&hwc->prev_count);
+ prev_raw_count = local64_read(&hwc->prev_count);
rdmsrl(hwc->event_base + idx, new_raw_count);
- if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+ if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
new_raw_count) != prev_raw_count)
goto again;
@@ -232,8 +314,8 @@ again:
delta = (new_raw_count << shift) - (prev_raw_count << shift);
delta >>= shift;
- atomic64_add(delta, &event->count);
- atomic64_sub(delta, &hwc->period_left);
+ local64_add(delta, &event->count);
+ local64_sub(delta, &hwc->period_left);
return new_raw_count;
}
@@ -241,33 +323,32 @@ again:
static atomic_t active_events;
static DEFINE_MUTEX(pmc_reserve_mutex);
+#ifdef CONFIG_X86_LOCAL_APIC
+
static bool reserve_pmc_hardware(void)
{
-#ifdef CONFIG_X86_LOCAL_APIC
int i;
if (nmi_watchdog == NMI_LOCAL_APIC)
disable_lapic_nmi_watchdog();
- for (i = 0; i < x86_pmu.num_events; i++) {
+ for (i = 0; i < x86_pmu.num_counters; i++) {
if (!reserve_perfctr_nmi(x86_pmu.perfctr + i))
goto perfctr_fail;
}
- for (i = 0; i < x86_pmu.num_events; i++) {
+ for (i = 0; i < x86_pmu.num_counters; i++) {
if (!reserve_evntsel_nmi(x86_pmu.eventsel + i))
goto eventsel_fail;
}
-#endif
return true;
-#ifdef CONFIG_X86_LOCAL_APIC
eventsel_fail:
for (i--; i >= 0; i--)
release_evntsel_nmi(x86_pmu.eventsel + i);
- i = x86_pmu.num_events;
+ i = x86_pmu.num_counters;
perfctr_fail:
for (i--; i >= 0; i--)
@@ -277,128 +358,36 @@ perfctr_fail:
enable_lapic_nmi_watchdog();
return false;
-#endif
}
static void release_pmc_hardware(void)
{
-#ifdef CONFIG_X86_LOCAL_APIC
int i;
- for (i = 0; i < x86_pmu.num_events; i++) {
+ for (i = 0; i < x86_pmu.num_counters; i++) {
release_perfctr_nmi(x86_pmu.perfctr + i);
release_evntsel_nmi(x86_pmu.eventsel + i);
}
if (nmi_watchdog == NMI_LOCAL_APIC)
enable_lapic_nmi_watchdog();
-#endif
-}
-
-static inline bool bts_available(void)
-{
- return x86_pmu.enable_bts != NULL;
-}
-
-static void init_debug_store_on_cpu(int cpu)
-{
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- return;
-
- wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
- (u32)((u64)(unsigned long)ds),
- (u32)((u64)(unsigned long)ds >> 32));
-}
-
-static void fini_debug_store_on_cpu(int cpu)
-{
- if (!per_cpu(cpu_hw_events, cpu).ds)
- return;
-
- wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
-}
-
-static void release_bts_hardware(void)
-{
- int cpu;
-
- if (!bts_available())
- return;
-
- get_online_cpus();
-
- for_each_online_cpu(cpu)
- fini_debug_store_on_cpu(cpu);
-
- for_each_possible_cpu(cpu) {
- struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
-
- if (!ds)
- continue;
-
- per_cpu(cpu_hw_events, cpu).ds = NULL;
-
- kfree((void *)(unsigned long)ds->bts_buffer_base);
- kfree(ds);
- }
-
- put_online_cpus();
}
-static int reserve_bts_hardware(void)
-{
- int cpu, err = 0;
-
- if (!bts_available())
- return 0;
-
- get_online_cpus();
-
- for_each_possible_cpu(cpu) {
- struct debug_store *ds;
- void *buffer;
-
- err = -ENOMEM;
- buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
- if (unlikely(!buffer))
- break;
-
- ds = kzalloc(sizeof(*ds), GFP_KERNEL);
- if (unlikely(!ds)) {
- kfree(buffer);
- break;
- }
-
- ds->bts_buffer_base = (u64)(unsigned long)buffer;
- ds->bts_index = ds->bts_buffer_base;
- ds->bts_absolute_maximum =
- ds->bts_buffer_base + BTS_BUFFER_SIZE;
- ds->bts_interrupt_threshold =
- ds->bts_absolute_maximum - BTS_OVFL_TH;
+#else
- per_cpu(cpu_hw_events, cpu).ds = ds;
- err = 0;
- }
+static bool reserve_pmc_hardware(void) { return true; }
+static void release_pmc_hardware(void) {}
- if (err)
- release_bts_hardware();
- else {
- for_each_online_cpu(cpu)
- init_debug_store_on_cpu(cpu);
- }
-
- put_online_cpus();
+#endif
- return err;
-}
+static int reserve_ds_buffers(void);
+static void release_ds_buffers(void);
static void hw_perf_event_destroy(struct perf_event *event)
{
if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) {
release_pmc_hardware();
- release_bts_hardware();
+ release_ds_buffers();
mutex_unlock(&pmc_reserve_mutex);
}
}
@@ -441,59 +430,16 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
return 0;
}
-/*
- * Setup the hardware configuration for a given attr_type
- */
-static int __hw_perf_event_init(struct perf_event *event)
+static int x86_setup_perfctr(struct perf_event *event)
{
struct perf_event_attr *attr = &event->attr;
struct hw_perf_event *hwc = &event->hw;
u64 config;
- int err;
-
- if (!x86_pmu_initialized())
- return -ENODEV;
-
- err = 0;
- if (!atomic_inc_not_zero(&active_events)) {
- mutex_lock(&pmc_reserve_mutex);
- if (atomic_read(&active_events) == 0) {
- if (!reserve_pmc_hardware())
- err = -EBUSY;
- else
- err = reserve_bts_hardware();
- }
- if (!err)
- atomic_inc(&active_events);
- mutex_unlock(&pmc_reserve_mutex);
- }
- if (err)
- return err;
-
- event->destroy = hw_perf_event_destroy;
-
- /*
- * Generate PMC IRQs:
- * (keep 'enabled' bit clear for now)
- */
- hwc->config = ARCH_PERFMON_EVENTSEL_INT;
-
- hwc->idx = -1;
- hwc->last_cpu = -1;
- hwc->last_tag = ~0ULL;
-
- /*
- * Count user and OS events unless requested not to.
- */
- if (!attr->exclude_user)
- hwc->config |= ARCH_PERFMON_EVENTSEL_USR;
- if (!attr->exclude_kernel)
- hwc->config |= ARCH_PERFMON_EVENTSEL_OS;
if (!hwc->sample_period) {
hwc->sample_period = x86_pmu.max_period;
hwc->last_period = hwc->sample_period;
- atomic64_set(&hwc->period_left, hwc->sample_period);
+ local64_set(&hwc->period_left, hwc->sample_period);
} else {
/*
* If we have a PMU initialized but no APIC
@@ -505,16 +451,8 @@ static int __hw_perf_event_init(struct perf_event *event)
return -EOPNOTSUPP;
}
- /*
- * Raw hw_event type provide the config in the hw_event structure
- */
- if (attr->type == PERF_TYPE_RAW) {
- hwc->config |= x86_pmu.raw_event(attr->config);
- if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
- perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
- return -EACCES;
+ if (attr->type == PERF_TYPE_RAW)
return 0;
- }
if (attr->type == PERF_TYPE_HW_CACHE)
return set_ext_hw_attr(hwc, attr);
@@ -539,11 +477,11 @@ static int __hw_perf_event_init(struct perf_event *event)
if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) &&
(hwc->sample_period == 1)) {
/* BTS is not supported by this architecture. */
- if (!bts_available())
+ if (!x86_pmu.bts)
return -EOPNOTSUPP;
/* BTS is currently only allowed for user-mode. */
- if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
+ if (!attr->exclude_kernel)
return -EOPNOTSUPP;
}
@@ -552,12 +490,87 @@ static int __hw_perf_event_init(struct perf_event *event)
return 0;
}
+static int x86_pmu_hw_config(struct perf_event *event)
+{
+ if (event->attr.precise_ip) {
+ int precise = 0;
+
+ /* Support for constant skid */
+ if (x86_pmu.pebs)
+ precise++;
+
+ /* Support for IP fixup */
+ if (x86_pmu.lbr_nr)
+ precise++;
+
+ if (event->attr.precise_ip > precise)
+ return -EOPNOTSUPP;
+ }
+
+ /*
+ * Generate PMC IRQs:
+ * (keep 'enabled' bit clear for now)
+ */
+ event->hw.config = ARCH_PERFMON_EVENTSEL_INT;
+
+ /*
+ * Count user and OS events unless requested not to
+ */
+ if (!event->attr.exclude_user)
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_USR;
+ if (!event->attr.exclude_kernel)
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_OS;
+
+ if (event->attr.type == PERF_TYPE_RAW)
+ event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK;
+
+ return x86_setup_perfctr(event);
+}
+
+/*
+ * Setup the hardware configuration for a given attr_type
+ */
+static int __hw_perf_event_init(struct perf_event *event)
+{
+ int err;
+
+ if (!x86_pmu_initialized())
+ return -ENODEV;
+
+ err = 0;
+ if (!atomic_inc_not_zero(&active_events)) {
+ mutex_lock(&pmc_reserve_mutex);
+ if (atomic_read(&active_events) == 0) {
+ if (!reserve_pmc_hardware())
+ err = -EBUSY;
+ else {
+ err = reserve_ds_buffers();
+ if (err)
+ release_pmc_hardware();
+ }
+ }
+ if (!err)
+ atomic_inc(&active_events);
+ mutex_unlock(&pmc_reserve_mutex);
+ }
+ if (err)
+ return err;
+
+ event->destroy = hw_perf_event_destroy;
+
+ event->hw.idx = -1;
+ event->hw.last_cpu = -1;
+ event->hw.last_tag = ~0ULL;
+
+ return x86_pmu.hw_config(event);
+}
+
static void x86_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int idx;
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
u64 val;
if (!test_bit(idx, cpuc->active_mask))
@@ -587,12 +600,12 @@ void hw_perf_disable(void)
x86_pmu.disable_all();
}
-static void x86_pmu_enable_all(void)
+static void x86_pmu_enable_all(int added)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int idx;
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
struct perf_event *event = cpuc->events[idx];
u64 val;
@@ -667,14 +680,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
* assign events to counters starting with most
* constrained events.
*/
- wmax = x86_pmu.num_events;
+ wmax = x86_pmu.num_counters;
/*
* when fixed event counters are present,
* wmax is incremented by 1 to account
* for one more choice
*/
- if (x86_pmu.num_events_fixed)
+ if (x86_pmu.num_counters_fixed)
wmax++;
for (w = 1, num = n; num && w <= wmax; w++) {
@@ -724,7 +737,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader,
struct perf_event *event;
int n, max_count;
- max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
+ max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed;
/* current number of events already accepted */
n = cpuc->n_events;
@@ -795,7 +808,7 @@ void hw_perf_enable(void)
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
struct perf_event *event;
struct hw_perf_event *hwc;
- int i;
+ int i, added = cpuc->n_added;
if (!x86_pmu_initialized())
return;
@@ -847,19 +860,20 @@ void hw_perf_enable(void)
cpuc->enabled = 1;
barrier();
- x86_pmu.enable_all();
+ x86_pmu.enable_all(added);
}
-static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
+static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
+ u64 enable_mask)
{
- (void)checking_wrmsrl(hwc->config_base + hwc->idx,
- hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
+ wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
}
static inline void x86_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
+
+ wrmsrl(hwc->config_base + hwc->idx, hwc->config);
}
static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -872,9 +886,9 @@ static int
x86_perf_event_set_period(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
- s64 left = atomic64_read(&hwc->period_left);
+ s64 left = local64_read(&hwc->period_left);
s64 period = hwc->sample_period;
- int err, ret = 0, idx = hwc->idx;
+ int ret = 0, idx = hwc->idx;
if (idx == X86_PMC_IDX_FIXED_BTS)
return 0;
@@ -884,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event)
*/
if (unlikely(left <= -period)) {
left = period;
- atomic64_set(&hwc->period_left, left);
+ local64_set(&hwc->period_left, left);
hwc->last_period = period;
ret = 1;
}
if (unlikely(left <= 0)) {
left += period;
- atomic64_set(&hwc->period_left, left);
+ local64_set(&hwc->period_left, left);
hwc->last_period = period;
ret = 1;
}
@@ -910,10 +924,19 @@ x86_perf_event_set_period(struct perf_event *event)
* The hw event starts counting from this event offset,
* mark it to be able to extra future deltas:
*/
- atomic64_set(&hwc->prev_count, (u64)-left);
+ local64_set(&hwc->prev_count, (u64)-left);
+
+ wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask);
- err = checking_wrmsrl(hwc->event_base + idx,
- (u64)(-left) & x86_pmu.event_mask);
+ /*
+ * Due to erratum on certan cpu we need
+ * a second write to be sure the register
+ * is updated properly
+ */
+ if (x86_pmu.perfctr_second_write) {
+ wrmsrl(hwc->event_base + idx,
+ (u64)(-left) & x86_pmu.cntval_mask);
+ }
perf_event_update_userpage(event);
@@ -924,7 +947,8 @@ static void x86_pmu_enable_event(struct perf_event *event)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
if (cpuc->enabled)
- __x86_pmu_enable_event(&event->hw);
+ __x86_pmu_enable_event(&event->hw,
+ ARCH_PERFMON_EVENTSEL_ENABLE);
}
/*
@@ -950,7 +974,15 @@ static int x86_pmu_enable(struct perf_event *event)
if (n < 0)
return n;
- ret = x86_schedule_events(cpuc, n, assign);
+ /*
+ * If group events scheduling transaction was started,
+ * skip the schedulability test here, it will be peformed
+ * at commit time(->commit_txn) as a whole
+ */
+ if (cpuc->group_flag & PERF_EVENT_TXN)
+ goto out;
+
+ ret = x86_pmu.schedule_events(cpuc, n, assign);
if (ret)
return ret;
/*
@@ -959,8 +991,10 @@ static int x86_pmu_enable(struct perf_event *event)
*/
memcpy(cpuc->assign, assign, n*sizeof(int));
+out:
cpuc->n_events = n;
cpuc->n_added += n - n0;
+ cpuc->n_txn += n - n0;
return 0;
}
@@ -991,11 +1025,12 @@ static void x86_pmu_unthrottle(struct perf_event *event)
void perf_event_print_debug(void)
{
u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed;
+ u64 pebs;
struct cpu_hw_events *cpuc;
unsigned long flags;
int cpu, idx;
- if (!x86_pmu.num_events)
+ if (!x86_pmu.num_counters)
return;
local_irq_save(flags);
@@ -1008,16 +1043,18 @@ void perf_event_print_debug(void)
rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow);
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed);
+ rdmsrl(MSR_IA32_PEBS_ENABLE, pebs);
pr_info("\n");
pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl);
pr_info("CPU#%d: status: %016llx\n", cpu, status);
pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
+ pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs);
}
- pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
+ pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
rdmsrl(x86_pmu.perfctr + idx, pmc_count);
@@ -1030,7 +1067,7 @@ void perf_event_print_debug(void)
pr_info("CPU#%d: gen-PMC%d left: %016llx\n",
cpu, idx, prev_left);
}
- for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) {
rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count);
pr_info("CPU#%d: fixed-PMC%d count: %016llx\n",
@@ -1064,6 +1101,14 @@ static void x86_pmu_disable(struct perf_event *event)
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
int i;
+ /*
+ * If we're called during a txn, we don't need to do anything.
+ * The events never got scheduled and ->cancel_txn will truncate
+ * the event_list.
+ */
+ if (cpuc->group_flag & PERF_EVENT_TXN)
+ return;
+
x86_pmu_stop(event);
for (i = 0; i < cpuc->n_events; i++) {
@@ -1095,7 +1140,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
cpuc = &__get_cpu_var(cpu_hw_events);
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
if (!test_bit(idx, cpuc->active_mask))
continue;
@@ -1103,7 +1148,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
hwc = &event->hw;
val = x86_perf_event_update(event);
- if (val & (1ULL << (x86_pmu.event_bits - 1)))
+ if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
continue;
/*
@@ -1146,7 +1191,6 @@ void set_perf_event_pending(void)
void perf_events_lapic_init(void)
{
-#ifdef CONFIG_X86_LOCAL_APIC
if (!x86_pmu.apic || !x86_pmu_initialized())
return;
@@ -1154,7 +1198,6 @@ void perf_events_lapic_init(void)
* Always use NMI for PMU
*/
apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
}
static int __kprobes
@@ -1178,9 +1221,7 @@ perf_event_nmi_handler(struct notifier_block *self,
regs = args->regs;
-#ifdef CONFIG_X86_LOCAL_APIC
apic_write(APIC_LVTPC, APIC_DM_NMI);
-#endif
/*
* Can't rely on the handled return value to say it was our NMI, two
* events could trigger 'simultaneously' raising two back-to-back NMIs.
@@ -1217,118 +1258,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
return &unconstrained;
}
-static int x86_event_sched_in(struct perf_event *event,
- struct perf_cpu_context *cpuctx)
-{
- int ret = 0;
-
- event->state = PERF_EVENT_STATE_ACTIVE;
- event->oncpu = smp_processor_id();
- event->tstamp_running += event->ctx->time - event->tstamp_stopped;
-
- if (!is_x86_event(event))
- ret = event->pmu->enable(event);
-
- if (!ret && !is_software_event(event))
- cpuctx->active_oncpu++;
-
- if (!ret && event->attr.exclusive)
- cpuctx->exclusive = 1;
-
- return ret;
-}
-
-static void x86_event_sched_out(struct perf_event *event,
- struct perf_cpu_context *cpuctx)
-{
- event->state = PERF_EVENT_STATE_INACTIVE;
- event->oncpu = -1;
-
- if (!is_x86_event(event))
- event->pmu->disable(event);
-
- event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
-
- if (!is_software_event(event))
- cpuctx->active_oncpu--;
-
- if (event->attr.exclusive || !cpuctx->active_oncpu)
- cpuctx->exclusive = 0;
-}
-
-/*
- * Called to enable a whole group of events.
- * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
- * Assumes the caller has disabled interrupts and has
- * frozen the PMU with hw_perf_save_disable.
- *
- * called with PMU disabled. If successful and return value 1,
- * then guaranteed to call perf_enable() and hw_perf_enable()
- */
-int hw_perf_group_sched_in(struct perf_event *leader,
- struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct perf_event *sub;
- int assign[X86_PMC_IDX_MAX];
- int n0, n1, ret;
-
- /* n0 = total number of events */
- n0 = collect_events(cpuc, leader, true);
- if (n0 < 0)
- return n0;
-
- ret = x86_schedule_events(cpuc, n0, assign);
- if (ret)
- return ret;
-
- ret = x86_event_sched_in(leader, cpuctx);
- if (ret)
- return ret;
-
- n1 = 1;
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
- if (sub->state > PERF_EVENT_STATE_OFF) {
- ret = x86_event_sched_in(sub, cpuctx);
- if (ret)
- goto undo;
- ++n1;
- }
- }
- /*
- * copy new assignment, now we know it is possible
- * will be used by hw_perf_enable()
- */
- memcpy(cpuc->assign, assign, n0*sizeof(int));
-
- cpuc->n_events = n0;
- cpuc->n_added += n1;
- ctx->nr_active += n1;
-
- /*
- * 1 means successful and events are active
- * This is not quite true because we defer
- * actual activation until hw_perf_enable() but
- * this way we* ensure caller won't try to enable
- * individual events
- */
- return 1;
-undo:
- x86_event_sched_out(leader, cpuctx);
- n0 = 1;
- list_for_each_entry(sub, &leader->sibling_list, group_entry) {
- if (sub->state == PERF_EVENT_STATE_ACTIVE) {
- x86_event_sched_out(sub, cpuctx);
- if (++n0 == n1)
- break;
- }
- }
- return ret;
-}
-
#include "perf_event_amd.c"
#include "perf_event_p6.c"
+#include "perf_event_p4.c"
+#include "perf_event_intel_lbr.c"
+#include "perf_event_intel_ds.c"
#include "perf_event_intel.c"
static int __cpuinit
@@ -1402,48 +1336,50 @@ void __init init_hw_perf_events(void)
pr_cont("%s PMU driver.\n", x86_pmu.name);
- if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
+ if (x86_pmu.quirks)
+ x86_pmu.quirks();
+
+ if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) {
WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!",
- x86_pmu.num_events, X86_PMC_MAX_GENERIC);
- x86_pmu.num_events = X86_PMC_MAX_GENERIC;
+ x86_pmu.num_counters, X86_PMC_MAX_GENERIC);
+ x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
}
- perf_event_mask = (1 << x86_pmu.num_events) - 1;
- perf_max_events = x86_pmu.num_events;
+ x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
+ perf_max_events = x86_pmu.num_counters;
- if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) {
+ if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
- x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED);
- x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED;
+ x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED);
+ x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED;
}
- perf_event_mask |=
- ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED;
- x86_pmu.intel_ctrl = perf_event_mask;
+ x86_pmu.intel_ctrl |=
+ ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED;
perf_events_lapic_init();
register_die_notifier(&perf_event_nmi_notifier);
unconstrained = (struct event_constraint)
- __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
- 0, x86_pmu.num_events);
+ __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1,
+ 0, x86_pmu.num_counters);
if (x86_pmu.event_constraints) {
for_each_event_constraint(c, x86_pmu.event_constraints) {
- if (c->cmask != INTEL_ARCH_FIXED_MASK)
+ if (c->cmask != X86_RAW_EVENT_MASK)
continue;
- c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
- c->weight += x86_pmu.num_events;
+ c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1;
+ c->weight += x86_pmu.num_counters;
}
}
pr_info("... version: %d\n", x86_pmu.version);
- pr_info("... bit width: %d\n", x86_pmu.event_bits);
- pr_info("... generic registers: %d\n", x86_pmu.num_events);
- pr_info("... value mask: %016Lx\n", x86_pmu.event_mask);
+ pr_info("... bit width: %d\n", x86_pmu.cntval_bits);
+ pr_info("... generic registers: %d\n", x86_pmu.num_counters);
+ pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask);
pr_info("... max period: %016Lx\n", x86_pmu.max_period);
- pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
- pr_info("... event mask: %016Lx\n", perf_event_mask);
+ pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
+ pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
perf_cpu_notifier(x86_pmu_notifier);
}
@@ -1453,6 +1389,67 @@ static inline void x86_pmu_read(struct perf_event *event)
x86_perf_event_update(event);
}
+/*
+ * Start group events scheduling transaction
+ * Set the flag to make pmu::enable() not perform the
+ * schedulability test, it will be performed at commit time
+ */
+static void x86_pmu_start_txn(const struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ cpuc->group_flag |= PERF_EVENT_TXN;
+ cpuc->n_txn = 0;
+}
+
+/*
+ * Stop group events scheduling transaction
+ * Clear the flag and pmu::enable() will perform the
+ * schedulability test.
+ */
+static void x86_pmu_cancel_txn(const struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ cpuc->group_flag &= ~PERF_EVENT_TXN;
+ /*
+ * Truncate the collected events.
+ */
+ cpuc->n_added -= cpuc->n_txn;
+ cpuc->n_events -= cpuc->n_txn;
+}
+
+/*
+ * Commit group events scheduling transaction
+ * Perform the group schedulability test as a whole
+ * Return 0 if success
+ */
+static int x86_pmu_commit_txn(const struct pmu *pmu)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int assign[X86_PMC_IDX_MAX];
+ int n, ret;
+
+ n = cpuc->n_events;
+
+ if (!x86_pmu_initialized())
+ return -EAGAIN;
+
+ ret = x86_pmu.schedule_events(cpuc, n, assign);
+ if (ret)
+ return ret;
+
+ /*
+ * copy new assignment, now we know it is possible
+ * will be used by hw_perf_enable()
+ */
+ memcpy(cpuc->assign, assign, n*sizeof(int));
+
+ cpuc->group_flag &= ~PERF_EVENT_TXN;
+
+ return 0;
+}
+
static const struct pmu pmu = {
.enable = x86_pmu_enable,
.disable = x86_pmu_disable,
@@ -1460,9 +1457,38 @@ static const struct pmu pmu = {
.stop = x86_pmu_stop,
.read = x86_pmu_read,
.unthrottle = x86_pmu_unthrottle,
+ .start_txn = x86_pmu_start_txn,
+ .cancel_txn = x86_pmu_cancel_txn,
+ .commit_txn = x86_pmu_commit_txn,
};
/*
+ * validate that we can schedule this event
+ */
+static int validate_event(struct perf_event *event)
+{
+ struct cpu_hw_events *fake_cpuc;
+ struct event_constraint *c;
+ int ret = 0;
+
+ fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
+ if (!fake_cpuc)
+ return -ENOMEM;
+
+ c = x86_pmu.get_event_constraints(fake_cpuc, event);
+
+ if (!c || !c->weight)
+ ret = -ENOSPC;
+
+ if (x86_pmu.put_event_constraints)
+ x86_pmu.put_event_constraints(fake_cpuc, event);
+
+ kfree(fake_cpuc);
+
+ return ret;
+}
+
+/*
* validate a single event group
*
* validation include:
@@ -1502,7 +1528,7 @@ static int validate_group(struct perf_event *event)
fake_cpuc->n_events = n;
- ret = x86_schedule_events(fake_cpuc, n, NULL);
+ ret = x86_pmu.schedule_events(fake_cpuc, n, NULL);
out_free:
kfree(fake_cpuc);
@@ -1527,6 +1553,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
if (event->group_leader != event)
err = validate_group(event);
+ else
+ err = validate_event(event);
event->pmu = tmp;
}
@@ -1574,8 +1602,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
{
struct perf_callchain_entry *entry = data;
- if (reliable)
- callchain_store(entry, addr);
+ callchain_store(entry, addr);
}
static const struct stacktrace_ops backtrace_ops = {
@@ -1586,8 +1613,6 @@ static const struct stacktrace_ops backtrace_ops = {
.walk_stack = print_context_stack_bp,
};
-#include "../dumpstack.h"
-
static void
perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
{
@@ -1597,41 +1622,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
}
-/*
- * best effort, GUP based copy_from_user() that assumes IRQ or NMI context
- */
-static unsigned long
-copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
-{
- unsigned long offset, addr = (unsigned long)from;
- int type = in_nmi() ? KM_NMI : KM_IRQ0;
- unsigned long size, len = 0;
- struct page *page;
- void *map;
- int ret;
-
- do {
- ret = __get_user_pages_fast(addr, 1, 0, &page);
- if (!ret)
- break;
-
- offset = addr & (PAGE_SIZE - 1);
- size = min(PAGE_SIZE - offset, n - len);
-
- map = kmap_atomic(page, type);
- memcpy(to, map+offset, size);
- kunmap_atomic(map, type);
- put_page(page);
-
- len += size;
- to += size;
- addr += size;
-
- } while (len < n);
-
- return len;
-}
-
#ifdef CONFIG_COMPAT
static inline int
perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -1727,6 +1717,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
{
struct perf_callchain_entry *entry;
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ /* TODO: We don't support guest os callchain now */
+ return NULL;
+ }
+
if (in_nmi())
entry = &__get_cpu_var(pmc_nmi_entry);
else
@@ -1739,14 +1734,36 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
return entry;
}
-void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
+unsigned long perf_instruction_pointer(struct pt_regs *regs)
{
- regs->ip = ip;
- /*
- * perf_arch_fetch_caller_regs adds another call, we need to increment
- * the skip level
- */
- regs->bp = rewind_frame_pointer(skip + 1);
- regs->cs = __KERNEL_CS;
- local_save_flags(regs->flags);
+ unsigned long ip;
+
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest())
+ ip = perf_guest_cbs->get_guest_ip();
+ else
+ ip = instruction_pointer(regs);
+
+ return ip;
+}
+
+unsigned long perf_misc_flags(struct pt_regs *regs)
+{
+ int misc = 0;
+
+ if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+ if (perf_guest_cbs->is_user_mode())
+ misc |= PERF_RECORD_MISC_GUEST_USER;
+ else
+ misc |= PERF_RECORD_MISC_GUEST_KERNEL;
+ } else {
+ if (user_mode(regs))
+ misc |= PERF_RECORD_MISC_USER;
+ else
+ misc |= PERF_RECORD_MISC_KERNEL;
+ }
+
+ if (regs->flags & PERF_EFLAGS_EXACT)
+ misc |= PERF_RECORD_MISC_EXACT_IP;
+
+ return misc;
}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index db6f7d4056e1..c2897b7b4a3b 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -2,7 +2,7 @@
static DEFINE_RAW_SPINLOCK(amd_nb_lock);
-static __initconst u64 amd_hw_cache_event_ids
+static __initconst const u64 amd_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -102,8 +102,8 @@ static const u64 amd_perfmon_event_map[] =
[PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
[PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
[PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
- [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
- [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
+ [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
};
static u64 amd_pmu_event_map(int hw_event)
@@ -111,22 +111,19 @@ static u64 amd_pmu_event_map(int hw_event)
return amd_perfmon_event_map[hw_event];
}
-static u64 amd_pmu_raw_event(u64 hw_event)
+static int amd_pmu_hw_config(struct perf_event *event)
{
-#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL
-#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
-#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
-#define K7_EVNTSEL_INV_MASK 0x000800000ULL
-#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
-
-#define K7_EVNTSEL_MASK \
- (K7_EVNTSEL_EVENT_MASK | \
- K7_EVNTSEL_UNIT_MASK | \
- K7_EVNTSEL_EDGE_MASK | \
- K7_EVNTSEL_INV_MASK | \
- K7_EVNTSEL_REG_MASK)
-
- return hw_event & K7_EVNTSEL_MASK;
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ if (event->attr.type != PERF_TYPE_RAW)
+ return 0;
+
+ event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK;
+
+ return 0;
}
/*
@@ -165,7 +162,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
* be removed on one CPU at a time AND PMU is disabled
* when we come here
*/
- for (i = 0; i < x86_pmu.num_events; i++) {
+ for (i = 0; i < x86_pmu.num_counters; i++) {
if (nb->owners[i] == event) {
cmpxchg(nb->owners+i, event, NULL);
break;
@@ -215,7 +212,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
struct hw_perf_event *hwc = &event->hw;
struct amd_nb *nb = cpuc->amd_nb;
struct perf_event *old = NULL;
- int max = x86_pmu.num_events;
+ int max = x86_pmu.num_counters;
int i, j, k = -1;
/*
@@ -293,7 +290,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
/*
* initialize all possible NB constraints
*/
- for (i = 0; i < x86_pmu.num_events; i++) {
+ for (i = 0; i < x86_pmu.num_counters; i++) {
__set_bit(i, nb->event_constraints[i].idxmsk);
nb->event_constraints[i].weight = 1;
}
@@ -371,21 +368,22 @@ static void amd_pmu_cpu_dead(int cpu)
raw_spin_unlock(&amd_nb_lock);
}
-static __initconst struct x86_pmu amd_pmu = {
+static __initconst const struct x86_pmu amd_pmu = {
.name = "AMD",
.handle_irq = x86_pmu_handle_irq,
.disable_all = x86_pmu_disable_all,
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
.disable = x86_pmu_disable_event,
+ .hw_config = amd_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
.eventsel = MSR_K7_EVNTSEL0,
.perfctr = MSR_K7_PERFCTR0,
.event_map = amd_pmu_event_map,
- .raw_event = amd_pmu_raw_event,
.max_events = ARRAY_SIZE(amd_perfmon_event_map),
- .num_events = 4,
- .event_bits = 48,
- .event_mask = (1ULL << 48) - 1,
+ .num_counters = 4,
+ .cntval_bits = 48,
+ .cntval_mask = (1ULL << 48) - 1,
.apic = 1,
/* use highest bit to detect overflow */
.max_period = (1ULL << 47) - 1,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 9c794ac87837..214ac860ebe0 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -72,6 +72,7 @@ static struct event_constraint intel_westmere_event_constraints[] =
INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
+ INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */
EVENT_CONSTRAINT_END
};
@@ -88,7 +89,7 @@ static u64 intel_pmu_event_map(int hw_event)
return intel_perfmon_event_map[hw_event];
}
-static __initconst u64 westmere_hw_cache_event_ids
+static __initconst const u64 westmere_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -179,7 +180,7 @@ static __initconst u64 westmere_hw_cache_event_ids
},
};
-static __initconst u64 nehalem_hw_cache_event_ids
+static __initconst const u64 nehalem_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -270,7 +271,7 @@ static __initconst u64 nehalem_hw_cache_event_ids
},
};
-static __initconst u64 core2_hw_cache_event_ids
+static __initconst const u64 core2_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -361,7 +362,7 @@ static __initconst u64 core2_hw_cache_event_ids
},
};
-static __initconst u64 atom_hw_cache_event_ids
+static __initconst const u64 atom_hw_cache_event_ids
[PERF_COUNT_HW_CACHE_MAX]
[PERF_COUNT_HW_CACHE_OP_MAX]
[PERF_COUNT_HW_CACHE_RESULT_MAX] =
@@ -452,60 +453,6 @@ static __initconst u64 atom_hw_cache_event_ids
},
};
-static u64 intel_pmu_raw_event(u64 hw_event)
-{
-#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
-#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
-#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
-#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
-#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
-
-#define CORE_EVNTSEL_MASK \
- (INTEL_ARCH_EVTSEL_MASK | \
- INTEL_ARCH_UNIT_MASK | \
- INTEL_ARCH_EDGE_MASK | \
- INTEL_ARCH_INV_MASK | \
- INTEL_ARCH_CNT_MASK)
-
- return hw_event & CORE_EVNTSEL_MASK;
-}
-
-static void intel_pmu_enable_bts(u64 config)
-{
- unsigned long debugctlmsr;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr |= X86_DEBUGCTL_TR;
- debugctlmsr |= X86_DEBUGCTL_BTS;
- debugctlmsr |= X86_DEBUGCTL_BTINT;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_OS))
- debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
-
- if (!(config & ARCH_PERFMON_EVENTSEL_USR))
- debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
-
- update_debugctlmsr(debugctlmsr);
-}
-
-static void intel_pmu_disable_bts(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- unsigned long debugctlmsr;
-
- if (!cpuc->ds)
- return;
-
- debugctlmsr = get_debugctlmsr();
-
- debugctlmsr &=
- ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
- X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
-
- update_debugctlmsr(debugctlmsr);
-}
-
static void intel_pmu_disable_all(void)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
@@ -514,12 +461,17 @@ static void intel_pmu_disable_all(void)
if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
intel_pmu_disable_bts();
+
+ intel_pmu_pebs_disable_all();
+ intel_pmu_lbr_disable_all();
}
-static void intel_pmu_enable_all(void)
+static void intel_pmu_enable_all(int added)
{
struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ intel_pmu_pebs_enable_all();
+ intel_pmu_lbr_enable_all();
wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
@@ -533,6 +485,42 @@ static void intel_pmu_enable_all(void)
}
}
+/*
+ * Workaround for:
+ * Intel Errata AAK100 (model 26)
+ * Intel Errata AAP53 (model 30)
+ * Intel Errata BD53 (model 44)
+ *
+ * These chips need to be 'reset' when adding counters by programming
+ * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5
+ * either in sequence on the same PMC or on different PMCs.
+ */
+static void intel_pmu_nhm_enable_all(int added)
+{
+ if (added) {
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int i;
+
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2);
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1);
+ wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5);
+
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3);
+ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0);
+
+ for (i = 0; i < 3; i++) {
+ struct perf_event *event = cpuc->events[i];
+
+ if (!event)
+ continue;
+
+ __x86_pmu_enable_event(&event->hw,
+ ARCH_PERFMON_EVENTSEL_ENABLE);
+ }
+ }
+ intel_pmu_enable_all(added);
+}
+
static inline u64 intel_pmu_get_status(void)
{
u64 status;
@@ -547,8 +535,7 @@ static inline void intel_pmu_ack_status(u64 ack)
wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
}
-static inline void
-intel_pmu_disable_fixed(struct hw_perf_event *hwc)
+static void intel_pmu_disable_fixed(struct hw_perf_event *hwc)
{
int idx = hwc->idx - X86_PMC_IDX_FIXED;
u64 ctrl_val, mask;
@@ -557,71 +544,10 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc)
rdmsrl(hwc->config_base, ctrl_val);
ctrl_val &= ~mask;
- (void)checking_wrmsrl(hwc->config_base, ctrl_val);
-}
-
-static void intel_pmu_drain_bts_buffer(void)
-{
- struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
- struct debug_store *ds = cpuc->ds;
- struct bts_record {
- u64 from;
- u64 to;
- u64 flags;
- };
- struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
- struct bts_record *at, *top;
- struct perf_output_handle handle;
- struct perf_event_header header;
- struct perf_sample_data data;
- struct pt_regs regs;
-
- if (!event)
- return;
-
- if (!ds)
- return;
-
- at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
- top = (struct bts_record *)(unsigned long)ds->bts_index;
-
- if (top <= at)
- return;
-
- ds->bts_index = ds->bts_buffer_base;
-
- perf_sample_data_init(&data, 0);
-
- data.period = event->hw.last_period;
- regs.ip = 0;
-
- /*
- * Prepare a generic sample, i.e. fill in the invariant fields.
- * We will overwrite the from and to address before we output
- * the sample.
- */
- perf_prepare_sample(&header, &data, event, &regs);
-
- if (perf_output_begin(&handle, event,
- header.size * (top - at), 1, 1))
- return;
-
- for (; at < top; at++) {
- data.ip = at->from;
- data.addr = at->to;
-
- perf_output_sample(&handle, &header, &data, event);
- }
-
- perf_output_end(&handle);
-
- /* There's new data available. */
- event->hw.interrupts++;
- event->pending_kill = POLL_IN;
+ wrmsrl(hwc->config_base, ctrl_val);
}
-static inline void
-intel_pmu_disable_event(struct perf_event *event)
+static void intel_pmu_disable_event(struct perf_event *event)
{
struct hw_perf_event *hwc = &event->hw;
@@ -637,14 +563,15 @@ intel_pmu_disable_event(struct perf_event *event)
}
x86_pmu_disable_event(event);
+
+ if (unlikely(event->attr.precise_ip))
+ intel_pmu_pebs_disable(event);
}
-static inline void
-intel_pmu_enable_fixed(struct hw_perf_event *hwc)
+static void intel_pmu_enable_fixed(struct hw_perf_event *hwc)
{
int idx = hwc->idx - X86_PMC_IDX_FIXED;
u64 ctrl_val, bits, mask;
- int err;
/*
* Enable IRQ generation (0x8),
@@ -669,7 +596,7 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc)
rdmsrl(hwc->config_base, ctrl_val);
ctrl_val &= ~mask;
ctrl_val |= bits;
- err = checking_wrmsrl(hwc->config_base, ctrl_val);
+ wrmsrl(hwc->config_base, ctrl_val);
}
static void intel_pmu_enable_event(struct perf_event *event)
@@ -689,7 +616,10 @@ static void intel_pmu_enable_event(struct perf_event *event)
return;
}
- __x86_pmu_enable_event(hwc);
+ if (unlikely(event->attr.precise_ip))
+ intel_pmu_pebs_enable(event);
+
+ __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
}
/*
@@ -708,20 +638,20 @@ static void intel_pmu_reset(void)
unsigned long flags;
int idx;
- if (!x86_pmu.num_events)
+ if (!x86_pmu.num_counters)
return;
local_irq_save(flags);
printk("clearing PMU state on CPU#%d\n", smp_processor_id());
- for (idx = 0; idx < x86_pmu.num_events; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
}
- for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
+ for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
- }
+
if (ds)
ds->bts_index = ds->bts_buffer_base;
@@ -747,7 +677,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
intel_pmu_drain_bts_buffer();
status = intel_pmu_get_status();
if (!status) {
- intel_pmu_enable_all();
+ intel_pmu_enable_all(0);
return 0;
}
@@ -762,6 +692,15 @@ again:
inc_irq_stat(apic_perf_irqs);
ack = status;
+
+ intel_pmu_lbr_read();
+
+ /*
+ * PEBS overflow sets bit 62 in the global status register
+ */
+ if (__test_and_clear_bit(62, (unsigned long *)&status))
+ x86_pmu.drain_pebs(regs);
+
for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
struct perf_event *event = cpuc->events[bit];
@@ -787,26 +726,22 @@ again:
goto again;
done:
- intel_pmu_enable_all();
+ intel_pmu_enable_all(0);
return 1;
}
-static struct event_constraint bts_constraint =
- EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
-
static struct event_constraint *
-intel_special_constraints(struct perf_event *event)
+intel_bts_constraints(struct perf_event *event)
{
- unsigned int hw_event;
-
- hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
+ struct hw_perf_event *hwc = &event->hw;
+ unsigned int hw_event, bts_event;
- if (unlikely((hw_event ==
- x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
- (event->hw.sample_period == 1))) {
+ hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
+ bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
+ if (unlikely(hw_event == bts_event && hwc->sample_period == 1))
return &bts_constraint;
- }
+
return NULL;
}
@@ -815,24 +750,53 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
{
struct event_constraint *c;
- c = intel_special_constraints(event);
+ c = intel_bts_constraints(event);
+ if (c)
+ return c;
+
+ c = intel_pebs_constraints(event);
if (c)
return c;
return x86_get_event_constraints(cpuc, event);
}
-static __initconst struct x86_pmu core_pmu = {
+static int intel_pmu_hw_config(struct perf_event *event)
+{
+ int ret = x86_pmu_hw_config(event);
+
+ if (ret)
+ return ret;
+
+ if (event->attr.type != PERF_TYPE_RAW)
+ return 0;
+
+ if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY))
+ return 0;
+
+ if (x86_pmu.version < 3)
+ return -EINVAL;
+
+ if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
+ event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
+
+ return 0;
+}
+
+static __initconst const struct x86_pmu core_pmu = {
.name = "core",
.handle_irq = x86_pmu_handle_irq,
.disable_all = x86_pmu_disable_all,
.enable_all = x86_pmu_enable_all,
.enable = x86_pmu_enable_event,
.disable = x86_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
.event_map = intel_pmu_event_map,
- .raw_event = intel_pmu_raw_event,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
/*
@@ -845,17 +809,32 @@ static __initconst struct x86_pmu core_pmu = {
.event_constraints = intel_core_event_constraints,
};
-static __initconst struct x86_pmu intel_pmu = {
+static void intel_pmu_cpu_starting(int cpu)
+{
+ init_debug_store_on_cpu(cpu);
+ /*
+ * Deal with CPUs that don't clear their LBRs on power-up.
+ */
+ intel_pmu_lbr_reset();
+}
+
+static void intel_pmu_cpu_dying(int cpu)
+{
+ fini_debug_store_on_cpu(cpu);
+}
+
+static __initconst const struct x86_pmu intel_pmu = {
.name = "Intel",
.handle_irq = intel_pmu_handle_irq,
.disable_all = intel_pmu_disable_all,
.enable_all = intel_pmu_enable_all,
.enable = intel_pmu_enable_event,
.disable = intel_pmu_disable_event,
+ .hw_config = intel_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
.eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
.perfctr = MSR_ARCH_PERFMON_PERFCTR0,
.event_map = intel_pmu_event_map,
- .raw_event = intel_pmu_raw_event,
.max_events = ARRAY_SIZE(intel_perfmon_event_map),
.apic = 1,
/*
@@ -864,14 +843,38 @@ static __initconst struct x86_pmu intel_pmu = {
* the generic event period:
*/
.max_period = (1ULL << 31) - 1,
- .enable_bts = intel_pmu_enable_bts,
- .disable_bts = intel_pmu_disable_bts,
.get_event_constraints = intel_get_event_constraints,
- .cpu_starting = init_debug_store_on_cpu,
- .cpu_dying = fini_debug_store_on_cpu,
+ .cpu_starting = intel_pmu_cpu_starting,
+ .cpu_dying = intel_pmu_cpu_dying,
};
+static void intel_clovertown_quirks(void)
+{
+ /*
+ * PEBS is unreliable due to:
+ *
+ * AJ67 - PEBS may experience CPL leaks
+ * AJ68 - PEBS PMI may be delayed by one event
+ * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12]
+ * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS
+ *
+ * AJ67 could be worked around by restricting the OS/USR flags.
+ * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI.
+ *
+ * AJ106 could possibly be worked around by not allowing LBR
+ * usage from PEBS, including the fixup.
+ * AJ68 could possibly be worked around by always programming
+ * a pebs_event_reset[0] value and coping with the lost events.
+ *
+ * But taken together it might just make sense to not enable PEBS on
+ * these chips.
+ */
+ printk(KERN_WARNING "PEBS disabled due to CPU errata.\n");
+ x86_pmu.pebs = 0;
+ x86_pmu.pebs_constraints = NULL;
+}
+
static __init int intel_pmu_init(void)
{
union cpuid10_edx edx;
@@ -881,12 +884,13 @@ static __init int intel_pmu_init(void)
int version;
if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
- /* check for P6 processor family */
- if (boot_cpu_data.x86 == 6) {
- return p6_pmu_init();
- } else {
+ switch (boot_cpu_data.x86) {
+ case 0x6:
+ return p6_pmu_init();
+ case 0xf:
+ return p4_pmu_init();
+ }
return -ENODEV;
- }
}
/*
@@ -904,16 +908,28 @@ static __init int intel_pmu_init(void)
x86_pmu = intel_pmu;
x86_pmu.version = version;
- x86_pmu.num_events = eax.split.num_events;
- x86_pmu.event_bits = eax.split.bit_width;
- x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
+ x86_pmu.num_counters = eax.split.num_counters;
+ x86_pmu.cntval_bits = eax.split.bit_width;
+ x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1;
/*
* Quirk: v2 perfmon does not report fixed-purpose events, so
* assume at least 3 events:
*/
if (version > 1)
- x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
+ x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3);
+
+ /*
+ * v2 and above have a perf capabilities MSR
+ */
+ if (version > 1) {
+ u64 capabilities;
+
+ rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities);
+ x86_pmu.intel_cap.capabilities = capabilities;
+ }
+
+ intel_ds_init();
/*
* Install the hw-cache-events table:
@@ -924,12 +940,15 @@ static __init int intel_pmu_init(void)
break;
case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
+ x86_pmu.quirks = intel_clovertown_quirks;
case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
case 29: /* six-core 45 nm xeon "Dunnington" */
memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ intel_pmu_lbr_init_core();
+
x86_pmu.event_constraints = intel_core2_event_constraints;
pr_cont("Core2 events, ");
break;
@@ -940,13 +959,19 @@ static __init int intel_pmu_init(void)
memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ intel_pmu_lbr_init_nhm();
+
x86_pmu.event_constraints = intel_nehalem_event_constraints;
- pr_cont("Nehalem/Corei7 events, ");
+ x86_pmu.enable_all = intel_pmu_nhm_enable_all;
+ pr_cont("Nehalem events, ");
break;
+
case 28: /* Atom */
memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ intel_pmu_lbr_init_atom();
+
x86_pmu.event_constraints = intel_gen_event_constraints;
pr_cont("Atom events, ");
break;
@@ -956,7 +981,10 @@ static __init int intel_pmu_init(void)
memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
sizeof(hw_cache_event_ids));
+ intel_pmu_lbr_init_nhm();
+
x86_pmu.event_constraints = intel_westmere_event_constraints;
+ x86_pmu.enable_all = intel_pmu_nhm_enable_all;
pr_cont("Westmere events, ");
break;
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
new file mode 100644
index 000000000000..18018d1311cd
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -0,0 +1,641 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+/* The maximal number of PEBS events: */
+#define MAX_PEBS_EVENTS 4
+
+/* The size of a BTS record in bytes: */
+#define BTS_RECORD_SIZE 24
+
+#define BTS_BUFFER_SIZE (PAGE_SIZE << 4)
+#define PEBS_BUFFER_SIZE PAGE_SIZE
+
+/*
+ * pebs_record_32 for p4 and core not supported
+
+struct pebs_record_32 {
+ u32 flags, ip;
+ u32 ax, bc, cx, dx;
+ u32 si, di, bp, sp;
+};
+
+ */
+
+struct pebs_record_core {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+};
+
+struct pebs_record_nhm {
+ u64 flags, ip;
+ u64 ax, bx, cx, dx;
+ u64 si, di, bp, sp;
+ u64 r8, r9, r10, r11;
+ u64 r12, r13, r14, r15;
+ u64 status, dla, dse, lat;
+};
+
+/*
+ * A debug store configuration.
+ *
+ * We only support architectures that use 64bit fields.
+ */
+struct debug_store {
+ u64 bts_buffer_base;
+ u64 bts_index;
+ u64 bts_absolute_maximum;
+ u64 bts_interrupt_threshold;
+ u64 pebs_buffer_base;
+ u64 pebs_index;
+ u64 pebs_absolute_maximum;
+ u64 pebs_interrupt_threshold;
+ u64 pebs_event_reset[MAX_PEBS_EVENTS];
+};
+
+static void init_debug_store_on_cpu(int cpu)
+{
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA,
+ (u32)((u64)(unsigned long)ds),
+ (u32)((u64)(unsigned long)ds >> 32));
+}
+
+static void fini_debug_store_on_cpu(int cpu)
+{
+ if (!per_cpu(cpu_hw_events, cpu).ds)
+ return;
+
+ wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
+}
+
+static void release_ds_buffers(void)
+{
+ int cpu;
+
+ if (!x86_pmu.bts && !x86_pmu.pebs)
+ return;
+
+ get_online_cpus();
+
+ for_each_online_cpu(cpu)
+ fini_debug_store_on_cpu(cpu);
+
+ for_each_possible_cpu(cpu) {
+ struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
+
+ if (!ds)
+ continue;
+
+ per_cpu(cpu_hw_events, cpu).ds = NULL;
+
+ kfree((void *)(unsigned long)ds->pebs_buffer_base);
+ kfree((void *)(unsigned long)ds->bts_buffer_base);
+ kfree(ds);
+ }
+
+ put_online_cpus();
+}
+
+static int reserve_ds_buffers(void)
+{
+ int cpu, err = 0;
+
+ if (!x86_pmu.bts && !x86_pmu.pebs)
+ return 0;
+
+ get_online_cpus();
+
+ for_each_possible_cpu(cpu) {
+ struct debug_store *ds;
+ void *buffer;
+ int max, thresh;
+
+ err = -ENOMEM;
+ ds = kzalloc(sizeof(*ds), GFP_KERNEL);
+ if (unlikely(!ds))
+ break;
+ per_cpu(cpu_hw_events, cpu).ds = ds;
+
+ if (x86_pmu.bts) {
+ buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
+ if (unlikely(!buffer))
+ break;
+
+ max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
+ thresh = max / 16;
+
+ ds->bts_buffer_base = (u64)(unsigned long)buffer;
+ ds->bts_index = ds->bts_buffer_base;
+ ds->bts_absolute_maximum = ds->bts_buffer_base +
+ max * BTS_RECORD_SIZE;
+ ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
+ thresh * BTS_RECORD_SIZE;
+ }
+
+ if (x86_pmu.pebs) {
+ buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL);
+ if (unlikely(!buffer))
+ break;
+
+ max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
+
+ ds->pebs_buffer_base = (u64)(unsigned long)buffer;
+ ds->pebs_index = ds->pebs_buffer_base;
+ ds->pebs_absolute_maximum = ds->pebs_buffer_base +
+ max * x86_pmu.pebs_record_size;
+ /*
+ * Always use single record PEBS
+ */
+ ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
+ x86_pmu.pebs_record_size;
+ }
+
+ err = 0;
+ }
+
+ if (err)
+ release_ds_buffers();
+ else {
+ for_each_online_cpu(cpu)
+ init_debug_store_on_cpu(cpu);
+ }
+
+ put_online_cpus();
+
+ return err;
+}
+
+/*
+ * BTS
+ */
+
+static struct event_constraint bts_constraint =
+ EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
+
+static void intel_pmu_enable_bts(u64 config)
+{
+ unsigned long debugctlmsr;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr |= DEBUGCTLMSR_TR;
+ debugctlmsr |= DEBUGCTLMSR_BTS;
+ debugctlmsr |= DEBUGCTLMSR_BTINT;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_OS))
+ debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS;
+
+ if (!(config & ARCH_PERFMON_EVENTSEL_USR))
+ debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR;
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_disable_bts(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ unsigned long debugctlmsr;
+
+ if (!cpuc->ds)
+ return;
+
+ debugctlmsr = get_debugctlmsr();
+
+ debugctlmsr &=
+ ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT |
+ DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR);
+
+ update_debugctlmsr(debugctlmsr);
+}
+
+static void intel_pmu_drain_bts_buffer(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct bts_record {
+ u64 from;
+ u64 to;
+ u64 flags;
+ };
+ struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
+ struct bts_record *at, *top;
+ struct perf_output_handle handle;
+ struct perf_event_header header;
+ struct perf_sample_data data;
+ struct pt_regs regs;
+
+ if (!event)
+ return;
+
+ if (!ds)
+ return;
+
+ at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
+ top = (struct bts_record *)(unsigned long)ds->bts_index;
+
+ if (top <= at)
+ return;
+
+ ds->bts_index = ds->bts_buffer_base;
+
+ perf_sample_data_init(&data, 0);
+ data.period = event->hw.last_period;
+ regs.ip = 0;
+
+ /*
+ * Prepare a generic sample, i.e. fill in the invariant fields.
+ * We will overwrite the from and to address before we output
+ * the sample.
+ */
+ perf_prepare_sample(&header, &data, event, &regs);
+
+ if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
+ return;
+
+ for (; at < top; at++) {
+ data.ip = at->from;
+ data.addr = at->to;
+
+ perf_output_sample(&handle, &header, &data, event);
+ }
+
+ perf_output_end(&handle);
+
+ /* There's new data available. */
+ event->hw.interrupts++;
+ event->pending_kill = POLL_IN;
+}
+
+/*
+ * PEBS
+ */
+
+static struct event_constraint intel_core_pebs_events[] = {
+ PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
+ PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */
+ PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */
+ PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint intel_nehalem_pebs_events[] = {
+ PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */
+ PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */
+ PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */
+ PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */
+ PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */
+ PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */
+ EVENT_CONSTRAINT_END
+};
+
+static struct event_constraint *
+intel_pebs_constraints(struct perf_event *event)
+{
+ struct event_constraint *c;
+
+ if (!event->attr.precise_ip)
+ return NULL;
+
+ if (x86_pmu.pebs_constraints) {
+ for_each_event_constraint(c, x86_pmu.pebs_constraints) {
+ if ((event->hw.config & c->cmask) == c->code)
+ return c;
+ }
+ }
+
+ return &emptyconstraint;
+}
+
+static void intel_pmu_pebs_enable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT;
+
+ cpuc->pebs_enabled |= 1ULL << hwc->idx;
+ WARN_ON_ONCE(cpuc->enabled);
+
+ if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
+ intel_pmu_lbr_enable(event);
+}
+
+static void intel_pmu_pebs_disable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct hw_perf_event *hwc = &event->hw;
+
+ cpuc->pebs_enabled &= ~(1ULL << hwc->idx);
+ if (cpuc->enabled)
+ wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+
+ hwc->config |= ARCH_PERFMON_EVENTSEL_INT;
+
+ if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1)
+ intel_pmu_lbr_disable(event);
+}
+
+static void intel_pmu_pebs_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (cpuc->pebs_enabled)
+ wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled);
+}
+
+static void intel_pmu_pebs_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (cpuc->pebs_enabled)
+ wrmsrl(MSR_IA32_PEBS_ENABLE, 0);
+}
+
+#include <asm/insn.h>
+
+static inline bool kernel_ip(unsigned long ip)
+{
+#ifdef CONFIG_X86_32
+ return ip > PAGE_OFFSET;
+#else
+ return (long)ip < 0;
+#endif
+}
+
+static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ unsigned long from = cpuc->lbr_entries[0].from;
+ unsigned long old_to, to = cpuc->lbr_entries[0].to;
+ unsigned long ip = regs->ip;
+
+ /*
+ * We don't need to fixup if the PEBS assist is fault like
+ */
+ if (!x86_pmu.intel_cap.pebs_trap)
+ return 1;
+
+ /*
+ * No LBR entry, no basic block, no rewinding
+ */
+ if (!cpuc->lbr_stack.nr || !from || !to)
+ return 0;
+
+ /*
+ * Basic blocks should never cross user/kernel boundaries
+ */
+ if (kernel_ip(ip) != kernel_ip(to))
+ return 0;
+
+ /*
+ * unsigned math, either ip is before the start (impossible) or
+ * the basic block is larger than 1 page (sanity)
+ */
+ if ((ip - to) > PAGE_SIZE)
+ return 0;
+
+ /*
+ * We sampled a branch insn, rewind using the LBR stack
+ */
+ if (ip == to) {
+ regs->ip = from;
+ return 1;
+ }
+
+ do {
+ struct insn insn;
+ u8 buf[MAX_INSN_SIZE];
+ void *kaddr;
+
+ old_to = to;
+ if (!kernel_ip(ip)) {
+ int bytes, size = MAX_INSN_SIZE;
+
+ bytes = copy_from_user_nmi(buf, (void __user *)to, size);
+ if (bytes != size)
+ return 0;
+
+ kaddr = buf;
+ } else
+ kaddr = (void *)to;
+
+ kernel_insn_init(&insn, kaddr);
+ insn_get_length(&insn);
+ to += insn.length;
+ } while (to < ip);
+
+ if (to == ip) {
+ regs->ip = old_to;
+ return 1;
+ }
+
+ /*
+ * Even though we decoded the basic block, the instruction stream
+ * never matched the given IP, either the TO or the IP got corrupted.
+ */
+ return 0;
+}
+
+static int intel_pmu_save_and_restart(struct perf_event *event);
+
+static void __intel_pmu_pebs_event(struct perf_event *event,
+ struct pt_regs *iregs, void *__pebs)
+{
+ /*
+ * We cast to pebs_record_core since that is a subset of
+ * both formats and we don't use the other fields in this
+ * routine.
+ */
+ struct pebs_record_core *pebs = __pebs;
+ struct perf_sample_data data;
+ struct pt_regs regs;
+
+ if (!intel_pmu_save_and_restart(event))
+ return;
+
+ perf_sample_data_init(&data, 0);
+ data.period = event->hw.last_period;
+
+ /*
+ * We use the interrupt regs as a base because the PEBS record
+ * does not contain a full regs set, specifically it seems to
+ * lack segment descriptors, which get used by things like
+ * user_mode().
+ *
+ * In the simple case fix up only the IP and BP,SP regs, for
+ * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly.
+ * A possible PERF_SAMPLE_REGS will have to transfer all regs.
+ */
+ regs = *iregs;
+ regs.ip = pebs->ip;
+ regs.bp = pebs->bp;
+ regs.sp = pebs->sp;
+
+ if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(&regs))
+ regs.flags |= PERF_EFLAGS_EXACT;
+ else
+ regs.flags &= ~PERF_EFLAGS_EXACT;
+
+ if (perf_event_overflow(event, 1, &data, &regs))
+ x86_pmu_stop(event);
+}
+
+static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct perf_event *event = cpuc->events[0]; /* PMC0 only */
+ struct pebs_record_core *at, *top;
+ int n;
+
+ if (!ds || !x86_pmu.pebs)
+ return;
+
+ at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_record_core *)(unsigned long)ds->pebs_index;
+
+ /*
+ * Whatever else happens, drain the thing
+ */
+ ds->pebs_index = ds->pebs_buffer_base;
+
+ if (!test_bit(0, cpuc->active_mask))
+ return;
+
+ WARN_ON_ONCE(!event);
+
+ if (!event->attr.precise_ip)
+ return;
+
+ n = top - at;
+ if (n <= 0)
+ return;
+
+ /*
+ * Should not happen, we program the threshold at 1 and do not
+ * set a reset value.
+ */
+ WARN_ON_ONCE(n > 1);
+ at += n - 1;
+
+ __intel_pmu_pebs_event(event, iregs, at);
+}
+
+static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ struct debug_store *ds = cpuc->ds;
+ struct pebs_record_nhm *at, *top;
+ struct perf_event *event = NULL;
+ u64 status = 0;
+ int bit, n;
+
+ if (!ds || !x86_pmu.pebs)
+ return;
+
+ at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
+ top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index;
+
+ ds->pebs_index = ds->pebs_buffer_base;
+
+ n = top - at;
+ if (n <= 0)
+ return;
+
+ /*
+ * Should not happen, we program the threshold at 1 and do not
+ * set a reset value.
+ */
+ WARN_ON_ONCE(n > MAX_PEBS_EVENTS);
+
+ for ( ; at < top; at++) {
+ for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) {
+ event = cpuc->events[bit];
+ if (!test_bit(bit, cpuc->active_mask))
+ continue;
+
+ WARN_ON_ONCE(!event);
+
+ if (!event->attr.precise_ip)
+ continue;
+
+ if (__test_and_set_bit(bit, (unsigned long *)&status))
+ continue;
+
+ break;
+ }
+
+ if (!event || bit >= MAX_PEBS_EVENTS)
+ continue;
+
+ __intel_pmu_pebs_event(event, iregs, at);
+ }
+}
+
+/*
+ * BTS, PEBS probe and setup
+ */
+
+static void intel_ds_init(void)
+{
+ /*
+ * No support for 32bit formats
+ */
+ if (!boot_cpu_has(X86_FEATURE_DTES64))
+ return;
+
+ x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS);
+ x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS);
+ if (x86_pmu.pebs) {
+ char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-';
+ int format = x86_pmu.intel_cap.pebs_format;
+
+ switch (format) {
+ case 0:
+ printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
+ x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
+ x86_pmu.pebs_constraints = intel_core_pebs_events;
+ break;
+
+ case 1:
+ printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
+ x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
+ x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
+ x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
+ break;
+
+ default:
+ printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
+ x86_pmu.pebs = 0;
+ break;
+ }
+ }
+}
+
+#else /* CONFIG_CPU_SUP_INTEL */
+
+static int reserve_ds_buffers(void)
+{
+ return 0;
+}
+
+static void release_ds_buffers(void)
+{
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
new file mode 100644
index 000000000000..d202c1bece1a
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c
@@ -0,0 +1,218 @@
+#ifdef CONFIG_CPU_SUP_INTEL
+
+enum {
+ LBR_FORMAT_32 = 0x00,
+ LBR_FORMAT_LIP = 0x01,
+ LBR_FORMAT_EIP = 0x02,
+ LBR_FORMAT_EIP_FLAGS = 0x03,
+};
+
+/*
+ * We only support LBR implementations that have FREEZE_LBRS_ON_PMI
+ * otherwise it becomes near impossible to get a reliable stack.
+ */
+
+static void __intel_pmu_lbr_enable(void)
+{
+ u64 debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void __intel_pmu_lbr_disable(void)
+{
+ u64 debugctl;
+
+ rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+ debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI);
+ wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
+}
+
+static void intel_pmu_lbr_reset_32(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++)
+ wrmsrl(x86_pmu.lbr_from + i, 0);
+}
+
+static void intel_pmu_lbr_reset_64(void)
+{
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ wrmsrl(x86_pmu.lbr_from + i, 0);
+ wrmsrl(x86_pmu.lbr_to + i, 0);
+ }
+}
+
+static void intel_pmu_lbr_reset(void)
+{
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+ intel_pmu_lbr_reset_32();
+ else
+ intel_pmu_lbr_reset_64();
+}
+
+static void intel_pmu_lbr_enable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ WARN_ON_ONCE(cpuc->enabled);
+
+ /*
+ * Reset the LBR stack if we changed task context to
+ * avoid data leaks.
+ */
+
+ if (event->ctx->task && cpuc->lbr_context != event->ctx) {
+ intel_pmu_lbr_reset();
+ cpuc->lbr_context = event->ctx;
+ }
+
+ cpuc->lbr_users++;
+}
+
+static void intel_pmu_lbr_disable(struct perf_event *event)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (!x86_pmu.lbr_nr)
+ return;
+
+ cpuc->lbr_users--;
+ WARN_ON_ONCE(cpuc->lbr_users < 0);
+
+ if (cpuc->enabled && !cpuc->lbr_users)
+ __intel_pmu_lbr_disable();
+}
+
+static void intel_pmu_lbr_enable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (cpuc->lbr_users)
+ __intel_pmu_lbr_enable();
+}
+
+static void intel_pmu_lbr_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (cpuc->lbr_users)
+ __intel_pmu_lbr_disable();
+}
+
+static inline u64 intel_pmu_lbr_tos(void)
+{
+ u64 tos;
+
+ rdmsrl(x86_pmu.lbr_tos, tos);
+
+ return tos;
+}
+
+static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc)
+{
+ unsigned long mask = x86_pmu.lbr_nr - 1;
+ u64 tos = intel_pmu_lbr_tos();
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ unsigned long lbr_idx = (tos - i) & mask;
+ union {
+ struct {
+ u32 from;
+ u32 to;
+ };
+ u64 lbr;
+ } msr_lastbranch;
+
+ rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr);
+
+ cpuc->lbr_entries[i].from = msr_lastbranch.from;
+ cpuc->lbr_entries[i].to = msr_lastbranch.to;
+ cpuc->lbr_entries[i].flags = 0;
+ }
+ cpuc->lbr_stack.nr = i;
+}
+
+#define LBR_FROM_FLAG_MISPRED (1ULL << 63)
+
+/*
+ * Due to lack of segmentation in Linux the effective address (offset)
+ * is the same as the linear address, allowing us to merge the LIP and EIP
+ * LBR formats.
+ */
+static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc)
+{
+ unsigned long mask = x86_pmu.lbr_nr - 1;
+ int lbr_format = x86_pmu.intel_cap.lbr_format;
+ u64 tos = intel_pmu_lbr_tos();
+ int i;
+
+ for (i = 0; i < x86_pmu.lbr_nr; i++) {
+ unsigned long lbr_idx = (tos - i) & mask;
+ u64 from, to, flags = 0;
+
+ rdmsrl(x86_pmu.lbr_from + lbr_idx, from);
+ rdmsrl(x86_pmu.lbr_to + lbr_idx, to);
+
+ if (lbr_format == LBR_FORMAT_EIP_FLAGS) {
+ flags = !!(from & LBR_FROM_FLAG_MISPRED);
+ from = (u64)((((s64)from) << 1) >> 1);
+ }
+
+ cpuc->lbr_entries[i].from = from;
+ cpuc->lbr_entries[i].to = to;
+ cpuc->lbr_entries[i].flags = flags;
+ }
+ cpuc->lbr_stack.nr = i;
+}
+
+static void intel_pmu_lbr_read(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+
+ if (!cpuc->lbr_users)
+ return;
+
+ if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32)
+ intel_pmu_lbr_read_32(cpuc);
+ else
+ intel_pmu_lbr_read_64(cpuc);
+}
+
+static void intel_pmu_lbr_init_core(void)
+{
+ x86_pmu.lbr_nr = 4;
+ x86_pmu.lbr_tos = 0x01c9;
+ x86_pmu.lbr_from = 0x40;
+ x86_pmu.lbr_to = 0x60;
+}
+
+static void intel_pmu_lbr_init_nhm(void)
+{
+ x86_pmu.lbr_nr = 16;
+ x86_pmu.lbr_tos = 0x01c9;
+ x86_pmu.lbr_from = 0x680;
+ x86_pmu.lbr_to = 0x6c0;
+}
+
+static void intel_pmu_lbr_init_atom(void)
+{
+ x86_pmu.lbr_nr = 8;
+ x86_pmu.lbr_tos = 0x01c9;
+ x86_pmu.lbr_from = 0x40;
+ x86_pmu.lbr_to = 0x60;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
new file mode 100644
index 000000000000..107711bf0ee8
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -0,0 +1,942 @@
+/*
+ * Netburst Perfomance Events (P4, old Xeon)
+ *
+ * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
+ * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
+ *
+ * For licencing details see kernel-base/COPYING
+ */
+
+#ifdef CONFIG_CPU_SUP_INTEL
+
+#include <asm/perf_event_p4.h>
+
+#define P4_CNTR_LIMIT 3
+/*
+ * array indices: 0,1 - HT threads, used with HT enabled cpu
+ */
+struct p4_event_bind {
+ unsigned int opcode; /* Event code and ESCR selector */
+ unsigned int escr_msr[2]; /* ESCR MSR for this event */
+ char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
+};
+
+struct p4_pebs_bind {
+ unsigned int metric_pebs;
+ unsigned int metric_vert;
+};
+
+/* it sets P4_PEBS_ENABLE_UOP_TAG as well */
+#define P4_GEN_PEBS_BIND(name, pebs, vert) \
+ [P4_PEBS_METRIC__##name] = { \
+ .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \
+ .metric_vert = vert, \
+ }
+
+/*
+ * note we have P4_PEBS_ENABLE_UOP_TAG always set here
+ *
+ * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of
+ * event configuration to find out which values are to be
+ * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT
+ * resgisters
+ */
+static struct p4_pebs_bind p4_pebs_bind_map[] = {
+ P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001),
+ P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001),
+ P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001),
+ P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002),
+ P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003),
+ P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010),
+ P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001),
+ P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001),
+ P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002),
+};
+
+/*
+ * Note that we don't use CCCR1 here, there is an
+ * exception for P4_BSQ_ALLOCATION but we just have
+ * no workaround
+ *
+ * consider this binding as resources which particular
+ * event may borrow, it doesn't contain EventMask,
+ * Tags and friends -- they are left to a caller
+ */
+static struct p4_event_bind p4_event_bind_map[] = {
+ [P4_EVENT_TC_DELIVER_MODE] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
+ .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_BPU_FETCH_REQUEST] = {
+ .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
+ .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_ITLB_REFERENCE] = {
+ .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
+ .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_MEMORY_CANCEL] = {
+ .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
+ .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_MEMORY_COMPLETE] = {
+ .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
+ .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_LOAD_PORT_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
+ .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_STORE_PORT_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
+ .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_MOB_LOAD_REPLAY] = {
+ .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
+ .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_PAGE_WALK_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
+ .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BSQ_CACHE_REFERENCE] = {
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
+ .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_IOQ_ALLOCATION] = {
+ .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
+ .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
+ .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
+ .cntr = { {2, -1, -1}, {3, -1, -1} },
+ },
+ [P4_EVENT_FSB_DATA_ACTIVITY] = {
+ .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
+ .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
+ .cntr = { {0, -1, -1}, {1, -1, -1} },
+ },
+ [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
+ .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
+ .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
+ .cntr = { {2, -1, -1}, {3, -1, -1} },
+ },
+ [P4_EVENT_SSE_INPUT_ASSIST] = {
+ .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_PACKED_SP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_PACKED_DP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_SCALAR_SP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_SCALAR_DP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_64BIT_MMX_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_128BIT_MMX_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_X87_FP_UOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
+ .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_TC_MISC] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
+ .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_GLOBAL_POWER_EVENTS] = {
+ .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_TC_MS_XFER] = {
+ .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
+ .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_UOP_QUEUE_WRITES] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
+ .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
+ .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RETIRED_BRANCH_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
+ .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
+ .cntr = { {4, 5, -1}, {6, 7, -1} },
+ },
+ [P4_EVENT_RESOURCE_STALL] = {
+ .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
+ .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_WC_BUFFER] = {
+ .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
+ .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
+ .cntr = { {8, 9, -1}, {10, 11, -1} },
+ },
+ [P4_EVENT_B2B_CYCLES] = {
+ .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_BNR] = {
+ .opcode = P4_OPCODE(P4_EVENT_BNR),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_SNOOP] = {
+ .opcode = P4_OPCODE(P4_EVENT_SNOOP),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_RESPONSE] = {
+ .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
+ .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
+ .cntr = { {0, -1, -1}, {2, -1, -1} },
+ },
+ [P4_EVENT_FRONT_END_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_EXECUTION_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_REPLAY_EVENT] = {
+ .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_INSTR_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_UOPS_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_UOP_TYPE] = {
+ .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
+ .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_BRANCH_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
+ .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_X87_ASSIST] = {
+ .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_MACHINE_CLEAR] = {
+ .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
+ .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+ [P4_EVENT_INSTR_COMPLETED] = {
+ .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
+ .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
+ .cntr = { {12, 13, 16}, {14, 15, 17} },
+ },
+};
+
+#define P4_GEN_CACHE_EVENT(event, bit, metric) \
+ p4_config_pack_escr(P4_ESCR_EVENT(event) | \
+ P4_ESCR_EMASK_BIT(event, bit)) | \
+ p4_config_pack_cccr(metric | \
+ P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event))))
+
+static __initconst const u64 p4_hw_cache_event_ids
+ [PERF_COUNT_HW_CACHE_MAX]
+ [PERF_COUNT_HW_CACHE_OP_MAX]
+ [PERF_COUNT_HW_CACHE_RESULT_MAX] =
+{
+ [ C(L1D ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__1stl_cache_load_miss_retired),
+ },
+ },
+ [ C(LL ) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__2ndl_cache_load_miss_retired),
+ },
+},
+ [ C(DTLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__dtlb_load_miss_retired),
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = 0x0,
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS,
+ P4_PEBS_METRIC__dtlb_store_miss_retired),
+ },
+ },
+ [ C(ITLB) ] = {
+ [ C(OP_READ) ] = {
+ [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT,
+ P4_PEBS_METRIC__none),
+ [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS,
+ P4_PEBS_METRIC__none),
+ },
+ [ C(OP_WRITE) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ [ C(OP_PREFETCH) ] = {
+ [ C(RESULT_ACCESS) ] = -1,
+ [ C(RESULT_MISS) ] = -1,
+ },
+ },
+};
+
+static u64 p4_general_events[PERF_COUNT_HW_MAX] = {
+ /* non-halted CPU clocks */
+ [PERF_COUNT_HW_CPU_CYCLES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)),
+
+ /*
+ * retired instructions
+ * in a sake of simplicity we don't use the FSB tagging
+ */
+ [PERF_COUNT_HW_INSTRUCTIONS] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)),
+
+ /* cache hits */
+ [PERF_COUNT_HW_CACHE_REFERENCES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)),
+
+ /* cache misses */
+ [PERF_COUNT_HW_CACHE_MISSES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)),
+
+ /* branch instructions retired */
+ [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)),
+
+ /* mispredicted branches retired */
+ [PERF_COUNT_HW_BRANCH_MISSES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)),
+
+ /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */
+ [PERF_COUNT_HW_BUS_CYCLES] =
+ p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
+ P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) |
+ p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE),
+};
+
+static struct p4_event_bind *p4_config_get_bind(u64 config)
+{
+ unsigned int evnt = p4_config_unpack_event(config);
+ struct p4_event_bind *bind = NULL;
+
+ if (evnt < ARRAY_SIZE(p4_event_bind_map))
+ bind = &p4_event_bind_map[evnt];
+
+ return bind;
+}
+
+static u64 p4_pmu_event_map(int hw_event)
+{
+ struct p4_event_bind *bind;
+ unsigned int esel;
+ u64 config;
+
+ config = p4_general_events[hw_event];
+ bind = p4_config_get_bind(config);
+ esel = P4_OPCODE_ESEL(bind->opcode);
+ config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
+
+ return config;
+}
+
+static int p4_validate_raw_event(struct perf_event *event)
+{
+ unsigned int v;
+
+ /* user data may have out-of-bound event index */
+ v = p4_config_unpack_event(event->attr.config);
+ if (v >= ARRAY_SIZE(p4_event_bind_map)) {
+ pr_warning("P4 PMU: Unknown event code: %d\n", v);
+ return -EINVAL;
+ }
+
+ /*
+ * it may have some screwed PEBS bits
+ */
+ if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) {
+ pr_warning("P4 PMU: PEBS are not supported yet\n");
+ return -EINVAL;
+ }
+ v = p4_config_unpack_metric(event->attr.config);
+ if (v >= ARRAY_SIZE(p4_pebs_bind_map)) {
+ pr_warning("P4 PMU: Unknown metric code: %d\n", v);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int p4_hw_config(struct perf_event *event)
+{
+ int cpu = get_cpu();
+ int rc = 0;
+ u32 escr, cccr;
+
+ /*
+ * the reason we use cpu that early is that: if we get scheduled
+ * first time on the same cpu -- we will not need swap thread
+ * specific flags in config (and will save some cpu cycles)
+ */
+
+ cccr = p4_default_cccr_conf(cpu);
+ escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel,
+ event->attr.exclude_user);
+ event->hw.config = p4_config_pack_escr(escr) |
+ p4_config_pack_cccr(cccr);
+
+ if (p4_ht_active() && p4_ht_thread(cpu))
+ event->hw.config = p4_set_ht_bit(event->hw.config);
+
+ if (event->attr.type == PERF_TYPE_RAW) {
+
+ rc = p4_validate_raw_event(event);
+ if (rc)
+ goto out;
+
+ /*
+ * We don't control raw events so it's up to the caller
+ * to pass sane values (and we don't count the thread number
+ * on HT machine but allow HT-compatible specifics to be
+ * passed on)
+ *
+ * Note that for RAW events we allow user to use P4_CCCR_RESERVED
+ * bits since we keep additional info here (for cache events and etc)
+ *
+ * XXX: HT wide things should check perf_paranoid_cpu() &&
+ * CAP_SYS_ADMIN
+ */
+ event->hw.config |= event->attr.config &
+ (p4_config_pack_escr(P4_ESCR_MASK_HT) |
+ p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED));
+ }
+
+ rc = x86_setup_perfctr(event);
+out:
+ put_cpu();
+ return rc;
+}
+
+static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
+{
+ int overflow = 0;
+ u32 low, high;
+
+ rdmsr(hwc->config_base + hwc->idx, low, high);
+
+ /* we need to check high bit for unflagged overflows */
+ if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) {
+ overflow = 1;
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+ ((u64)low) & ~P4_CCCR_OVF);
+ }
+
+ return overflow;
+}
+
+static void p4_pmu_disable_pebs(void)
+{
+ /*
+ * FIXME
+ *
+ * It's still allowed that two threads setup same cache
+ * events so we can't simply clear metrics until we knew
+ * noone is depending on us, so we need kind of counter
+ * for "ReplayEvent" users.
+ *
+ * What is more complex -- RAW events, if user (for some
+ * reason) will pass some cache event metric with improper
+ * event opcode -- it's fine from hardware point of view
+ * but completely nonsence from "meaning" of such action.
+ *
+ * So at moment let leave metrics turned on forever -- it's
+ * ok for now but need to be revisited!
+ *
+ * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0);
+ * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0);
+ */
+}
+
+static inline void p4_pmu_disable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+
+ /*
+ * If event gets disabled while counter is in overflowed
+ * state we need to clear P4_CCCR_OVF, otherwise interrupt get
+ * asserted again and again
+ */
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+ (u64)(p4_config_unpack_cccr(hwc->config)) &
+ ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
+}
+
+static void p4_pmu_disable_all(void)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int idx;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct perf_event *event = cpuc->events[idx];
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ p4_pmu_disable_event(event);
+ }
+
+ p4_pmu_disable_pebs();
+}
+
+/* configuration must be valid */
+static void p4_pmu_enable_pebs(u64 config)
+{
+ struct p4_pebs_bind *bind;
+ unsigned int idx;
+
+ BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK);
+
+ idx = p4_config_unpack_metric(config);
+ if (idx == P4_PEBS_METRIC__none)
+ return;
+
+ bind = &p4_pebs_bind_map[idx];
+
+ (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs);
+ (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert);
+}
+
+static void p4_pmu_enable_event(struct perf_event *event)
+{
+ struct hw_perf_event *hwc = &event->hw;
+ int thread = p4_ht_config_thread(hwc->config);
+ u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config));
+ unsigned int idx = p4_config_unpack_event(hwc->config);
+ struct p4_event_bind *bind;
+ u64 escr_addr, cccr;
+
+ bind = &p4_event_bind_map[idx];
+ escr_addr = (u64)bind->escr_msr[thread];
+
+ /*
+ * - we dont support cascaded counters yet
+ * - and counter 1 is broken (erratum)
+ */
+ WARN_ON_ONCE(p4_is_event_cascaded(hwc->config));
+ WARN_ON_ONCE(hwc->idx == 1);
+
+ /* we need a real Event value */
+ escr_conf &= ~P4_ESCR_EVENT_MASK;
+ escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode));
+
+ cccr = p4_config_unpack_cccr(hwc->config);
+
+ /*
+ * it could be Cache event so we need to write metrics
+ * into additional MSRs
+ */
+ p4_pmu_enable_pebs(hwc->config);
+
+ (void)checking_wrmsrl(escr_addr, escr_conf);
+ (void)checking_wrmsrl(hwc->config_base + hwc->idx,
+ (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
+}
+
+static void p4_pmu_enable_all(int added)
+{
+ struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
+ int idx;
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+ struct perf_event *event = cpuc->events[idx];
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+ p4_pmu_enable_event(event);
+ }
+}
+
+static int p4_pmu_handle_irq(struct pt_regs *regs)
+{
+ struct perf_sample_data data;
+ struct cpu_hw_events *cpuc;
+ struct perf_event *event;
+ struct hw_perf_event *hwc;
+ int idx, handled = 0;
+ u64 val;
+
+ data.addr = 0;
+ data.raw = NULL;
+
+ cpuc = &__get_cpu_var(cpu_hw_events);
+
+ for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+
+ if (!test_bit(idx, cpuc->active_mask))
+ continue;
+
+ event = cpuc->events[idx];
+ hwc = &event->hw;
+
+ WARN_ON_ONCE(hwc->idx != idx);
+
+ /* it might be unflagged overflow */
+ handled = p4_pmu_clear_cccr_ovf(hwc);
+
+ val = x86_perf_event_update(event);
+ if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1))))
+ continue;
+
+ /* event overflow for sure */
+ data.period = event->hw.last_period;
+
+ if (!x86_perf_event_set_period(event))
+ continue;
+ if (perf_event_overflow(event, 1, &data, regs))
+ p4_pmu_disable_event(event);
+ }
+
+ if (handled) {
+ /* p4 quirk: unmask it again */
+ apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
+ inc_irq_stat(apic_perf_irqs);
+ }
+
+ return handled;
+}
+
+/*
+ * swap thread specific fields according to a thread
+ * we are going to run on
+ */
+static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu)
+{
+ u32 escr, cccr;
+
+ /*
+ * we either lucky and continue on same cpu or no HT support
+ */
+ if (!p4_should_swap_ts(hwc->config, cpu))
+ return;
+
+ /*
+ * the event is migrated from an another logical
+ * cpu, so we need to swap thread specific flags
+ */
+
+ escr = p4_config_unpack_escr(hwc->config);
+ cccr = p4_config_unpack_cccr(hwc->config);
+
+ if (p4_ht_thread(cpu)) {
+ cccr &= ~P4_CCCR_OVF_PMI_T0;
+ cccr |= P4_CCCR_OVF_PMI_T1;
+ if (escr & P4_ESCR_T0_OS) {
+ escr &= ~P4_ESCR_T0_OS;
+ escr |= P4_ESCR_T1_OS;
+ }
+ if (escr & P4_ESCR_T0_USR) {
+ escr &= ~P4_ESCR_T0_USR;
+ escr |= P4_ESCR_T1_USR;
+ }
+ hwc->config = p4_config_pack_escr(escr);
+ hwc->config |= p4_config_pack_cccr(cccr);
+ hwc->config |= P4_CONFIG_HT;
+ } else {
+ cccr &= ~P4_CCCR_OVF_PMI_T1;
+ cccr |= P4_CCCR_OVF_PMI_T0;
+ if (escr & P4_ESCR_T1_OS) {
+ escr &= ~P4_ESCR_T1_OS;
+ escr |= P4_ESCR_T0_OS;
+ }
+ if (escr & P4_ESCR_T1_USR) {
+ escr &= ~P4_ESCR_T1_USR;
+ escr |= P4_ESCR_T0_USR;
+ }
+ hwc->config = p4_config_pack_escr(escr);
+ hwc->config |= p4_config_pack_cccr(cccr);
+ hwc->config &= ~P4_CONFIG_HT;
+ }
+}
+
+/*
+ * ESCR address hashing is tricky, ESCRs are not sequential
+ * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and
+ * the metric between any ESCRs is laid in range [0xa0,0xe1]
+ *
+ * so we make ~70% filled hashtable
+ */
+
+#define P4_ESCR_MSR_BASE 0x000003a0
+#define P4_ESCR_MSR_MAX 0x000003e1
+#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1)
+#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE)
+#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr
+
+static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = {
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0),
+ P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1),
+};
+
+static int p4_get_escr_idx(unsigned int addr)
+{
+ unsigned int idx = P4_ESCR_MSR_IDX(addr);
+
+ if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE ||
+ !p4_escr_table[idx] ||
+ p4_escr_table[idx] != addr)) {
+ WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr);
+ return -1;
+ }
+
+ return idx;
+}
+
+static int p4_next_cntr(int thread, unsigned long *used_mask,
+ struct p4_event_bind *bind)
+{
+ int i, j;
+
+ for (i = 0; i < P4_CNTR_LIMIT; i++) {
+ j = bind->cntr[thread][i];
+ if (j != -1 && !test_bit(j, used_mask))
+ return j;
+ }
+
+ return -1;
+}
+
+static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
+{
+ unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
+ unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)];
+ int cpu = smp_processor_id();
+ struct hw_perf_event *hwc;
+ struct p4_event_bind *bind;
+ unsigned int i, thread, num;
+ int cntr_idx, escr_idx;
+
+ bitmap_zero(used_mask, X86_PMC_IDX_MAX);
+ bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE);
+
+ for (i = 0, num = n; i < n; i++, num--) {
+
+ hwc = &cpuc->event_list[i]->hw;
+ thread = p4_ht_thread(cpu);
+ bind = p4_config_get_bind(hwc->config);
+ escr_idx = p4_get_escr_idx(bind->escr_msr[thread]);
+ if (unlikely(escr_idx == -1))
+ goto done;
+
+ if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) {
+ cntr_idx = hwc->idx;
+ if (assign)
+ assign[i] = hwc->idx;
+ goto reserve;
+ }
+
+ cntr_idx = p4_next_cntr(thread, used_mask, bind);
+ if (cntr_idx == -1 || test_bit(escr_idx, escr_mask))
+ goto done;
+
+ p4_pmu_swap_config_ts(hwc, cpu);
+ if (assign)
+ assign[i] = cntr_idx;
+reserve:
+ set_bit(cntr_idx, used_mask);
+ set_bit(escr_idx, escr_mask);
+ }
+
+done:
+ return num ? -ENOSPC : 0;
+}
+
+static __initconst const struct x86_pmu p4_pmu = {
+ .name = "Netburst P4/Xeon",
+ .handle_irq = p4_pmu_handle_irq,
+ .disable_all = p4_pmu_disable_all,
+ .enable_all = p4_pmu_enable_all,
+ .enable = p4_pmu_enable_event,
+ .disable = p4_pmu_disable_event,
+ .eventsel = MSR_P4_BPU_CCCR0,
+ .perfctr = MSR_P4_BPU_PERFCTR0,
+ .event_map = p4_pmu_event_map,
+ .max_events = ARRAY_SIZE(p4_general_events),
+ .get_event_constraints = x86_get_event_constraints,
+ /*
+ * IF HT disabled we may need to use all
+ * ARCH_P4_MAX_CCCR counters simulaneously
+ * though leave it restricted at moment assuming
+ * HT is on
+ */
+ .num_counters = ARCH_P4_MAX_CCCR,
+ .apic = 1,
+ .cntval_bits = 40,
+ .cntval_mask = (1ULL << 40) - 1,
+ .max_period = (1ULL << 39) - 1,
+ .hw_config = p4_hw_config,
+ .schedule_events = p4_pmu_schedule_events,
+ /*
+ * This handles erratum N15 in intel doc 249199-029,
+ * the counter may not be updated correctly on write
+ * so we need a second write operation to do the trick
+ * (the official workaround didn't work)
+ *
+ * the former idea is taken from OProfile code
+ */
+ .perfctr_second_write = 1,
+};
+
+static __init int p4_pmu_init(void)
+{
+ unsigned int low, high;
+
+ /* If we get stripped -- indexig fails */
+ BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
+
+ rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+ if (!(low & (1 << 7))) {
+ pr_cont("unsupported Netburst CPU model %d ",
+ boot_cpu_data.x86_model);
+ return -ENODEV;
+ }
+
+ memcpy(hw_cache_event_ids, p4_hw_cache_event_ids,
+ sizeof(hw_cache_event_ids));
+
+ pr_cont("Netburst events, ");
+
+ x86_pmu = p4_pmu;
+
+ return 0;
+}
+
+#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index a330485d14da..34ba07be2cda 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event)
*/
#define P6_NOP_EVENT 0x0000002EULL
-static u64 p6_pmu_raw_event(u64 hw_event)
-{
-#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
-#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
-#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
-#define P6_EVNTSEL_INV_MASK 0x00800000ULL
-#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
-
-#define P6_EVNTSEL_MASK \
- (P6_EVNTSEL_EVENT_MASK | \
- P6_EVNTSEL_UNIT_MASK | \
- P6_EVNTSEL_EDGE_MASK | \
- P6_EVNTSEL_INV_MASK | \
- P6_EVNTSEL_REG_MASK)
-
- return hw_event & P6_EVNTSEL_MASK;
-}
-
static struct event_constraint p6_event_constraints[] =
{
INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
@@ -66,7 +48,7 @@ static void p6_pmu_disable_all(void)
wrmsrl(MSR_P6_EVNTSEL0, val);
}
-static void p6_pmu_enable_all(void)
+static void p6_pmu_enable_all(int added)
{
unsigned long val;
@@ -102,22 +84,23 @@ static void p6_pmu_enable_event(struct perf_event *event)
(void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
}
-static __initconst struct x86_pmu p6_pmu = {
+static __initconst const struct x86_pmu p6_pmu = {
.name = "p6",
.handle_irq = x86_pmu_handle_irq,
.disable_all = p6_pmu_disable_all,
.enable_all = p6_pmu_enable_all,
.enable = p6_pmu_enable_event,
.disable = p6_pmu_disable_event,
+ .hw_config = x86_pmu_hw_config,
+ .schedule_events = x86_schedule_events,
.eventsel = MSR_P6_EVNTSEL0,
.perfctr = MSR_P6_PERFCTR0,
.event_map = p6_pmu_event_map,
- .raw_event = p6_pmu_raw_event,
.max_events = ARRAY_SIZE(p6_perfmon_event_map),
.apic = 1,
.max_period = (1ULL << 31) - 1,
.version = 0,
- .num_events = 2,
+ .num_counters = 2,
/*
* Events have 40 bits implemented. However they are designed such
* that bits [32-39] are sign extensions of bit 31. As such the
@@ -125,8 +108,8 @@ static __initconst struct x86_pmu p6_pmu = {
*
* See IA-32 Intel Architecture Software developer manual Vol 3B
*/
- .event_bits = 32,
- .event_mask = (1ULL << 32) - 1,
+ .cntval_bits = 32,
+ .cntval_mask = (1ULL << 32) - 1,
.get_event_constraints = x86_get_event_constraints,
.event_constraints = p6_event_constraints,
};
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
new file mode 100644
index 000000000000..34b4dad6f0b8
--- /dev/null
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -0,0 +1,63 @@
+/*
+ * Routines to indentify additional cpu features that are scattered in
+ * cpuid space.
+ */
+#include <linux/cpu.h>
+
+#include <asm/pat.h>
+#include <asm/processor.h>
+
+#include <asm/apic.h>
+
+struct cpuid_bit {
+ u16 feature;
+ u8 reg;
+ u8 bit;
+ u32 level;
+ u32 sub_leaf;
+};
+
+enum cpuid_regs {
+ CR_EAX = 0,
+ CR_ECX,
+ CR_EDX,
+ CR_EBX
+};
+
+void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
+{
+ u32 max_level;
+ u32 regs[4];
+ const struct cpuid_bit *cb;
+
+ static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
+ { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 },
+ { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 },
+ { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 },
+ { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 },
+ { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 },
+ { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 },
+ { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 },
+ { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 },
+ { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 },
+ { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
+ { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
+ { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
+ { 0, 0, 0, 0, 0 }
+ };
+
+ for (cb = cpuid_bits; cb->feature; cb++) {
+
+ /* Verify that the level is valid */
+ max_level = cpuid_eax(cb->level & 0xffff0000);
+ if (max_level < cb->level ||
+ max_level > (cb->level | 0xffff))
+ continue;
+
+ cpuid_count(cb->level, cb->sub_leaf, &regs[CR_EAX],
+ &regs[CR_EBX], &regs[CR_ECX], &regs[CR_EDX]);
+
+ if (regs[cb->reg] & (1 << cb->bit))
+ set_cpu_cap(c, cb->feature);
+ }
+}
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c
index 97ad79cdf688..4397e987a1cf 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/topology.c
@@ -1,60 +1,14 @@
/*
- * Routines to indentify additional cpu features that are scattered in
- * cpuid space.
+ * Check for extended topology enumeration cpuid leaf 0xb and if it
+ * exists, use it for populating initial_apicid and cpu topology
+ * detection.
*/
-#include <linux/cpu.h>
+#include <linux/cpu.h>
+#include <asm/apic.h>
#include <asm/pat.h>
#include <asm/processor.h>
-#include <asm/apic.h>
-
-struct cpuid_bit {
- u16 feature;
- u8 reg;
- u8 bit;
- u32 level;
-};
-
-enum cpuid_regs {
- CR_EAX = 0,
- CR_ECX,
- CR_EDX,
- CR_EBX
-};
-
-void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
-{
- u32 max_level;
- u32 regs[4];
- const struct cpuid_bit *cb;
-
- static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
- { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
- { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
- { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
- { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
- { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
- { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
- { 0, 0, 0, 0 }
- };
-
- for (cb = cpuid_bits; cb->feature; cb++) {
-
- /* Verify that the level is valid */
- max_level = cpuid_eax(cb->level & 0xffff0000);
- if (max_level < cb->level ||
- max_level > (cb->level | 0xffff))
- continue;
-
- cpuid(cb->level, &regs[CR_EAX], &regs[CR_EBX],
- &regs[CR_ECX], &regs[CR_EDX]);
-
- if (regs[cb->reg] & (1 << cb->bit))
- set_cpu_cap(c, cb->feature);
- }
-}
-
/* leaf 0xb SMT level */
#define SMT_LEVEL 0
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index dfdb4dba2320..227b0448960d 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -24,8 +24,8 @@
#include <linux/dmi.h>
#include <linux/module.h>
#include <asm/div64.h>
-#include <asm/vmware.h>
#include <asm/x86_init.h>
+#include <asm/hypervisor.h>
#define CPUID_VMWARE_INFO_LEAF 0x40000000
#define VMWARE_HYPERVISOR_MAGIC 0x564D5868
@@ -51,7 +51,7 @@ static inline int __vmware_platform(void)
static unsigned long vmware_get_tsc_khz(void)
{
- uint64_t tsc_hz;
+ uint64_t tsc_hz, lpj;
uint32_t eax, ebx, ecx, edx;
VMWARE_PORT(GETHZ, eax, ebx, ecx, edx);
@@ -62,10 +62,17 @@ static unsigned long vmware_get_tsc_khz(void)
printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n",
(unsigned long) tsc_hz / 1000,
(unsigned long) tsc_hz % 1000);
+
+ if (!preset_lpj) {
+ lpj = ((u64)tsc_hz * 1000);
+ do_div(lpj, HZ);
+ preset_lpj = lpj;
+ }
+
return tsc_hz;
}
-void __init vmware_platform_setup(void)
+static void __init vmware_platform_setup(void)
{
uint32_t eax, ebx, ecx, edx;
@@ -83,26 +90,22 @@ void __init vmware_platform_setup(void)
* serial key should be enough, as this will always have a VMware
* specific string when running under VMware hypervisor.
*/
-int vmware_platform(void)
+static bool __init vmware_platform(void)
{
if (cpu_has_hypervisor) {
- unsigned int eax, ebx, ecx, edx;
- char hyper_vendor_id[13];
-
- cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx);
- memcpy(hyper_vendor_id + 0, &ebx, 4);
- memcpy(hyper_vendor_id + 4, &ecx, 4);
- memcpy(hyper_vendor_id + 8, &edx, 4);
- hyper_vendor_id[12] = '\0';
- if (!strcmp(hyper_vendor_id, "VMwareVMware"))
- return 1;
+ unsigned int eax;
+ unsigned int hyper_vendor_id[3];
+
+ cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0],
+ &hyper_vendor_id[1], &hyper_vendor_id[2]);
+ if (!memcmp(hyper_vendor_id, "VMwareVMware", 12))
+ return true;
} else if (dmi_available && dmi_name_in_serial("VMware") &&
__vmware_platform())
- return 1;
+ return true;
- return 0;
+ return false;
}
-EXPORT_SYMBOL(vmware_platform);
/*
* VMware hypervisor takes care of exporting a reliable TSC to the guest.
@@ -116,8 +119,16 @@ EXPORT_SYMBOL(vmware_platform);
* so that the kernel could just trust the hypervisor with providing a
* reliable virtual TSC that is suitable for timekeeping.
*/
-void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c)
+static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c)
{
set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE);
}
+
+const __refconst struct hypervisor_x86 x86_hyper_vmware = {
+ .name = "VMware",
+ .detect = vmware_platform,
+ .set_cpu_features = vmware_set_cpu_features,
+ .init_platform = vmware_platform_setup,
+};
+EXPORT_SYMBOL(x86_hyper_vmware);
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 8b862d5900fe..1b7b31ab7d86 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -170,7 +170,7 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb,
cpuid_device_destroy(cpu);
break;
}
- return err ? NOTIFY_BAD : NOTIFY_OK;
+ return notifier_from_errno(err);
}
static struct notifier_block __refdata cpuid_class_cpu_notifier =
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
deleted file mode 100644
index 1c47390dd0e5..000000000000
--- a/arch/x86/kernel/ds.c
+++ /dev/null
@@ -1,1437 +0,0 @@
-/*
- * Debug Store support
- *
- * This provides a low-level interface to the hardware's Debug Store
- * feature that is used for branch trace store (BTS) and
- * precise-event based sampling (PEBS).
- *
- * It manages:
- * - DS and BTS hardware configuration
- * - buffer overflow handling (to be done)
- * - buffer access
- *
- * It does not do:
- * - security checking (is the caller allowed to trace the task)
- * - buffer allocation (memory accounting)
- *
- *
- * Copyright (C) 2007-2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
- */
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/errno.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/mm.h>
-#include <linux/trace_clock.h>
-
-#include <asm/ds.h>
-
-#include "ds_selftest.h"
-
-/*
- * The configuration for a particular DS hardware implementation:
- */
-struct ds_configuration {
- /* The name of the configuration: */
- const char *name;
-
- /* The size of pointer-typed fields in DS, BTS, and PEBS: */
- unsigned char sizeof_ptr_field;
-
- /* The size of a BTS/PEBS record in bytes: */
- unsigned char sizeof_rec[2];
-
- /* The number of pebs counter reset values in the DS structure. */
- unsigned char nr_counter_reset;
-
- /* Control bit-masks indexed by enum ds_feature: */
- unsigned long ctl[dsf_ctl_max];
-};
-static struct ds_configuration ds_cfg __read_mostly;
-
-
-/* Maximal size of a DS configuration: */
-#define MAX_SIZEOF_DS 0x80
-
-/* Maximal size of a BTS record: */
-#define MAX_SIZEOF_BTS (3 * 8)
-
-/* BTS and PEBS buffer alignment: */
-#define DS_ALIGNMENT (1 << 3)
-
-/* Number of buffer pointers in DS: */
-#define NUM_DS_PTR_FIELDS 8
-
-/* Size of a pebs reset value in DS: */
-#define PEBS_RESET_FIELD_SIZE 8
-
-/* Mask of control bits in the DS MSR register: */
-#define BTS_CONTROL \
- ( ds_cfg.ctl[dsf_bts] | \
- ds_cfg.ctl[dsf_bts_kernel] | \
- ds_cfg.ctl[dsf_bts_user] | \
- ds_cfg.ctl[dsf_bts_overflow] )
-
-/*
- * A BTS or PEBS tracer.
- *
- * This holds the configuration of the tracer and serves as a handle
- * to identify tracers.
- */
-struct ds_tracer {
- /* The DS context (partially) owned by this tracer. */
- struct ds_context *context;
- /* The buffer provided on ds_request() and its size in bytes. */
- void *buffer;
- size_t size;
-};
-
-struct bts_tracer {
- /* The common DS part: */
- struct ds_tracer ds;
-
- /* The trace including the DS configuration: */
- struct bts_trace trace;
-
- /* Buffer overflow notification function: */
- bts_ovfl_callback_t ovfl;
-
- /* Active flags affecting trace collection. */
- unsigned int flags;
-};
-
-struct pebs_tracer {
- /* The common DS part: */
- struct ds_tracer ds;
-
- /* The trace including the DS configuration: */
- struct pebs_trace trace;
-
- /* Buffer overflow notification function: */
- pebs_ovfl_callback_t ovfl;
-};
-
-/*
- * Debug Store (DS) save area configuration (see Intel64 and IA32
- * Architectures Software Developer's Manual, section 18.5)
- *
- * The DS configuration consists of the following fields; different
- * architetures vary in the size of those fields.
- *
- * - double-word aligned base linear address of the BTS buffer
- * - write pointer into the BTS buffer
- * - end linear address of the BTS buffer (one byte beyond the end of
- * the buffer)
- * - interrupt pointer into BTS buffer
- * (interrupt occurs when write pointer passes interrupt pointer)
- * - double-word aligned base linear address of the PEBS buffer
- * - write pointer into the PEBS buffer
- * - end linear address of the PEBS buffer (one byte beyond the end of
- * the buffer)
- * - interrupt pointer into PEBS buffer
- * (interrupt occurs when write pointer passes interrupt pointer)
- * - value to which counter is reset following counter overflow
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- *
- * We compute the base address for the first 8 fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- * - an offset giving the start of the respective region
- *
- * This offset is further used to index various arrays holding
- * information for BTS and PEBS at the respective index.
- *
- * On later 32bit processors, we only access the lower 32bit of the
- * 64bit pointer fields. The upper halves will be zeroed out.
- */
-
-enum ds_field {
- ds_buffer_base = 0,
- ds_index,
- ds_absolute_maximum,
- ds_interrupt_threshold,
-};
-
-enum ds_qualifier {
- ds_bts = 0,
- ds_pebs
-};
-
-static inline unsigned long
-ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
-{
- base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
- return *(unsigned long *)base;
-}
-
-static inline void
-ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
- unsigned long value)
-{
- base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
- (*(unsigned long *)base) = value;
-}
-
-
-/*
- * Locking is done only for allocating BTS or PEBS resources.
- */
-static DEFINE_SPINLOCK(ds_lock);
-
-/*
- * We either support (system-wide) per-cpu or per-thread allocation.
- * We distinguish the two based on the task_struct pointer, where a
- * NULL pointer indicates per-cpu allocation for the current cpu.
- *
- * Allocations are use-counted. As soon as resources are allocated,
- * further allocations must be of the same type (per-cpu or
- * per-thread). We model this by counting allocations (i.e. the number
- * of tracers of a certain type) for one type negatively:
- * =0 no tracers
- * >0 number of per-thread tracers
- * <0 number of per-cpu tracers
- *
- * Tracers essentially gives the number of ds contexts for a certain
- * type of allocation.
- */
-static atomic_t tracers = ATOMIC_INIT(0);
-
-static inline int get_tracer(struct task_struct *task)
-{
- int error;
-
- spin_lock_irq(&ds_lock);
-
- if (task) {
- error = -EPERM;
- if (atomic_read(&tracers) < 0)
- goto out;
- atomic_inc(&tracers);
- } else {
- error = -EPERM;
- if (atomic_read(&tracers) > 0)
- goto out;
- atomic_dec(&tracers);
- }
-
- error = 0;
-out:
- spin_unlock_irq(&ds_lock);
- return error;
-}
-
-static inline void put_tracer(struct task_struct *task)
-{
- if (task)
- atomic_dec(&tracers);
- else
- atomic_inc(&tracers);
-}
-
-/*
- * The DS context is either attached to a thread or to a cpu:
- * - in the former case, the thread_struct contains a pointer to the
- * attached context.
- * - in the latter case, we use a static array of per-cpu context
- * pointers.
- *
- * Contexts are use-counted. They are allocated on first access and
- * deallocated when the last user puts the context.
- */
-struct ds_context {
- /* The DS configuration; goes into MSR_IA32_DS_AREA: */
- unsigned char ds[MAX_SIZEOF_DS];
-
- /* The owner of the BTS and PEBS configuration, respectively: */
- struct bts_tracer *bts_master;
- struct pebs_tracer *pebs_master;
-
- /* Use count: */
- unsigned long count;
-
- /* Pointer to the context pointer field: */
- struct ds_context **this;
-
- /* The traced task; NULL for cpu tracing: */
- struct task_struct *task;
-
- /* The traced cpu; only valid if task is NULL: */
- int cpu;
-};
-
-static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
-
-
-static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
-{
- struct ds_context **p_context =
- (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
- struct ds_context *context = NULL;
- struct ds_context *new_context = NULL;
-
- /* Chances are small that we already have a context. */
- new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
- if (!new_context)
- return NULL;
-
- spin_lock_irq(&ds_lock);
-
- context = *p_context;
- if (likely(!context)) {
- context = new_context;
-
- context->this = p_context;
- context->task = task;
- context->cpu = cpu;
- context->count = 0;
-
- *p_context = context;
- }
-
- context->count++;
-
- spin_unlock_irq(&ds_lock);
-
- if (context != new_context)
- kfree(new_context);
-
- return context;
-}
-
-static void ds_put_context(struct ds_context *context)
-{
- struct task_struct *task;
- unsigned long irq;
-
- if (!context)
- return;
-
- spin_lock_irqsave(&ds_lock, irq);
-
- if (--context->count) {
- spin_unlock_irqrestore(&ds_lock, irq);
- return;
- }
-
- *(context->this) = NULL;
-
- task = context->task;
-
- if (task)
- clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
- /*
- * We leave the (now dangling) pointer to the DS configuration in
- * the DS_AREA msr. This is as good or as bad as replacing it with
- * NULL - the hardware would crash if we enabled tracing.
- *
- * This saves us some problems with having to write an msr on a
- * different cpu while preventing others from doing the same for the
- * next context for that same cpu.
- */
-
- spin_unlock_irqrestore(&ds_lock, irq);
-
- /* The context might still be in use for context switching. */
- if (task && (task != current))
- wait_task_context_switch(task);
-
- kfree(context);
-}
-
-static void ds_install_ds_area(struct ds_context *context)
-{
- unsigned long ds;
-
- ds = (unsigned long)context->ds;
-
- /*
- * There is a race between the bts master and the pebs master.
- *
- * The thread/cpu access is synchronized via get/put_cpu() for
- * task tracing and via wrmsr_on_cpu for cpu tracing.
- *
- * If bts and pebs are collected for the same task or same cpu,
- * the same confiuration is written twice.
- */
- if (context->task) {
- get_cpu();
- if (context->task == current)
- wrmsrl(MSR_IA32_DS_AREA, ds);
- set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
- put_cpu();
- } else
- wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
- (u32)((u64)ds), (u32)((u64)ds >> 32));
-}
-
-/*
- * Call the tracer's callback on a buffer overflow.
- *
- * context: the ds context
- * qual: the buffer type
- */
-static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
-{
- switch (qual) {
- case ds_bts:
- if (context->bts_master &&
- context->bts_master->ovfl)
- context->bts_master->ovfl(context->bts_master);
- break;
- case ds_pebs:
- if (context->pebs_master &&
- context->pebs_master->ovfl)
- context->pebs_master->ovfl(context->pebs_master);
- break;
- }
-}
-
-
-/*
- * Write raw data into the BTS or PEBS buffer.
- *
- * The remainder of any partially written record is zeroed out.
- *
- * context: the DS context
- * qual: the buffer type
- * record: the data to write
- * size: the size of the data
- */
-static int ds_write(struct ds_context *context, enum ds_qualifier qual,
- const void *record, size_t size)
-{
- int bytes_written = 0;
-
- if (!record)
- return -EINVAL;
-
- while (size) {
- unsigned long base, index, end, write_end, int_th;
- unsigned long write_size, adj_write_size;
-
- /*
- * Write as much as possible without producing an
- * overflow interrupt.
- *
- * Interrupt_threshold must either be
- * - bigger than absolute_maximum or
- * - point to a record between buffer_base and absolute_maximum
- *
- * Index points to a valid record.
- */
- base = ds_get(context->ds, qual, ds_buffer_base);
- index = ds_get(context->ds, qual, ds_index);
- end = ds_get(context->ds, qual, ds_absolute_maximum);
- int_th = ds_get(context->ds, qual, ds_interrupt_threshold);
-
- write_end = min(end, int_th);
-
- /*
- * If we are already beyond the interrupt threshold,
- * we fill the entire buffer.
- */
- if (write_end <= index)
- write_end = end;
-
- if (write_end <= index)
- break;
-
- write_size = min((unsigned long) size, write_end - index);
- memcpy((void *)index, record, write_size);
-
- record = (const char *)record + write_size;
- size -= write_size;
- bytes_written += write_size;
-
- adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
- adj_write_size *= ds_cfg.sizeof_rec[qual];
-
- /* Zero out trailing bytes. */
- memset((char *)index + write_size, 0,
- adj_write_size - write_size);
- index += adj_write_size;
-
- if (index >= end)
- index = base;
- ds_set(context->ds, qual, ds_index, index);
-
- if (index >= int_th)
- ds_overflow(context, qual);
- }
-
- return bytes_written;
-}
-
-
-/*
- * Branch Trace Store (BTS) uses the following format. Different
- * architectures vary in the size of those fields.
- * - source linear address
- * - destination linear address
- * - flags
- *
- * Later architectures use 64bit pointers throughout, whereas earlier
- * architectures use 32bit pointers in 32bit mode.
- *
- * We compute the base address for the fields based on:
- * - the field size stored in the DS configuration
- * - the relative field position
- *
- * In order to store additional information in the BTS buffer, we use
- * a special source address to indicate that the record requires
- * special interpretation.
- *
- * Netburst indicated via a bit in the flags field whether the branch
- * was predicted; this is ignored.
- *
- * We use two levels of abstraction:
- * - the raw data level defined here
- * - an arch-independent level defined in ds.h
- */
-
-enum bts_field {
- bts_from,
- bts_to,
- bts_flags,
-
- bts_qual = bts_from,
- bts_clock = bts_to,
- bts_pid = bts_flags,
-
- bts_qual_mask = (bts_qual_max - 1),
- bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
-};
-
-static inline unsigned long bts_get(const char *base, unsigned long field)
-{
- base += (ds_cfg.sizeof_ptr_field * field);
- return *(unsigned long *)base;
-}
-
-static inline void bts_set(char *base, unsigned long field, unsigned long val)
-{
- base += (ds_cfg.sizeof_ptr_field * field);
- (*(unsigned long *)base) = val;
-}
-
-
-/*
- * The raw BTS data is architecture dependent.
- *
- * For higher-level users, we give an arch-independent view.
- * - ds.h defines struct bts_struct
- * - bts_read translates one raw bts record into a bts_struct
- * - bts_write translates one bts_struct into the raw format and
- * writes it into the top of the parameter tracer's buffer.
- *
- * return: bytes read/written on success; -Eerrno, otherwise
- */
-static int
-bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
-{
- if (!tracer)
- return -EINVAL;
-
- if (at < tracer->trace.ds.begin)
- return -EINVAL;
-
- if (tracer->trace.ds.end < (at + tracer->trace.ds.size))
- return -EINVAL;
-
- memset(out, 0, sizeof(*out));
- if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
- out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
- out->variant.event.clock = bts_get(at, bts_clock);
- out->variant.event.pid = bts_get(at, bts_pid);
- } else {
- out->qualifier = bts_branch;
- out->variant.lbr.from = bts_get(at, bts_from);
- out->variant.lbr.to = bts_get(at, bts_to);
-
- if (!out->variant.lbr.from && !out->variant.lbr.to)
- out->qualifier = bts_invalid;
- }
-
- return ds_cfg.sizeof_rec[ds_bts];
-}
-
-static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
-{
- unsigned char raw[MAX_SIZEOF_BTS];
-
- if (!tracer)
- return -EINVAL;
-
- if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts])
- return -EOVERFLOW;
-
- switch (in->qualifier) {
- case bts_invalid:
- bts_set(raw, bts_from, 0);
- bts_set(raw, bts_to, 0);
- bts_set(raw, bts_flags, 0);
- break;
- case bts_branch:
- bts_set(raw, bts_from, in->variant.lbr.from);
- bts_set(raw, bts_to, in->variant.lbr.to);
- bts_set(raw, bts_flags, 0);
- break;
- case bts_task_arrives:
- case bts_task_departs:
- bts_set(raw, bts_qual, (bts_escape | in->qualifier));
- bts_set(raw, bts_clock, in->variant.event.clock);
- bts_set(raw, bts_pid, in->variant.event.pid);
- break;
- default:
- return -EINVAL;
- }
-
- return ds_write(tracer->ds.context, ds_bts, raw,
- ds_cfg.sizeof_rec[ds_bts]);
-}
-
-
-static void ds_write_config(struct ds_context *context,
- struct ds_trace *cfg, enum ds_qualifier qual)
-{
- unsigned char *ds = context->ds;
-
- ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin);
- ds_set(ds, qual, ds_index, (unsigned long)cfg->top);
- ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end);
- ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith);
-}
-
-static void ds_read_config(struct ds_context *context,
- struct ds_trace *cfg, enum ds_qualifier qual)
-{
- unsigned char *ds = context->ds;
-
- cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base);
- cfg->top = (void *)ds_get(ds, qual, ds_index);
- cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum);
- cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold);
-}
-
-static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
- void *base, size_t size, size_t ith,
- unsigned int flags) {
- unsigned long buffer, adj;
-
- /*
- * Adjust the buffer address and size to meet alignment
- * constraints:
- * - buffer is double-word aligned
- * - size is multiple of record size
- *
- * We checked the size at the very beginning; we have enough
- * space to do the adjustment.
- */
- buffer = (unsigned long)base;
-
- adj = ALIGN(buffer, DS_ALIGNMENT) - buffer;
- buffer += adj;
- size -= adj;
-
- trace->n = size / ds_cfg.sizeof_rec[qual];
- trace->size = ds_cfg.sizeof_rec[qual];
-
- size = (trace->n * trace->size);
-
- trace->begin = (void *)buffer;
- trace->top = trace->begin;
- trace->end = (void *)(buffer + size);
- /*
- * The value for 'no threshold' is -1, which will set the
- * threshold outside of the buffer, just like we want it.
- */
- ith *= ds_cfg.sizeof_rec[qual];
- trace->ith = (void *)(buffer + size - ith);
-
- trace->flags = flags;
-}
-
-
-static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
- enum ds_qualifier qual, struct task_struct *task,
- int cpu, void *base, size_t size, size_t th)
-{
- struct ds_context *context;
- int error;
- size_t req_size;
-
- error = -EOPNOTSUPP;
- if (!ds_cfg.sizeof_rec[qual])
- goto out;
-
- error = -EINVAL;
- if (!base)
- goto out;
-
- req_size = ds_cfg.sizeof_rec[qual];
- /* We might need space for alignment adjustments. */
- if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
- req_size += DS_ALIGNMENT;
-
- error = -EINVAL;
- if (size < req_size)
- goto out;
-
- if (th != (size_t)-1) {
- th *= ds_cfg.sizeof_rec[qual];
-
- error = -EINVAL;
- if (size <= th)
- goto out;
- }
-
- tracer->buffer = base;
- tracer->size = size;
-
- error = -ENOMEM;
- context = ds_get_context(task, cpu);
- if (!context)
- goto out;
- tracer->context = context;
-
- /*
- * Defer any tracer-specific initialization work for the context until
- * context ownership has been clarified.
- */
-
- error = 0;
- out:
- return error;
-}
-
-static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
-{
- struct bts_tracer *tracer;
- int error;
-
- /* Buffer overflow notification is not yet implemented. */
- error = -EOPNOTSUPP;
- if (ovfl)
- goto out;
-
- error = get_tracer(task);
- if (error < 0)
- goto out;
-
- error = -ENOMEM;
- tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
- if (!tracer)
- goto out_put_tracer;
- tracer->ovfl = ovfl;
-
- /* Do some more error checking and acquire a tracing context. */
- error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_bts, task, cpu, base, size, th);
- if (error < 0)
- goto out_tracer;
-
- /* Claim the bts part of the tracing context we acquired above. */
- spin_lock_irq(&ds_lock);
-
- error = -EPERM;
- if (tracer->ds.context->bts_master)
- goto out_unlock;
- tracer->ds.context->bts_master = tracer;
-
- spin_unlock_irq(&ds_lock);
-
- /*
- * Now that we own the bts part of the context, let's complete the
- * initialization for that part.
- */
- ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
- ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
- ds_install_ds_area(tracer->ds.context);
-
- tracer->trace.read = bts_read;
- tracer->trace.write = bts_write;
-
- /* Start tracing. */
- ds_resume_bts(tracer);
-
- return tracer;
-
- out_unlock:
- spin_unlock_irq(&ds_lock);
- ds_put_context(tracer->ds.context);
- out_tracer:
- kfree(tracer);
- out_put_tracer:
- put_tracer(task);
- out:
- return ERR_PTR(error);
-}
-
-struct bts_tracer *ds_request_bts_task(struct task_struct *task,
- void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_bts(task, 0, base, size, ovfl, th, flags);
-}
-
-struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
- bts_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl, size_t th,
- unsigned int flags)
-{
- struct pebs_tracer *tracer;
- int error;
-
- /* Buffer overflow notification is not yet implemented. */
- error = -EOPNOTSUPP;
- if (ovfl)
- goto out;
-
- error = get_tracer(task);
- if (error < 0)
- goto out;
-
- error = -ENOMEM;
- tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
- if (!tracer)
- goto out_put_tracer;
- tracer->ovfl = ovfl;
-
- /* Do some more error checking and acquire a tracing context. */
- error = ds_request(&tracer->ds, &tracer->trace.ds,
- ds_pebs, task, cpu, base, size, th);
- if (error < 0)
- goto out_tracer;
-
- /* Claim the pebs part of the tracing context we acquired above. */
- spin_lock_irq(&ds_lock);
-
- error = -EPERM;
- if (tracer->ds.context->pebs_master)
- goto out_unlock;
- tracer->ds.context->pebs_master = tracer;
-
- spin_unlock_irq(&ds_lock);
-
- /*
- * Now that we own the pebs part of the context, let's complete the
- * initialization for that part.
- */
- ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
- ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
- ds_install_ds_area(tracer->ds.context);
-
- /* Start tracing. */
- ds_resume_pebs(tracer);
-
- return tracer;
-
- out_unlock:
- spin_unlock_irq(&ds_lock);
- ds_put_context(tracer->ds.context);
- out_tracer:
- kfree(tracer);
- out_put_tracer:
- put_tracer(task);
- out:
- return ERR_PTR(error);
-}
-
-struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
- void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
-}
-
-struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
- pebs_ovfl_callback_t ovfl,
- size_t th, unsigned int flags)
-{
- return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
-}
-
-static void ds_free_bts(struct bts_tracer *tracer)
-{
- struct task_struct *task;
-
- task = tracer->ds.context->task;
-
- WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
- tracer->ds.context->bts_master = NULL;
-
- /* Make sure tracing stopped and the tracer is not in use. */
- if (task && (task != current))
- wait_task_context_switch(task);
-
- ds_put_context(tracer->ds.context);
- put_tracer(task);
-
- kfree(tracer);
-}
-
-void ds_release_bts(struct bts_tracer *tracer)
-{
- might_sleep();
-
- if (!tracer)
- return;
-
- ds_suspend_bts(tracer);
- ds_free_bts(tracer);
-}
-
-int ds_release_bts_noirq(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long irq;
- int error;
-
- if (!tracer)
- return 0;
-
- task = tracer->ds.context->task;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task &&
- (tracer->ds.context->cpu != smp_processor_id()))
- goto out;
-
- error = -EPERM;
- if (task && (task != current))
- goto out;
-
- ds_suspend_bts_noirq(tracer);
- ds_free_bts(tracer);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-static void update_task_debugctlmsr(struct task_struct *task,
- unsigned long debugctlmsr)
-{
- task->thread.debugctlmsr = debugctlmsr;
-
- get_cpu();
- if (task == current)
- update_debugctlmsr(debugctlmsr);
- put_cpu();
-}
-
-void ds_suspend_bts(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr;
- int cpu;
-
- if (!tracer)
- return;
-
- tracer->flags = 0;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- WARN_ON(!task && irqs_disabled());
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr_on_cpu(cpu));
- debugctlmsr &= ~BTS_CONTROL;
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_suspend_bts_noirq(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr, irq;
- int cpu, error = 0;
-
- if (!tracer)
- return 0;
-
- tracer->flags = 0;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task && (cpu != smp_processor_id()))
- goto out;
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr());
- debugctlmsr &= ~BTS_CONTROL;
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr(debugctlmsr);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-static unsigned long ds_bts_control(struct bts_tracer *tracer)
-{
- unsigned long control;
-
- control = ds_cfg.ctl[dsf_bts];
- if (!(tracer->trace.ds.flags & BTS_KERNEL))
- control |= ds_cfg.ctl[dsf_bts_kernel];
- if (!(tracer->trace.ds.flags & BTS_USER))
- control |= ds_cfg.ctl[dsf_bts_user];
-
- return control;
-}
-
-void ds_resume_bts(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr;
- int cpu;
-
- if (!tracer)
- return;
-
- tracer->flags = tracer->trace.ds.flags;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- WARN_ON(!task && irqs_disabled());
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr_on_cpu(cpu));
- debugctlmsr |= ds_bts_control(tracer);
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr_on_cpu(cpu, debugctlmsr);
-}
-
-int ds_resume_bts_noirq(struct bts_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long debugctlmsr, irq;
- int cpu, error = 0;
-
- if (!tracer)
- return 0;
-
- tracer->flags = tracer->trace.ds.flags;
-
- task = tracer->ds.context->task;
- cpu = tracer->ds.context->cpu;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task && (cpu != smp_processor_id()))
- goto out;
-
- debugctlmsr = (task ?
- task->thread.debugctlmsr :
- get_debugctlmsr());
- debugctlmsr |= ds_bts_control(tracer);
-
- if (task)
- update_task_debugctlmsr(task, debugctlmsr);
- else
- update_debugctlmsr(debugctlmsr);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-static void ds_free_pebs(struct pebs_tracer *tracer)
-{
- struct task_struct *task;
-
- task = tracer->ds.context->task;
-
- WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
- tracer->ds.context->pebs_master = NULL;
-
- ds_put_context(tracer->ds.context);
- put_tracer(task);
-
- kfree(tracer);
-}
-
-void ds_release_pebs(struct pebs_tracer *tracer)
-{
- might_sleep();
-
- if (!tracer)
- return;
-
- ds_suspend_pebs(tracer);
- ds_free_pebs(tracer);
-}
-
-int ds_release_pebs_noirq(struct pebs_tracer *tracer)
-{
- struct task_struct *task;
- unsigned long irq;
- int error;
-
- if (!tracer)
- return 0;
-
- task = tracer->ds.context->task;
-
- local_irq_save(irq);
-
- error = -EPERM;
- if (!task &&
- (tracer->ds.context->cpu != smp_processor_id()))
- goto out;
-
- error = -EPERM;
- if (task && (task != current))
- goto out;
-
- ds_suspend_pebs_noirq(tracer);
- ds_free_pebs(tracer);
-
- error = 0;
- out:
- local_irq_restore(irq);
- return error;
-}
-
-void ds_suspend_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
-{
- return 0;
-}
-
-void ds_resume_pebs(struct pebs_tracer *tracer)
-{
-
-}
-
-int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
-{
- return 0;
-}
-
-const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
-{
- if (!tracer)
- return NULL;
-
- ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
- return &tracer->trace;
-}
-
-const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
-{
- if (!tracer)
- return NULL;
-
- ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-
- tracer->trace.counters = ds_cfg.nr_counter_reset;
- memcpy(tracer->trace.counter_reset,
- tracer->ds.context->ds +
- (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
- ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
-
- return &tracer->trace;
-}
-
-int ds_reset_bts(struct bts_tracer *tracer)
-{
- if (!tracer)
- return -EINVAL;
-
- tracer->trace.ds.top = tracer->trace.ds.begin;
-
- ds_set(tracer->ds.context->ds, ds_bts, ds_index,
- (unsigned long)tracer->trace.ds.top);
-
- return 0;
-}
-
-int ds_reset_pebs(struct pebs_tracer *tracer)
-{
- if (!tracer)
- return -EINVAL;
-
- tracer->trace.ds.top = tracer->trace.ds.begin;
-
- ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
- (unsigned long)tracer->trace.ds.top);
-
- return 0;
-}
-
-int ds_set_pebs_reset(struct pebs_tracer *tracer,
- unsigned int counter, u64 value)
-{
- if (!tracer)
- return -EINVAL;
-
- if (ds_cfg.nr_counter_reset < counter)
- return -EINVAL;
-
- *(u64 *)(tracer->ds.context->ds +
- (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
- (counter * PEBS_RESET_FIELD_SIZE)) = value;
-
- return 0;
-}
-
-static const struct ds_configuration ds_cfg_netburst = {
- .name = "Netburst",
- .ctl[dsf_bts] = (1 << 2) | (1 << 3),
- .ctl[dsf_bts_kernel] = (1 << 5),
- .ctl[dsf_bts_user] = (1 << 6),
- .nr_counter_reset = 1,
-};
-static const struct ds_configuration ds_cfg_pentium_m = {
- .name = "Pentium M",
- .ctl[dsf_bts] = (1 << 6) | (1 << 7),
- .nr_counter_reset = 1,
-};
-static const struct ds_configuration ds_cfg_core2_atom = {
- .name = "Core 2/Atom",
- .ctl[dsf_bts] = (1 << 6) | (1 << 7),
- .ctl[dsf_bts_kernel] = (1 << 9),
- .ctl[dsf_bts_user] = (1 << 10),
- .nr_counter_reset = 1,
-};
-static const struct ds_configuration ds_cfg_core_i7 = {
- .name = "Core i7",
- .ctl[dsf_bts] = (1 << 6) | (1 << 7),
- .ctl[dsf_bts_kernel] = (1 << 9),
- .ctl[dsf_bts_user] = (1 << 10),
- .nr_counter_reset = 4,
-};
-
-static void
-ds_configure(const struct ds_configuration *cfg,
- struct cpuinfo_x86 *cpu)
-{
- unsigned long nr_pebs_fields = 0;
-
- printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
-
-#ifdef __i386__
- nr_pebs_fields = 10;
-#else
- nr_pebs_fields = 18;
-#endif
-
- /*
- * Starting with version 2, architectural performance
- * monitoring supports a format specifier.
- */
- if ((cpuid_eax(0xa) & 0xff) > 1) {
- unsigned long perf_capabilities, format;
-
- rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
-
- format = (perf_capabilities >> 8) & 0xf;
-
- switch (format) {
- case 0:
- nr_pebs_fields = 18;
- break;
- case 1:
- nr_pebs_fields = 22;
- break;
- default:
- printk(KERN_INFO
- "[ds] unknown PEBS format: %lu\n", format);
- nr_pebs_fields = 0;
- break;
- }
- }
-
- memset(&ds_cfg, 0, sizeof(ds_cfg));
- ds_cfg = *cfg;
-
- ds_cfg.sizeof_ptr_field =
- (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
-
- ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3;
- ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
-
- if (!cpu_has(cpu, X86_FEATURE_BTS)) {
- ds_cfg.sizeof_rec[ds_bts] = 0;
- printk(KERN_INFO "[ds] bts not available\n");
- }
- if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
- ds_cfg.sizeof_rec[ds_pebs] = 0;
- printk(KERN_INFO "[ds] pebs not available\n");
- }
-
- printk(KERN_INFO "[ds] sizes: address: %u bit, ",
- 8 * ds_cfg.sizeof_ptr_field);
- printk("bts/pebs record: %u/%u bytes\n",
- ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
-
- WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
-}
-
-void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
-{
- /* Only configure the first cpu. Others are identical. */
- if (ds_cfg.name)
- return;
-
- switch (c->x86) {
- case 0x6:
- switch (c->x86_model) {
- case 0x9:
- case 0xd: /* Pentium M */
- ds_configure(&ds_cfg_pentium_m, c);
- break;
- case 0xf:
- case 0x17: /* Core2 */
- case 0x1c: /* Atom */
- ds_configure(&ds_cfg_core2_atom, c);
- break;
- case 0x1a: /* Core i7 */
- ds_configure(&ds_cfg_core_i7, c);
- break;
- default:
- /* Sorry, don't know about them. */
- break;
- }
- break;
- case 0xf:
- switch (c->x86_model) {
- case 0x0:
- case 0x1:
- case 0x2: /* Netburst */
- ds_configure(&ds_cfg_netburst, c);
- break;
- default:
- /* Sorry, don't know about them. */
- break;
- }
- break;
- default:
- /* Sorry, don't know about them. */
- break;
- }
-}
-
-static inline void ds_take_timestamp(struct ds_context *context,
- enum bts_qualifier qualifier,
- struct task_struct *task)
-{
- struct bts_tracer *tracer = context->bts_master;
- struct bts_struct ts;
-
- /* Prevent compilers from reading the tracer pointer twice. */
- barrier();
-
- if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
- return;
-
- memset(&ts, 0, sizeof(ts));
- ts.qualifier = qualifier;
- ts.variant.event.clock = trace_clock_global();
- ts.variant.event.pid = task->pid;
-
- bts_write(tracer, &ts);
-}
-
-/*
- * Change the DS configuration from tracing prev to tracing next.
- */
-void ds_switch_to(struct task_struct *prev, struct task_struct *next)
-{
- struct ds_context *prev_ctx = prev->thread.ds_ctx;
- struct ds_context *next_ctx = next->thread.ds_ctx;
- unsigned long debugctlmsr = next->thread.debugctlmsr;
-
- /* Make sure all data is read before we start. */
- barrier();
-
- if (prev_ctx) {
- update_debugctlmsr(0);
-
- ds_take_timestamp(prev_ctx, bts_task_departs, prev);
- }
-
- if (next_ctx) {
- ds_take_timestamp(next_ctx, bts_task_arrives, next);
-
- wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
- }
-
- update_debugctlmsr(debugctlmsr);
-}
-
-static __init int ds_selftest(void)
-{
- if (ds_cfg.sizeof_rec[ds_bts]) {
- int error;
-
- error = ds_selftest_bts();
- if (error) {
- WARN(1, "[ds] selftest failed. disabling bts.\n");
- ds_cfg.sizeof_rec[ds_bts] = 0;
- }
- }
-
- if (ds_cfg.sizeof_rec[ds_pebs]) {
- int error;
-
- error = ds_selftest_pebs();
- if (error) {
- WARN(1, "[ds] selftest failed. disabling pebs.\n");
- ds_cfg.sizeof_rec[ds_pebs] = 0;
- }
- }
-
- return 0;
-}
-device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
deleted file mode 100644
index 6bc7c199ab99..000000000000
--- a/arch/x86/kernel/ds_selftest.c
+++ /dev/null
@@ -1,408 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#include "ds_selftest.h"
-
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <linux/smp.h>
-#include <linux/cpu.h>
-
-#include <asm/ds.h>
-
-
-#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */
-#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */
-
-struct ds_selftest_bts_conf {
- struct bts_tracer *tracer;
- int error;
- int (*suspend)(struct bts_tracer *);
- int (*resume)(struct bts_tracer *);
-};
-
-static int ds_selftest_bts_consistency(const struct bts_trace *trace)
-{
- int error = 0;
-
- if (!trace) {
- printk(KERN_CONT "failed to access trace...");
- /* Bail out. Other tests are pointless. */
- return -1;
- }
-
- if (!trace->read) {
- printk(KERN_CONT "bts read not available...");
- error = -1;
- }
-
- /* Do some sanity checks on the trace configuration. */
- if (!trace->ds.n) {
- printk(KERN_CONT "empty bts buffer...");
- error = -1;
- }
- if (!trace->ds.size) {
- printk(KERN_CONT "bad bts trace setup...");
- error = -1;
- }
- if (trace->ds.end !=
- (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
- printk(KERN_CONT "bad bts buffer setup...");
- error = -1;
- }
- /*
- * We allow top in [begin; end], since its not clear when the
- * overflow adjustment happens: after the increment or before the
- * write.
- */
- if ((trace->ds.top < trace->ds.begin) ||
- (trace->ds.end < trace->ds.top)) {
- printk(KERN_CONT "bts top out of bounds...");
- error = -1;
- }
-
- return error;
-}
-
-static int ds_selftest_bts_read(struct bts_tracer *tracer,
- const struct bts_trace *trace,
- const void *from, const void *to)
-{
- const unsigned char *at;
-
- /*
- * Check a few things which do not belong to this test.
- * They should be covered by other tests.
- */
- if (!trace)
- return -1;
-
- if (!trace->read)
- return -1;
-
- if (to < from)
- return -1;
-
- if (from < trace->ds.begin)
- return -1;
-
- if (trace->ds.end < to)
- return -1;
-
- if (!trace->ds.size)
- return -1;
-
- /* Now to the test itself. */
- for (at = from; (void *)at < to; at += trace->ds.size) {
- struct bts_struct bts;
- unsigned long index;
- int error;
-
- if (((void *)at - trace->ds.begin) % trace->ds.size) {
- printk(KERN_CONT
- "read from non-integer index...");
- return -1;
- }
- index = ((void *)at - trace->ds.begin) / trace->ds.size;
-
- memset(&bts, 0, sizeof(bts));
- error = trace->read(tracer, at, &bts);
- if (error < 0) {
- printk(KERN_CONT
- "error reading bts trace at [%lu] (0x%p)...",
- index, at);
- return error;
- }
-
- switch (bts.qualifier) {
- case BTS_BRANCH:
- break;
- default:
- printk(KERN_CONT
- "unexpected bts entry %llu at [%lu] (0x%p)...",
- bts.qualifier, index, at);
- return -1;
- }
- }
-
- return 0;
-}
-
-static void ds_selftest_bts_cpu(void *arg)
-{
- struct ds_selftest_bts_conf *conf = arg;
- const struct bts_trace *trace;
- void *top;
-
- if (IS_ERR(conf->tracer)) {
- conf->error = PTR_ERR(conf->tracer);
- conf->tracer = NULL;
-
- printk(KERN_CONT
- "initialization failed (err: %d)...", conf->error);
- return;
- }
-
- /* We should meanwhile have enough trace. */
- conf->error = conf->suspend(conf->tracer);
- if (conf->error < 0)
- return;
-
- /* Let's see if we can access the trace. */
- trace = ds_read_bts(conf->tracer);
-
- conf->error = ds_selftest_bts_consistency(trace);
- if (conf->error < 0)
- return;
-
- /* If everything went well, we should have a few trace entries. */
- if (trace->ds.top == trace->ds.begin) {
- /*
- * It is possible but highly unlikely that we got a
- * buffer overflow and end up at exactly the same
- * position we started from.
- * Let's issue a warning, but continue.
- */
- printk(KERN_CONT "no trace/overflow...");
- }
-
- /* Let's try to read the trace we collected. */
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace,
- trace->ds.begin, trace->ds.top);
- if (conf->error < 0)
- return;
-
- /*
- * Let's read the trace again.
- * Since we suspended tracing, we should get the same result.
- */
- top = trace->ds.top;
-
- trace = ds_read_bts(conf->tracer);
- conf->error = ds_selftest_bts_consistency(trace);
- if (conf->error < 0)
- return;
-
- if (top != trace->ds.top) {
- printk(KERN_CONT "suspend not working...");
- conf->error = -1;
- return;
- }
-
- /* Let's collect some more trace - see if resume is working. */
- conf->error = conf->resume(conf->tracer);
- if (conf->error < 0)
- return;
-
- conf->error = conf->suspend(conf->tracer);
- if (conf->error < 0)
- return;
-
- trace = ds_read_bts(conf->tracer);
-
- conf->error = ds_selftest_bts_consistency(trace);
- if (conf->error < 0)
- return;
-
- if (trace->ds.top == top) {
- /*
- * It is possible but highly unlikely that we got a
- * buffer overflow and end up at exactly the same
- * position we started from.
- * Let's issue a warning and check the full trace.
- */
- printk(KERN_CONT
- "no resume progress/overflow...");
-
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace,
- trace->ds.begin, trace->ds.end);
- } else if (trace->ds.top < top) {
- /*
- * We had a buffer overflow - the entire buffer should
- * contain trace records.
- */
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace,
- trace->ds.begin, trace->ds.end);
- } else {
- /*
- * It is quite likely that the buffer did not overflow.
- * Let's just check the delta trace.
- */
- conf->error =
- ds_selftest_bts_read(conf->tracer, trace, top,
- trace->ds.top);
- }
- if (conf->error < 0)
- return;
-
- conf->error = 0;
-}
-
-static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
-{
- ds_suspend_bts(tracer);
- return 0;
-}
-
-static int ds_resume_bts_wrap(struct bts_tracer *tracer)
-{
- ds_resume_bts(tracer);
- return 0;
-}
-
-static void ds_release_bts_noirq_wrap(void *tracer)
-{
- (void)ds_release_bts_noirq(tracer);
-}
-
-static int ds_selftest_bts_bad_release_noirq(int cpu,
- struct bts_tracer *tracer)
-{
- int error = -EPERM;
-
- /* Try to release the tracer on the wrong cpu. */
- get_cpu();
- if (cpu != smp_processor_id()) {
- error = ds_release_bts_noirq(tracer);
- if (error != -EPERM)
- printk(KERN_CONT "release on wrong cpu...");
- }
- put_cpu();
-
- return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
-{
- struct bts_tracer *tracer;
- int error;
-
- /* Try to request cpu tracing while task tracing is active. */
- tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
- (size_t)-1, BTS_KERNEL);
- error = PTR_ERR(tracer);
- if (!IS_ERR(tracer)) {
- ds_release_bts(tracer);
- error = 0;
- }
-
- if (error != -EPERM)
- printk(KERN_CONT "cpu/task tracing overlap...");
-
- return error ? 0 : -1;
-}
-
-static int ds_selftest_bts_bad_request_task(void *buffer)
-{
- struct bts_tracer *tracer;
- int error;
-
- /* Try to request cpu tracing while task tracing is active. */
- tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
- (size_t)-1, BTS_KERNEL);
- error = PTR_ERR(tracer);
- if (!IS_ERR(tracer)) {
- error = 0;
- ds_release_bts(tracer);
- }
-
- if (error != -EPERM)
- printk(KERN_CONT "task/cpu tracing overlap...");
-
- return error ? 0 : -1;
-}
-
-int ds_selftest_bts(void)
-{
- struct ds_selftest_bts_conf conf;
- unsigned char buffer[BUFFER_SIZE], *small_buffer;
- unsigned long irq;
- int cpu;
-
- printk(KERN_INFO "[ds] bts selftest...");
- conf.error = 0;
-
- small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
-
- get_online_cpus();
- for_each_online_cpu(cpu) {
- conf.suspend = ds_suspend_bts_wrap;
- conf.resume = ds_resume_bts_wrap;
- conf.tracer =
- ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- ds_selftest_bts_cpu(&conf);
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_task(buffer);
- ds_release_bts(conf.tracer);
- if (conf.error < 0)
- goto out;
-
- conf.suspend = ds_suspend_bts_noirq;
- conf.resume = ds_resume_bts_noirq;
- conf.tracer =
- ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
- if (conf.error >= 0) {
- conf.error =
- ds_selftest_bts_bad_release_noirq(cpu,
- conf.tracer);
- /* We must not release the tracer twice. */
- if (conf.error < 0)
- conf.tracer = NULL;
- }
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_task(buffer);
- smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
- conf.tracer, 1);
- if (conf.error < 0)
- goto out;
- }
-
- conf.suspend = ds_suspend_bts_wrap;
- conf.resume = ds_resume_bts_wrap;
- conf.tracer =
- ds_request_bts_task(current, buffer, BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- ds_selftest_bts_cpu(&conf);
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
- ds_release_bts(conf.tracer);
- if (conf.error < 0)
- goto out;
-
- conf.suspend = ds_suspend_bts_noirq;
- conf.resume = ds_resume_bts_noirq;
- conf.tracer =
- ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
- NULL, (size_t)-1, BTS_KERNEL);
- local_irq_save(irq);
- ds_selftest_bts_cpu(&conf);
- if (conf.error >= 0)
- conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
- ds_release_bts_noirq(conf.tracer);
- local_irq_restore(irq);
- if (conf.error < 0)
- goto out;
-
- conf.error = 0;
- out:
- put_online_cpus();
- printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
-
- return conf.error;
-}
-
-int ds_selftest_pebs(void)
-{
- return 0;
-}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
deleted file mode 100644
index 2ba8745c6663..000000000000
--- a/arch/x86/kernel/ds_selftest.h
+++ /dev/null
@@ -1,15 +0,0 @@
-/*
- * Debug Store support - selftest
- *
- *
- * Copyright (C) 2009 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2009
- */
-
-#ifdef CONFIG_X86_DS_SELFTEST
-extern int ds_selftest_bts(void);
-extern int ds_selftest_pebs(void);
-#else
-static inline int ds_selftest_bts(void) { return 0; }
-static inline int ds_selftest_pebs(void) { return 0; }
-#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6d817554780a..6e8752c1bd52 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -18,7 +18,6 @@
#include <asm/stacktrace.h>
-#include "dumpstack.h"
int panic_on_unrecovered_nmi;
int panic_on_io_nmi;
@@ -224,11 +223,6 @@ unsigned __kprobes long oops_begin(void)
int cpu;
unsigned long flags;
- /* notify the hw-branch tracer so it may disable tracing and
- add the last trace to the trace buffer -
- the earlier this happens, the more useful the trace. */
- trace_hw_branch_oops();
-
oops_enter();
/* racy, but better than risking deadlock. */
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
deleted file mode 100644
index e1a93be4fd44..000000000000
--- a/arch/x86/kernel/dumpstack.h
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Copyright (C) 1991, 1992 Linus Torvalds
- * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs
- */
-
-#ifndef DUMPSTACK_H
-#define DUMPSTACK_H
-
-#ifdef CONFIG_X86_32
-#define STACKSLOTS_PER_LINE 8
-#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :)
-#else
-#define STACKSLOTS_PER_LINE 4
-#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
-#endif
-
-#include <linux/uaccess.h>
-
-extern void
-show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *stack, unsigned long bp, char *log_lvl);
-
-extern void
-show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
- unsigned long *sp, unsigned long bp, char *log_lvl);
-
-extern unsigned int code_bytes;
-
-/* The form of the top of the frame on the stack */
-struct stack_frame {
- struct stack_frame *next_frame;
- unsigned long return_address;
-};
-
-struct stack_frame_ia32 {
- u32 next_frame;
- u32 return_address;
-};
-
-static inline unsigned long rewind_frame_pointer(int n)
-{
- struct stack_frame *frame;
-
- get_bp(frame);
-
-#ifdef CONFIG_FRAME_POINTER
- while (n--) {
- if (probe_kernel_address(&frame->next_frame, frame))
- break;
- }
-#endif
-
- return (unsigned long)frame;
-}
-
-#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 11540a189d93..0f6376ffa2d9 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -16,8 +16,6 @@
#include <asm/stacktrace.h>
-#include "dumpstack.h"
-
void dump_trace(struct task_struct *task, struct pt_regs *regs,
unsigned long *stack, unsigned long bp,
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 272c9f1f05f3..57a21f11c791 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -16,7 +16,6 @@
#include <asm/stacktrace.h>
-#include "dumpstack.h"
#define N_EXCEPTION_STACKS_END \
(N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 7bca3c6a02fb..0d6fc71bedb1 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -729,7 +729,7 @@ static int __init e820_mark_nvs_memory(void)
struct e820entry *ei = &e820.map[i];
if (ei->type == E820_NVS)
- hibernate_nvs_register(ei->addr, ei->size);
+ suspend_nvs_register(ei->addr, ei->size);
}
return 0;
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index ebdb85cf2686..e5cc7e82e60d 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -18,6 +18,7 @@
#include <asm/apic.h>
#include <asm/iommu.h>
#include <asm/gart.h>
+#include <asm/hpet.h>
static void __init fix_hypertransport_config(int num, int slot, int func)
{
@@ -191,6 +192,21 @@ static void __init ati_bugs_contd(int num, int slot, int func)
}
#endif
+/*
+ * Force the read back of the CMP register in hpet_next_event()
+ * to work around the problem that the CMP register write seems to be
+ * delayed. See hpet_next_event() for details.
+ *
+ * We do this on all SMBUS incarnations for now until we have more
+ * information about the affected chipsets.
+ */
+static void __init ati_hpet_bugs(int num, int slot, int func)
+{
+#ifdef CONFIG_HPET_TIMER
+ hpet_readback_cmp = 1;
+#endif
+}
+
#define QFLAG_APPLY_ONCE 0x1
#define QFLAG_APPLIED 0x2
#define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED)
@@ -220,6 +236,8 @@ static struct chipset early_qrk[] __initdata = {
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs },
{ PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd },
+ { PCI_VENDOR_ID_ATI, PCI_ANY_ID,
+ PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs },
{}
};
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index b9c830c12b4a..fa99bae75ace 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -41,6 +41,14 @@ static void early_vga_write(struct console *con, const char *str, unsigned n)
writew(0x720, VGABASE + 2*(max_xpos*j + i));
current_ypos = max_ypos-1;
}
+#ifdef CONFIG_KGDB_KDB
+ if (c == '\b') {
+ if (current_xpos > 0)
+ current_xpos--;
+ } else if (c == '\r') {
+ current_xpos = 0;
+ } else
+#endif
if (c == '\n') {
current_xpos = 0;
current_ypos++;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 44a8e0dc6737..227d00920d2f 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -53,6 +53,7 @@
#include <asm/processor-flags.h>
#include <asm/ftrace.h>
#include <asm/irq_vectors.h>
+#include <asm/cpufeature.h>
/* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */
#include <linux/elf-em.h>
@@ -610,14 +611,14 @@ ldt_ss:
* compensating for the offset by changing to the ESPFIX segment with
* a base address that matches for the difference.
*/
+#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8)
mov %esp, %edx /* load kernel esp */
mov PT_OLDESP(%esp), %eax /* load userspace esp */
mov %dx, %ax /* eax: new kernel esp */
sub %eax, %edx /* offset (low word is 0) */
- PER_CPU(gdt_page, %ebx)
shr $16, %edx
- mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */
- mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */
+ mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
+ mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
pushl $__ESPFIX_SS
CFI_ADJUST_CFA_OFFSET 4
push %eax /* new kernel esp */
@@ -790,9 +791,8 @@ ptregs_clone:
* normal stack and adjusts ESP with the matching offset.
*/
/* fixup the stack */
- PER_CPU(gdt_page, %ebx)
- mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */
- mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */
+ mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */
+ mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
shl $16, %eax
addl %esp, %eax /* the adjusted stack pointer */
pushl $__KERNEL_DS
@@ -905,7 +905,25 @@ ENTRY(simd_coprocessor_error)
RING0_INT_FRAME
pushl $0
CFI_ADJUST_CFA_OFFSET 4
+#ifdef CONFIG_X86_INVD_BUG
+ /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
+661: pushl $do_general_protection
+662:
+.section .altinstructions,"a"
+ .balign 4
+ .long 661b
+ .long 663f
+ .word X86_FEATURE_XMM
+ .byte 662b-661b
+ .byte 664f-663f
+.previous
+.section .altinstr_replacement,"ax"
+663: pushl $do_simd_coprocessor_error
+664:
+.previous
+#else
pushl $do_simd_coprocessor_error
+#endif
CFI_ADJUST_CFA_OFFSET 4
jmp error_code
CFI_ENDPROC
@@ -1147,6 +1165,9 @@ ENTRY(xen_failsafe_callback)
.previous
ENDPROC(xen_failsafe_callback)
+BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK,
+ xen_evtchn_do_upcall)
+
#endif /* CONFIG_XEN */
#ifdef CONFIG_FUNCTION_TRACER
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 0697ff139837..c5ea5cdbe7b3 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -571,8 +571,8 @@ auditsys:
* masked off.
*/
sysret_audit:
- movq %rax,%rsi /* second arg, syscall return value */
- cmpq $0,%rax /* is it < 0? */
+ movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */
+ cmpq $0,%rsi /* is it < 0? */
setl %al /* 1 if so, 0 if not */
movzbl %al,%edi /* zero-extend that into %edi */
inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */
@@ -1065,6 +1065,7 @@ ENTRY(\sym)
END(\sym)
.endm
+#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
.macro paranoidzeroentry_ist sym do_sym ist
ENTRY(\sym)
INTR_FRAME
@@ -1076,10 +1077,9 @@ ENTRY(\sym)
TRACE_IRQS_OFF
movq %rsp,%rdi /* pt_regs pointer */
xorl %esi,%esi /* no error code */
- PER_CPU(init_tss, %r12)
- subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
+ subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
call \do_sym
- addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
+ addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist)
jmp paranoid_exit /* %ebx: no swapgs flag */
CFI_ENDPROC
END(\sym)
@@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback)
CFI_ENDPROC
END(xen_failsafe_callback)
+apicinterrupt XEN_HVM_EVTCHN_CALLBACK \
+ xen_hvm_callback_vector xen_evtchn_do_upcall
+
#endif /* CONFIG_XEN */
/*
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index b2e246037392..784360c0625c 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -20,7 +20,7 @@
static void __init i386_default_early_setup(void)
{
- /* Initilize 32bit specific setup functions */
+ /* Initialize 32bit specific setup functions */
x86_init.resources.probe_roms = probe_roms;
x86_init.resources.reserve_resources = i386_reserve_resources;
x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 37c3d4b17d85..ff4c453e13f3 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -131,6 +131,12 @@ ENTRY(startup_32)
movsl
1:
+#ifdef CONFIG_OLPC_OPENFIRMWARE
+ /* save OFW's pgdir table for later use when calling into OFW */
+ movl %cr3, %eax
+ movl %eax, pa(olpc_ofw_pgd)
+#endif
+
#ifdef CONFIG_PARAVIRT
/* This is can only trip for a broken bootloader... */
cmpw $0x207, pa(boot_params + BP_version)
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 3d1e6f16b7a6..239046bd447f 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -234,9 +234,8 @@ ENTRY(secondary_startup_64)
* init data section till per cpu areas are set up.
*/
movl $MSR_GS_BASE,%ecx
- movq initial_gs(%rip),%rax
- movq %rax,%rdx
- shrq $32,%rdx
+ movl initial_gs(%rip),%eax
+ movl initial_gs+4(%rip),%edx
wrmsr
/* esi is pointer to real mode structure with interesting info.
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 23b4ecdffa9b..33dbcc4ec5ff 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -16,7 +16,6 @@
#include <asm/hpet.h>
#define HPET_MASK CLOCKSOURCE_MASK(32)
-#define HPET_SHIFT 22
/* FSEC = 10^-15
NSEC = 10^-9 */
@@ -36,6 +35,7 @@
unsigned long hpet_address;
u8 hpet_blockid; /* OS timer block num */
u8 hpet_msi_disable;
+u8 hpet_readback_cmp;
#ifdef CONFIG_PCI_MSI
static unsigned long hpet_num_timers;
@@ -395,19 +395,23 @@ static int hpet_next_event(unsigned long delta,
* at that point and we would wait for the next hpet interrupt
* forever. We found out that reading the CMP register back
* forces the transfer so we can rely on the comparison with
- * the counter register below. If the read back from the
- * compare register does not match the value we programmed
- * then we might have a real hardware problem. We can not do
- * much about it here, but at least alert the user/admin with
- * a prominent warning.
- * An erratum on some chipsets (ICH9,..), results in comparator read
- * immediately following a write returning old value. Workaround
- * for this is to read this value second time, when first
- * read returns old value.
+ * the counter register below.
+ *
+ * That works fine on those ATI chipsets, but on newer Intel
+ * chipsets (ICH9...) this triggers due to an erratum: Reading
+ * the comparator immediately following a write is returning
+ * the old value.
+ *
+ * We restrict the read back to the affected ATI chipsets (set
+ * by quirks) and also run it with hpet=verbose for debugging
+ * purposes.
*/
- if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
- WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
- KERN_WARNING "hpet: compare register read back failed.\n");
+ if (hpet_readback_cmp || hpet_verbose) {
+ u32 cmp = hpet_readl(HPET_Tn_CMP(timer));
+
+ if (cmp != cnt)
+ printk_once(KERN_WARNING
+ "hpet: compare register read back failed.\n");
}
return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
@@ -782,7 +786,6 @@ static struct clocksource clocksource_hpet = {
.rating = 250,
.read = read_hpet,
.mask = HPET_MASK,
- .shift = HPET_SHIFT,
.flags = CLOCK_SOURCE_IS_CONTINUOUS,
.resume = hpet_resume_counter,
#ifdef CONFIG_X86_64
@@ -793,6 +796,7 @@ static struct clocksource clocksource_hpet = {
static int hpet_clocksource_register(void)
{
u64 start, now;
+ u64 hpet_freq;
cycle_t t1;
/* Start the counter */
@@ -827,9 +831,15 @@ static int hpet_clocksource_register(void)
* mult = (hpet_period * 2^shift)/10^6
* mult = (hpet_period << shift)/FSEC_PER_NSEC
*/
- clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT);
- clocksource_register(&clocksource_hpet);
+ /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
+ *
+ * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
+ * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
+ */
+ hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC;
+ do_div(hpet_freq, hpet_period);
+ clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
return 0;
}
@@ -959,7 +969,7 @@ fs_initcall(hpet_late_init);
void hpet_disable(void)
{
- if (is_hpet_capable()) {
+ if (is_hpet_capable() && hpet_virt_address) {
unsigned int cfg = hpet_readl(HPET_CFG);
if (hpet_legacy_int_enabled) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index d6cc065f519f..a474ec37c32f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -189,25 +189,16 @@ static int get_hbp_len(u8 hbp_len)
}
/*
- * Check for virtual address in user space.
- */
-int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
-{
- unsigned int len;
-
- len = get_hbp_len(hbp_len);
-
- return (va <= TASK_SIZE - len);
-}
-
-/*
* Check for virtual address in kernel space.
*/
-static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
+int arch_check_bp_in_kernelspace(struct perf_event *bp)
{
unsigned int len;
+ unsigned long va;
+ struct arch_hw_breakpoint *info = counter_arch_bp(bp);
- len = get_hbp_len(hbp_len);
+ va = info->address;
+ len = get_hbp_len(info->len);
return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
}
@@ -217,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type,
{
/* Len */
switch (x86_len) {
+ case X86_BREAKPOINT_LEN_X:
+ *gen_len = sizeof(long);
+ break;
case X86_BREAKPOINT_LEN_1:
*gen_len = HW_BREAKPOINT_LEN_1;
break;
@@ -260,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp)
info->address = bp->attr.bp_addr;
+ /* Type */
+ switch (bp->attr.bp_type) {
+ case HW_BREAKPOINT_W:
+ info->type = X86_BREAKPOINT_WRITE;
+ break;
+ case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
+ info->type = X86_BREAKPOINT_RW;
+ break;
+ case HW_BREAKPOINT_X:
+ info->type = X86_BREAKPOINT_EXECUTE;
+ /*
+ * x86 inst breakpoints need to have a specific undefined len.
+ * But we still need to check userspace is not trying to setup
+ * an unsupported length, to get a range breakpoint for example.
+ */
+ if (bp->attr.bp_len == sizeof(long)) {
+ info->len = X86_BREAKPOINT_LEN_X;
+ return 0;
+ }
+ default:
+ return -EINVAL;
+ }
+
/* Len */
switch (bp->attr.bp_len) {
case HW_BREAKPOINT_LEN_1:
@@ -280,28 +297,12 @@ static int arch_build_bp_info(struct perf_event *bp)
return -EINVAL;
}
- /* Type */
- switch (bp->attr.bp_type) {
- case HW_BREAKPOINT_W:
- info->type = X86_BREAKPOINT_WRITE;
- break;
- case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
- info->type = X86_BREAKPOINT_RW;
- break;
- case HW_BREAKPOINT_X:
- info->type = X86_BREAKPOINT_EXECUTE;
- break;
- default:
- return -EINVAL;
- }
-
return 0;
}
/*
* Validate the arch-specific HW Breakpoint register settings
*/
-int arch_validate_hwbkpt_settings(struct perf_event *bp,
- struct task_struct *tsk)
+int arch_validate_hwbkpt_settings(struct perf_event *bp)
{
struct arch_hw_breakpoint *info = counter_arch_bp(bp);
unsigned int align;
@@ -314,17 +315,10 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
ret = -EINVAL;
- if (info->type == X86_BREAKPOINT_EXECUTE)
- /*
- * Ptrace-refactoring code
- * For now, we'll allow instruction breakpoint only for user-space
- * addresses
- */
- if ((!arch_check_va_in_userspace(info->address, info->len)) &&
- info->len != X86_BREAKPOINT_EXECUTE)
- return ret;
-
switch (info->len) {
+ case X86_BREAKPOINT_LEN_X:
+ align = sizeof(long) -1;
+ break;
case X86_BREAKPOINT_LEN_1:
align = 0;
break;
@@ -350,15 +344,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp,
if (info->address & align)
return -EINVAL;
- /* Check that the virtual address is in the proper range */
- if (tsk) {
- if (!arch_check_va_in_userspace(info->address, info->len))
- return -EFAULT;
- } else {
- if (!arch_check_va_in_kernelspace(info->address, info->len))
- return -EFAULT;
- }
-
return 0;
}
@@ -495,6 +480,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
perf_bp_event(bp, args->regs);
+ /*
+ * Set up resume flag to avoid breakpoint recursion when
+ * returning back to origin.
+ */
+ if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE)
+ args->regs->flags |= X86_EFLAGS_RF;
+
rcu_read_unlock();
}
/*
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index 54c31c285488..1f11f5ce668f 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void)
stts();
}
-void __cpuinit init_thread_xstate(void)
+static void __cpuinit init_thread_xstate(void)
{
+ /*
+ * Note that xstate_size might be overwriten later during
+ * xsave_init().
+ */
+
if (!HAVE_HWFP) {
xstate_size = sizeof(struct i387_soft_struct);
return;
}
- if (cpu_has_xsave) {
- xsave_cntxt_init();
- return;
- }
-
if (cpu_has_fxsr)
xstate_size = sizeof(struct i387_fxsave_struct);
#ifdef CONFIG_X86_32
@@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void)
* Called at bootup to set up the initial FPU state that is later cloned
* into all processes.
*/
+
void __cpuinit fpu_init(void)
{
unsigned long oldcr0 = read_cr0();
@@ -93,74 +94,77 @@ void __cpuinit fpu_init(void)
write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */
- /*
- * Boot processor to setup the FP and extended state context info.
- */
if (!smp_processor_id())
init_thread_xstate();
- xsave_init();
mxcsr_feature_mask_init();
/* clean state in init */
- if (cpu_has_xsave)
- current_thread_info()->status = TS_XSAVE;
- else
- current_thread_info()->status = 0;
+ current_thread_info()->status = 0;
clear_used_math();
}
-#endif /* CONFIG_X86_64 */
-/*
- * The _current_ task is using the FPU for the first time
- * so initialize it and set the mxcsr to its default
- * value at reset if we support XMM instructions and then
- * remeber the current task has used the FPU.
- */
-int init_fpu(struct task_struct *tsk)
+#else /* CONFIG_X86_64 */
+
+void __cpuinit fpu_init(void)
{
- if (tsk_used_math(tsk)) {
- if (HAVE_HWFP && tsk == current)
- unlazy_fpu(tsk);
- return 0;
- }
+ if (!smp_processor_id())
+ init_thread_xstate();
+}
- /*
- * Memory allocation at the first usage of the FPU and other state.
- */
- if (!tsk->thread.xstate) {
- tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
- GFP_KERNEL);
- if (!tsk->thread.xstate)
- return -ENOMEM;
- }
+#endif /* CONFIG_X86_32 */
+void fpu_finit(struct fpu *fpu)
+{
#ifdef CONFIG_X86_32
if (!HAVE_HWFP) {
- memset(tsk->thread.xstate, 0, xstate_size);
- finit_task(tsk);
- set_stopped_child_used_math(tsk);
- return 0;
+ finit_soft_fpu(&fpu->state->soft);
+ return;
}
#endif
if (cpu_has_fxsr) {
- struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+ struct i387_fxsave_struct *fx = &fpu->state->fxsave;
memset(fx, 0, xstate_size);
fx->cwd = 0x37f;
if (cpu_has_xmm)
fx->mxcsr = MXCSR_DEFAULT;
} else {
- struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+ struct i387_fsave_struct *fp = &fpu->state->fsave;
memset(fp, 0, xstate_size);
fp->cwd = 0xffff037fu;
fp->swd = 0xffff0000u;
fp->twd = 0xffffffffu;
fp->fos = 0xffff0000u;
}
+}
+EXPORT_SYMBOL_GPL(fpu_finit);
+
+/*
+ * The _current_ task is using the FPU for the first time
+ * so initialize it and set the mxcsr to its default
+ * value at reset if we support XMM instructions and then
+ * remeber the current task has used the FPU.
+ */
+int init_fpu(struct task_struct *tsk)
+{
+ int ret;
+
+ if (tsk_used_math(tsk)) {
+ if (HAVE_HWFP && tsk == current)
+ unlazy_fpu(tsk);
+ return 0;
+ }
+
/*
- * Only the device not available exception or ptrace can call init_fpu.
+ * Memory allocation at the first usage of the FPU and other state.
*/
+ ret = fpu_alloc(&tsk->thread.fpu);
+ if (ret)
+ return ret;
+
+ fpu_finit(&tsk->thread.fpu);
+
set_stopped_child_used_math(tsk);
return 0;
}
@@ -193,8 +197,10 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
+ sanitize_i387_state(target);
+
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fxsave, 0, -1);
+ &target->thread.fpu.state->fxsave, 0, -1);
}
int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
@@ -210,20 +216,22 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
+ sanitize_i387_state(target);
+
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fxsave, 0, -1);
+ &target->thread.fpu.state->fxsave, 0, -1);
/*
* mxcsr reserved bits must be masked to zero for security reasons.
*/
- target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+ target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
/*
* update the header bits in the xsave header, indicating the
* presence of FP and SSE state.
*/
if (cpu_has_xsave)
- target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+ target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
return ret;
}
@@ -246,14 +254,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
* memory layout in the thread struct, so that we can copy the entire
* xstateregs to the user using one user_regset_copyout().
*/
- memcpy(&target->thread.xstate->fxsave.sw_reserved,
+ memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
/*
* Copy the xstate memory layout.
*/
ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->xsave, 0, -1);
+ &target->thread.fpu.state->xsave, 0, -1);
return ret;
}
@@ -272,14 +280,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
return ret;
ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->xsave, 0, -1);
+ &target->thread.fpu.state->xsave, 0, -1);
/*
* mxcsr reserved bits must be masked to zero for security reasons.
*/
- target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+ target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
- xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
+ xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
xsave_hdr->xstate_bv &= pcntxt_mask;
/*
@@ -365,7 +373,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave)
static void
convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
{
- struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+ struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
struct _fpreg *to = (struct _fpreg *) &env->st_space[0];
struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0];
int i;
@@ -405,7 +413,7 @@ static void convert_to_fxsr(struct task_struct *tsk,
const struct user_i387_ia32_struct *env)
{
- struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave;
+ struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave;
struct _fpreg *from = (struct _fpreg *) &env->st_space[0];
struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0];
int i;
@@ -445,10 +453,12 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset,
if (!cpu_has_fxsr) {
return user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fsave, 0,
+ &target->thread.fpu.state->fsave, 0,
-1);
}
+ sanitize_i387_state(target);
+
if (kbuf && pos == 0 && count == sizeof(env)) {
convert_from_fxsr(kbuf, target);
return 0;
@@ -470,12 +480,14 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
+ sanitize_i387_state(target);
+
if (!HAVE_HWFP)
return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
if (!cpu_has_fxsr) {
return user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.xstate->fsave, 0, -1);
+ &target->thread.fpu.state->fsave, 0, -1);
}
if (pos > 0 || count < sizeof(env))
@@ -490,7 +502,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
* presence of FP.
*/
if (cpu_has_xsave)
- target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
+ target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP;
return ret;
}
@@ -501,7 +513,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
{
struct task_struct *tsk = current;
- struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave;
+ struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave;
fp->status = fp->swd;
if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct)))
@@ -512,7 +524,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf)
static int save_i387_fxsave(struct _fpstate_ia32 __user *buf)
{
struct task_struct *tsk = current;
- struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave;
+ struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
struct user_i387_ia32_struct env;
int err = 0;
@@ -536,6 +548,9 @@ static int save_i387_xsave(void __user *buf)
struct _fpstate_ia32 __user *fx = buf;
int err = 0;
+
+ sanitize_i387_state(tsk);
+
/*
* For legacy compatible, we always set FP/SSE bits in the bit
* vector while saving the state to the user context.
@@ -547,7 +562,7 @@ static int save_i387_xsave(void __user *buf)
* header as well as change any contents in the memory layout.
* xrestore as part of sigreturn will capture all the changes.
*/
- tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
+ tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE;
if (save_i387_fxsave(fx) < 0)
return -1;
@@ -599,7 +614,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf)
{
struct task_struct *tsk = current;
- return __copy_from_user(&tsk->thread.xstate->fsave, buf,
+ return __copy_from_user(&tsk->thread.fpu.state->fsave, buf,
sizeof(struct i387_fsave_struct));
}
@@ -610,10 +625,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf,
struct user_i387_ia32_struct env;
int err;
- err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0],
+ err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0],
size);
/* mxcsr reserved bits must be masked to zero for security reasons */
- tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
+ tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
if (err || __copy_from_user(&env, buf, sizeof(env)))
return 1;
convert_to_fxsr(tsk, &env);
@@ -629,7 +644,7 @@ static int restore_i387_xsave(void __user *buf)
struct i387_fxsave_struct __user *fx =
(struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0];
struct xsave_hdr_struct *xsave_hdr =
- &current->thread.xstate->xsave.xsave_hdr;
+ &current->thread.fpu.state->xsave.xsave_hdr;
u64 mask;
int err;
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 23c167925a5c..2dfd31597443 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -16,7 +16,7 @@
#include <asm/hpet.h>
#include <asm/smp.h>
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
EXPORT_SYMBOL(i8253_lock);
/*
@@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event;
static void init_pit_timer(enum clock_event_mode mode,
struct clock_event_device *evt)
{
- spin_lock(&i8253_lock);
+ raw_spin_lock(&i8253_lock);
switch (mode) {
case CLOCK_EVT_MODE_PERIODIC:
@@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode,
/* Nothing to do here */
break;
}
- spin_unlock(&i8253_lock);
+ raw_spin_unlock(&i8253_lock);
}
/*
@@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode,
*/
static int pit_next_event(unsigned long delta, struct clock_event_device *evt)
{
- spin_lock(&i8253_lock);
+ raw_spin_lock(&i8253_lock);
outb_pit(delta & 0xff , PIT_CH0); /* LSB */
outb_pit(delta >> 8 , PIT_CH0); /* MSB */
- spin_unlock(&i8253_lock);
+ raw_spin_unlock(&i8253_lock);
return 0;
}
@@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs)
int count;
u32 jifs;
- spin_lock_irqsave(&i8253_lock, flags);
+ raw_spin_lock_irqsave(&i8253_lock, flags);
/*
* Although our caller may have the read side of xtime_lock,
* this is now a seqlock, and we are cheating in this routine
@@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs)
old_count = count;
old_jifs = jifs;
- spin_unlock_irqrestore(&i8253_lock, flags);
+ raw_spin_unlock_irqrestore(&i8253_lock, flags);
count = (LATCH - 1) - count;
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index 7c9f02c130f3..cafa7c80ac95 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -276,16 +276,6 @@ static struct sys_device device_i8259A = {
.cls = &i8259_sysdev_class,
};
-static int __init i8259A_init_sysfs(void)
-{
- int error = sysdev_class_register(&i8259_sysdev_class);
- if (!error)
- error = sysdev_register(&device_i8259A);
- return error;
-}
-
-device_initcall(i8259A_init_sysfs);
-
static void mask_8259A(void)
{
unsigned long flags;
@@ -407,3 +397,18 @@ struct legacy_pic default_legacy_pic = {
};
struct legacy_pic *legacy_pic = &default_legacy_pic;
+
+static int __init i8259A_init_sysfs(void)
+{
+ int error;
+
+ if (legacy_pic != &default_legacy_pic)
+ return 0;
+
+ error = sysdev_class_register(&i8259_sysdev_class);
+ if (!error)
+ error = sysdev_register(&device_i8259A);
+ return error;
+}
+
+device_initcall(i8259A_init_sysfs);
diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c
index 3a54dcb9cd0e..43e9ccf44947 100644
--- a/arch/x86/kernel/init_task.c
+++ b/arch/x86/kernel/init_task.c
@@ -34,7 +34,7 @@ EXPORT_SYMBOL(init_task);
/*
* per-CPU TSS segments. Threads are completely 'soft' on Linux,
* no more per-task TSS's. The TSS size is kept cacheline-aligned
- * so they are allowed to end up in the .data.cacheline_aligned
+ * so they are allowed to end up in the .data..cacheline_aligned
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 0ed2d300cd46..990ae7cfc578 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
outb(0, 0xF0);
if (ignore_fpu_irq || !boot_cpu_data.hard_math)
return IRQ_NONE;
- math_error((void __user *)get_irq_regs()->ip);
+ math_error(get_irq_regs(), 0, 16);
return IRQ_HANDLED;
}
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index b2258ca91003..ef10940e1af0 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -47,69 +47,96 @@
#include <asm/debugreg.h>
#include <asm/apicdef.h>
#include <asm/system.h>
-
#include <asm/apic.h>
-/*
- * Put the error code here just in case the user cares:
- */
-static int gdb_x86errcode;
-
-/*
- * Likewise, the vector number here (since GDB only gets the signal
- * number through the usual means, and that's not very specific):
- */
-static int gdb_x86vector = -1;
-
-/**
- * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs
- * @gdb_regs: A pointer to hold the registers in the order GDB wants.
- * @regs: The &struct pt_regs of the current process.
- *
- * Convert the pt_regs in @regs into the format for registers that
- * GDB expects, stored in @gdb_regs.
- */
-void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
+struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{
-#ifndef CONFIG_X86_32
- u32 *gdb_regs32 = (u32 *)gdb_regs;
+#ifdef CONFIG_X86_32
+ { "ax", 4, offsetof(struct pt_regs, ax) },
+ { "cx", 4, offsetof(struct pt_regs, cx) },
+ { "dx", 4, offsetof(struct pt_regs, dx) },
+ { "bx", 4, offsetof(struct pt_regs, bx) },
+ { "sp", 4, offsetof(struct pt_regs, sp) },
+ { "bp", 4, offsetof(struct pt_regs, bp) },
+ { "si", 4, offsetof(struct pt_regs, si) },
+ { "di", 4, offsetof(struct pt_regs, di) },
+ { "ip", 4, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
+ { "ds", 4, offsetof(struct pt_regs, ds) },
+ { "es", 4, offsetof(struct pt_regs, es) },
+ { "fs", 4, -1 },
+ { "gs", 4, -1 },
+#else
+ { "ax", 8, offsetof(struct pt_regs, ax) },
+ { "bx", 8, offsetof(struct pt_regs, bx) },
+ { "cx", 8, offsetof(struct pt_regs, cx) },
+ { "dx", 8, offsetof(struct pt_regs, dx) },
+ { "si", 8, offsetof(struct pt_regs, dx) },
+ { "di", 8, offsetof(struct pt_regs, di) },
+ { "bp", 8, offsetof(struct pt_regs, bp) },
+ { "sp", 8, offsetof(struct pt_regs, sp) },
+ { "r8", 8, offsetof(struct pt_regs, r8) },
+ { "r9", 8, offsetof(struct pt_regs, r9) },
+ { "r10", 8, offsetof(struct pt_regs, r10) },
+ { "r11", 8, offsetof(struct pt_regs, r11) },
+ { "r12", 8, offsetof(struct pt_regs, r12) },
+ { "r13", 8, offsetof(struct pt_regs, r13) },
+ { "r14", 8, offsetof(struct pt_regs, r14) },
+ { "r15", 8, offsetof(struct pt_regs, r15) },
+ { "ip", 8, offsetof(struct pt_regs, ip) },
+ { "flags", 4, offsetof(struct pt_regs, flags) },
+ { "cs", 4, offsetof(struct pt_regs, cs) },
+ { "ss", 4, offsetof(struct pt_regs, ss) },
#endif
- gdb_regs[GDB_AX] = regs->ax;
- gdb_regs[GDB_BX] = regs->bx;
- gdb_regs[GDB_CX] = regs->cx;
- gdb_regs[GDB_DX] = regs->dx;
- gdb_regs[GDB_SI] = regs->si;
- gdb_regs[GDB_DI] = regs->di;
- gdb_regs[GDB_BP] = regs->bp;
- gdb_regs[GDB_PC] = regs->ip;
+};
+
+int dbg_set_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (
#ifdef CONFIG_X86_32
- gdb_regs[GDB_PS] = regs->flags;
- gdb_regs[GDB_DS] = regs->ds;
- gdb_regs[GDB_ES] = regs->es;
- gdb_regs[GDB_CS] = regs->cs;
- gdb_regs[GDB_FS] = 0xFFFF;
- gdb_regs[GDB_GS] = 0xFFFF;
- if (user_mode_vm(regs)) {
- gdb_regs[GDB_SS] = regs->ss;
- gdb_regs[GDB_SP] = regs->sp;
- } else {
- gdb_regs[GDB_SS] = __KERNEL_DS;
- gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
+ regno == GDB_SS || regno == GDB_FS || regno == GDB_GS ||
+#endif
+ regno == GDB_SP || regno == GDB_ORIG_AX)
+ return 0;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy((void *)regs + dbg_reg_def[regno].offset, mem,
+ dbg_reg_def[regno].size);
+ return 0;
+}
+
+char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
+{
+ if (regno == GDB_ORIG_AX) {
+ memcpy(mem, &regs->orig_ax, sizeof(regs->orig_ax));
+ return "orig_ax";
}
-#else
- gdb_regs[GDB_R8] = regs->r8;
- gdb_regs[GDB_R9] = regs->r9;
- gdb_regs[GDB_R10] = regs->r10;
- gdb_regs[GDB_R11] = regs->r11;
- gdb_regs[GDB_R12] = regs->r12;
- gdb_regs[GDB_R13] = regs->r13;
- gdb_regs[GDB_R14] = regs->r14;
- gdb_regs[GDB_R15] = regs->r15;
- gdb_regs32[GDB_PS] = regs->flags;
- gdb_regs32[GDB_CS] = regs->cs;
- gdb_regs32[GDB_SS] = regs->ss;
- gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
+ if (regno >= DBG_MAX_REG_NUM || regno < 0)
+ return NULL;
+
+ if (dbg_reg_def[regno].offset != -1)
+ memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
+ dbg_reg_def[regno].size);
+
+ switch (regno) {
+#ifdef CONFIG_X86_32
+ case GDB_SS:
+ if (!user_mode_vm(regs))
+ *(unsigned long *)mem = __KERNEL_DS;
+ break;
+ case GDB_SP:
+ if (!user_mode_vm(regs))
+ *(unsigned long *)mem = kernel_stack_pointer(regs);
+ break;
+ case GDB_GS:
+ case GDB_FS:
+ *(unsigned long *)mem = 0xFFFF;
+ break;
#endif
+ }
+ return dbg_reg_def[regno].name;
}
/**
@@ -162,66 +189,35 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
gdb_regs[GDB_SP] = p->thread.sp;
}
-/**
- * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs.
- * @gdb_regs: A pointer to hold the registers we've received from GDB.
- * @regs: A pointer to a &struct pt_regs to hold these values in.
- *
- * Convert the GDB regs in @gdb_regs into the pt_regs, and store them
- * in @regs.
- */
-void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
-{
-#ifndef CONFIG_X86_32
- u32 *gdb_regs32 = (u32 *)gdb_regs;
-#endif
- regs->ax = gdb_regs[GDB_AX];
- regs->bx = gdb_regs[GDB_BX];
- regs->cx = gdb_regs[GDB_CX];
- regs->dx = gdb_regs[GDB_DX];
- regs->si = gdb_regs[GDB_SI];
- regs->di = gdb_regs[GDB_DI];
- regs->bp = gdb_regs[GDB_BP];
- regs->ip = gdb_regs[GDB_PC];
-#ifdef CONFIG_X86_32
- regs->flags = gdb_regs[GDB_PS];
- regs->ds = gdb_regs[GDB_DS];
- regs->es = gdb_regs[GDB_ES];
- regs->cs = gdb_regs[GDB_CS];
-#else
- regs->r8 = gdb_regs[GDB_R8];
- regs->r9 = gdb_regs[GDB_R9];
- regs->r10 = gdb_regs[GDB_R10];
- regs->r11 = gdb_regs[GDB_R11];
- regs->r12 = gdb_regs[GDB_R12];
- regs->r13 = gdb_regs[GDB_R13];
- regs->r14 = gdb_regs[GDB_R14];
- regs->r15 = gdb_regs[GDB_R15];
- regs->flags = gdb_regs32[GDB_PS];
- regs->cs = gdb_regs32[GDB_CS];
- regs->ss = gdb_regs32[GDB_SS];
-#endif
-}
-
static struct hw_breakpoint {
unsigned enabled;
unsigned long addr;
int len;
int type;
struct perf_event **pev;
-} breakinfo[4];
+} breakinfo[HBP_NUM];
+
+static unsigned long early_dr7;
static void kgdb_correct_hw_break(void)
{
int breakno;
- for (breakno = 0; breakno < 4; breakno++) {
+ for (breakno = 0; breakno < HBP_NUM; breakno++) {
struct perf_event *bp;
struct arch_hw_breakpoint *info;
int val;
int cpu = raw_smp_processor_id();
if (!breakinfo[breakno].enabled)
continue;
+ if (dbg_is_early) {
+ set_debugreg(breakinfo[breakno].addr, breakno);
+ early_dr7 |= encode_dr7(breakno,
+ breakinfo[breakno].len,
+ breakinfo[breakno].type);
+ set_debugreg(early_dr7, 7);
+ continue;
+ }
bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
info = counter_arch_bp(bp);
if (bp->attr.disabled != 1)
@@ -236,7 +232,8 @@ static void kgdb_correct_hw_break(void)
if (!val)
bp->attr.disabled = 0;
}
- hw_breakpoint_restore();
+ if (!dbg_is_early)
+ hw_breakpoint_restore();
}
static int hw_break_reserve_slot(int breakno)
@@ -245,6 +242,9 @@ static int hw_break_reserve_slot(int breakno)
int cnt = 0;
struct perf_event **pevent;
+ if (dbg_is_early)
+ return 0;
+
for_each_online_cpu(cpu) {
cnt++;
pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
@@ -270,6 +270,9 @@ static int hw_break_release_slot(int breakno)
struct perf_event **pevent;
int cpu;
+ if (dbg_is_early)
+ return 0;
+
for_each_online_cpu(cpu) {
pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
if (dbg_release_bp_slot(*pevent))
@@ -287,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
int i;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < HBP_NUM; i++)
if (breakinfo[i].addr == addr && breakinfo[i].enabled)
break;
- if (i == 4)
+ if (i == HBP_NUM)
return -1;
if (hw_break_release_slot(i)) {
@@ -308,13 +311,17 @@ static void kgdb_remove_all_hw_break(void)
int cpu = raw_smp_processor_id();
struct perf_event *bp;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < HBP_NUM; i++) {
if (!breakinfo[i].enabled)
continue;
bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
if (bp->attr.disabled == 1)
continue;
- arch_uninstall_hw_breakpoint(bp);
+ if (dbg_is_early)
+ early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+ breakinfo[i].type);
+ else
+ arch_uninstall_hw_breakpoint(bp);
bp->attr.disabled = 1;
}
}
@@ -324,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
{
int i;
- for (i = 0; i < 4; i++)
+ for (i = 0; i < HBP_NUM; i++)
if (!breakinfo[i].enabled)
break;
- if (i == 4)
+ if (i == HBP_NUM)
return -1;
switch (bptype) {
@@ -388,9 +395,14 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
/* Disable hardware debugging while we are in kgdb: */
set_debugreg(0UL, 7);
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < HBP_NUM; i++) {
if (!breakinfo[i].enabled)
continue;
+ if (dbg_is_early) {
+ early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
+ breakinfo[i].type);
+ continue;
+ }
bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
if (bp->attr.disabled == 1)
continue;
@@ -399,23 +411,6 @@ void kgdb_disable_hw_debug(struct pt_regs *regs)
}
}
-/**
- * kgdb_post_primary_code - Save error vector/code numbers.
- * @regs: Original pt_regs.
- * @e_vector: Original error vector.
- * @err_code: Original error code.
- *
- * This is needed on architectures which support SMP and KGDB.
- * This function is called after all the slave cpus have been put
- * to a know spin state and the primary CPU has control over KGDB.
- */
-void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code)
-{
- /* primary processor is completely in the debugger */
- gdb_x86vector = e_vector;
- gdb_x86errcode = err_code;
-}
-
#ifdef CONFIG_SMP
/**
* kgdb_roundup_cpus - Get other CPUs into a holding pattern
@@ -461,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
{
unsigned long addr;
char *ptr;
- int newPC;
switch (remcomInBuffer[0]) {
case 'c':
@@ -472,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
linux_regs->ip = addr;
case 'D':
case 'k':
- newPC = linux_regs->ip;
-
/* clear the trace bit */
linux_regs->flags &= ~X86_EFLAGS_TF;
atomic_set(&kgdb_cpu_doing_single_step, -1);
@@ -567,7 +559,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
return NOTIFY_DONE;
}
- if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs))
+ if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs))
return NOTIFY_DONE;
/* Must touch watchdog before return to normal operation */
@@ -575,6 +567,24 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
return NOTIFY_STOP;
}
+int kgdb_ll_trap(int cmd, const char *str,
+ struct pt_regs *regs, long err, int trap, int sig)
+{
+ struct die_args args = {
+ .regs = regs,
+ .str = str,
+ .err = err,
+ .trapnr = trap,
+ .signr = sig,
+
+ };
+
+ if (!kgdb_io_module_registered)
+ return NOTIFY_DONE;
+
+ return __kgdb_notify(&args, cmd);
+}
+
static int
kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr)
{
@@ -605,14 +615,21 @@ static struct notifier_block kgdb_notifier = {
*/
int kgdb_arch_init(void)
{
+ return register_die_notifier(&kgdb_notifier);
+}
+
+static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
+ struct perf_sample_data *data, struct pt_regs *regs)
+{
+ kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP);
+}
+
+void kgdb_arch_late(void)
+{
int i, cpu;
- int ret;
struct perf_event_attr attr;
struct perf_event **pevent;
- ret = register_die_notifier(&kgdb_notifier);
- if (ret != 0)
- return ret;
/*
* Pre-allocate the hw breakpoint structions in the non-atomic
* portion of kgdb because this operation requires mutexs to
@@ -623,24 +640,27 @@ int kgdb_arch_init(void)
attr.bp_len = HW_BREAKPOINT_LEN_1;
attr.bp_type = HW_BREAKPOINT_W;
attr.disabled = 1;
- for (i = 0; i < 4; i++) {
+ for (i = 0; i < HBP_NUM; i++) {
+ if (breakinfo[i].pev)
+ continue;
breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
if (IS_ERR(breakinfo[i].pev)) {
- printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
+ printk(KERN_ERR "kgdb: Could not allocate hw"
+ "breakpoints\nDisabling the kernel debugger\n");
breakinfo[i].pev = NULL;
kgdb_arch_exit();
- return -1;
+ return;
}
for_each_online_cpu(cpu) {
pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
pevent[0]->hw.sample_period = 1;
+ pevent[0]->overflow_handler = kgdb_hw_overflow_handler;
if (pevent[0]->destroy != NULL) {
pevent[0]->destroy = NULL;
release_bp_slot(*pevent);
}
}
}
- return ret;
}
/**
@@ -690,6 +710,11 @@ unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs)
return instruction_pointer(regs);
}
+void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip)
+{
+ regs->ip = ip;
+}
+
struct kgdb_arch arch_kgdb_ops = {
/* Breakpoint instruction: */
.gdb_bpt_instr = { 0xcc },
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 1658efdfb4e5..1bfb6cf4dd55 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to)
}
/*
- * Check for the REX prefix which can only exist on X86_64
- * X86_32 always returns 0
+ * Skip the prefixes of the instruction.
*/
-static int __kprobes is_REX_prefix(kprobe_opcode_t *insn)
+static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn)
{
+ insn_attr_t attr;
+
+ attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+ while (inat_is_legacy_prefix(attr)) {
+ insn++;
+ attr = inat_get_opcode_attribute((insn_byte_t)*insn);
+ }
#ifdef CONFIG_X86_64
- if ((*insn & 0xf0) == 0x40)
- return 1;
+ if (inat_is_rex_prefix(attr))
+ insn++;
#endif
- return 0;
+ return insn;
}
/*
@@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr)
*/
static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
{
+ /* Skip prefixes */
+ insn = skip_prefixes(insn);
+
switch (*insn) {
case 0xfa: /* cli */
case 0xfb: /* sti */
@@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
return 1;
}
- /*
- * on X86_64, 0x40-0x4f are REX prefixes so we need to look
- * at the next byte instead.. but of course not recurse infinitely
- */
- if (is_REX_prefix(insn))
- return is_IF_modifier(++insn);
-
return 0;
}
@@ -422,14 +424,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
static void __kprobes clear_btf(void)
{
- if (test_thread_flag(TIF_DEBUGCTLMSR))
- update_debugctlmsr(0);
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ }
}
static void __kprobes restore_btf(void)
{
- if (test_thread_flag(TIF_DEBUGCTLMSR))
- update_debugctlmsr(current->thread.debugctlmsr);
+ if (test_thread_flag(TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl |= DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ }
}
void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
@@ -632,8 +642,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
/* Skip cs, ip, orig_ax and gs. */ \
" subl $16, %esp\n" \
" pushl %fs\n" \
- " pushl %ds\n" \
" pushl %es\n" \
+ " pushl %ds\n" \
" pushl %eax\n" \
" pushl %ebp\n" \
" pushl %edi\n" \
@@ -795,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p,
unsigned long orig_ip = (unsigned long)p->addr;
kprobe_opcode_t *insn = p->ainsn.insn;
- /*skip the REX prefix*/
- if (is_REX_prefix(insn))
- insn++;
+ /* Skip prefixes */
+ insn = skip_prefixes(insn);
regs->flags &= ~X86_EFLAGS_TF;
switch (*insn) {
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index feaeb0d3aa4f..eb9b76c716c2 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -29,6 +29,8 @@
#define KVM_SCALE 22
static int kvmclock = 1;
+static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
+static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
static int parse_no_kvmclock(char *arg)
{
@@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void)
low = (int)__pa_symbol(&wall_clock);
high = ((u64)__pa_symbol(&wall_clock) >> 32);
- native_write_msr(MSR_KVM_WALL_CLOCK, low, high);
+
+ native_write_msr(msr_kvm_wall_clock, low, high);
vcpu_time = &get_cpu_var(hv_clock);
pvclock_read_wallclock(&wall_clock, vcpu_time, &ts);
@@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt)
high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
cpu, high, low, txt);
- return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high);
+
+ return native_write_msr_safe(msr_kvm_system_time, low, high);
}
#ifdef CONFIG_X86_LOCAL_APIC
@@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void)
#ifdef CONFIG_KEXEC
static void kvm_crash_shutdown(struct pt_regs *regs)
{
- native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+ native_write_msr(msr_kvm_system_time, 0, 0);
native_machine_crash_shutdown(regs);
}
#endif
static void kvm_shutdown(void)
{
- native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0);
+ native_write_msr(msr_kvm_system_time, 0, 0);
native_machine_shutdown();
}
@@ -181,27 +185,37 @@ void __init kvmclock_init(void)
if (!kvm_para_available())
return;
- if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) {
- if (kvm_register_clock("boot clock"))
- return;
- pv_time_ops.sched_clock = kvm_clock_read;
- x86_platform.calibrate_tsc = kvm_get_tsc_khz;
- x86_platform.get_wallclock = kvm_get_wallclock;
- x86_platform.set_wallclock = kvm_set_wallclock;
+ if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) {
+ msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW;
+ msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW;
+ } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
+ return;
+
+ printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+ msr_kvm_system_time, msr_kvm_wall_clock);
+
+ if (kvm_register_clock("boot clock"))
+ return;
+ pv_time_ops.sched_clock = kvm_clock_read;
+ x86_platform.calibrate_tsc = kvm_get_tsc_khz;
+ x86_platform.get_wallclock = kvm_get_wallclock;
+ x86_platform.set_wallclock = kvm_set_wallclock;
#ifdef CONFIG_X86_LOCAL_APIC
- x86_cpuinit.setup_percpu_clockev =
- kvm_setup_secondary_clock;
+ x86_cpuinit.setup_percpu_clockev =
+ kvm_setup_secondary_clock;
#endif
#ifdef CONFIG_SMP
- smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
+ smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
#endif
- machine_ops.shutdown = kvm_shutdown;
+ machine_ops.shutdown = kvm_shutdown;
#ifdef CONFIG_KEXEC
- machine_ops.crash_shutdown = kvm_crash_shutdown;
+ machine_ops.crash_shutdown = kvm_crash_shutdown;
#endif
- kvm_get_preset_lpj();
- clocksource_register(&kvm_clock);
- pv_info.paravirt_enabled = 1;
- pv_info.name = "KVM";
- }
+ kvm_get_preset_lpj();
+ clocksource_register(&kvm_clock);
+ pv_info.paravirt_enabled = 1;
+ pv_info.name = "KVM";
+
+ if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
+ pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
}
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index cceb5bc3c3c2..fa6551d36c10 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size)
return error;
}
-static int microcode_open(struct inode *unused1, struct file *unused2)
+static int microcode_open(struct inode *inode, struct file *file)
{
- return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
+ return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM;
}
static ssize_t microcode_write(struct file *file, const char __user *buf,
@@ -260,6 +260,7 @@ static void microcode_dev_exit(void)
}
MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
+MODULE_ALIAS("devname:cpu/microcode");
#else
#define microcode_dev_init() 0
#define microcode_dev_exit() do { } while (0)
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 85a343e28937..356170262a93 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
int (*get_ucode_data)(void *, const void *, size_t))
{
struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
- u8 *ucode_ptr = data, *new_mc = NULL, *mc;
+ u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL;
int new_rev = uci->cpu_sig.rev;
unsigned int leftover = size;
enum ucode_state state = UCODE_OK;
+ unsigned int curr_mc_size = 0;
while (leftover) {
struct microcode_header_intel mc_header;
@@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
break;
}
- mc = vmalloc(mc_size);
- if (!mc)
- break;
+ /* For performance reasons, reuse mc area when possible */
+ if (!mc || mc_size > curr_mc_size) {
+ if (mc)
+ vfree(mc);
+ mc = vmalloc(mc_size);
+ if (!mc)
+ break;
+ curr_mc_size = mc_size;
+ }
if (get_ucode_data(mc, ucode_ptr, mc_size) ||
microcode_sanity_check(mc) < 0) {
@@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
- } else
- vfree(mc);
+ mc = NULL; /* trigger new vmalloc */
+ }
ucode_ptr += mc_size;
leftover -= mc_size;
}
+ if (mc)
+ vfree(mc);
+
if (leftover) {
if (new_mc)
vfree(new_mc);
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index e81030f71a8f..d86dbf7e54be 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m)
printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str);
}
-static int bad_ioapic(unsigned long address)
-{
- if (nr_ioapics >= MAX_IO_APICS) {
- printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded "
- "(found %d)\n", MAX_IO_APICS, nr_ioapics);
- panic("Recompile kernel with bigger MAX_IO_APICS!\n");
- }
- if (!address) {
- printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address"
- " found in table, skipping!\n");
- return 1;
- }
- return 0;
-}
-
static void __init MP_ioapic_info(struct mpc_ioapic *m)
{
if (!(m->flags & MPC_APIC_USABLE))
@@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m)
printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
m->apicid, m->apicver, m->apicaddr);
- if (bad_ioapic(m->apicaddr))
- return;
-
- mp_ioapics[nr_ioapics].apicaddr = m->apicaddr;
- mp_ioapics[nr_ioapics].apicid = m->apicid;
- mp_ioapics[nr_ioapics].type = m->type;
- mp_ioapics[nr_ioapics].apicver = m->apicver;
- mp_ioapics[nr_ioapics].flags = m->flags;
- nr_ioapics++;
+ mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
}
static void print_MP_intsrc_info(struct mpc_intsrc *m)
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 0aad8670858e..79ae68154e87 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -25,8 +25,34 @@
#include <asm/i8259.h>
#include <asm/apb_timer.h>
+/*
+ * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
+ * cmdline option x86_mrst_timer can be used to override the configuration
+ * to prefer one or the other.
+ * at runtime, there are basically three timer configurations:
+ * 1. per cpu apbt clock only
+ * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
+ * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
+ *
+ * by default (without cmdline option), platform code first detects cpu type
+ * to see if we are on lincroft or penwell, then set up both lapic or apbt
+ * clocks accordingly.
+ * i.e. by default, medfield uses configuration #2, moorestown uses #1.
+ * config #3 is supported but not recommended on medfield.
+ *
+ * rating and feature summary:
+ * lapic (with C3STOP) --------- 100
+ * apbt (always-on) ------------ 110
+ * lapic (always-on,ARAT) ------ 150
+ */
+
+__cpuinitdata enum mrst_timer_options mrst_timer_options;
+
static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
+enum mrst_cpu_type __mrst_cpu_chip;
+EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
+
int sfi_mtimer_num;
struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
@@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table)
return 0;
}
-/*
- * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
- * APBT but cmdline option can also override it.
- */
-static void __cpuinit mrst_setup_secondary_clock(void)
-{
- /* restore default lapic clock if disabled by cmdline */
- if (disable_apbt_percpu)
- return setup_secondary_APIC_clock();
- apbt_setup_secondary_clock();
-}
-
static unsigned long __init mrst_calibrate_tsc(void)
{
unsigned long flags, fast_calibrate;
@@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void)
void __init mrst_time_init(void)
{
+ switch (mrst_timer_options) {
+ case MRST_TIMER_APBT_ONLY:
+ break;
+ case MRST_TIMER_LAPIC_APBT:
+ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+ x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+ break;
+ default:
+ if (!boot_cpu_has(X86_FEATURE_ARAT))
+ break;
+ x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
+ x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
+ return;
+ }
+ /* we need at least one APB timer */
sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
pre_init_apic_IRQ0();
apbt_time_init();
@@ -205,16 +234,27 @@ void __init mrst_rtc_init(void)
sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
}
-/*
- * if we use per cpu apb timer, the bootclock already setup. if we use lapic
- * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
- */
-static void __init mrst_setup_boot_clock(void)
+void __cpuinit mrst_arch_setup(void)
{
- pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
- if (disable_apbt_percpu)
- setup_boot_APIC_clock();
-};
+ if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
+ __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
+ else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
+ __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+ else {
+ pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
+ boot_cpu_data.x86, boot_cpu_data.x86_model);
+ __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
+ }
+ pr_debug("Moorestown CPU %s identified\n",
+ (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
+ "Lincroft" : "Penwell");
+}
+
+/* MID systems don't have i8042 controller */
+static int mrst_i8042_detect(void)
+{
+ return 0;
+}
/*
* Moorestown specific x86_init function overrides and early setup
@@ -226,15 +266,46 @@ void __init x86_mrst_early_setup(void)
x86_init.resources.reserve_resources = x86_init_noop;
x86_init.timers.timer_init = mrst_time_init;
- x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
+ x86_init.timers.setup_percpu_clockev = x86_init_noop;
x86_init.irqs.pre_vector_init = x86_init_noop;
- x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
+ x86_init.oem.arch_setup = mrst_arch_setup;
+
+ x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
x86_platform.calibrate_tsc = mrst_calibrate_tsc;
+ x86_platform.i8042_detect = mrst_i8042_detect;
x86_init.pci.init = pci_mrst_init;
x86_init.pci.fixup_irqs = x86_init_noop;
legacy_pic = &null_legacy_pic;
+
+ /* Avoid searching for BIOS MP tables */
+ x86_init.mpparse.find_smp_config = x86_init_noop;
+ x86_init.mpparse.get_smp_config = x86_init_uint_noop;
+
+}
+
+/*
+ * if user does not want to use per CPU apb timer, just give it a lower rating
+ * than local apic timer and skip the late per cpu timer init.
+ */
+static inline int __init setup_x86_mrst_timer(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (strcmp("apbt_only", arg) == 0)
+ mrst_timer_options = MRST_TIMER_APBT_ONLY;
+ else if (strcmp("lapic_and_apbt", arg) == 0)
+ mrst_timer_options = MRST_TIMER_LAPIC_APBT;
+ else {
+ pr_warning("X86 MRST timer option %s not recognised"
+ " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
+ arg);
+ return -EINVAL;
+ }
+ return 0;
}
+__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 4d4468e9f47c..7bf2dc4c8f70 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -230,7 +230,7 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb,
msr_device_destroy(cpu);
break;
}
- return err ? NOTIFY_BAD : NOTIFY_OK;
+ return notifier_from_errno(err);
}
static struct notifier_block __refdata msr_class_cpu_notifier = {
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 8297160c41b3..0e0cdde519be 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -21,10 +21,7 @@
#include <asm/geode.h>
#include <asm/setup.h>
#include <asm/olpc.h>
-
-#ifdef CONFIG_OPEN_FIRMWARE
-#include <asm/ofw.h>
-#endif
+#include <asm/olpc_ofw.h>
struct olpc_platform_t olpc_platform_info;
EXPORT_SYMBOL_GPL(olpc_platform_info);
@@ -145,7 +142,7 @@ restart:
* The OBF flag will sometimes misbehave due to what we believe
* is a hardware quirk..
*/
- printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd);
+ pr_devel("olpc-ec: running cmd 0x%x\n", cmd);
outb(cmd, 0x6c);
if (wait_on_ibf(0x6c, 0)) {
@@ -162,8 +159,7 @@ restart:
" EC accept data!\n");
goto err;
}
- printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n",
- inbuf[i]);
+ pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]);
outb(inbuf[i], 0x68);
}
}
@@ -176,8 +172,7 @@ restart:
goto restart;
}
outbuf[i] = inb(0x68);
- printk(KERN_DEBUG "olpc-ec: received 0x%x\n",
- outbuf[i]);
+ pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
}
}
@@ -188,14 +183,15 @@ err:
}
EXPORT_SYMBOL_GPL(olpc_ec_cmd);
-#ifdef CONFIG_OPEN_FIRMWARE
+#ifdef CONFIG_OLPC_OPENFIRMWARE
static void __init platform_detect(void)
{
size_t propsize;
__be32 rev;
+ const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
+ void *res[] = { &propsize };
- if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4,
- &propsize) || propsize != 4) {
+ if (olpc_ofw("getprop", args, res) || propsize != 4) {
printk(KERN_ERR "ofw: getprop call failed!\n");
rev = cpu_to_be32(0);
}
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c
new file mode 100644
index 000000000000..3218aa71ab5e
--- /dev/null
+++ b/arch/x86/kernel/olpc_ofw.c
@@ -0,0 +1,106 @@
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <asm/page.h>
+#include <asm/setup.h>
+#include <asm/io.h>
+#include <asm/pgtable.h>
+#include <asm/olpc_ofw.h>
+
+/* address of OFW callback interface; will be NULL if OFW isn't found */
+static int (*olpc_ofw_cif)(int *);
+
+/* page dir entry containing OFW's pgdir table; filled in by head_32.S */
+u32 olpc_ofw_pgd __initdata;
+
+static DEFINE_SPINLOCK(ofw_lock);
+
+#define MAXARGS 10
+
+void __init setup_olpc_ofw_pgd(void)
+{
+ pgd_t *base, *ofw_pde;
+
+ if (!olpc_ofw_cif)
+ return;
+
+ /* fetch OFW's PDE */
+ base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+ if (!base) {
+ printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n");
+ olpc_ofw_cif = NULL;
+ return;
+ }
+ ofw_pde = &base[OLPC_OFW_PDE_NR];
+
+ /* install OFW's PDE permanently into the kernel's pgtable */
+ set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde);
+ /* implicit optimization barrier here due to uninline function return */
+
+ early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD);
+}
+
+int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
+ void **res)
+{
+ int ofw_args[MAXARGS + 3];
+ unsigned long flags;
+ int ret, i, *p;
+
+ BUG_ON(nr_args + nr_res > MAXARGS);
+
+ if (!olpc_ofw_cif)
+ return -EIO;
+
+ ofw_args[0] = (int)name;
+ ofw_args[1] = nr_args;
+ ofw_args[2] = nr_res;
+
+ p = &ofw_args[3];
+ for (i = 0; i < nr_args; i++, p++)
+ *p = (int)args[i];
+
+ /* call into ofw */
+ spin_lock_irqsave(&ofw_lock, flags);
+ ret = olpc_ofw_cif(ofw_args);
+ spin_unlock_irqrestore(&ofw_lock, flags);
+
+ if (!ret) {
+ for (i = 0; i < nr_res; i++, p++)
+ *((int *)res[i]) = *p;
+ }
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(__olpc_ofw);
+
+/* OFW cif _should_ be above this address */
+#define OFW_MIN 0xff000000
+
+/* OFW starts on a 1MB boundary */
+#define OFW_BOUND (1<<20)
+
+void __init olpc_ofw_detect(void)
+{
+ struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header;
+ unsigned long start;
+
+ /* ensure OFW booted us by checking for "OFW " string */
+ if (hdr->ofw_magic != OLPC_OFW_SIG)
+ return;
+
+ olpc_ofw_cif = (int (*)(int *))hdr->cif_handler;
+
+ if ((unsigned long)olpc_ofw_cif < OFW_MIN) {
+ printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n",
+ (unsigned long)olpc_ofw_cif);
+ olpc_ofw_cif = NULL;
+ return;
+ }
+
+ /* determine where OFW starts in memory */
+ start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND);
+ printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n",
+ (unsigned long)olpc_ofw_cif, (-start) >> 20);
+ reserve_top_address(-start);
+}
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index fb99f7edb341..078d4ec1a9d9 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -103,11 +103,16 @@ int use_calgary __read_mostly = 0;
#define PMR_SOFTSTOPFAULT 0x40000000
#define PMR_HARDSTOP 0x20000000
-#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */
-#define MAX_NUM_CHASSIS 8 /* max number of chassis */
-/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */
-#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2)
-#define PHBS_PER_CALGARY 4
+/*
+ * The maximum PHB bus number.
+ * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384
+ * x3950M2: 4 chassis, 48 PHBs per chassis = 192
+ * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256
+ * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128
+ */
+#define MAX_PHB_BUS_NUM 256
+
+#define PHBS_PER_CALGARY 4
/* register offsets in Calgary's internal register space */
static const unsigned long tar_offsets[] = {
@@ -1051,8 +1056,6 @@ static int __init calgary_init_one(struct pci_dev *dev)
struct iommu_table *tbl;
int ret;
- BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM);
-
bbar = busno_to_bbar(dev->bus->number);
ret = calgary_setup_tar(dev, bbar);
if (ret)
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index 7d2829dde20e..a5bc528d4328 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -31,8 +31,6 @@ static struct dma_map_ops swiotlb_dma_ops = {
.free_coherent = swiotlb_free_coherent,
.sync_single_for_cpu = swiotlb_sync_single_for_cpu,
.sync_single_for_device = swiotlb_sync_single_for_device,
- .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu,
- .sync_single_range_for_device = swiotlb_sync_single_range_for_device,
.sync_sg_for_cpu = swiotlb_sync_sg_for_cpu,
.sync_sg_for_device = swiotlb_sync_sg_for_device,
.map_sg = swiotlb_map_sg_attrs,
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 0415c3ef91b5..d401f1d2d06e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -20,7 +20,6 @@
#include <asm/idle.h>
#include <asm/uaccess.h>
#include <asm/i387.h>
-#include <asm/ds.h>
#include <asm/debugreg.h>
unsigned long idle_halt;
@@ -29,29 +28,26 @@ unsigned long idle_nomwait;
EXPORT_SYMBOL(idle_nomwait);
struct kmem_cache *task_xstate_cachep;
+EXPORT_SYMBOL_GPL(task_xstate_cachep);
int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
{
+ int ret;
+
*dst = *src;
- if (src->thread.xstate) {
- dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep,
- GFP_KERNEL);
- if (!dst->thread.xstate)
- return -ENOMEM;
- WARN_ON((unsigned long)dst->thread.xstate & 15);
- memcpy(dst->thread.xstate, src->thread.xstate, xstate_size);
+ if (fpu_allocated(&src->thread.fpu)) {
+ memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu));
+ ret = fpu_alloc(&dst->thread.fpu);
+ if (ret)
+ return ret;
+ fpu_copy(&dst->thread.fpu, &src->thread.fpu);
}
return 0;
}
void free_thread_xstate(struct task_struct *tsk)
{
- if (tsk->thread.xstate) {
- kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
- tsk->thread.xstate = NULL;
- }
-
- WARN(tsk->thread.ds_ctx, "leaking DS context\n");
+ fpu_free(&tsk->thread.fpu);
}
void free_thread_info(struct thread_info *ti)
@@ -198,11 +194,16 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
prev = &prev_p->thread;
next = &next_p->thread;
- if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) ||
- test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR))
- ds_switch_to(prev_p, next_p);
- else if (next->debugctlmsr != prev->debugctlmsr)
- update_debugctlmsr(next->debugctlmsr);
+ if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^
+ test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP))
+ debugctl |= DEBUGCTLMSR_BTF;
+
+ update_debugctlmsr(debugctl);
+ }
if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
test_tsk_thread_flag(next_p, TIF_NOTSC)) {
@@ -371,7 +372,7 @@ static inline int hlt_use_halt(void)
void default_idle(void)
{
if (hlt_use_halt()) {
- trace_power_start(POWER_CSTATE, 1);
+ trace_power_start(POWER_CSTATE, 1, smp_processor_id());
current_thread_info()->status &= ~TS_POLLING;
/*
* TS_POLLING-cleared state must be visible before we
@@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
*/
void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
{
- trace_power_start(POWER_CSTATE, (ax>>4)+1);
+ trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
if (!need_resched()) {
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -457,7 +458,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
static void mwait_idle(void)
{
if (!need_resched()) {
- trace_power_start(POWER_CSTATE, 1);
+ trace_power_start(POWER_CSTATE, 1, smp_processor_id());
if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
clflush((void *)&current_thread_info()->flags);
@@ -478,7 +479,7 @@ static void mwait_idle(void)
*/
static void poll_idle(void)
{
- trace_power_start(POWER_CSTATE, 0);
+ trace_power_start(POWER_CSTATE, 0, smp_processor_id());
local_irq_enable();
while (!need_resched())
cpu_relax();
@@ -525,44 +526,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
return (edx & MWAIT_EDX_C1);
}
-/*
- * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
- * For more information see
- * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
- * - Erratum #365 for family 0x11 (not affected because C1e not in use)
- */
-static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
-{
- u64 val;
- if (c->x86_vendor != X86_VENDOR_AMD)
- goto no_c1e_idle;
-
- /* Family 0x0f models < rev F do not have C1E */
- if (c->x86 == 0x0F && c->x86_model >= 0x40)
- return 1;
-
- if (c->x86 == 0x10) {
- /*
- * check OSVW bit for CPUs that are not affected
- * by erratum #400
- */
- if (cpu_has(c, X86_FEATURE_OSVW)) {
- rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
- if (val >= 2) {
- rdmsrl(MSR_AMD64_OSVW_STATUS, val);
- if (!(val & BIT(1)))
- goto no_c1e_idle;
- }
- }
- return 1;
- }
-
-no_c1e_idle:
- return 0;
-}
+bool c1e_detected;
+EXPORT_SYMBOL(c1e_detected);
static cpumask_var_t c1e_mask;
-static int c1e_detected;
void c1e_remove_cpu(int cpu)
{
@@ -584,12 +551,12 @@ static void c1e_idle(void)
u32 lo, hi;
rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
+
if (lo & K8_INTP_C1E_ACTIVE_MASK) {
- c1e_detected = 1;
+ c1e_detected = true;
if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
mark_tsc_unstable("TSC halt in AMD C1E");
printk(KERN_INFO "System has AMD C1E enabled\n");
- set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E);
}
}
@@ -638,7 +605,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
*/
printk(KERN_INFO "using mwait in idle threads.\n");
pm_idle = mwait_idle;
- } else if (check_c1e_idle(c)) {
+ } else if (cpu_has_amd_erratum(amd_erratum_400)) {
+ /* E400: APIC timer interrupt does not wake up CPU from C1e */
printk(KERN_INFO "using C1E aware idle routine\n");
pm_idle = c1e_idle;
} else
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index f6c62667e30c..96586c3cbbbf 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -55,9 +55,10 @@
#include <asm/cpu.h>
#include <asm/idle.h>
#include <asm/syscalls.h>
-#include <asm/ds.h>
#include <asm/debugreg.h>
+#include <trace/events/power.h>
+
asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
/*
@@ -112,6 +113,8 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
+
+ trace_power_end(smp_processor_id());
}
tick_nohz_restart_sched_tick();
preempt_enable_no_resched();
@@ -238,13 +241,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
kfree(p->thread.io_bitmap_ptr);
p->thread.io_bitmap_max = 0;
}
-
- clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
- p->thread.ds_ctx = NULL;
-
- clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
- p->thread.debugctlmsr = 0;
-
return err;
}
@@ -317,7 +313,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* we're going to use this soon, after a few expensive things */
if (preload_fpu)
- prefetch(next->xstate);
+ prefetch(next->fpu.state);
/*
* Reload esp0.
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 17cb3295cbf7..3d9ea531ddd1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -49,9 +49,10 @@
#include <asm/ia32.h>
#include <asm/idle.h>
#include <asm/syscalls.h>
-#include <asm/ds.h>
#include <asm/debugreg.h>
+#include <trace/events/power.h>
+
asmlinkage extern void ret_from_fork(void);
DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -139,6 +140,9 @@ void cpu_idle(void)
stop_critical_timings();
pm_idle();
start_critical_timings();
+
+ trace_power_end(smp_processor_id());
+
/* In many cases the interrupt that ended idle
has already called exit_idle. But some idle
loops can be woken up without interrupt. */
@@ -313,13 +317,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
if (err)
goto out;
}
-
- clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
- p->thread.ds_ctx = NULL;
-
- clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
- p->thread.debugctlmsr = 0;
-
err = 0;
out:
if (err && p->thread.io_bitmap_ptr) {
@@ -396,7 +393,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/* we're going to use this soon, after a few expensive things */
if (preload_fpu)
- prefetch(next->xstate);
+ prefetch(next->fpu.state);
/*
* Reload esp0, LDT and the page table pointer:
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 2e9b55027b7e..70c4872cd8aa 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -2,9 +2,6 @@
/*
* Pentium III FXSR, SSE support
* Gareth Hughes <gareth@valinux.com>, May 2000
- *
- * BTS tracing
- * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007
*/
#include <linux/kernel.h>
@@ -22,7 +19,6 @@
#include <linux/audit.h>
#include <linux/seccomp.h>
#include <linux/signal.h>
-#include <linux/workqueue.h>
#include <linux/perf_event.h>
#include <linux/hw_breakpoint.h>
@@ -36,7 +32,6 @@
#include <asm/desc.h>
#include <asm/prctl.h>
#include <asm/proto.h>
-#include <asm/ds.h>
#include <asm/hw_breakpoint.h>
#include "tls.h"
@@ -693,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
struct perf_event_attr attr;
if (!t->ptrace_bps[nr]) {
- hw_breakpoint_init(&attr);
+ ptrace_breakpoint_init(&attr);
/*
* Put stub len and type to register (reserve) an inactive but
* correct bp
@@ -789,342 +784,6 @@ static int ioperm_get(struct task_struct *target,
0, IO_BITMAP_BYTES);
}
-#ifdef CONFIG_X86_PTRACE_BTS
-/*
- * A branch trace store context.
- *
- * Contexts may only be installed by ptrace_bts_config() and only for
- * ptraced tasks.
- *
- * Contexts are destroyed when the tracee is detached from the tracer.
- * The actual destruction work requires interrupts enabled, so the
- * work is deferred and will be scheduled during __ptrace_unlink().
- *
- * Contexts hold an additional task_struct reference on the traced
- * task, as well as a reference on the tracer's mm.
- *
- * Ptrace already holds a task_struct for the duration of ptrace operations,
- * but since destruction is deferred, it may be executed after both
- * tracer and tracee exited.
- */
-struct bts_context {
- /* The branch trace handle. */
- struct bts_tracer *tracer;
-
- /* The buffer used to store the branch trace and its size. */
- void *buffer;
- unsigned int size;
-
- /* The mm that paid for the above buffer. */
- struct mm_struct *mm;
-
- /* The task this context belongs to. */
- struct task_struct *task;
-
- /* The signal to send on a bts buffer overflow. */
- unsigned int bts_ovfl_signal;
-
- /* The work struct to destroy a context. */
- struct work_struct work;
-};
-
-static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
-{
- void *buffer = NULL;
- int err = -ENOMEM;
-
- err = account_locked_memory(current->mm, current->signal->rlim, size);
- if (err < 0)
- return err;
-
- buffer = kzalloc(size, GFP_KERNEL);
- if (!buffer)
- goto out_refund;
-
- context->buffer = buffer;
- context->size = size;
- context->mm = get_task_mm(current);
-
- return 0;
-
- out_refund:
- refund_locked_memory(current->mm, size);
- return err;
-}
-
-static inline void free_bts_buffer(struct bts_context *context)
-{
- if (!context->buffer)
- return;
-
- kfree(context->buffer);
- context->buffer = NULL;
-
- refund_locked_memory(context->mm, context->size);
- context->size = 0;
-
- mmput(context->mm);
- context->mm = NULL;
-}
-
-static void free_bts_context_work(struct work_struct *w)
-{
- struct bts_context *context;
-
- context = container_of(w, struct bts_context, work);
-
- ds_release_bts(context->tracer);
- put_task_struct(context->task);
- free_bts_buffer(context);
- kfree(context);
-}
-
-static inline void free_bts_context(struct bts_context *context)
-{
- INIT_WORK(&context->work, free_bts_context_work);
- schedule_work(&context->work);
-}
-
-static inline struct bts_context *alloc_bts_context(struct task_struct *task)
-{
- struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
- if (context) {
- context->task = task;
- task->bts = context;
-
- get_task_struct(task);
- }
-
- return context;
-}
-
-static int ptrace_bts_read_record(struct task_struct *child, size_t index,
- struct bts_struct __user *out)
-{
- struct bts_context *context;
- const struct bts_trace *trace;
- struct bts_struct bts;
- const unsigned char *at;
- int error;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- at = trace->ds.top - ((index + 1) * trace->ds.size);
- if ((void *)at < trace->ds.begin)
- at += (trace->ds.n * trace->ds.size);
-
- if (!trace->read)
- return -EOPNOTSUPP;
-
- error = trace->read(context->tracer, at, &bts);
- if (error < 0)
- return error;
-
- if (copy_to_user(out, &bts, sizeof(bts)))
- return -EFAULT;
-
- return sizeof(bts);
-}
-
-static int ptrace_bts_drain(struct task_struct *child,
- long size,
- struct bts_struct __user *out)
-{
- struct bts_context *context;
- const struct bts_trace *trace;
- const unsigned char *at;
- int error, drained = 0;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- if (!trace->read)
- return -EOPNOTSUPP;
-
- if (size < (trace->ds.top - trace->ds.begin))
- return -EIO;
-
- for (at = trace->ds.begin; (void *)at < trace->ds.top;
- out++, drained++, at += trace->ds.size) {
- struct bts_struct bts;
-
- error = trace->read(context->tracer, at, &bts);
- if (error < 0)
- return error;
-
- if (copy_to_user(out, &bts, sizeof(bts)))
- return -EFAULT;
- }
-
- memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
- error = ds_reset_bts(context->tracer);
- if (error < 0)
- return error;
-
- return drained;
-}
-
-static int ptrace_bts_config(struct task_struct *child,
- long cfg_size,
- const struct ptrace_bts_config __user *ucfg)
-{
- struct bts_context *context;
- struct ptrace_bts_config cfg;
- unsigned int flags = 0;
-
- if (cfg_size < sizeof(cfg))
- return -EIO;
-
- if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
- return -EFAULT;
-
- context = child->bts;
- if (!context)
- context = alloc_bts_context(child);
- if (!context)
- return -ENOMEM;
-
- if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
- if (!cfg.signal)
- return -EINVAL;
-
- return -EOPNOTSUPP;
- context->bts_ovfl_signal = cfg.signal;
- }
-
- ds_release_bts(context->tracer);
- context->tracer = NULL;
-
- if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
- int err;
-
- free_bts_buffer(context);
- if (!cfg.size)
- return 0;
-
- err = alloc_bts_buffer(context, cfg.size);
- if (err < 0)
- return err;
- }
-
- if (cfg.flags & PTRACE_BTS_O_TRACE)
- flags |= BTS_USER;
-
- if (cfg.flags & PTRACE_BTS_O_SCHED)
- flags |= BTS_TIMESTAMPS;
-
- context->tracer =
- ds_request_bts_task(child, context->buffer, context->size,
- NULL, (size_t)-1, flags);
- if (unlikely(IS_ERR(context->tracer))) {
- int error = PTR_ERR(context->tracer);
-
- free_bts_buffer(context);
- context->tracer = NULL;
- return error;
- }
-
- return sizeof(cfg);
-}
-
-static int ptrace_bts_status(struct task_struct *child,
- long cfg_size,
- struct ptrace_bts_config __user *ucfg)
-{
- struct bts_context *context;
- const struct bts_trace *trace;
- struct ptrace_bts_config cfg;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- if (cfg_size < sizeof(cfg))
- return -EIO;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- memset(&cfg, 0, sizeof(cfg));
- cfg.size = trace->ds.end - trace->ds.begin;
- cfg.signal = context->bts_ovfl_signal;
- cfg.bts_size = sizeof(struct bts_struct);
-
- if (cfg.signal)
- cfg.flags |= PTRACE_BTS_O_SIGNAL;
-
- if (trace->ds.flags & BTS_USER)
- cfg.flags |= PTRACE_BTS_O_TRACE;
-
- if (trace->ds.flags & BTS_TIMESTAMPS)
- cfg.flags |= PTRACE_BTS_O_SCHED;
-
- if (copy_to_user(ucfg, &cfg, sizeof(cfg)))
- return -EFAULT;
-
- return sizeof(cfg);
-}
-
-static int ptrace_bts_clear(struct task_struct *child)
-{
- struct bts_context *context;
- const struct bts_trace *trace;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
-
- return ds_reset_bts(context->tracer);
-}
-
-static int ptrace_bts_size(struct task_struct *child)
-{
- struct bts_context *context;
- const struct bts_trace *trace;
-
- context = child->bts;
- if (!context)
- return -ESRCH;
-
- trace = ds_read_bts(context->tracer);
- if (!trace)
- return -ESRCH;
-
- return (trace->ds.top - trace->ds.begin) / trace->ds.size;
-}
-
-/*
- * Called from __ptrace_unlink() after the child has been moved back
- * to its original parent.
- */
-void ptrace_bts_untrace(struct task_struct *child)
-{
- if (unlikely(child->bts)) {
- free_bts_context(child->bts);
- child->bts = NULL;
- }
-}
-#endif /* CONFIG_X86_PTRACE_BTS */
-
/*
* Called by kernel/ptrace.c when detaching..
*
@@ -1252,39 +911,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
break;
#endif
- /*
- * These bits need more cooking - not enabled yet:
- */
-#ifdef CONFIG_X86_PTRACE_BTS
- case PTRACE_BTS_CONFIG:
- ret = ptrace_bts_config
- (child, data, (struct ptrace_bts_config __user *)addr);
- break;
-
- case PTRACE_BTS_STATUS:
- ret = ptrace_bts_status
- (child, data, (struct ptrace_bts_config __user *)addr);
- break;
-
- case PTRACE_BTS_SIZE:
- ret = ptrace_bts_size(child);
- break;
-
- case PTRACE_BTS_GET:
- ret = ptrace_bts_read_record
- (child, data, (struct bts_struct __user *) addr);
- break;
-
- case PTRACE_BTS_CLEAR:
- ret = ptrace_bts_clear(child);
- break;
-
- case PTRACE_BTS_DRAIN:
- ret = ptrace_bts_drain
- (child, data, (struct bts_struct __user *) addr);
- break;
-#endif /* CONFIG_X86_PTRACE_BTS */
-
default:
ret = ptrace_request(child, request, addr, data);
break;
@@ -1544,14 +1170,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
case PTRACE_GET_THREAD_AREA:
case PTRACE_SET_THREAD_AREA:
-#ifdef CONFIG_X86_PTRACE_BTS
- case PTRACE_BTS_CONFIG:
- case PTRACE_BTS_STATUS:
- case PTRACE_BTS_SIZE:
- case PTRACE_BTS_GET:
- case PTRACE_BTS_CLEAR:
- case PTRACE_BTS_DRAIN:
-#endif /* CONFIG_X86_PTRACE_BTS */
return arch_ptrace(child, request, addr, data);
default:
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 03801f2f761f..239427ca02af 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -31,8 +31,16 @@ struct pvclock_shadow_time {
u32 tsc_to_nsec_mul;
int tsc_shift;
u32 version;
+ u8 flags;
};
+static u8 valid_flags __read_mostly = 0;
+
+void pvclock_set_flags(u8 flags)
+{
+ valid_flags = flags;
+}
+
/*
* Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
* yielding a 64-bit result.
@@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst,
dst->system_timestamp = src->system_time;
dst->tsc_to_nsec_mul = src->tsc_to_system_mul;
dst->tsc_shift = src->tsc_shift;
+ dst->flags = src->flags;
rmb(); /* test version after fetching data */
} while ((src->version & 1) || (dst->version != src->version));
@@ -109,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
return pv_tsc_khz;
}
+static atomic64_t last_value = ATOMIC64_INIT(0);
+
cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
{
struct pvclock_shadow_time shadow;
unsigned version;
cycle_t ret, offset;
+ u64 last;
do {
version = pvclock_get_time_values(&shadow, src);
@@ -123,6 +135,31 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
barrier();
} while (version != src->version);
+ if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) &&
+ (shadow.flags & PVCLOCK_TSC_STABLE_BIT))
+ return ret;
+
+ /*
+ * Assumption here is that last_value, a global accumulator, always goes
+ * forward. If we are less than that, we should not be much smaller.
+ * We assume there is an error marging we're inside, and then the correction
+ * does not sacrifice accuracy.
+ *
+ * For reads: global may have changed between test and return,
+ * but this means someone else updated poked the clock at a later time.
+ * We just need to make sure we are not seeing a backwards event.
+ *
+ * For updates: last_value = ret is not enough, since two vcpus could be
+ * updating at the same time, and one of them could be slightly behind,
+ * making the assumption that last_value always go forward fail to hold.
+ */
+ last = atomic64_read(&last_value);
+ do {
+ if (ret < last)
+ return last;
+ last = atomic64_cmpxchg(&last_value, last, ret);
+ } while (unlikely(last != ret));
+
return ret;
}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 12e9feaa2f7a..939b9e98245f 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -495,6 +495,9 @@ void force_hpet_resume(void)
/*
* HPET MSI on some boards (ATI SB700/SB800) has side effect on
* floppy DMA. Disable HPET MSI on such platforms.
+ * See erratum #27 (Misinterpreted MSI Requests May Result in
+ * Corrupted LPC DMA Data) in AMD Publication #46837,
+ * "SB700 Family Product Errata", Rev. 1.0, March 2010.
*/
static void force_disable_hpet_msi(struct pci_dev *unused)
{
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 8e1aac86b50c..e3af342fe83a 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -228,6 +228,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"),
},
},
+ { /* Handle problems with rebooting on Dell T7400's */
+ .callback = set_bios_reboot,
+ .ident = "Dell Precision T7400",
+ .matches = {
+ DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+ DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"),
+ },
+ },
{ /* Handle problems with rebooting on HP laptops */
.callback = set_bios_reboot,
.ident = "HP Compaq Laptop",
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c4851eff57b3..b008e7883207 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -102,6 +102,7 @@
#include <asm/paravirt.h>
#include <asm/hypervisor.h>
+#include <asm/olpc_ofw.h>
#include <asm/percpu.h>
#include <asm/topology.h>
@@ -676,6 +677,17 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
},
},
+ /*
+ * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
+ * match on the product name.
+ */
+ {
+ .callback = dmi_low_memory_corruption,
+ .ident = "Phoenix BIOS",
+ .matches = {
+ DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
+ },
+ },
#endif
{}
};
@@ -725,9 +737,15 @@ void __init setup_arch(char **cmdline_p)
/* VMI may relocate the fixmap; do this before touching ioremap area */
vmi_init();
+ /* OFW also may relocate the fixmap */
+ olpc_ofw_detect();
+
+ early_trap_init();
early_cpu_init();
early_ioremap_init();
+ setup_olpc_ofw_pgd();
+
ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev);
screen_info = boot_params.screen_info;
edid_info = boot_params.edid_info;
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index ef6370b00e70..a60df9ae6454 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -21,12 +21,6 @@
#include <asm/cpu.h>
#include <asm/stackprotector.h>
-#ifdef CONFIG_DEBUG_PER_CPU_MAPS
-# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
-#else
-# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
-#endif
-
DEFINE_PER_CPU(int, cpu_number);
EXPORT_PER_CPU_SYMBOL(cpu_number);
@@ -244,10 +238,19 @@ void __init setup_per_cpu_areas(void)
#ifdef CONFIG_NUMA
per_cpu(x86_cpu_to_node_map, cpu) =
early_per_cpu_map(x86_cpu_to_node_map, cpu);
+ /*
+ * Ensure that the boot cpu numa_node is correct when the boot
+ * cpu is on a node that doesn't have memory installed.
+ * Also cpu_up() will call cpu_to_node() for APs when
+ * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set
+ * up later with c_init aka intel_init/amd_init.
+ * So set them all (boot cpu and all APs).
+ */
+ set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
#endif
#endif
/*
- * Up to this point, the boot CPU has been using .data.init
+ * Up to this point, the boot CPU has been using .init.data
* area. Reload any changed state for the boot CPU.
*/
if (cpu == boot_cpu_id)
@@ -263,14 +266,6 @@ void __init setup_per_cpu_areas(void)
early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
#endif
-#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
- /*
- * make sure boot cpu node_number is right, when boot cpu is on the
- * node that doesn't have mem installed
- */
- per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
-#endif
-
/* Setup node to cpumask map */
setup_node_to_cpumask_map();
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c
index 34e099382651..cb22acf3ed09 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/kernel/sfi.c
@@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table)
#endif /* CONFIG_X86_LOCAL_APIC */
#ifdef CONFIG_X86_IO_APIC
-static u32 gsi_base;
static int __init sfi_parse_ioapic(struct sfi_table_header *table)
{
@@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
pentry = (struct sfi_apic_table_entry *)sb->pentry;
for (i = 0; i < num; i++) {
- mp_register_ioapic(i, pentry->phys_addr, gsi_base);
- gsi_base += io_apic_get_redir_entries(i);
+ mp_register_ioapic(i, pentry->phys_addr, gsi_top);
pentry++;
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 763d815e27a0..a5e928b0cb5f 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -686,7 +686,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
static void __cpuinit announce_cpu(int cpu, int apicid)
{
static int current_node = -1;
- int node = cpu_to_node(cpu);
+ int node = early_cpu_to_node(cpu);
if (system_state == SYSTEM_BOOTING) {
if (node != current_node) {
@@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
goto do_rest;
}
- if (!keventd_up() || current_is_keventd())
- c_idle.work.func(&c_idle.work);
- else {
- schedule_work(&c_idle.work);
- wait_for_completion(&c_idle.done);
- }
+ schedule_work(&c_idle.work);
+ wait_for_completion(&c_idle.done);
if (IS_ERR(c_idle.idle)) {
printk("failed fork for CPU %d\n", cpu);
@@ -816,6 +812,13 @@ do_rest:
if (cpumask_test_cpu(cpu, cpu_callin_mask))
break; /* It has booted */
udelay(100);
+ /*
+ * Allow other tasks to run while we wait for the
+ * AP to come online. This also gives a chance
+ * for the MTRR work(triggered by the AP coming online)
+ * to be completed in the stop machine context.
+ */
+ schedule();
}
if (cpumask_test_cpu(cpu, cpu_callin_mask))
@@ -1215,9 +1218,17 @@ __init void prefill_possible_map(void)
if (!num_processors)
num_processors = 1;
- if (setup_possible_cpus == -1)
- possible = num_processors + disabled_cpus;
- else
+ i = setup_max_cpus ?: 1;
+ if (setup_possible_cpus == -1) {
+ possible = num_processors;
+#ifdef CONFIG_HOTPLUG_CPU
+ if (setup_max_cpus)
+ possible += disabled_cpus;
+#else
+ if (possible > i)
+ possible = i;
+#endif
+ } else
possible = setup_possible_cpus;
total_cpus = max_t(int, possible, num_processors + disabled_cpus);
@@ -1230,11 +1241,23 @@ __init void prefill_possible_map(void)
possible = nr_cpu_ids;
}
+#ifdef CONFIG_HOTPLUG_CPU
+ if (!setup_max_cpus)
+#endif
+ if (possible > i) {
+ printk(KERN_WARNING
+ "%d Processors exceeds max_cpus limit of %u\n",
+ possible, setup_max_cpus);
+ possible = i;
+ }
+
printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
possible, max_t(int, possible - num_processors, 0));
for (i = 0; i < possible; i++)
set_cpu_possible(i, true);
+ for (; i < NR_CPUS; i++)
+ set_cpu_possible(i, false);
nr_cpu_ids = possible;
}
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 922eefbb3f6c..b53c525368a7 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name)
return 0;
}
-static void save_stack_address(void *data, unsigned long addr, int reliable)
+static void
+__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched)
{
struct stack_trace *trace = data;
+#ifdef CONFIG_FRAME_POINTER
if (!reliable)
return;
+#endif
+ if (nosched && in_sched_functions(addr))
+ return;
if (trace->skip > 0) {
trace->skip--;
return;
@@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable)
trace->entries[trace->nr_entries++] = addr;
}
+static void save_stack_address(void *data, unsigned long addr, int reliable)
+{
+ return __save_stack_address(data, addr, reliable, false);
+}
+
static void
save_stack_address_nosched(void *data, unsigned long addr, int reliable)
{
- struct stack_trace *trace = (struct stack_trace *)data;
- if (!reliable)
- return;
- if (in_sched_functions(addr))
- return;
- if (trace->skip > 0) {
- trace->skip--;
- return;
- }
- if (trace->nr_entries < trace->max_entries)
- trace->entries[trace->nr_entries++] = addr;
+ return __save_stack_address(data, addr, reliable, true);
}
static const struct stacktrace_ops save_stack_ops = {
@@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk);
/* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */
-struct stack_frame {
+struct stack_frame_user {
const void __user *next_fp;
unsigned long ret_addr;
};
-static int copy_stack_frame(const void __user *fp, struct stack_frame *frame)
+static int
+copy_stack_frame(const void __user *fp, struct stack_frame_user *frame)
{
int ret;
@@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace)
trace->entries[trace->nr_entries++] = regs->ip;
while (trace->nr_entries < trace->max_entries) {
- struct stack_frame frame;
+ struct stack_frame_user frame;
frame.next_fp = NULL;
frame.ret_addr = 0;
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 3149032ff107..58de45ee08b6 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -158,22 +158,6 @@ static int enable_single_step(struct task_struct *child)
}
/*
- * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running.
- */
-static void write_debugctlmsr(struct task_struct *child, unsigned long val)
-{
- if (child->thread.debugctlmsr == val)
- return;
-
- child->thread.debugctlmsr = val;
-
- if (child != current)
- return;
-
- update_debugctlmsr(val);
-}
-
-/*
* Enable single or block step.
*/
static void enable_step(struct task_struct *child, bool block)
@@ -186,15 +170,17 @@ static void enable_step(struct task_struct *child, bool block)
* that uses user-mode single stepping itself.
*/
if (enable_single_step(child) && block) {
- set_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
- write_debugctlmsr(child,
- child->thread.debugctlmsr | DEBUGCTLMSR_BTF);
- } else {
- write_debugctlmsr(child,
- child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
-
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl |= DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ set_tsk_thread_flag(child, TIF_BLOCKSTEP);
+ } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
+
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
}
}
@@ -213,11 +199,13 @@ void user_disable_single_step(struct task_struct *child)
/*
* Make sure block stepping (BTF) is disabled.
*/
- write_debugctlmsr(child,
- child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF);
+ if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) {
+ unsigned long debugctl = get_debugctlmsr();
- if (!child->thread.debugctlmsr)
- clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR);
+ debugctl &= ~DEBUGCTLMSR_BTF;
+ update_debugctlmsr(debugctl);
+ clear_tsk_thread_flag(child, TIF_BLOCKSTEP);
+ }
/* Always clear TIF_SINGLESTEP... */
clear_tsk_thread_flag(child, TIF_SINGLESTEP);
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 8b3729341216..b35786dc9b8f 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,6 @@ ENTRY(sys_call_table)
.long sys_rt_tgsigqueueinfo /* 335 */
.long sys_perf_event_open
.long sys_recvmmsg
+ .long sys_fanotify_init
+ .long sys_fanotify_mark
+ .long sys_prlimit64 /* 340 */
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 86c9f91b48ae..c2f1b26141e2 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -46,6 +46,7 @@
/* Global pointer to shared data; NULL means no measured launch. */
struct tboot *tboot __read_mostly;
+EXPORT_SYMBOL(tboot);
/* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */
#define AP_WAIT_TIMEOUT 1
@@ -175,6 +176,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
struct tboot_mac_region *mr;
phys_addr_t end = start + size;
+ if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS)
+ panic("tboot: Too many MAC regions\n");
+
if (start && size) {
mr = &tboot->mac_regions[tboot->num_mac_regions++];
mr->start = round_down(start, PAGE_SIZE);
@@ -184,18 +188,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size)
static int tboot_setup_sleep(void)
{
+ int i;
+
tboot->num_mac_regions = 0;
- /* S3 resume code */
- add_mac_region(acpi_wakeup_address, WAKEUP_SIZE);
+ for (i = 0; i < e820.nr_map; i++) {
+ if ((e820.map[i].type != E820_RAM)
+ && (e820.map[i].type != E820_RESERVED_KERN))
+ continue;
-#ifdef CONFIG_X86_TRAMPOLINE
- /* AP trampoline code */
- add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE);
-#endif
-
- /* kernel code + data + bss */
- add_mac_region(virt_to_phys(_text), _end - _text);
+ add_mac_region(e820.map[i].addr, e820.map[i].size);
+ }
tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address;
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 17b03dd3a6b5..7fea555929e2 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -1,7 +1,7 @@
/*
* SGI UltraViolet TLB flush routines.
*
- * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI.
+ * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
*
* This code is released under the GNU General Public License version 2 or
* later.
@@ -20,42 +20,67 @@
#include <asm/idle.h>
#include <asm/tsc.h>
#include <asm/irq_vectors.h>
+#include <asm/timer.h>
-static struct bau_control **uv_bau_table_bases __read_mostly;
-static int uv_bau_retry_limit __read_mostly;
+struct msg_desc {
+ struct bau_payload_queue_entry *msg;
+ int msg_slot;
+ int sw_ack_slot;
+ struct bau_payload_queue_entry *va_queue_first;
+ struct bau_payload_queue_entry *va_queue_last;
+};
-/* base pnode in this partition */
-static int uv_partition_base_pnode __read_mostly;
+#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL
+
+static int uv_bau_max_concurrent __read_mostly;
+
+static int nobau;
+static int __init setup_nobau(char *arg)
+{
+ nobau = 1;
+ return 0;
+}
+early_param("nobau", setup_nobau);
-static unsigned long uv_mmask __read_mostly;
+/* base pnode in this partition */
+static int uv_partition_base_pnode __read_mostly;
+/* position of pnode (which is nasid>>1): */
+static int uv_nshift __read_mostly;
+static unsigned long uv_mmask __read_mostly;
static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
static DEFINE_PER_CPU(struct bau_control, bau_control);
+static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
+
+struct reset_args {
+ int sender;
+};
/*
- * Determine the first node on a blade.
+ * Determine the first node on a uvhub. 'Nodes' are used for kernel
+ * memory allocation.
*/
-static int __init blade_to_first_node(int blade)
+static int __init uvhub_to_first_node(int uvhub)
{
int node, b;
for_each_online_node(node) {
b = uv_node_to_blade_id(node);
- if (blade == b)
+ if (uvhub == b)
return node;
}
- return -1; /* shouldn't happen */
+ return -1;
}
/*
- * Determine the apicid of the first cpu on a blade.
+ * Determine the apicid of the first cpu on a uvhub.
*/
-static int __init blade_to_first_apicid(int blade)
+static int __init uvhub_to_first_apicid(int uvhub)
{
int cpu;
for_each_present_cpu(cpu)
- if (blade == uv_cpu_to_blade_id(cpu))
+ if (uvhub == uv_cpu_to_blade_id(cpu))
return per_cpu(x86_cpu_to_apicid, cpu);
return -1;
}
@@ -68,195 +93,459 @@ static int __init blade_to_first_apicid(int blade)
* clear of the Timeout bit (as well) will free the resource. No reply will
* be sent (the hardware will only do one reply per message).
*/
-static void uv_reply_to_message(int resource,
- struct bau_payload_queue_entry *msg,
- struct bau_msg_status *msp)
+static inline void uv_reply_to_message(struct msg_desc *mdp,
+ struct bau_control *bcp)
{
unsigned long dw;
+ struct bau_payload_queue_entry *msg;
- dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource);
+ msg = mdp->msg;
+ if (!msg->canceled) {
+ dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
+ msg->sw_ack_vector;
+ uv_write_local_mmr(
+ UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
+ }
msg->replied_to = 1;
msg->sw_ack_vector = 0;
- if (msp)
- msp->seen_by.bits = 0;
- uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
}
/*
- * Do all the things a cpu should do for a TLB shootdown message.
- * Other cpu's may come here at the same time for this message.
+ * Process the receipt of a RETRY message
*/
-static void uv_bau_process_message(struct bau_payload_queue_entry *msg,
- int msg_slot, int sw_ack_slot)
+static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
+ struct bau_control *bcp)
{
- unsigned long this_cpu_mask;
- struct bau_msg_status *msp;
- int cpu;
+ int i;
+ int cancel_count = 0;
+ int slot2;
+ unsigned long msg_res;
+ unsigned long mmr = 0;
+ struct bau_payload_queue_entry *msg;
+ struct bau_payload_queue_entry *msg2;
+ struct ptc_stats *stat;
- msp = __get_cpu_var(bau_control).msg_statuses + msg_slot;
- cpu = uv_blade_processor_id();
- msg->number_of_cpus =
- uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id()));
- this_cpu_mask = 1UL << cpu;
- if (msp->seen_by.bits & this_cpu_mask)
- return;
- atomic_or_long(&msp->seen_by.bits, this_cpu_mask);
+ msg = mdp->msg;
+ stat = &per_cpu(ptcstats, bcp->cpu);
+ stat->d_retries++;
+ /*
+ * cancel any message from msg+1 to the retry itself
+ */
+ for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
+ if (msg2 > mdp->va_queue_last)
+ msg2 = mdp->va_queue_first;
+ if (msg2 == msg)
+ break;
+
+ /* same conditions for cancellation as uv_do_reset */
+ if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
+ (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
+ msg->sw_ack_vector) == 0) &&
+ (msg2->sending_cpu == msg->sending_cpu) &&
+ (msg2->msg_type != MSG_NOOP)) {
+ slot2 = msg2 - mdp->va_queue_first;
+ mmr = uv_read_local_mmr
+ (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+ msg_res = ((msg2->sw_ack_vector << 8) |
+ msg2->sw_ack_vector);
+ /*
+ * This is a message retry; clear the resources held
+ * by the previous message only if they timed out.
+ * If it has not timed out we have an unexpected
+ * situation to report.
+ */
+ if (mmr & (msg_res << 8)) {
+ /*
+ * is the resource timed out?
+ * make everyone ignore the cancelled message.
+ */
+ msg2->canceled = 1;
+ stat->d_canceled++;
+ cancel_count++;
+ uv_write_local_mmr(
+ UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+ (msg_res << 8) | msg_res);
+ } else
+ printk(KERN_INFO "note bau retry: no effect\n");
+ }
+ }
+ if (!cancel_count)
+ stat->d_nocanceled++;
+}
- if (msg->replied_to == 1)
- return;
+/*
+ * Do all the things a cpu should do for a TLB shootdown message.
+ * Other cpu's may come here at the same time for this message.
+ */
+static void uv_bau_process_message(struct msg_desc *mdp,
+ struct bau_control *bcp)
+{
+ int msg_ack_count;
+ short socket_ack_count = 0;
+ struct ptc_stats *stat;
+ struct bau_payload_queue_entry *msg;
+ struct bau_control *smaster = bcp->socket_master;
+ /*
+ * This must be a normal message, or retry of a normal message
+ */
+ msg = mdp->msg;
+ stat = &per_cpu(ptcstats, bcp->cpu);
if (msg->address == TLB_FLUSH_ALL) {
local_flush_tlb();
- __get_cpu_var(ptcstats).alltlb++;
+ stat->d_alltlb++;
} else {
__flush_tlb_one(msg->address);
- __get_cpu_var(ptcstats).onetlb++;
+ stat->d_onetlb++;
}
+ stat->d_requestee++;
+
+ /*
+ * One cpu on each uvhub has the additional job on a RETRY
+ * of releasing the resource held by the message that is
+ * being retried. That message is identified by sending
+ * cpu number.
+ */
+ if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
+ uv_bau_process_retry_msg(mdp, bcp);
- __get_cpu_var(ptcstats).requestee++;
+ /*
+ * This is a sw_ack message, so we have to reply to it.
+ * Count each responding cpu on the socket. This avoids
+ * pinging the count's cache line back and forth between
+ * the sockets.
+ */
+ socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
+ &smaster->socket_acknowledge_count[mdp->msg_slot]);
+ if (socket_ack_count == bcp->cpus_in_socket) {
+ /*
+ * Both sockets dump their completed count total into
+ * the message's count.
+ */
+ smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
+ msg_ack_count = atomic_add_short_return(socket_ack_count,
+ (struct atomic_short *)&msg->acknowledge_count);
+
+ if (msg_ack_count == bcp->cpus_in_uvhub) {
+ /*
+ * All cpus in uvhub saw it; reply
+ */
+ uv_reply_to_message(mdp, bcp);
+ }
+ }
- atomic_inc_short(&msg->acknowledge_count);
- if (msg->number_of_cpus == msg->acknowledge_count)
- uv_reply_to_message(sw_ack_slot, msg, msp);
+ return;
}
/*
- * Examine the payload queue on one distribution node to see
- * which messages have not been seen, and which cpu(s) have not seen them.
+ * Determine the first cpu on a uvhub.
+ */
+static int uvhub_to_first_cpu(int uvhub)
+{
+ int cpu;
+ for_each_present_cpu(cpu)
+ if (uvhub == uv_cpu_to_blade_id(cpu))
+ return cpu;
+ return -1;
+}
+
+/*
+ * Last resort when we get a large number of destination timeouts is
+ * to clear resources held by a given cpu.
+ * Do this with IPI so that all messages in the BAU message queue
+ * can be identified by their nonzero sw_ack_vector field.
*
- * Returns the number of cpu's that have not responded.
+ * This is entered for a single cpu on the uvhub.
+ * The sender want's this uvhub to free a specific message's
+ * sw_ack resources.
*/
-static int uv_examine_destination(struct bau_control *bau_tablesp, int sender)
+static void
+uv_do_reset(void *ptr)
{
- struct bau_payload_queue_entry *msg;
- struct bau_msg_status *msp;
- int count = 0;
int i;
- int j;
+ int slot;
+ int count = 0;
+ unsigned long mmr;
+ unsigned long msg_res;
+ struct bau_control *bcp;
+ struct reset_args *rap;
+ struct bau_payload_queue_entry *msg;
+ struct ptc_stats *stat;
- for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE;
- msg++, i++) {
- if ((msg->sending_cpu == sender) && (!msg->replied_to)) {
- msp = bau_tablesp->msg_statuses + i;
- printk(KERN_DEBUG
- "blade %d: address:%#lx %d of %d, not cpu(s): ",
- i, msg->address, msg->acknowledge_count,
- msg->number_of_cpus);
- for (j = 0; j < msg->number_of_cpus; j++) {
- if (!((1L << j) & msp->seen_by.bits)) {
- count++;
- printk("%d ", j);
- }
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ rap = (struct reset_args *)ptr;
+ stat = &per_cpu(ptcstats, bcp->cpu);
+ stat->d_resets++;
+
+ /*
+ * We're looking for the given sender, and
+ * will free its sw_ack resource.
+ * If all cpu's finally responded after the timeout, its
+ * message 'replied_to' was set.
+ */
+ for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
+ /* uv_do_reset: same conditions for cancellation as
+ uv_bau_process_retry_msg() */
+ if ((msg->replied_to == 0) &&
+ (msg->canceled == 0) &&
+ (msg->sending_cpu == rap->sender) &&
+ (msg->sw_ack_vector) &&
+ (msg->msg_type != MSG_NOOP)) {
+ /*
+ * make everyone else ignore this message
+ */
+ msg->canceled = 1;
+ slot = msg - bcp->va_queue_first;
+ count++;
+ /*
+ * only reset the resource if it is still pending
+ */
+ mmr = uv_read_local_mmr
+ (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
+ msg_res = ((msg->sw_ack_vector << 8) |
+ msg->sw_ack_vector);
+ if (mmr & msg_res) {
+ stat->d_rcanceled++;
+ uv_write_local_mmr(
+ UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
+ msg_res);
}
- printk("\n");
}
}
- return count;
+ return;
}
/*
- * Examine the payload queue on all the distribution nodes to see
- * which messages have not been seen, and which cpu(s) have not seen them.
- *
- * Returns the number of cpu's that have not responded.
+ * Use IPI to get all target uvhubs to release resources held by
+ * a given sending cpu number.
*/
-static int uv_examine_destinations(struct bau_target_nodemask *distribution)
+static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
+ int sender)
{
- int sender;
- int i;
- int count = 0;
+ int uvhub;
+ int cpu;
+ cpumask_t mask;
+ struct reset_args reset_args;
+
+ reset_args.sender = sender;
- sender = smp_processor_id();
- for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) {
- if (!bau_node_isset(i, distribution))
+ cpus_clear(mask);
+ /* find a single cpu for each uvhub in this distribution mask */
+ for (uvhub = 0;
+ uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
+ uvhub++) {
+ if (!bau_uvhub_isset(uvhub, distribution))
continue;
- count += uv_examine_destination(uv_bau_table_bases[i], sender);
+ /* find a cpu for this uvhub */
+ cpu = uvhub_to_first_cpu(uvhub);
+ cpu_set(cpu, mask);
}
- return count;
+ /* IPI all cpus; Preemption is already disabled */
+ smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
+ return;
+}
+
+static inline unsigned long
+cycles_2_us(unsigned long long cyc)
+{
+ unsigned long long ns;
+ unsigned long us;
+ ns = (cyc * per_cpu(cyc2ns, smp_processor_id()))
+ >> CYC2NS_SCALE_FACTOR;
+ us = ns / 1000;
+ return us;
}
/*
- * wait for completion of a broadcast message
- *
- * return COMPLETE, RETRY or GIVEUP
+ * wait for all cpus on this hub to finish their sends and go quiet
+ * leaves uvhub_quiesce set so that no new broadcasts are started by
+ * bau_flush_send_and_wait()
+ */
+static inline void
+quiesce_local_uvhub(struct bau_control *hmaster)
+{
+ atomic_add_short_return(1, (struct atomic_short *)
+ &hmaster->uvhub_quiesce);
+}
+
+/*
+ * mark this quiet-requestor as done
+ */
+static inline void
+end_uvhub_quiesce(struct bau_control *hmaster)
+{
+ atomic_add_short_return(-1, (struct atomic_short *)
+ &hmaster->uvhub_quiesce);
+}
+
+/*
+ * Wait for completion of a broadcast software ack message
+ * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
*/
static int uv_wait_completion(struct bau_desc *bau_desc,
- unsigned long mmr_offset, int right_shift)
+ unsigned long mmr_offset, int right_shift, int this_cpu,
+ struct bau_control *bcp, struct bau_control *smaster, long try)
{
- int exams = 0;
- long destination_timeouts = 0;
- long source_timeouts = 0;
+ int relaxes = 0;
unsigned long descriptor_status;
+ unsigned long mmr;
+ unsigned long mask;
+ cycles_t ttime;
+ cycles_t timeout_time;
+ struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu);
+ struct bau_control *hmaster;
+
+ hmaster = bcp->uvhub_master;
+ timeout_time = get_cycles() + bcp->timeout_interval;
+ /* spin on the status MMR, waiting for it to go idle */
while ((descriptor_status = (((unsigned long)
uv_read_local_mmr(mmr_offset) >>
right_shift) & UV_ACT_STATUS_MASK)) !=
DESC_STATUS_IDLE) {
- if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
- source_timeouts++;
- if (source_timeouts > SOURCE_TIMEOUT_LIMIT)
- source_timeouts = 0;
- __get_cpu_var(ptcstats).s_retry++;
- return FLUSH_RETRY;
- }
/*
- * spin here looking for progress at the destinations
+ * Our software ack messages may be blocked because there are
+ * no swack resources available. As long as none of them
+ * has timed out hardware will NACK our message and its
+ * state will stay IDLE.
*/
- if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) {
- destination_timeouts++;
- if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) {
- /*
- * returns number of cpus not responding
- */
- if (uv_examine_destinations
- (&bau_desc->distribution) == 0) {
- __get_cpu_var(ptcstats).d_retry++;
- return FLUSH_RETRY;
- }
- exams++;
- if (exams >= uv_bau_retry_limit) {
- printk(KERN_DEBUG
- "uv_flush_tlb_others");
- printk("giving up on cpu %d\n",
- smp_processor_id());
+ if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
+ stat->s_stimeout++;
+ return FLUSH_GIVEUP;
+ } else if (descriptor_status ==
+ DESC_STATUS_DESTINATION_TIMEOUT) {
+ stat->s_dtimeout++;
+ ttime = get_cycles();
+
+ /*
+ * Our retries may be blocked by all destination
+ * swack resources being consumed, and a timeout
+ * pending. In that case hardware returns the
+ * ERROR that looks like a destination timeout.
+ */
+ if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) {
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_PLUGGED;
+ }
+
+ bcp->conseccompletes = 0;
+ return FLUSH_RETRY_TIMEOUT;
+ } else {
+ /*
+ * descriptor_status is still BUSY
+ */
+ cpu_relax();
+ relaxes++;
+ if (relaxes >= 10000) {
+ relaxes = 0;
+ if (get_cycles() > timeout_time) {
+ quiesce_local_uvhub(hmaster);
+
+ /* single-thread the register change */
+ spin_lock(&hmaster->masks_lock);
+ mmr = uv_read_local_mmr(mmr_offset);
+ mask = 0UL;
+ mask |= (3UL < right_shift);
+ mask = ~mask;
+ mmr &= mask;
+ uv_write_local_mmr(mmr_offset, mmr);
+ spin_unlock(&hmaster->masks_lock);
+ end_uvhub_quiesce(hmaster);
+ stat->s_busy++;
return FLUSH_GIVEUP;
}
- /*
- * delays can hang the simulator
- udelay(1000);
- */
- destination_timeouts = 0;
}
}
- cpu_relax();
}
+ bcp->conseccompletes++;
return FLUSH_COMPLETE;
}
+static inline cycles_t
+sec_2_cycles(unsigned long sec)
+{
+ unsigned long ns;
+ cycles_t cyc;
+
+ ns = sec * 1000000000;
+ cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+ return cyc;
+}
+
+/*
+ * conditionally add 1 to *v, unless *v is >= u
+ * return 0 if we cannot add 1 to *v because it is >= u
+ * return 1 if we can add 1 to *v because it is < u
+ * the add is atomic
+ *
+ * This is close to atomic_add_unless(), but this allows the 'u' value
+ * to be lowered below the current 'v'. atomic_add_unless can only stop
+ * on equal.
+ */
+static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
+{
+ spin_lock(lock);
+ if (atomic_read(v) >= u) {
+ spin_unlock(lock);
+ return 0;
+ }
+ atomic_inc(v);
+ spin_unlock(lock);
+ return 1;
+}
+
/**
* uv_flush_send_and_wait
*
- * Send a broadcast and wait for a broadcast message to complete.
+ * Send a broadcast and wait for it to complete.
*
- * The flush_mask contains the cpus the broadcast was sent to.
+ * The flush_mask contains the cpus the broadcast is to be sent to, plus
+ * cpus that are on the local uvhub.
*
- * Returns NULL if all remote flushing was done. The mask is zeroed.
+ * Returns NULL if all flushing represented in the mask was done. The mask
+ * is zeroed.
* Returns @flush_mask if some remote flushing remains to be done. The
- * mask will have some bits still set.
+ * mask will have some bits still set, representing any cpus on the local
+ * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed.
*/
-const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
- struct bau_desc *bau_desc,
- struct cpumask *flush_mask)
+const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc,
+ struct cpumask *flush_mask,
+ struct bau_control *bcp)
{
- int completion_status = 0;
int right_shift;
- int tries = 0;
- int pnode;
+ int uvhub;
int bit;
+ int completion_status = 0;
+ int seq_number = 0;
+ long try = 0;
+ int cpu = bcp->uvhub_cpu;
+ int this_cpu = bcp->cpu;
+ int this_uvhub = bcp->uvhub;
unsigned long mmr_offset;
unsigned long index;
cycles_t time1;
cycles_t time2;
+ struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu);
+ struct bau_control *smaster = bcp->socket_master;
+ struct bau_control *hmaster = bcp->uvhub_master;
+
+ /*
+ * Spin here while there are hmaster->max_concurrent or more active
+ * descriptors. This is the per-uvhub 'throttle'.
+ */
+ if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+ &hmaster->active_descriptor_count,
+ hmaster->max_concurrent)) {
+ stat->s_throttles++;
+ do {
+ cpu_relax();
+ } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
+ &hmaster->active_descriptor_count,
+ hmaster->max_concurrent));
+ }
+
+ while (hmaster->uvhub_quiesce)
+ cpu_relax();
if (cpu < UV_CPUS_PER_ACT_STATUS) {
mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
@@ -268,24 +557,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
}
time1 = get_cycles();
do {
- tries++;
+ /*
+ * Every message from any given cpu gets a unique message
+ * sequence number. But retries use that same number.
+ * Our message may have timed out at the destination because
+ * all sw-ack resources are in use and there is a timeout
+ * pending there. In that case, our last send never got
+ * placed into the queue and we need to persist until it
+ * does.
+ *
+ * Make any retry a type MSG_RETRY so that the destination will
+ * free any resource held by a previous message from this cpu.
+ */
+ if (try == 0) {
+ /* use message type set by the caller the first time */
+ seq_number = bcp->message_number++;
+ } else {
+ /* use RETRY type on all the rest; same sequence */
+ bau_desc->header.msg_type = MSG_RETRY;
+ stat->s_retry_messages++;
+ }
+ bau_desc->header.sequence = seq_number;
index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
- cpu;
+ bcp->uvhub_cpu;
+ bcp->send_message = get_cycles();
+
uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
+
+ try++;
completion_status = uv_wait_completion(bau_desc, mmr_offset,
- right_shift);
- } while (completion_status == FLUSH_RETRY);
+ right_shift, this_cpu, bcp, smaster, try);
+
+ if (completion_status == FLUSH_RETRY_PLUGGED) {
+ /*
+ * Our retries may be blocked by all destination swack
+ * resources being consumed, and a timeout pending. In
+ * that case hardware immediately returns the ERROR
+ * that looks like a destination timeout.
+ */
+ udelay(TIMEOUT_DELAY);
+ bcp->plugged_tries++;
+ if (bcp->plugged_tries >= PLUGSB4RESET) {
+ bcp->plugged_tries = 0;
+ quiesce_local_uvhub(hmaster);
+ spin_lock(&hmaster->queue_lock);
+ uv_reset_with_ipi(&bau_desc->distribution,
+ this_cpu);
+ spin_unlock(&hmaster->queue_lock);
+ end_uvhub_quiesce(hmaster);
+ bcp->ipi_attempts++;
+ stat->s_resets_plug++;
+ }
+ } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
+ hmaster->max_concurrent = 1;
+ bcp->timeout_tries++;
+ udelay(TIMEOUT_DELAY);
+ if (bcp->timeout_tries >= TIMEOUTSB4RESET) {
+ bcp->timeout_tries = 0;
+ quiesce_local_uvhub(hmaster);
+ spin_lock(&hmaster->queue_lock);
+ uv_reset_with_ipi(&bau_desc->distribution,
+ this_cpu);
+ spin_unlock(&hmaster->queue_lock);
+ end_uvhub_quiesce(hmaster);
+ bcp->ipi_attempts++;
+ stat->s_resets_timeout++;
+ }
+ }
+ if (bcp->ipi_attempts >= 3) {
+ bcp->ipi_attempts = 0;
+ completion_status = FLUSH_GIVEUP;
+ break;
+ }
+ cpu_relax();
+ } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
+ (completion_status == FLUSH_RETRY_TIMEOUT));
time2 = get_cycles();
- __get_cpu_var(ptcstats).sflush += (time2 - time1);
- if (tries > 1)
- __get_cpu_var(ptcstats).retriesok++;
- if (completion_status == FLUSH_GIVEUP) {
+ if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5)
+ && (hmaster->max_concurrent < hmaster->max_concurrent_constant))
+ hmaster->max_concurrent++;
+
+ /*
+ * hold any cpu not timing out here; no other cpu currently held by
+ * the 'throttle' should enter the activation code
+ */
+ while (hmaster->uvhub_quiesce)
+ cpu_relax();
+ atomic_dec(&hmaster->active_descriptor_count);
+
+ /* guard against cycles wrap */
+ if (time2 > time1)
+ stat->s_time += (time2 - time1);
+ else
+ stat->s_requestor--; /* don't count this one */
+ if (completion_status == FLUSH_COMPLETE && try > 1)
+ stat->s_retriesok++;
+ else if (completion_status == FLUSH_GIVEUP) {
/*
* Cause the caller to do an IPI-style TLB shootdown on
- * the cpu's, all of which are still in the mask.
+ * the target cpu's, all of which are still in the mask.
*/
- __get_cpu_var(ptcstats).ptc_i++;
+ stat->s_giveup++;
return flush_mask;
}
@@ -294,18 +667,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode,
* use the IPI method of shootdown on them.
*/
for_each_cpu(bit, flush_mask) {
- pnode = uv_cpu_to_pnode(bit);
- if (pnode == this_pnode)
+ uvhub = uv_cpu_to_blade_id(bit);
+ if (uvhub == this_uvhub)
continue;
cpumask_clear_cpu(bit, flush_mask);
}
if (!cpumask_empty(flush_mask))
return flush_mask;
+
return NULL;
}
-static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
-
/**
* uv_flush_tlb_others - globally purge translation cache of a virtual
* address or all TLB's
@@ -322,8 +694,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
* The caller has derived the cpumask from the mm_struct. This function
* is called only if there are bits set in the mask. (e.g. flush_tlb_page())
*
- * The cpumask is converted into a nodemask of the nodes containing
- * the cpus.
+ * The cpumask is converted into a uvhubmask of the uvhubs containing
+ * those cpus.
*
* Note that this function should be called with preemption disabled.
*
@@ -335,52 +707,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
struct mm_struct *mm,
unsigned long va, unsigned int cpu)
{
- struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask);
- int i;
- int bit;
- int pnode;
- int uv_cpu;
- int this_pnode;
+ int remotes;
+ int tcpu;
+ int uvhub;
int locals = 0;
struct bau_desc *bau_desc;
+ struct cpumask *flush_mask;
+ struct ptc_stats *stat;
+ struct bau_control *bcp;
- cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+ if (nobau)
+ return cpumask;
- uv_cpu = uv_blade_processor_id();
- this_pnode = uv_hub_info->pnode;
- bau_desc = __get_cpu_var(bau_control).descriptor_base;
- bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu;
+ bcp = &per_cpu(bau_control, cpu);
+ /*
+ * Each sending cpu has a per-cpu mask which it fills from the caller's
+ * cpu mask. Only remote cpus are converted to uvhubs and copied.
+ */
+ flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
+ /*
+ * copy cpumask to flush_mask, removing current cpu
+ * (current cpu should already have been flushed by the caller and
+ * should never be returned if we return flush_mask)
+ */
+ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
+ if (cpu_isset(cpu, *cpumask))
+ locals++; /* current cpu was targeted */
- bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+ bau_desc = bcp->descriptor_base;
+ bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
- i = 0;
- for_each_cpu(bit, flush_mask) {
- pnode = uv_cpu_to_pnode(bit);
- BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1));
- if (pnode == this_pnode) {
+ bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
+ remotes = 0;
+ for_each_cpu(tcpu, flush_mask) {
+ uvhub = uv_cpu_to_blade_id(tcpu);
+ if (uvhub == bcp->uvhub) {
locals++;
continue;
}
- bau_node_set(pnode - uv_partition_base_pnode,
- &bau_desc->distribution);
- i++;
+ bau_uvhub_set(uvhub, &bau_desc->distribution);
+ remotes++;
}
- if (i == 0) {
+ if (remotes == 0) {
/*
- * no off_node flushing; return status for local node
+ * No off_hub flushing; return status for local hub.
+ * Return the caller's mask if all were local (the current
+ * cpu may be in that mask).
*/
if (locals)
- return flush_mask;
+ return cpumask;
else
return NULL;
}
- __get_cpu_var(ptcstats).requestor++;
- __get_cpu_var(ptcstats).ntargeted += i;
+ stat = &per_cpu(ptcstats, cpu);
+ stat->s_requestor++;
+ stat->s_ntargcpu += remotes;
+ remotes = bau_uvhub_weight(&bau_desc->distribution);
+ stat->s_ntarguvhub += remotes;
+ if (remotes >= 16)
+ stat->s_ntarguvhub16++;
+ else if (remotes >= 8)
+ stat->s_ntarguvhub8++;
+ else if (remotes >= 4)
+ stat->s_ntarguvhub4++;
+ else if (remotes >= 2)
+ stat->s_ntarguvhub2++;
+ else
+ stat->s_ntarguvhub1++;
bau_desc->payload.address = va;
bau_desc->payload.sending_cpu = cpu;
- return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask);
+ /*
+ * uv_flush_send_and_wait returns null if all cpu's were messaged, or
+ * the adjusted flush_mask if any cpu's were not messaged.
+ */
+ return uv_flush_send_and_wait(bau_desc, flush_mask, bcp);
}
/*
@@ -389,87 +791,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
*
* We received a broadcast assist message.
*
- * Interrupts may have been disabled; this interrupt could represent
+ * Interrupts are disabled; this interrupt could represent
* the receipt of several messages.
*
- * All cores/threads on this node get this interrupt.
- * The last one to see it does the s/w ack.
+ * All cores/threads on this hub get this interrupt.
+ * The last one to see it does the software ack.
* (the resource will not be freed until noninterruptable cpus see this
- * interrupt; hardware will timeout the s/w ack and reply ERROR)
+ * interrupt; hardware may timeout the s/w ack and reply ERROR)
*/
void uv_bau_message_interrupt(struct pt_regs *regs)
{
- struct bau_payload_queue_entry *va_queue_first;
- struct bau_payload_queue_entry *va_queue_last;
- struct bau_payload_queue_entry *msg;
- struct pt_regs *old_regs = set_irq_regs(regs);
- cycles_t time1;
- cycles_t time2;
- int msg_slot;
- int sw_ack_slot;
- int fw;
int count = 0;
- unsigned long local_pnode;
-
- ack_APIC_irq();
- exit_idle();
- irq_enter();
-
- time1 = get_cycles();
-
- local_pnode = uv_blade_to_pnode(uv_numa_blade_id());
-
- va_queue_first = __get_cpu_var(bau_control).va_queue_first;
- va_queue_last = __get_cpu_var(bau_control).va_queue_last;
-
- msg = __get_cpu_var(bau_control).bau_msg_head;
+ cycles_t time_start;
+ struct bau_payload_queue_entry *msg;
+ struct bau_control *bcp;
+ struct ptc_stats *stat;
+ struct msg_desc msgdesc;
+
+ time_start = get_cycles();
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ stat = &per_cpu(ptcstats, smp_processor_id());
+ msgdesc.va_queue_first = bcp->va_queue_first;
+ msgdesc.va_queue_last = bcp->va_queue_last;
+ msg = bcp->bau_msg_head;
while (msg->sw_ack_vector) {
count++;
- fw = msg->sw_ack_vector;
- msg_slot = msg - va_queue_first;
- sw_ack_slot = ffs(fw) - 1;
-
- uv_bau_process_message(msg, msg_slot, sw_ack_slot);
-
+ msgdesc.msg_slot = msg - msgdesc.va_queue_first;
+ msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
+ msgdesc.msg = msg;
+ uv_bau_process_message(&msgdesc, bcp);
msg++;
- if (msg > va_queue_last)
- msg = va_queue_first;
- __get_cpu_var(bau_control).bau_msg_head = msg;
+ if (msg > msgdesc.va_queue_last)
+ msg = msgdesc.va_queue_first;
+ bcp->bau_msg_head = msg;
}
+ stat->d_time += (get_cycles() - time_start);
if (!count)
- __get_cpu_var(ptcstats).nomsg++;
+ stat->d_nomsg++;
else if (count > 1)
- __get_cpu_var(ptcstats).multmsg++;
-
- time2 = get_cycles();
- __get_cpu_var(ptcstats).dflush += (time2 - time1);
-
- irq_exit();
- set_irq_regs(old_regs);
+ stat->d_multmsg++;
+ ack_APIC_irq();
}
/*
* uv_enable_timeouts
*
- * Each target blade (i.e. blades that have cpu's) needs to have
+ * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
* shootdown message timeouts enabled. The timeout does not cause
* an interrupt, but causes an error message to be returned to
* the sender.
*/
static void uv_enable_timeouts(void)
{
- int blade;
- int nblades;
+ int uvhub;
+ int nuvhubs;
int pnode;
unsigned long mmr_image;
- nblades = uv_num_possible_blades();
+ nuvhubs = uv_num_possible_blades();
- for (blade = 0; blade < nblades; blade++) {
- if (!uv_blade_nr_possible_cpus(blade))
+ for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
+ if (!uv_blade_nr_possible_cpus(uvhub))
continue;
- pnode = uv_blade_to_pnode(blade);
+ pnode = uv_blade_to_pnode(uvhub);
mmr_image =
uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
/*
@@ -479,16 +864,16 @@ static void uv_enable_timeouts(void)
* To program the period, the SOFT_ACK_MODE must be off.
*/
mmr_image &= ~((unsigned long)1 <<
- UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+ UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
uv_write_global_mmr64
(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
/*
* Set the 4-bit period.
*/
mmr_image &= ~((unsigned long)0xf <<
- UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+ UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
- UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT);
+ UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
uv_write_global_mmr64
(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
/*
@@ -497,7 +882,7 @@ static void uv_enable_timeouts(void)
* indicated in bits 2:0 (7 causes all of them to timeout).
*/
mmr_image |= ((unsigned long)1 <<
- UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT);
+ UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
uv_write_global_mmr64
(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
}
@@ -522,9 +907,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data)
{
}
+static inline unsigned long long
+millisec_2_cycles(unsigned long millisec)
+{
+ unsigned long ns;
+ unsigned long long cyc;
+
+ ns = millisec * 1000;
+ cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
+ return cyc;
+}
+
/*
- * Display the statistics thru /proc
- * data points to the cpu number
+ * Display the statistics thru /proc.
+ * 'data' points to the cpu number
*/
static int uv_ptc_seq_show(struct seq_file *file, void *data)
{
@@ -535,78 +931,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
if (!cpu) {
seq_printf(file,
- "# cpu requestor requestee one all sretry dretry ptc_i ");
+ "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 ");
seq_printf(file,
- "sw_ack sflush dflush sok dnomsg dmult starget\n");
+ "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto ");
+ seq_printf(file,
+ "retries rok resetp resett giveup sto bz throt ");
+ seq_printf(file,
+ "sw_ack recv rtime all ");
+ seq_printf(file,
+ "one mult none retry canc nocan reset rcan\n");
}
if (cpu < num_possible_cpus() && cpu_online(cpu)) {
stat = &per_cpu(ptcstats, cpu);
- seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ",
- cpu, stat->requestor,
- stat->requestee, stat->onetlb, stat->alltlb,
- stat->s_retry, stat->d_retry, stat->ptc_i);
- seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n",
+ /* source side statistics */
+ seq_printf(file,
+ "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
+ cpu, stat->s_requestor, cycles_2_us(stat->s_time),
+ stat->s_ntarguvhub, stat->s_ntarguvhub16,
+ stat->s_ntarguvhub8, stat->s_ntarguvhub4,
+ stat->s_ntarguvhub2, stat->s_ntarguvhub1,
+ stat->s_ntargcpu, stat->s_dtimeout);
+ seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
+ stat->s_retry_messages, stat->s_retriesok,
+ stat->s_resets_plug, stat->s_resets_timeout,
+ stat->s_giveup, stat->s_stimeout,
+ stat->s_busy, stat->s_throttles);
+ /* destination side statistics */
+ seq_printf(file,
+ "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n",
uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
- stat->sflush, stat->dflush,
- stat->retriesok, stat->nomsg,
- stat->multmsg, stat->ntargeted);
+ stat->d_requestee, cycles_2_us(stat->d_time),
+ stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
+ stat->d_nomsg, stat->d_retries, stat->d_canceled,
+ stat->d_nocanceled, stat->d_resets,
+ stat->d_rcanceled);
}
return 0;
}
/*
+ * -1: resetf the statistics
* 0: display meaning of the statistics
- * >0: retry limit
+ * >0: maximum concurrent active descriptors per uvhub (throttle)
*/
static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
size_t count, loff_t *data)
{
- long newmode;
+ int cpu;
+ long input_arg;
char optstr[64];
+ struct ptc_stats *stat;
+ struct bau_control *bcp;
if (count == 0 || count > sizeof(optstr))
return -EINVAL;
if (copy_from_user(optstr, user, count))
return -EFAULT;
optstr[count - 1] = '\0';
- if (strict_strtoul(optstr, 10, &newmode) < 0) {
+ if (strict_strtol(optstr, 10, &input_arg) < 0) {
printk(KERN_DEBUG "%s is invalid\n", optstr);
return -EINVAL;
}
- if (newmode == 0) {
+ if (input_arg == 0) {
printk(KERN_DEBUG "# cpu: cpu number\n");
+ printk(KERN_DEBUG "Sender statistics:\n");
+ printk(KERN_DEBUG
+ "sent: number of shootdown messages sent\n");
+ printk(KERN_DEBUG
+ "stime: time spent sending messages\n");
+ printk(KERN_DEBUG
+ "numuvhubs: number of hubs targeted with shootdown\n");
+ printk(KERN_DEBUG
+ "numuvhubs16: number times 16 or more hubs targeted\n");
+ printk(KERN_DEBUG
+ "numuvhubs8: number times 8 or more hubs targeted\n");
+ printk(KERN_DEBUG
+ "numuvhubs4: number times 4 or more hubs targeted\n");
+ printk(KERN_DEBUG
+ "numuvhubs2: number times 2 or more hubs targeted\n");
+ printk(KERN_DEBUG
+ "numuvhubs1: number times 1 hub targeted\n");
+ printk(KERN_DEBUG
+ "numcpus: number of cpus targeted with shootdown\n");
+ printk(KERN_DEBUG
+ "dto: number of destination timeouts\n");
+ printk(KERN_DEBUG
+ "retries: destination timeout retries sent\n");
+ printk(KERN_DEBUG
+ "rok: : destination timeouts successfully retried\n");
+ printk(KERN_DEBUG
+ "resetp: ipi-style resource resets for plugs\n");
+ printk(KERN_DEBUG
+ "resett: ipi-style resource resets for timeouts\n");
+ printk(KERN_DEBUG
+ "giveup: fall-backs to ipi-style shootdowns\n");
+ printk(KERN_DEBUG
+ "sto: number of source timeouts\n");
+ printk(KERN_DEBUG
+ "bz: number of stay-busy's\n");
+ printk(KERN_DEBUG
+ "throt: number times spun in throttle\n");
+ printk(KERN_DEBUG "Destination side statistics:\n");
printk(KERN_DEBUG
- "requestor: times this cpu was the flush requestor\n");
+ "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
printk(KERN_DEBUG
- "requestee: times this cpu was requested to flush its TLBs\n");
+ "recv: shootdown messages received\n");
printk(KERN_DEBUG
- "one: times requested to flush a single address\n");
+ "rtime: time spent processing messages\n");
printk(KERN_DEBUG
- "all: times requested to flush all TLB's\n");
+ "all: shootdown all-tlb messages\n");
printk(KERN_DEBUG
- "sretry: number of retries of source-side timeouts\n");
+ "one: shootdown one-tlb messages\n");
printk(KERN_DEBUG
- "dretry: number of retries of destination-side timeouts\n");
+ "mult: interrupts that found multiple messages\n");
printk(KERN_DEBUG
- "ptc_i: times UV fell through to IPI-style flushes\n");
+ "none: interrupts that found no messages\n");
printk(KERN_DEBUG
- "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
+ "retry: number of retry messages processed\n");
printk(KERN_DEBUG
- "sflush_us: cycles spent in uv_flush_tlb_others()\n");
+ "canc: number messages canceled by retries\n");
printk(KERN_DEBUG
- "dflush_us: cycles spent in handling flush requests\n");
- printk(KERN_DEBUG "sok: successes on retry\n");
- printk(KERN_DEBUG "dnomsg: interrupts with no message\n");
+ "nocan: number retries that found nothing to cancel\n");
printk(KERN_DEBUG
- "dmult: interrupts with multiple messages\n");
- printk(KERN_DEBUG "starget: nodes targeted\n");
+ "reset: number of ipi-style reset requests processed\n");
+ printk(KERN_DEBUG
+ "rcan: number messages canceled by reset requests\n");
+ } else if (input_arg == -1) {
+ for_each_present_cpu(cpu) {
+ stat = &per_cpu(ptcstats, cpu);
+ memset(stat, 0, sizeof(struct ptc_stats));
+ }
} else {
- uv_bau_retry_limit = newmode;
- printk(KERN_DEBUG "timeout retry limit:%d\n",
- uv_bau_retry_limit);
+ uv_bau_max_concurrent = input_arg;
+ bcp = &per_cpu(bau_control, smp_processor_id());
+ if (uv_bau_max_concurrent < 1 ||
+ uv_bau_max_concurrent > bcp->cpus_in_uvhub) {
+ printk(KERN_DEBUG
+ "Error: BAU max concurrent %d; %d is invalid\n",
+ bcp->max_concurrent, uv_bau_max_concurrent);
+ return -EINVAL;
+ }
+ printk(KERN_DEBUG "Set BAU max concurrent:%d\n",
+ uv_bau_max_concurrent);
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->max_concurrent = uv_bau_max_concurrent;
+ }
}
return count;
@@ -650,79 +1123,30 @@ static int __init uv_ptc_init(void)
}
/*
- * begin the initialization of the per-blade control structures
- */
-static struct bau_control * __init uv_table_bases_init(int blade, int node)
-{
- int i;
- struct bau_msg_status *msp;
- struct bau_control *bau_tabp;
-
- bau_tabp =
- kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node);
- BUG_ON(!bau_tabp);
-
- bau_tabp->msg_statuses =
- kmalloc_node(sizeof(struct bau_msg_status) *
- DEST_Q_SIZE, GFP_KERNEL, node);
- BUG_ON(!bau_tabp->msg_statuses);
-
- for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++)
- bau_cpubits_clear(&msp->seen_by, (int)
- uv_blade_nr_possible_cpus(blade));
-
- uv_bau_table_bases[blade] = bau_tabp;
-
- return bau_tabp;
-}
-
-/*
- * finish the initialization of the per-blade control structures
- */
-static void __init
-uv_table_bases_finish(int blade,
- struct bau_control *bau_tablesp,
- struct bau_desc *adp)
-{
- struct bau_control *bcp;
- int cpu;
-
- for_each_present_cpu(cpu) {
- if (blade != uv_cpu_to_blade_id(cpu))
- continue;
-
- bcp = (struct bau_control *)&per_cpu(bau_control, cpu);
- bcp->bau_msg_head = bau_tablesp->va_queue_first;
- bcp->va_queue_first = bau_tablesp->va_queue_first;
- bcp->va_queue_last = bau_tablesp->va_queue_last;
- bcp->msg_statuses = bau_tablesp->msg_statuses;
- bcp->descriptor_base = adp;
- }
-}
-
-/*
* initialize the sending side's sending buffers
*/
-static struct bau_desc * __init
+static void
uv_activation_descriptor_init(int node, int pnode)
{
int i;
+ int cpu;
unsigned long pa;
unsigned long m;
unsigned long n;
- struct bau_desc *adp;
- struct bau_desc *ad2;
+ struct bau_desc *bau_desc;
+ struct bau_desc *bd2;
+ struct bau_control *bcp;
/*
* each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
- * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+ * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
*/
- adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+ bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
- BUG_ON(!adp);
+ BUG_ON(!bau_desc);
- pa = uv_gpa(adp); /* need the real nasid*/
- n = uv_gpa_to_pnode(pa);
+ pa = uv_gpa(bau_desc); /* need the real nasid*/
+ n = pa >> uv_nshift;
m = pa & uv_mmask;
uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
@@ -731,96 +1155,188 @@ uv_activation_descriptor_init(int node, int pnode)
/*
* initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
* cpu even though we only use the first one; one descriptor can
- * describe a broadcast to 256 nodes.
+ * describe a broadcast to 256 uv hubs.
*/
- for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
- i++, ad2++) {
- memset(ad2, 0, sizeof(struct bau_desc));
- ad2->header.sw_ack_flag = 1;
+ for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+ i++, bd2++) {
+ memset(bd2, 0, sizeof(struct bau_desc));
+ bd2->header.sw_ack_flag = 1;
/*
- * base_dest_nodeid is the first node in the partition, so
- * the bit map will indicate partition-relative node numbers.
- * note that base_dest_nodeid is actually a nasid.
+ * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
+ * in the partition. The bit map will indicate uvhub numbers,
+ * which are 0-N in a partition. Pnodes are unique system-wide.
*/
- ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
- ad2->header.dest_subnodeid = 0x10; /* the LB */
- ad2->header.command = UV_NET_ENDPOINT_INTD;
- ad2->header.int_both = 1;
+ bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
+ bd2->header.dest_subnodeid = 0x10; /* the LB */
+ bd2->header.command = UV_NET_ENDPOINT_INTD;
+ bd2->header.int_both = 1;
/*
* all others need to be set to zero:
* fairness chaining multilevel count replied_to
*/
}
- return adp;
+ for_each_present_cpu(cpu) {
+ if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
+ continue;
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->descriptor_base = bau_desc;
+ }
}
/*
* initialize the destination side's receiving buffers
+ * entered for each uvhub in the partition
+ * - node is first node (kernel memory notion) on the uvhub
+ * - pnode is the uvhub's physical identifier
*/
-static struct bau_payload_queue_entry * __init
-uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp)
+static void
+uv_payload_queue_init(int node, int pnode)
{
- struct bau_payload_queue_entry *pqp;
- unsigned long pa;
int pn;
+ int cpu;
char *cp;
+ unsigned long pa;
+ struct bau_payload_queue_entry *pqp;
+ struct bau_payload_queue_entry *pqp_malloc;
+ struct bau_control *bcp;
pqp = (struct bau_payload_queue_entry *) kmalloc_node(
(DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
GFP_KERNEL, node);
BUG_ON(!pqp);
+ pqp_malloc = pqp;
cp = (char *)pqp + 31;
pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
- bau_tablesp->va_queue_first = pqp;
+
+ for_each_present_cpu(cpu) {
+ if (pnode != uv_cpu_to_pnode(cpu))
+ continue;
+ /* for every cpu on this pnode: */
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->va_queue_first = pqp;
+ bcp->bau_msg_head = pqp;
+ bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
+ }
/*
* need the pnode of where the memory was really allocated
*/
pa = uv_gpa(pqp);
- pn = uv_gpa_to_pnode(pa);
+ pn = pa >> uv_nshift;
uv_write_global_mmr64(pnode,
UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
uv_physnodeaddr(pqp));
uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
uv_physnodeaddr(pqp));
- bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
(unsigned long)
- uv_physnodeaddr(bau_tablesp->va_queue_last));
+ uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
+ /* in effect, all msg_type's are set to MSG_NOOP */
memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
-
- return pqp;
}
/*
- * Initialization of each UV blade's structures
+ * Initialization of each UV hub's structures
*/
-static int __init uv_init_blade(int blade)
+static void __init uv_init_uvhub(int uvhub, int vector)
{
int node;
int pnode;
- unsigned long pa;
unsigned long apicid;
- struct bau_desc *adp;
- struct bau_payload_queue_entry *pqp;
- struct bau_control *bau_tablesp;
-
- node = blade_to_first_node(blade);
- bau_tablesp = uv_table_bases_init(blade, node);
- pnode = uv_blade_to_pnode(blade);
- adp = uv_activation_descriptor_init(node, pnode);
- pqp = uv_payload_queue_init(node, pnode, bau_tablesp);
- uv_table_bases_finish(blade, bau_tablesp, adp);
+
+ node = uvhub_to_first_node(uvhub);
+ pnode = uv_blade_to_pnode(uvhub);
+ uv_activation_descriptor_init(node, pnode);
+ uv_payload_queue_init(node, pnode);
/*
* the below initialization can't be in firmware because the
* messaging IRQ will be determined by the OS
*/
- apicid = blade_to_first_apicid(blade);
- pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
+ apicid = uvhub_to_first_apicid(uvhub);
uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
- ((apicid << 32) | UV_BAU_MESSAGE));
- return 0;
+ ((apicid << 32) | vector));
+}
+
+/*
+ * initialize the bau_control structure for each cpu
+ */
+static void uv_init_per_cpu(int nuvhubs)
+{
+ int i, j, k;
+ int cpu;
+ int pnode;
+ int uvhub;
+ short socket = 0;
+ struct bau_control *bcp;
+ struct uvhub_desc *bdp;
+ struct socket_desc *sdp;
+ struct bau_control *hmaster = NULL;
+ struct bau_control *smaster = NULL;
+ struct socket_desc {
+ short num_cpus;
+ short cpu_number[16];
+ };
+ struct uvhub_desc {
+ short num_sockets;
+ short num_cpus;
+ short uvhub;
+ short pnode;
+ struct socket_desc socket[2];
+ };
+ struct uvhub_desc *uvhub_descs;
+
+ uvhub_descs = (struct uvhub_desc *)
+ kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
+ memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
+ for_each_present_cpu(cpu) {
+ bcp = &per_cpu(bau_control, cpu);
+ memset(bcp, 0, sizeof(struct bau_control));
+ spin_lock_init(&bcp->masks_lock);
+ bcp->max_concurrent = uv_bau_max_concurrent;
+ pnode = uv_cpu_hub_info(cpu)->pnode;
+ uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
+ bdp = &uvhub_descs[uvhub];
+ bdp->num_cpus++;
+ bdp->uvhub = uvhub;
+ bdp->pnode = pnode;
+ /* time interval to catch a hardware stay-busy bug */
+ bcp->timeout_interval = millisec_2_cycles(3);
+ /* kludge: assume uv_hub.h is constant */
+ socket = (cpu_physical_id(cpu)>>5)&1;
+ if (socket >= bdp->num_sockets)
+ bdp->num_sockets = socket+1;
+ sdp = &bdp->socket[socket];
+ sdp->cpu_number[sdp->num_cpus] = cpu;
+ sdp->num_cpus++;
+ }
+ socket = 0;
+ for_each_possible_blade(uvhub) {
+ bdp = &uvhub_descs[uvhub];
+ for (i = 0; i < bdp->num_sockets; i++) {
+ sdp = &bdp->socket[i];
+ for (j = 0; j < sdp->num_cpus; j++) {
+ cpu = sdp->cpu_number[j];
+ bcp = &per_cpu(bau_control, cpu);
+ bcp->cpu = cpu;
+ if (j == 0) {
+ smaster = bcp;
+ if (i == 0)
+ hmaster = bcp;
+ }
+ bcp->cpus_in_uvhub = bdp->num_cpus;
+ bcp->cpus_in_socket = sdp->num_cpus;
+ bcp->socket_master = smaster;
+ bcp->uvhub_master = hmaster;
+ for (k = 0; k < DEST_Q_SIZE; k++)
+ bcp->socket_acknowledge_count[k] = 0;
+ bcp->uvhub_cpu =
+ uv_cpu_hub_info(cpu)->blade_processor_id;
+ }
+ socket++;
+ }
+ }
+ kfree(uvhub_descs);
}
/*
@@ -828,38 +1344,54 @@ static int __init uv_init_blade(int blade)
*/
static int __init uv_bau_init(void)
{
- int blade;
- int nblades;
+ int uvhub;
+ int pnode;
+ int nuvhubs;
int cur_cpu;
+ int vector;
+ unsigned long mmr;
if (!is_uv_system())
return 0;
+ if (nobau)
+ return 0;
+
for_each_possible_cpu(cur_cpu)
zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
GFP_KERNEL, cpu_to_node(cur_cpu));
- uv_bau_retry_limit = 1;
+ uv_bau_max_concurrent = MAX_BAU_CONCURRENT;
+ uv_nshift = uv_hub_info->m_val;
uv_mmask = (1UL << uv_hub_info->m_val) - 1;
- nblades = uv_num_possible_blades();
+ nuvhubs = uv_num_possible_blades();
- uv_bau_table_bases = (struct bau_control **)
- kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL);
- BUG_ON(!uv_bau_table_bases);
+ uv_init_per_cpu(nuvhubs);
uv_partition_base_pnode = 0x7fffffff;
- for (blade = 0; blade < nblades; blade++)
- if (uv_blade_nr_possible_cpus(blade) &&
- (uv_blade_to_pnode(blade) < uv_partition_base_pnode))
- uv_partition_base_pnode = uv_blade_to_pnode(blade);
- for (blade = 0; blade < nblades; blade++)
- if (uv_blade_nr_possible_cpus(blade))
- uv_init_blade(blade);
-
- alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1);
+ for (uvhub = 0; uvhub < nuvhubs; uvhub++)
+ if (uv_blade_nr_possible_cpus(uvhub) &&
+ (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
+ uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
+
+ vector = UV_BAU_MESSAGE;
+ for_each_possible_blade(uvhub)
+ if (uv_blade_nr_possible_cpus(uvhub))
+ uv_init_uvhub(uvhub, vector);
+
uv_enable_timeouts();
+ alloc_intr_gate(vector, uv_bau_message_intr1);
+
+ for_each_possible_blade(uvhub) {
+ pnode = uv_blade_to_pnode(uvhub);
+ /* INIT the bau */
+ uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL,
+ ((unsigned long)1 << 63));
+ mmr = 1; /* should be 1 to broadcast to both sockets */
+ uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr);
+ }
return 0;
}
-__initcall(uv_bau_init);
-__initcall(uv_ptc_init);
+core_initcall(uv_bau_init);
+core_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 1168e4454188..60788dee0f8a 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -15,6 +15,7 @@
#include <linux/kprobes.h>
#include <linux/uaccess.h>
#include <linux/kdebug.h>
+#include <linux/kgdb.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/ptrace.h>
@@ -108,15 +109,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs)
dec_preempt_count();
}
-#ifdef CONFIG_X86_32
-static inline void
-die_if_kernel(const char *str, struct pt_regs *regs, long err)
-{
- if (!user_mode_vm(regs))
- die(str, regs, err);
-}
-#endif
-
static void __kprobes
do_trap(int trapnr, int signr, char *str, struct pt_regs *regs,
long error_code, siginfo_t *info)
@@ -400,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
== NOTIFY_STOP)
return;
+
#ifdef CONFIG_X86_LOCAL_APIC
+ if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
+ == NOTIFY_STOP)
+ return;
+
+#ifndef CONFIG_LOCKUP_DETECTOR
/*
* Ok, so this is none of the documented NMI sources,
* so it must be the NMI watchdog.
@@ -408,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
if (nmi_watchdog_tick(regs, reason))
return;
if (!do_nmi_callback(regs, cpu))
+#endif /* !CONFIG_LOCKUP_DETECTOR */
unknown_nmi_error(reason, regs);
#else
unknown_nmi_error(reason, regs);
@@ -460,6 +459,11 @@ void restart_nmi(void)
/* May run on IST stack. */
dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code)
{
+#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP
+ if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
+ == NOTIFY_STOP)
+ return;
+#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */
#ifdef CONFIG_KPROBES
if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP)
== NOTIFY_STOP)
@@ -529,6 +533,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
{
struct task_struct *tsk = current;
+ int user_icebp = 0;
unsigned long dr6;
int si_code;
@@ -537,17 +542,25 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
/* Filter out all the reserved bits which are preset to 1 */
dr6 &= ~DR6_RESERVED;
+ /*
+ * If dr6 has no reason to give us about the origin of this trap,
+ * then it's very likely the result of an icebp/int01 trap.
+ * User wants a sigtrap for that.
+ */
+ if (!dr6 && user_mode(regs))
+ user_icebp = 1;
+
/* Catch kmemcheck conditions first of all! */
if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
return;
/* DR6 may or may not be cleared by the CPU */
set_debugreg(0, 6);
+
/*
* The processor cleared BTF, so don't mark that we need it set.
*/
- clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
- tsk->thread.debugctlmsr = 0;
+ clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP);
/* Store the virtualized DR6 value */
tsk->thread.debugreg6 = dr6;
@@ -578,62 +591,74 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
regs->flags &= ~X86_EFLAGS_TF;
}
si_code = get_si_code(tsk->thread.debugreg6);
- if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
+ if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp)
send_sigtrap(tsk, regs, error_code, si_code);
preempt_conditional_cli(regs);
return;
}
-#ifdef CONFIG_X86_64
-static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr)
-{
- if (fixup_exception(regs))
- return 1;
-
- notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE);
- /* Illegal floating point operation in the kernel */
- current->thread.trap_no = trapnr;
- die(str, regs, 0);
- return 0;
-}
-#endif
-
/*
* Note that we play around with the 'TS' bit in an attempt to get
* the correct behaviour even in the presence of the asynchronous
* IRQ13 behaviour
*/
-void math_error(void __user *ip)
+void math_error(struct pt_regs *regs, int error_code, int trapnr)
{
- struct task_struct *task;
+ struct task_struct *task = current;
siginfo_t info;
- unsigned short cwd, swd, err;
+ unsigned short err;
+ char *str = (trapnr == 16) ? "fpu exception" : "simd exception";
+
+ if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP)
+ return;
+ conditional_sti(regs);
+
+ if (!user_mode_vm(regs))
+ {
+ if (!fixup_exception(regs)) {
+ task->thread.error_code = error_code;
+ task->thread.trap_no = trapnr;
+ die(str, regs, error_code);
+ }
+ return;
+ }
/*
* Save the info for the exception handler and clear the error.
*/
- task = current;
save_init_fpu(task);
- task->thread.trap_no = 16;
- task->thread.error_code = 0;
+ task->thread.trap_no = trapnr;
+ task->thread.error_code = error_code;
info.si_signo = SIGFPE;
info.si_errno = 0;
- info.si_addr = ip;
- /*
- * (~cwd & swd) will mask out exceptions that are not set to unmasked
- * status. 0x3f is the exception bits in these regs, 0x200 is the
- * C1 reg you need in case of a stack fault, 0x040 is the stack
- * fault bit. We should only be taking one exception at a time,
- * so if this combination doesn't produce any single exception,
- * then we have a bad program that isn't synchronizing its FPU usage
- * and it will suffer the consequences since we won't be able to
- * fully reproduce the context of the exception
- */
- cwd = get_fpu_cwd(task);
- swd = get_fpu_swd(task);
+ info.si_addr = (void __user *)regs->ip;
+ if (trapnr == 16) {
+ unsigned short cwd, swd;
+ /*
+ * (~cwd & swd) will mask out exceptions that are not set to unmasked
+ * status. 0x3f is the exception bits in these regs, 0x200 is the
+ * C1 reg you need in case of a stack fault, 0x040 is the stack
+ * fault bit. We should only be taking one exception at a time,
+ * so if this combination doesn't produce any single exception,
+ * then we have a bad program that isn't synchronizing its FPU usage
+ * and it will suffer the consequences since we won't be able to
+ * fully reproduce the context of the exception
+ */
+ cwd = get_fpu_cwd(task);
+ swd = get_fpu_swd(task);
- err = swd & ~cwd;
+ err = swd & ~cwd;
+ } else {
+ /*
+ * The SIMD FPU exceptions are handled a little differently, as there
+ * is only a single status/control register. Thus, to determine which
+ * unmasked exception was caught we must mask the exception mask bits
+ * at 0x1f80, and then use these to mask the exception bits at 0x3f.
+ */
+ unsigned short mxcsr = get_fpu_mxcsr(task);
+ err = ~(mxcsr >> 7) & mxcsr;
+ }
if (err & 0x001) { /* Invalid op */
/*
@@ -662,97 +687,17 @@ void math_error(void __user *ip)
dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code)
{
- conditional_sti(regs);
-
#ifdef CONFIG_X86_32
ignore_fpu_irq = 1;
-#else
- if (!user_mode(regs) &&
- kernel_math_error(regs, "kernel x87 math error", 16))
- return;
#endif
- math_error((void __user *)regs->ip);
-}
-
-static void simd_math_error(void __user *ip)
-{
- struct task_struct *task;
- siginfo_t info;
- unsigned short mxcsr;
-
- /*
- * Save the info for the exception handler and clear the error.
- */
- task = current;
- save_init_fpu(task);
- task->thread.trap_no = 19;
- task->thread.error_code = 0;
- info.si_signo = SIGFPE;
- info.si_errno = 0;
- info.si_code = __SI_FAULT;
- info.si_addr = ip;
- /*
- * The SIMD FPU exceptions are handled a little differently, as there
- * is only a single status/control register. Thus, to determine which
- * unmasked exception was caught we must mask the exception mask bits
- * at 0x1f80, and then use these to mask the exception bits at 0x3f.
- */
- mxcsr = get_fpu_mxcsr(task);
- switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) {
- case 0x000:
- default:
- break;
- case 0x001: /* Invalid Op */
- info.si_code = FPE_FLTINV;
- break;
- case 0x002: /* Denormalize */
- case 0x010: /* Underflow */
- info.si_code = FPE_FLTUND;
- break;
- case 0x004: /* Zero Divide */
- info.si_code = FPE_FLTDIV;
- break;
- case 0x008: /* Overflow */
- info.si_code = FPE_FLTOVF;
- break;
- case 0x020: /* Precision */
- info.si_code = FPE_FLTRES;
- break;
- }
- force_sig_info(SIGFPE, &info, task);
+ math_error(regs, error_code, 16);
}
dotraplinkage void
do_simd_coprocessor_error(struct pt_regs *regs, long error_code)
{
- conditional_sti(regs);
-
-#ifdef CONFIG_X86_32
- if (cpu_has_xmm) {
- /* Handle SIMD FPU exceptions on PIII+ processors. */
- ignore_fpu_irq = 1;
- simd_math_error((void __user *)regs->ip);
- return;
- }
- /*
- * Handle strange cache flush from user space exception
- * in all other cases. This is undocumented behaviour.
- */
- if (regs->flags & X86_VM_MASK) {
- handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code);
- return;
- }
- current->thread.trap_no = 19;
- current->thread.error_code = error_code;
- die_if_kernel("cache flush denied", regs, error_code);
- force_sig(SIGSEGV, current);
-#else
- if (!user_mode(regs) &&
- kernel_math_error(regs, "kernel simd math error", 19))
- return;
- simd_math_error((void __user *)regs->ip);
-#endif
+ math_error(regs, error_code, 19);
}
dotraplinkage void
@@ -879,6 +824,16 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
}
#endif
+/* Set of traps needed for early debugging. */
+void __init early_trap_init(void)
+{
+ set_intr_gate_ist(1, &debug, DEBUG_STACK);
+ /* int3 can be called from all */
+ set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
+ set_intr_gate(14, &page_fault);
+ load_idt(&idt_descr);
+}
+
void __init trap_init(void)
{
int i;
@@ -892,10 +847,7 @@ void __init trap_init(void)
#endif
set_intr_gate(0, &divide_error);
- set_intr_gate_ist(1, &debug, DEBUG_STACK);
set_intr_gate_ist(2, &nmi, NMI_STACK);
- /* int3 can be called from all */
- set_system_intr_gate_ist(3, &int3, DEBUG_STACK);
/* int4 can be called from all */
set_system_intr_gate(4, &overflow);
set_intr_gate(5, &bounds);
@@ -911,7 +863,6 @@ void __init trap_init(void)
set_intr_gate(11, &segment_not_present);
set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK);
set_intr_gate(13, &general_protection);
- set_intr_gate(14, &page_fault);
set_intr_gate(15, &spurious_interrupt_bug);
set_intr_gate(16, &coprocessor_error);
set_intr_gate(17, &alignment_check);
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9faf91ae1841..ce8e50239332 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = {
.read = read_tsc,
.resume = resume_tsc,
.mask = CLOCKSOURCE_MASK(64),
- .shift = 22,
.flags = CLOCK_SOURCE_IS_CONTINUOUS |
CLOCK_SOURCE_MUST_VERIFY,
#ifdef CONFIG_X86_64
@@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void)
static void __init init_tsc_clocksource(void)
{
- clocksource_tsc.mult = clocksource_khz2mult(tsc_khz,
- clocksource_tsc.shift);
if (tsc_clocksource_reliable)
clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
/* lower the rating if we already know its unstable: */
@@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void)
clocksource_tsc.rating = 0;
clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
}
- clocksource_register(&clocksource_tsc);
+ clocksource_register_khz(&clocksource_tsc, tsc_khz);
}
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index 1d40336b030a..1132129db792 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -44,7 +44,7 @@ static void uv_ack_apic(unsigned int irq)
ack_APIC_irq();
}
-struct irq_chip uv_irq_chip = {
+static struct irq_chip uv_irq_chip = {
.name = "UV-CORE",
.startup = uv_noop_ret,
.shutdown = uv_noop,
@@ -141,7 +141,7 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
*/
static int
arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
- unsigned long mmr_offset, int restrict)
+ unsigned long mmr_offset, int limit)
{
const struct cpumask *eligible_cpu = cpumask_of(cpu);
struct irq_desc *desc = irq_to_desc(irq);
@@ -160,7 +160,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
if (err != 0)
return err;
- if (restrict == UV_AFFINITY_CPU)
+ if (limit == UV_AFFINITY_CPU)
desc->status |= IRQ_NO_BALANCING;
else
desc->status |= IRQ_MOVE_PCNTXT;
@@ -214,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
unsigned long mmr_value;
struct uv_IO_APIC_route_entry *entry;
unsigned long mmr_offset;
- unsigned mmr_pnode;
+ int mmr_pnode;
if (set_desc_affinity(desc, mask, &dest))
return -1;
@@ -248,7 +248,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
* interrupt is raised.
*/
int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
- unsigned long mmr_offset, int restrict)
+ unsigned long mmr_offset, int limit)
{
int irq, ret;
@@ -258,7 +258,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
return -EBUSY;
ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
- restrict);
+ limit);
if (ret == irq)
uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
else
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S
index 45b6f8a975a1..56a8c2a867d9 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu_64.S
@@ -31,6 +31,7 @@
*/
#include <asm/cpufeature.h>
+#include <asm/msr-index.h>
verify_cpu:
pushfl # Save caller passed flags
@@ -88,7 +89,7 @@ verify_cpu_sse_test:
je verify_cpu_sse_ok
test %di,%di
jz verify_cpu_no_longmode # only try to force SSE on AMD
- movl $0xc0010015,%ecx # HWCR
+ movl $MSR_K7_HWCR,%ecx
rdmsr
btr $15,%eax # enable SSE
wrmsr
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 2cc249718c46..d0bb52296fa3 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -97,7 +97,7 @@ SECTIONS
HEAD_TEXT
#ifdef CONFIG_X86_32
. = ALIGN(PAGE_SIZE);
- *(.text.page_aligned)
+ *(.text..page_aligned)
#endif
. = ALIGN(8);
_stext = .;
@@ -305,7 +305,7 @@ SECTIONS
. = ALIGN(PAGE_SIZE);
.bss : AT(ADDR(.bss) - LOAD_OFFSET) {
__bss_start = .;
- *(.bss.page_aligned)
+ *(.bss..page_aligned)
*(.bss)
. = ALIGN(4);
__bss_stop = .;
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 1c0c6ab9c60f..dcbb28c4b694 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,8 +73,8 @@ void update_vsyscall_tz(void)
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
-void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
- u32 mult)
+void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
+ struct clocksource *clock, u32 mult)
{
unsigned long flags;
@@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
vsyscall_gtod_data.clock.shift = clock->shift;
vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
- vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic;
+ vsyscall_gtod_data.wall_to_monotonic = *wtm;
vsyscall_gtod_data.wall_time_coarse = __current_kernel_time();
write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
}
@@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz)
* unlikely */
time_t __vsyscall(1) vtime(time_t *t)
{
- struct timeval tv;
+ unsigned seq;
time_t result;
if (unlikely(!__vsyscall_gtod_data.sysctl_enabled))
return time_syscall(t);
- vgettimeofday(&tv, NULL);
- result = tv.tv_sec;
+ do {
+ seq = read_seqbegin(&__vsyscall_gtod_data.lock);
+
+ result = __vsyscall_gtod_data.wall_time_sec;
+
+ } while (read_seqretry(&__vsyscall_gtod_data.lock, seq));
+
if (t)
*t = result;
return result;
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 693920b22496..1b950d151e58 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy);
EXPORT_SYMBOL(__memcpy);
EXPORT_SYMBOL(empty_zero_page);
-EXPORT_SYMBOL(init_level4_pgt);
#ifndef CONFIG_PARAVIRT
EXPORT_SYMBOL(native_load_gs_index);
#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 61a1e8c7e19f..cd6da6bf3eca 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -5,6 +5,7 @@
*/
#include <linux/init.h>
#include <linux/ioport.h>
+#include <linux/module.h>
#include <asm/bios_ebda.h>
#include <asm/paravirt.h>
@@ -85,6 +86,7 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
};
static void default_nmi_init(void) { };
+static int default_i8042_detect(void) { return 1; };
struct x86_platform_ops x86_platform = {
.calibrate_tsc = native_calibrate_tsc,
@@ -92,5 +94,8 @@ struct x86_platform_ops x86_platform = {
.set_wallclock = mach_set_rtc_mmss,
.iommu_shutdown = iommu_shutdown_noop,
.is_untracked_pat_range = is_ISA_range,
- .nmi_init = default_nmi_init
+ .nmi_init = default_nmi_init,
+ .i8042_detect = default_i8042_detect
};
+
+EXPORT_SYMBOL_GPL(x86_platform);
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 782c3a362ec6..9c253bd65e24 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -16,11 +16,88 @@
*/
u64 pcntxt_mask;
+/*
+ * Represents init state for the supported extended state.
+ */
+static struct xsave_struct *init_xstate_buf;
+
struct _fpx_sw_bytes fx_sw_reserved;
#ifdef CONFIG_IA32_EMULATION
struct _fpx_sw_bytes fx_sw_reserved_ia32;
#endif
+static unsigned int *xstate_offsets, *xstate_sizes, xstate_features;
+
+/*
+ * If a processor implementation discern that a processor state component is
+ * in its initialized state it may modify the corresponding bit in the
+ * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory
+ * layout in the case of xsaveopt. While presenting the xstate information to
+ * the user, we always ensure that the memory layout of a feature will be in
+ * the init state if the corresponding header bit is zero. This is to ensure
+ * that the user doesn't see some stale state in the memory layout during
+ * signal handling, debugging etc.
+ */
+void __sanitize_i387_state(struct task_struct *tsk)
+{
+ u64 xstate_bv;
+ int feature_bit = 0x2;
+ struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave;
+
+ if (!fx)
+ return;
+
+ BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU);
+
+ xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv;
+
+ /*
+ * None of the feature bits are in init state. So nothing else
+ * to do for us, as the memory layout is upto date.
+ */
+ if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
+ return;
+
+ /*
+ * FP is in init state
+ */
+ if (!(xstate_bv & XSTATE_FP)) {
+ fx->cwd = 0x37f;
+ fx->swd = 0;
+ fx->twd = 0;
+ fx->fop = 0;
+ fx->rip = 0;
+ fx->rdp = 0;
+ memset(&fx->st_space[0], 0, 128);
+ }
+
+ /*
+ * SSE is in init state
+ */
+ if (!(xstate_bv & XSTATE_SSE))
+ memset(&fx->xmm_space[0], 0, 256);
+
+ xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2;
+
+ /*
+ * Update all the other memory layouts for which the corresponding
+ * header bit is in the init state.
+ */
+ while (xstate_bv) {
+ if (xstate_bv & 0x1) {
+ int offset = xstate_offsets[feature_bit];
+ int size = xstate_sizes[feature_bit];
+
+ memcpy(((void *) fx) + offset,
+ ((void *) init_xstate_buf) + offset,
+ size);
+ }
+
+ xstate_bv >>= 1;
+ feature_bit++;
+ }
+}
+
/*
* Check for the presence of extended state information in the
* user fpstate pointer in the sigcontext.
@@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0],
sizeof(struct _fpx_sw_bytes));
-
if (err)
- return err;
+ return -EFAULT;
/*
* First Magic check failed.
*/
if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1)
- return -1;
+ return -EINVAL;
/*
* Check for error scenarios.
@@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf,
if (fx_sw_user->xstate_size < min_xstate_size ||
fx_sw_user->xstate_size > xstate_size ||
fx_sw_user->xstate_size > fx_sw_user->extended_size)
- return -1;
+ return -EINVAL;
err = __get_user(magic2, (__u32 *) (((void *)fpstate) +
fx_sw_user->extended_size -
FP_XSTATE_MAGIC2_SIZE));
+ if (err)
+ return err;
/*
* Check for the presence of second magic word at the end of memory
* layout. This detects the case where the user just copied the legacy
* fpstate layout with out copying the extended state information
* in the memory layout.
*/
- if (err || magic2 != FP_XSTATE_MAGIC2)
- return -1;
+ if (magic2 != FP_XSTATE_MAGIC2)
+ return -EFAULT;
return 0;
}
@@ -91,15 +169,7 @@ int save_i387_xstate(void __user *buf)
return 0;
if (task_thread_info(tsk)->status & TS_USEDFPU) {
- /*
- * Start with clearing the user buffer. This will present a
- * clean context for the bytes not touched by the fxsave/xsave.
- */
- err = __clear_user(buf, sig_xstate_size);
- if (err)
- return err;
-
- if (task_thread_info(tsk)->status & TS_XSAVE)
+ if (use_xsave())
err = xsave_user(buf);
else
err = fxsave_user(buf);
@@ -109,14 +179,15 @@ int save_i387_xstate(void __user *buf)
task_thread_info(tsk)->status &= ~TS_USEDFPU;
stts();
} else {
- if (__copy_to_user(buf, &tsk->thread.xstate->fxsave,
+ sanitize_i387_state(tsk);
+ if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave,
xstate_size))
return -1;
}
clear_used_math(); /* trigger finit */
- if (task_thread_info(tsk)->status & TS_XSAVE) {
+ if (use_xsave()) {
struct _fpstate __user *fx = buf;
struct _xstate __user *x = buf;
u64 xstate_bv;
@@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf)
* init the state skipped by the user.
*/
mask = pcntxt_mask & ~mask;
-
- xrstor_state(init_xstate_buf, mask);
+ if (unlikely(mask))
+ xrstor_state(init_xstate_buf, mask);
return 0;
@@ -225,7 +296,7 @@ int restore_i387_xstate(void __user *buf)
clts();
task_thread_info(current)->status |= TS_USEDFPU;
}
- if (task_thread_info(tsk)->status & TS_XSAVE)
+ if (use_xsave())
err = restore_user_xstate(buf);
else
err = fxrstor_checking((__force struct i387_fxsave_struct *)
@@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void)
#endif
}
-/*
- * Represents init state for the supported extended state.
- */
-struct xsave_struct *init_xstate_buf;
-
#ifdef CONFIG_X86_64
unsigned int sig_xstate_size = sizeof(struct _fpstate);
#endif
@@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate);
/*
* Enable the extended processor state save/restore feature
*/
-void __cpuinit xsave_init(void)
+static inline void xstate_enable(void)
{
- if (!cpu_has_xsave)
- return;
-
set_in_cr4(X86_CR4_OSXSAVE);
-
- /*
- * Enable all the features that the HW is capable of
- * and the Linux kernel is aware of.
- */
xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask);
}
/*
+ * Record the offsets and sizes of different state managed by the xsave
+ * memory layout.
+ */
+static void __init setup_xstate_features(void)
+{
+ int eax, ebx, ecx, edx, leaf = 0x2;
+
+ xstate_features = fls64(pcntxt_mask);
+ xstate_offsets = alloc_bootmem(xstate_features * sizeof(int));
+ xstate_sizes = alloc_bootmem(xstate_features * sizeof(int));
+
+ do {
+ cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx);
+
+ if (eax == 0)
+ break;
+
+ xstate_offsets[leaf] = ebx;
+ xstate_sizes[leaf] = eax;
+
+ leaf++;
+ } while (1);
+}
+
+/*
* setup the xstate image representing the init state
*/
static void __init setup_xstate_init(void)
{
+ setup_xstate_features();
+
+ /*
+ * Setup init_xstate_buf to represent the init state of
+ * all the features managed by the xsave
+ */
init_xstate_buf = alloc_bootmem(xstate_size);
init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
+
+ clts();
+ /*
+ * Init all the features state with header_bv being 0x0
+ */
+ xrstor_state(init_xstate_buf, -1);
+ /*
+ * Dump the init state again. This is to identify the init state
+ * of any feature which is not represented by all zero's.
+ */
+ xsave_state(init_xstate_buf, -1);
+ stts();
}
/*
* Enable and initialize the xsave feature.
*/
-void __ref xsave_cntxt_init(void)
+static void __init xstate_enable_boot_cpu(void)
{
unsigned int eax, ebx, ecx, edx;
- cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+ if (boot_cpu_data.cpuid_level < XSTATE_CPUID) {
+ WARN(1, KERN_ERR "XSTATE_CPUID missing\n");
+ return;
+ }
+
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
pcntxt_mask = eax + ((u64)edx << 32);
if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) {
@@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void)
* Support only the state known to OS.
*/
pcntxt_mask = pcntxt_mask & XCNTXT_MASK;
- xsave_init();
+
+ xstate_enable();
/*
* Recompute the context size for enabled features
*/
- cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
+ cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx);
xstate_size = ebx;
update_regset_xstate_info(xstate_size, pcntxt_mask);
@@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void)
"cntxt size 0x%x\n",
pcntxt_mask, xstate_size);
}
+
+/*
+ * For the very first instance, this calls xstate_enable_boot_cpu();
+ * for all subsequent instances, this calls xstate_enable().
+ *
+ * This is somewhat obfuscated due to the lack of powerful enough
+ * overrides for the section checks.
+ */
+void __cpuinit xsave_init(void)
+{
+ static __refdata void (*next_func)(void) = xstate_enable_boot_cpu;
+ void (*this_func)(void);
+
+ if (!cpu_has_xsave)
+ return;
+
+ this_func = next_func;
+ next_func = xstate_enable;
+ this_func();
+}