diff options
Diffstat (limited to 'arch/x86/kernel')
120 files changed, 6334 insertions, 4895 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 4c58352209e0..0925676266bd 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -47,8 +47,6 @@ obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o -obj-$(CONFIG_X86_DS) += ds.o -obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o @@ -106,6 +104,7 @@ obj-$(CONFIG_SCx200) += scx200.o scx200-y += scx200_32.o obj-$(CONFIG_OLPC) += olpc.o +obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o obj-$(CONFIG_X86_MRST) += mrst.o microcode-y := microcode_core.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index cd40aba6aa95..c05872aa3ce0 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -63,7 +63,6 @@ EXPORT_SYMBOL(acpi_disabled); int acpi_noirq; /* skip ACPI IRQ initialization */ int acpi_pci_disabled; /* skip ACPI PCI scan and IRQ initialization */ EXPORT_SYMBOL(acpi_pci_disabled); -int acpi_ht __initdata = 1; /* enable HT */ int acpi_lapic; int acpi_ioapic; @@ -94,6 +93,53 @@ enum acpi_irq_model_id acpi_irq_model = ACPI_IRQ_MODEL_PIC; /* + * ISA irqs by default are the first 16 gsis but can be + * any gsi as specified by an interrupt source override. + */ +static u32 isa_irq_to_gsi[NR_IRQS_LEGACY] __read_mostly = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +static unsigned int gsi_to_irq(unsigned int gsi) +{ + unsigned int irq = gsi + NR_IRQS_LEGACY; + unsigned int i; + + for (i = 0; i < NR_IRQS_LEGACY; i++) { + if (isa_irq_to_gsi[i] == gsi) { + return i; + } + } + + /* Provide an identity mapping of gsi == irq + * except on truly weird platforms that have + * non isa irqs in the first 16 gsis. + */ + if (gsi >= NR_IRQS_LEGACY) + irq = gsi; + else + irq = gsi_top + gsi; + + return irq; +} + +static u32 irq_to_gsi(int irq) +{ + unsigned int gsi; + + if (irq < NR_IRQS_LEGACY) + gsi = isa_irq_to_gsi[irq]; + else if (irq < gsi_top) + gsi = irq; + else if (irq < (gsi_top + NR_IRQS_LEGACY)) + gsi = irq - gsi_top; + else + gsi = 0xffffffff; + + return gsi; +} + +/* * Temporarily use the virtual area starting from FIX_IO_APIC_BASE_END, * to map the target physical address. The problem is that set_fixmap() * provides a single page, and it is possible that the page is not @@ -313,7 +359,7 @@ acpi_parse_ioapic(struct acpi_subtable_header * header, const unsigned long end) /* * Parse Interrupt Source Override for the ACPI SCI */ -static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) +static void __init acpi_sci_ioapic_setup(u8 bus_irq, u16 polarity, u16 trigger, u32 gsi) { if (trigger == 0) /* compatible SCI trigger is level */ trigger = 3; @@ -333,7 +379,7 @@ static void __init acpi_sci_ioapic_setup(u32 gsi, u16 polarity, u16 trigger) * If GSI is < 16, this will update its flags, * else it will create a new mp_irqs[] entry. */ - mp_override_legacy_irq(gsi, polarity, trigger, gsi); + mp_override_legacy_irq(bus_irq, polarity, trigger, gsi); /* * stash over-ride to indicate we've been here @@ -357,9 +403,10 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header, acpi_table_print_madt_entry(header); if (intsrc->source_irq == acpi_gbl_FADT.sci_interrupt) { - acpi_sci_ioapic_setup(intsrc->global_irq, + acpi_sci_ioapic_setup(intsrc->source_irq, intsrc->inti_flags & ACPI_MADT_POLARITY_MASK, - (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2); + (intsrc->inti_flags & ACPI_MADT_TRIGGER_MASK) >> 2, + intsrc->global_irq); return 0; } @@ -448,7 +495,7 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger) int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) { - *irq = gsi; + *irq = gsi_to_irq(gsi); #ifdef CONFIG_X86_IO_APIC if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) @@ -458,6 +505,14 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) return 0; } +int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) +{ + if (isa_irq >= 16) + return -1; + *gsi = irq_to_gsi(isa_irq); + return 0; +} + /* * success: return IRQ number (>=0) * failure: return < 0 @@ -482,7 +537,7 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); } #endif - irq = plat_gsi; + irq = gsi_to_irq(plat_gsi); return irq; } @@ -867,29 +922,6 @@ static int __init acpi_parse_madt_lapic_entries(void) extern int es7000_plat; #endif -int __init acpi_probe_gsi(void) -{ - int idx; - int gsi; - int max_gsi = 0; - - if (acpi_disabled) - return 0; - - if (!acpi_ioapic) - return 0; - - max_gsi = 0; - for (idx = 0; idx < nr_ioapics; idx++) { - gsi = mp_gsi_routing[idx].gsi_end; - - if (gsi > max_gsi) - max_gsi = gsi; - } - - return max_gsi + 1; -} - static void assign_to_mp_irq(struct mpc_intsrc *m, struct mpc_intsrc *mp_irq) { @@ -947,13 +979,13 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) mp_irq.dstirq = pin; /* INTIN# */ save_mp_irq(&mp_irq); + + isa_irq_to_gsi[bus_irq] = gsi; } void __init mp_config_acpi_legacy_irqs(void) { int i; - int ioapic; - unsigned int dstapic; struct mpc_intsrc mp_irq; #if defined (CONFIG_MCA) || defined (CONFIG_EISA) @@ -974,19 +1006,27 @@ void __init mp_config_acpi_legacy_irqs(void) #endif /* - * Locate the IOAPIC that manages the ISA IRQs (0-15). - */ - ioapic = mp_find_ioapic(0); - if (ioapic < 0) - return; - dstapic = mp_ioapics[ioapic].apicid; - - /* * Use the default configuration for the IRQs 0-15. Unless * overridden by (MADT) interrupt source override entries. */ for (i = 0; i < 16; i++) { + int ioapic, pin; + unsigned int dstapic; int idx; + u32 gsi; + + /* Locate the gsi that irq i maps to. */ + if (acpi_isa_irq_to_gsi(i, &gsi)) + continue; + + /* + * Locate the IOAPIC that manages the ISA IRQ. + */ + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) + continue; + pin = mp_find_ioapic_pin(ioapic, gsi); + dstapic = mp_ioapics[ioapic].apicid; for (idx = 0; idx < mp_irq_entries; idx++) { struct mpc_intsrc *irq = mp_irqs + idx; @@ -996,7 +1036,7 @@ void __init mp_config_acpi_legacy_irqs(void) break; /* Do we already have a mapping for this IOAPIC pin */ - if (irq->dstapic == dstapic && irq->dstirq == i) + if (irq->dstapic == dstapic && irq->dstirq == pin) break; } @@ -1011,7 +1051,7 @@ void __init mp_config_acpi_legacy_irqs(void) mp_irq.dstapic = dstapic; mp_irq.irqtype = mp_INT; mp_irq.srcbusirq = i; /* Identity mapped */ - mp_irq.dstirq = i; + mp_irq.dstirq = pin; save_mp_irq(&mp_irq); } @@ -1076,11 +1116,6 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) ioapic_pin = mp_find_ioapic_pin(ioapic, gsi); -#ifdef CONFIG_X86_32 - if (ioapic_renumber_irq) - gsi = ioapic_renumber_irq(ioapic, gsi); -#endif - if (ioapic_pin > MP_MAX_IOAPIC_PIN) { printk(KERN_ERR "Invalid reference to IOAPIC pin " "%d-%d\n", mp_ioapics[ioapic].apicid, @@ -1094,7 +1129,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity) set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin, trigger == ACPI_EDGE_SENSITIVE ? 0 : 1, polarity == ACPI_ACTIVE_HIGH ? 0 : 1); - io_apic_set_pci_routing(dev, gsi, &irq_attr); + io_apic_set_pci_routing(dev, gsi_to_irq(gsi), &irq_attr); return gsi; } @@ -1154,7 +1189,8 @@ static int __init acpi_parse_madt_ioapic_entries(void) * pretend we got one so we can set the SCI flags. */ if (!acpi_sci_override_gsi) - acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); + acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0, + acpi_gbl_FADT.sci_interrupt); /* Fill in identity legacy mappings where no override */ mp_config_acpi_legacy_irqs(); @@ -1464,9 +1500,8 @@ void __init acpi_boot_table_init(void) /* * If acpi_disabled, bail out - * One exception: acpi=ht continues far enough to enumerate LAPICs */ - if (acpi_disabled && !acpi_ht) + if (acpi_disabled) return; /* @@ -1497,9 +1532,8 @@ int __init early_acpi_boot_init(void) { /* * If acpi_disabled, bail out - * One exception: acpi=ht continues far enough to enumerate LAPICs */ - if (acpi_disabled && !acpi_ht) + if (acpi_disabled) return 1; /* @@ -1517,9 +1551,8 @@ int __init acpi_boot_init(void) /* * If acpi_disabled, bail out - * One exception: acpi=ht continues far enough to enumerate LAPICs */ - if (acpi_disabled && !acpi_ht) + if (acpi_disabled) return 1; acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); @@ -1554,21 +1587,12 @@ static int __init parse_acpi(char *arg) /* acpi=force to over-ride black-list */ else if (strcmp(arg, "force") == 0) { acpi_force = 1; - acpi_ht = 1; acpi_disabled = 0; } /* acpi=strict disables out-of-spec workarounds */ else if (strcmp(arg, "strict") == 0) { acpi_strict = 1; } - /* Limit ACPI just to boot-time to enable HT */ - else if (strcmp(arg, "ht") == 0) { - if (!acpi_force) { - printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n"); - disable_acpi(); - } - acpi_ht = 1; - } /* acpi=rsdt use RSDT instead of XSDT */ else if (strcmp(arg, "rsdt") == 0) { acpi_rsdt_forced = 1; @@ -1576,6 +1600,10 @@ static int __init parse_acpi(char *arg) /* "acpi=noirq" disables ACPI interrupt routing */ else if (strcmp(arg, "noirq") == 0) { acpi_noirq_set(); + } + /* "acpi=copy_dsdt" copys DSDT */ + else if (strcmp(arg, "copy_dsdt") == 0) { + acpi_gbl_copy_dsdt_locally = 1; } else { /* Core will printk when we return error. */ return -EINVAL; diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index 2e837f5080fe..fb7a5f052e2b 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -145,6 +145,15 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, percpu_entry->states[cx->index].eax = cx->address; percpu_entry->states[cx->index].ecx = MWAIT_ECX_INTERRUPT_BREAK; } + + /* + * For _CST FFH on Intel, if GAS.access_size bit 1 is cleared, + * then we should skip checking BM_STS for this C-state. + * ref: "Intel Processor Vendor-Specific ACPI Interface Specification" + */ + if ((c->x86_vendor == X86_VENDOR_INTEL) && !(reg->access_size & 0x2)) + cx->bm_sts_skip = 1; + return retval; } EXPORT_SYMBOL_GPL(acpi_processor_ffh_cstate_probe); diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S index 580b4e296010..28595d6df47c 100644 --- a/arch/x86/kernel/acpi/realmode/wakeup.S +++ b/arch/x86/kernel/acpi/realmode/wakeup.S @@ -104,7 +104,7 @@ _start: movl %eax, %ecx orl %edx, %ecx jz 1f - movl $0xc0000080, %ecx + movl $MSR_EFER, %ecx wrmsr 1: diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index f9961034e557..33cec152070d 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -2,7 +2,7 @@ * sleep.c - x86-specific ACPI sleep support. * * Copyright (C) 2001-2003 Patrick Mochel - * Copyright (C) 2001-2003 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2001-2003 Pavel Machek <pavel@ucw.cz> */ #include <linux/acpi.h> @@ -157,13 +157,16 @@ static int __init acpi_sleep_setup(char *str) #ifdef CONFIG_HIBERNATION if (strncmp(str, "s4_nohwsig", 10) == 0) acpi_no_s4_hw_signature(); - if (strncmp(str, "s4_nonvs", 8) == 0) - acpi_s4_no_nvs(); + if (strncmp(str, "s4_nonvs", 8) == 0) { + pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, " + "please use acpi_sleep=nonvs instead"); + acpi_nvs_nosave(); + } #endif + if (strncmp(str, "nonvs", 5) == 0) + acpi_nvs_nosave(); if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); - if (strncmp(str, "sci_force_enable", 16) == 0) - acpi_set_sci_en_on_resume(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/arch/x86/kernel/acpi/wakeup_32.S b/arch/x86/kernel/acpi/wakeup_32.S index 8ded418b0593..13ab720573e3 100644 --- a/arch/x86/kernel/acpi/wakeup_32.S +++ b/arch/x86/kernel/acpi/wakeup_32.S @@ -1,4 +1,4 @@ - .section .text.page_aligned + .section .text..page_aligned #include <linux/linkage.h> #include <asm/segment.h> #include <asm/page_types.h> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 1a160d5d44d0..f65ab8b014c4 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -194,7 +194,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len) } extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; -extern u8 *__smp_locks[], *__smp_locks_end[]; +extern s32 __smp_locks[], __smp_locks_end[]; static void *text_poke_early(void *addr, const void *opcode, size_t len); /* Replace instructions with better alternatives for this CPU type. @@ -214,6 +214,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 *instr = a->instr; BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); + BUG_ON(a->cpuid >= NCAPINTS*32); if (!boot_cpu_has(a->cpuid)) continue; #ifdef CONFIG_X86_64 @@ -235,37 +236,41 @@ void __init_or_module apply_alternatives(struct alt_instr *start, #ifdef CONFIG_SMP -static void alternatives_smp_lock(u8 **start, u8 **end, u8 *text, u8 *text_end) +static void alternatives_smp_lock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) { - u8 **ptr; + const s32 *poff; mutex_lock(&text_mutex); - for (ptr = start; ptr < end; ptr++) { - if (*ptr < text) - continue; - if (*ptr > text_end) + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) continue; /* turn DS segment override prefix into lock prefix */ - text_poke(*ptr, ((unsigned char []){0xf0}), 1); + if (*ptr == 0x3e) + text_poke(ptr, ((unsigned char []){0xf0}), 1); }; mutex_unlock(&text_mutex); } -static void alternatives_smp_unlock(u8 **start, u8 **end, u8 *text, u8 *text_end) +static void alternatives_smp_unlock(const s32 *start, const s32 *end, + u8 *text, u8 *text_end) { - u8 **ptr; + const s32 *poff; if (noreplace_smp) return; mutex_lock(&text_mutex); - for (ptr = start; ptr < end; ptr++) { - if (*ptr < text) - continue; - if (*ptr > text_end) + for (poff = start; poff < end; poff++) { + u8 *ptr = (u8 *)poff + *poff; + + if (!*poff || ptr < text || ptr >= text_end) continue; /* turn lock prefix into DS segment override prefix */ - text_poke(*ptr, ((unsigned char []){0x3E}), 1); + if (*ptr == 0xf0) + text_poke(ptr, ((unsigned char []){0x3E}), 1); }; mutex_unlock(&text_mutex); } @@ -276,8 +281,8 @@ struct smp_alt_module { char *name; /* ptrs to lock prefixes */ - u8 **locks; - u8 **locks_end; + const s32 *locks; + const s32 *locks_end; /* .text segment, needed to avoid patching init code ;) */ u8 *text; @@ -398,16 +403,19 @@ void alternatives_smp_switch(int smp) int alternatives_text_reserved(void *start, void *end) { struct smp_alt_module *mod; - u8 **ptr; + const s32 *poff; u8 *text_start = start; u8 *text_end = end; list_for_each_entry(mod, &smp_alt_modules, next) { if (mod->text > text_end || mod->text_end < text_start) continue; - for (ptr = mod->locks; ptr < mod->locks_end; ptr++) - if (text_start <= *ptr && text_end >= *ptr) + for (poff = mod->locks; poff < mod->locks_end; poff++) { + const u8 *ptr = (const u8 *)poff + *poff; + + if (text_start <= ptr && text_end > ptr) return 1; + } } return 0; diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c index f854d89b7edf..fa044e1e30a2 100644 --- a/arch/x86/kernel/amd_iommu.c +++ b/arch/x86/kernel/amd_iommu.c @@ -731,18 +731,22 @@ static bool increase_address_space(struct protection_domain *domain, static u64 *alloc_pte(struct protection_domain *domain, unsigned long address, - int end_lvl, + unsigned long page_size, u64 **pte_page, gfp_t gfp) { + int level, end_lvl; u64 *pte, *page; - int level; + + BUG_ON(!is_power_of_2(page_size)); while (address > PM_LEVEL_SIZE(domain->mode)) increase_address_space(domain, gfp); - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + address = PAGE_SIZE_ALIGN(address, page_size); + end_lvl = PAGE_SIZE_LEVEL(page_size); while (level > end_lvl) { if (!IOMMU_PTE_PRESENT(*pte)) { @@ -752,6 +756,10 @@ static u64 *alloc_pte(struct protection_domain *domain, *pte = PM_LEVEL_PDE(level, virt_to_phys(page)); } + /* No level skipping support yet */ + if (PM_PTE_LEVEL(*pte) != level) + return NULL; + level -= 1; pte = IOMMU_PTE_PAGE(*pte); @@ -769,28 +777,47 @@ static u64 *alloc_pte(struct protection_domain *domain, * This function checks if there is a PTE for a given dma address. If * there is one, it returns the pointer to it. */ -static u64 *fetch_pte(struct protection_domain *domain, - unsigned long address, int map_size) +static u64 *fetch_pte(struct protection_domain *domain, unsigned long address) { int level; u64 *pte; - level = domain->mode - 1; - pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + if (address > PM_LEVEL_SIZE(domain->mode)) + return NULL; - while (level > map_size) { + level = domain->mode - 1; + pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)]; + + while (level > 0) { + + /* Not Present */ if (!IOMMU_PTE_PRESENT(*pte)) return NULL; + /* Large PTE */ + if (PM_PTE_LEVEL(*pte) == 0x07) { + unsigned long pte_mask, __pte; + + /* + * If we have a series of large PTEs, make + * sure to return a pointer to the first one. + */ + pte_mask = PTE_PAGE_SIZE(*pte); + pte_mask = ~((PAGE_SIZE_PTE_COUNT(pte_mask) << 3) - 1); + __pte = ((unsigned long)pte) & pte_mask; + + return (u64 *)__pte; + } + + /* No level skipping support yet */ + if (PM_PTE_LEVEL(*pte) != level) + return NULL; + level -= 1; + /* Walk to the next level */ pte = IOMMU_PTE_PAGE(*pte); pte = &pte[PM_LEVEL_INDEX(level, address)]; - - if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { - pte = NULL; - break; - } } return pte; @@ -807,44 +834,84 @@ static int iommu_map_page(struct protection_domain *dom, unsigned long bus_addr, unsigned long phys_addr, int prot, - int map_size) + unsigned long page_size) { u64 __pte, *pte; - - bus_addr = PAGE_ALIGN(bus_addr); - phys_addr = PAGE_ALIGN(phys_addr); - - BUG_ON(!PM_ALIGNED(map_size, bus_addr)); - BUG_ON(!PM_ALIGNED(map_size, phys_addr)); + int i, count; if (!(prot & IOMMU_PROT_MASK)) return -EINVAL; - pte = alloc_pte(dom, bus_addr, map_size, NULL, GFP_KERNEL); + bus_addr = PAGE_ALIGN(bus_addr); + phys_addr = PAGE_ALIGN(phys_addr); + count = PAGE_SIZE_PTE_COUNT(page_size); + pte = alloc_pte(dom, bus_addr, page_size, NULL, GFP_KERNEL); + + for (i = 0; i < count; ++i) + if (IOMMU_PTE_PRESENT(pte[i])) + return -EBUSY; - if (IOMMU_PTE_PRESENT(*pte)) - return -EBUSY; + if (page_size > PAGE_SIZE) { + __pte = PAGE_SIZE_PTE(phys_addr, page_size); + __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_P | IOMMU_PTE_FC; + } else + __pte = phys_addr | IOMMU_PTE_P | IOMMU_PTE_FC; - __pte = phys_addr | IOMMU_PTE_P; if (prot & IOMMU_PROT_IR) __pte |= IOMMU_PTE_IR; if (prot & IOMMU_PROT_IW) __pte |= IOMMU_PTE_IW; - *pte = __pte; + for (i = 0; i < count; ++i) + pte[i] = __pte; update_domain(dom); return 0; } -static void iommu_unmap_page(struct protection_domain *dom, - unsigned long bus_addr, int map_size) +static unsigned long iommu_unmap_page(struct protection_domain *dom, + unsigned long bus_addr, + unsigned long page_size) { - u64 *pte = fetch_pte(dom, bus_addr, map_size); + unsigned long long unmap_size, unmapped; + u64 *pte; + + BUG_ON(!is_power_of_2(page_size)); + + unmapped = 0; + + while (unmapped < page_size) { + + pte = fetch_pte(dom, bus_addr); + + if (!pte) { + /* + * No PTE for this address + * move forward in 4kb steps + */ + unmap_size = PAGE_SIZE; + } else if (PM_PTE_LEVEL(*pte) == 0) { + /* 4kb PTE found for this address */ + unmap_size = PAGE_SIZE; + *pte = 0ULL; + } else { + int count, i; + + /* Large PTE found which maps this address */ + unmap_size = PTE_PAGE_SIZE(*pte); + count = PAGE_SIZE_PTE_COUNT(unmap_size); + for (i = 0; i < count; i++) + pte[i] = 0ULL; + } - if (pte) - *pte = 0; + bus_addr = (bus_addr & ~(unmap_size - 1)) + unmap_size; + unmapped += unmap_size; + } + + BUG_ON(!is_power_of_2(unmapped)); + + return unmapped; } /* @@ -878,7 +945,7 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom, for (addr = e->address_start; addr < e->address_end; addr += PAGE_SIZE) { ret = iommu_map_page(&dma_dom->domain, addr, addr, e->prot, - PM_MAP_4k); + PAGE_SIZE); if (ret) return ret; /* @@ -1006,7 +1073,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, u64 *pte, *pte_page; for (i = 0; i < num_ptes; ++i) { - pte = alloc_pte(&dma_dom->domain, address, PM_MAP_4k, + pte = alloc_pte(&dma_dom->domain, address, PAGE_SIZE, &pte_page, gfp); if (!pte) goto out_free; @@ -1042,7 +1109,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom, for (i = dma_dom->aperture[index]->offset; i < dma_dom->aperture_size; i += PAGE_SIZE) { - u64 *pte = fetch_pte(&dma_dom->domain, i, PM_MAP_4k); + u64 *pte = fetch_pte(&dma_dom->domain, i); if (!pte || !IOMMU_PTE_PRESENT(*pte)) continue; @@ -1420,6 +1487,7 @@ static int __attach_device(struct device *dev, struct protection_domain *domain) { struct iommu_dev_data *dev_data, *alias_data; + int ret; dev_data = get_dev_data(dev); alias_data = get_dev_data(dev_data->alias); @@ -1431,13 +1499,14 @@ static int __attach_device(struct device *dev, spin_lock(&domain->lock); /* Some sanity checks */ + ret = -EBUSY; if (alias_data->domain != NULL && alias_data->domain != domain) - return -EBUSY; + goto out_unlock; if (dev_data->domain != NULL && dev_data->domain != domain) - return -EBUSY; + goto out_unlock; /* Do real assignment */ if (dev_data->alias != dev) { @@ -1453,10 +1522,14 @@ static int __attach_device(struct device *dev, atomic_inc(&dev_data->bind); + ret = 0; + +out_unlock: + /* ready */ spin_unlock(&domain->lock); - return 0; + return ret; } /* @@ -1712,7 +1785,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom, pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)]; if (!pte) { - pte = alloc_pte(&dom->domain, address, PM_MAP_4k, &pte_page, + pte = alloc_pte(&dom->domain, address, PAGE_SIZE, &pte_page, GFP_ATOMIC); aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page; } else @@ -2257,10 +2330,6 @@ int __init amd_iommu_init_dma_ops(void) iommu_detected = 1; swiotlb = 0; -#ifdef CONFIG_GART_IOMMU - gart_iommu_aperture_disabled = 1; - gart_iommu_aperture = 0; -#endif /* Make the driver finally visible to the drivers */ dma_ops = &amd_iommu_dma_ops; @@ -2439,12 +2508,11 @@ static int amd_iommu_attach_device(struct iommu_domain *dom, return ret; } -static int amd_iommu_map_range(struct iommu_domain *dom, - unsigned long iova, phys_addr_t paddr, - size_t size, int iommu_prot) +static int amd_iommu_map(struct iommu_domain *dom, unsigned long iova, + phys_addr_t paddr, int gfp_order, int iommu_prot) { + unsigned long page_size = 0x1000UL << gfp_order; struct protection_domain *domain = dom->priv; - unsigned long i, npages = iommu_num_pages(paddr, size, PAGE_SIZE); int prot = 0; int ret; @@ -2453,61 +2521,50 @@ static int amd_iommu_map_range(struct iommu_domain *dom, if (iommu_prot & IOMMU_WRITE) prot |= IOMMU_PROT_IW; - iova &= PAGE_MASK; - paddr &= PAGE_MASK; - mutex_lock(&domain->api_lock); - - for (i = 0; i < npages; ++i) { - ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); - if (ret) - return ret; - - iova += PAGE_SIZE; - paddr += PAGE_SIZE; - } - + ret = iommu_map_page(domain, iova, paddr, prot, page_size); mutex_unlock(&domain->api_lock); - return 0; + return ret; } -static void amd_iommu_unmap_range(struct iommu_domain *dom, - unsigned long iova, size_t size) +static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, + int gfp_order) { - struct protection_domain *domain = dom->priv; - unsigned long i, npages = iommu_num_pages(iova, size, PAGE_SIZE); + unsigned long page_size, unmap_size; - iova &= PAGE_MASK; + page_size = 0x1000UL << gfp_order; mutex_lock(&domain->api_lock); - - for (i = 0; i < npages; ++i) { - iommu_unmap_page(domain, iova, PM_MAP_4k); - iova += PAGE_SIZE; - } + unmap_size = iommu_unmap_page(domain, iova, page_size); + mutex_unlock(&domain->api_lock); iommu_flush_tlb_pde(domain); - mutex_unlock(&domain->api_lock); + return get_order(unmap_size); } static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, unsigned long iova) { struct protection_domain *domain = dom->priv; - unsigned long offset = iova & ~PAGE_MASK; + unsigned long offset_mask; phys_addr_t paddr; - u64 *pte; + u64 *pte, __pte; - pte = fetch_pte(domain, iova, PM_MAP_4k); + pte = fetch_pte(domain, iova); if (!pte || !IOMMU_PTE_PRESENT(*pte)) return 0; - paddr = *pte & IOMMU_PAGE_MASK; - paddr |= offset; + if (PM_PTE_LEVEL(*pte) == 0) + offset_mask = PAGE_SIZE - 1; + else + offset_mask = PTE_PAGE_SIZE(*pte) - 1; + + __pte = *pte & PM_ADDR_MASK; + paddr = (__pte & ~offset_mask) | (iova & offset_mask); return paddr; } @@ -2515,6 +2572,11 @@ static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, static int amd_iommu_domain_has_cap(struct iommu_domain *domain, unsigned long cap) { + switch (cap) { + case IOMMU_CAP_CACHE_COHERENCY: + return 1; + } + return 0; } @@ -2523,8 +2585,8 @@ static struct iommu_ops amd_iommu_ops = { .domain_destroy = amd_iommu_domain_destroy, .attach_dev = amd_iommu_attach_device, .detach_dev = amd_iommu_detach_device, - .map = amd_iommu_map_range, - .unmap = amd_iommu_unmap_range, + .map = amd_iommu_map, + .unmap = amd_iommu_unmap, .iova_to_phys = amd_iommu_iova_to_phys, .domain_has_cap = amd_iommu_domain_has_cap, }; @@ -2552,8 +2614,7 @@ int __init amd_iommu_init_passthrough(void) pt_domain->mode |= PAGE_MODE_NONE; - while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { - + for_each_pci_dev(dev) { if (!check_device(&dev->dev)) continue; diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c index 6360abf993d4..3cc63e2b8dd4 100644 --- a/arch/x86/kernel/amd_iommu_init.c +++ b/arch/x86/kernel/amd_iommu_init.c @@ -120,6 +120,7 @@ struct ivmd_header { bool amd_iommu_dump; static int __initdata amd_iommu_detected; +static bool __initdata amd_iommu_disabled; u16 amd_iommu_last_bdf; /* largest PCI device id we have to handle */ @@ -286,8 +287,12 @@ static u8 * __init iommu_map_mmio_space(u64 address) { u8 *ret; - if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) + if (!request_mem_region(address, MMIO_REGION_LENGTH, "amd_iommu")) { + pr_err("AMD-Vi: Can not reserve memory region %llx for mmio\n", + address); + pr_err("AMD-Vi: This is a BIOS bug. Please contact your hardware vendor\n"); return NULL; + } ret = ioremap_nocache(address, MMIO_REGION_LENGTH); if (ret != NULL) @@ -1313,7 +1318,7 @@ static int __init amd_iommu_init(void) ret = amd_iommu_init_dma_ops(); if (ret) - goto free; + goto free_disable; amd_iommu_init_api(); @@ -1331,9 +1336,10 @@ static int __init amd_iommu_init(void) out: return ret; -free: +free_disable: disable_iommus(); +free: amd_iommu_uninit_devices(); free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, @@ -1352,6 +1358,15 @@ free: free_unity_maps(); +#ifdef CONFIG_GART_IOMMU + /* + * We failed to initialize the AMD IOMMU - try fallback to GART + * if possible. + */ + gart_iommu_init(); + +#endif + goto out; } @@ -1372,6 +1387,9 @@ void __init amd_iommu_detect(void) if (no_iommu || (iommu_detected && !gart_iommu_aperture)) return; + if (amd_iommu_disabled) + return; + if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { iommu_detected = 1; amd_iommu_detected = 1; @@ -1401,6 +1419,8 @@ static int __init parse_amd_iommu_options(char *str) for (; *str; ++str) { if (strncmp(str, "fullflush", 9) == 0) amd_iommu_unmap_flush = true; + if (strncmp(str, "off", 3) == 0) + amd_iommu_disabled = true; } return 1; diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c index a35347501d36..8dd77800ff5d 100644 --- a/arch/x86/kernel/apb_timer.c +++ b/arch/x86/kernel/apb_timer.c @@ -43,10 +43,11 @@ #include <asm/fixmap.h> #include <asm/apb_timer.h> +#include <asm/mrst.h> #define APBT_MASK CLOCKSOURCE_MASK(32) #define APBT_SHIFT 22 -#define APBT_CLOCKEVENT_RATING 150 +#define APBT_CLOCKEVENT_RATING 110 #define APBT_CLOCKSOURCE_RATING 250 #define APBT_MIN_DELTA_USEC 200 @@ -83,8 +84,6 @@ struct apbt_dev { char name[10]; }; -int disable_apbt_percpu __cpuinitdata; - static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev); #ifdef CONFIG_SMP @@ -195,29 +194,6 @@ static struct clock_event_device apbt_clockevent = { }; /* - * if user does not want to use per CPU apb timer, just give it a lower rating - * than local apic timer and skip the late per cpu timer init. - */ -static inline int __init setup_x86_mrst_timer(char *arg) -{ - if (!arg) - return -EINVAL; - - if (strcmp("apbt_only", arg) == 0) - disable_apbt_percpu = 0; - else if (strcmp("lapic_and_apbt", arg) == 0) - disable_apbt_percpu = 1; - else { - pr_warning("X86 MRST timer option %s not recognised" - " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", - arg); - return -EINVAL; - } - return 0; -} -__setup("x86_mrst_timer=", setup_x86_mrst_timer); - -/* * start count down from 0xffff_ffff. this is done by toggling the enable bit * then load initial load count to ~0. */ @@ -335,7 +311,7 @@ static int __init apbt_clockevent_register(void) adev->num = smp_processor_id(); memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); - if (disable_apbt_percpu) { + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; global_clock_event = &adev->evt; printk(KERN_DEBUG "%s clockevent registered as global\n", @@ -429,7 +405,8 @@ static int apbt_cpuhp_notify(struct notifier_block *n, static __init int apbt_late_init(void) { - if (disable_apbt_percpu || !apb_timer_block_enabled) + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT || + !apb_timer_block_enabled) return 0; /* This notifier should be called after workqueue is ready */ hotcpu_notifier(apbt_cpuhp_notify, -20); @@ -450,6 +427,8 @@ static void apbt_set_mode(enum clock_event_mode mode, int timer_num; struct apbt_dev *adev = EVT_TO_APBT_DEV(evt); + BUG_ON(!apbt_virt_address); + timer_num = adev->num; pr_debug("%s CPU %d timer %d mode=%d\n", __func__, first_cpu(*evt->cpumask), timer_num, mode); @@ -676,7 +655,7 @@ void __init apbt_time_init(void) } #ifdef CONFIG_SMP /* kernel cmdline disable apb timer, so we will use lapic timers */ - if (disable_apbt_percpu) { + if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { printk(KERN_INFO "apbt: disabled per cpu timer\n"); return; } diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index b5d8b0bcf235..a2e0caf26e17 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -280,7 +280,7 @@ void __init early_gart_iommu_check(void) * or BIOS forget to put that in reserved. * try to update e820 to make that region as reserved. */ - u32 agp_aper_base = 0, agp_aper_order = 0; + u32 agp_aper_order = 0; int i, fix, slot, valid_agp = 0; u32 ctl; u32 aper_size = 0, aper_order = 0, last_aper_order = 0; @@ -291,7 +291,7 @@ void __init early_gart_iommu_check(void) return; /* This is mostly duplicate of iommu_hole_init */ - agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp); + search_agp_bridge(&agp_aper_order, &valid_agp); fix = 0; for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 565c1bfc507d..910f20b457c4 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -2,7 +2,12 @@ # Makefile for local APIC drivers and for the IO-APIC code # -obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o +obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o +ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) +obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o +endif +obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o + obj-$(CONFIG_X86_IO_APIC) += io_apic.o obj-$(CONFIG_SMP) += ipi.o diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e5a4a1e01618..980508c79082 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -51,6 +51,7 @@ #include <asm/smp.h> #include <asm/mce.h> #include <asm/kvm_para.h> +#include <asm/tsc.h> unsigned int num_processors; @@ -459,7 +460,7 @@ static void lapic_timer_broadcast(const struct cpumask *mask) } /* - * Setup the local APIC timer for this CPU. Copy the initilized values + * Setup the local APIC timer for this CPU. Copy the initialized values * of the boot CPU and register the clock event in the framework. */ static void __cpuinit setup_APIC_timer(void) @@ -920,7 +921,7 @@ void disable_local_APIC(void) unsigned int value; /* APIC hasn't been mapped yet */ - if (!apic_phys) + if (!x2apic_mode && !apic_phys) return; clear_local_APIC(); @@ -1151,8 +1152,13 @@ static void __cpuinit lapic_setup_esr(void) */ void __cpuinit setup_local_APIC(void) { - unsigned int value; - int i, j; + unsigned int value, queued; + int i, j, acked = 0; + unsigned long long tsc = 0, ntsc; + long long max_loops = cpu_khz; + + if (cpu_has_tsc) + rdtscll(tsc); if (disable_apic) { arch_disable_smp_support(); @@ -1204,13 +1210,32 @@ void __cpuinit setup_local_APIC(void) * the interrupt. Hence a vector might get locked. It was noticed * for timer irq (vector 0x31). Issue an extra EOI to clear ISR. */ - for (i = APIC_ISR_NR - 1; i >= 0; i--) { - value = apic_read(APIC_ISR + i*0x10); - for (j = 31; j >= 0; j--) { - if (value & (1<<j)) - ack_APIC_irq(); + do { + queued = 0; + for (i = APIC_ISR_NR - 1; i >= 0; i--) + queued |= apic_read(APIC_IRR + i*0x10); + + for (i = APIC_ISR_NR - 1; i >= 0; i--) { + value = apic_read(APIC_ISR + i*0x10); + for (j = 31; j >= 0; j--) { + if (value & (1<<j)) { + ack_APIC_irq(); + acked++; + } + } } - } + if (acked > 256) { + printk(KERN_ERR "LAPIC pending interrupts after %d EOI\n", + acked); + break; + } + if (cpu_has_tsc) { + rdtscll(ntsc); + max_loops = (cpu_khz << 10) - (ntsc - tsc); + } else + max_loops--; + } while (queued && max_loops > 0); + WARN_ON(max_loops <= 0); /* * Now that we are all set up, enable the APIC diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c index 03ba1b895f5e..8593582d8022 100644 --- a/arch/x86/kernel/apic/es7000_32.c +++ b/arch/x86/kernel/apic/es7000_32.c @@ -129,25 +129,6 @@ int es7000_plat; * GSI override for ES7000 platforms. */ -static unsigned int base; - -static int -es7000_rename_gsi(int ioapic, int gsi) -{ - if (es7000_plat == ES7000_ZORRO) - return gsi; - - if (!base) { - int i; - for (i = 0; i < nr_ioapics; i++) - base += nr_ioapic_registers[i]; - } - - if (!ioapic && (gsi < 16)) - gsi += base; - - return gsi; -} static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip) { @@ -190,7 +171,6 @@ static void setup_unisys(void) es7000_plat = ES7000_ZORRO; else es7000_plat = ES7000_CLASSIC; - ioapic_renumber_irq = es7000_rename_gsi; } /* diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c new file mode 100644 index 000000000000..cefd6942f0e9 --- /dev/null +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -0,0 +1,107 @@ +/* + * HW NMI watchdog support + * + * started by Don Zickus, Copyright (C) 2010 Red Hat, Inc. + * + * Arch specific calls to support NMI watchdog + * + * Bits copied from original nmi.c file + * + */ +#include <asm/apic.h> + +#include <linux/cpumask.h> +#include <linux/kdebug.h> +#include <linux/notifier.h> +#include <linux/kprobes.h> +#include <linux/nmi.h> +#include <linux/module.h> + +/* For reliability, we're prepared to waste bits here. */ +static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; + +u64 hw_nmi_get_sample_period(void) +{ + return (u64)(cpu_khz) * 1000 * 60; +} + +#ifdef ARCH_HAS_NMI_WATCHDOG +void arch_trigger_all_cpu_backtrace(void) +{ + int i; + + cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + + printk(KERN_INFO "sending NMI to all CPUs:\n"); + apic->send_IPI_all(NMI_VECTOR); + + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpumask_empty(to_cpumask(backtrace_mask))) + break; + mdelay(1); + } +} + +static int __kprobes +arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + int cpu = smp_processor_id(); + + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; + + default: + return NOTIFY_DONE; + } + + regs = args->regs; + + if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { + static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + + arch_spin_lock(&lock); + printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); + show_regs(regs); + dump_stack(); + arch_spin_unlock(&lock); + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); + return NOTIFY_STOP; + } + + return NOTIFY_DONE; +} + +static __read_mostly struct notifier_block backtrace_notifier = { + .notifier_call = arch_trigger_all_cpu_backtrace_handler, + .next = NULL, + .priority = 1 +}; + +static int __init register_trigger_all_cpu_backtrace(void) +{ + register_die_notifier(&backtrace_notifier); + return 0; +} +early_initcall(register_trigger_all_cpu_backtrace); +#endif + +/* STUB calls to mimic old nmi_watchdog behaviour */ +#if defined(CONFIG_X86_LOCAL_APIC) +unsigned int nmi_watchdog = NMI_NONE; +EXPORT_SYMBOL(nmi_watchdog); +void acpi_nmi_enable(void) { return; } +void acpi_nmi_disable(void) { return; } +#endif +atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */ +EXPORT_SYMBOL(nmi_active); +int unknown_nmi_panic; +void cpu_nmi_set_wd_enabled(void) { return; } +void stop_apic_nmi_watchdog(void *unused) { return; } +void setup_apic_nmi_watchdog(void *unused) { return; } +int __init check_nmi_watchdog(void) { return 0; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index eb2789c3f721..4dc0084ec1b1 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -89,6 +89,9 @@ int nr_ioapics; /* IO APIC gsi routing info */ struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; +/* The one past the highest gsi number used */ +u32 gsi_top; + /* MP IRQ source entries */ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES]; @@ -1013,10 +1016,9 @@ static inline int irq_trigger(int idx) return MPBIOS_trigger(idx); } -int (*ioapic_renumber_irq)(int ioapic, int irq); static int pin_2_irq(int idx, int apic, int pin) { - int irq, i; + int irq; int bus = mp_irqs[idx].srcbus; /* @@ -1028,18 +1030,12 @@ static int pin_2_irq(int idx, int apic, int pin) if (test_bit(bus, mp_bus_not_pci)) { irq = mp_irqs[idx].srcbusirq; } else { - /* - * PCI IRQs are mapped in order - */ - i = irq = 0; - while (i < apic) - irq += nr_ioapic_registers[i++]; - irq += pin; - /* - * For MPS mode, so far only needed by ES7000 platform - */ - if (ioapic_renumber_irq) - irq = ioapic_renumber_irq(apic, irq); + u32 gsi = mp_gsi_routing[apic].gsi_base + pin; + + if (gsi >= NR_IRQS_LEGACY) + irq = gsi; + else + irq = gsi_top + gsi; } #ifdef CONFIG_X86_32 @@ -1950,20 +1946,8 @@ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; void __init enable_IO_APIC(void) { - union IO_APIC_reg_01 reg_01; int i8259_apic, i8259_pin; int apic; - unsigned long flags; - - /* - * The number of IO-APIC IRQ registers (== #pins): - */ - for (apic = 0; apic < nr_ioapics; apic++) { - raw_spin_lock_irqsave(&ioapic_lock, flags); - reg_01.raw = io_apic_read(apic, 1); - raw_spin_unlock_irqrestore(&ioapic_lock, flags); - nr_ioapic_registers[apic] = reg_01.bits.entries+1; - } if (!legacy_pic->nr_legacy_irqs) return; @@ -3413,7 +3397,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) cfg = desc->chip_data; - read_msi_msg_desc(desc, &msg); + get_cached_msi_msg_desc(desc, &msg); msg.data &= ~MSI_DATA_VECTOR_MASK; msg.data |= MSI_DATA_VECTOR(cfg->vector); @@ -3858,27 +3842,20 @@ int __init io_apic_get_redir_entries (int ioapic) reg_01.raw = io_apic_read(ioapic, 1); raw_spin_unlock_irqrestore(&ioapic_lock, flags); - return reg_01.bits.entries; + /* The register returns the maximum index redir index + * supported, which is one less than the total number of redir + * entries. + */ + return reg_01.bits.entries + 1; } void __init probe_nr_irqs_gsi(void) { - int nr = 0; + int nr; - nr = acpi_probe_gsi(); - if (nr > nr_irqs_gsi) { + nr = gsi_top + NR_IRQS_LEGACY; + if (nr > nr_irqs_gsi) nr_irqs_gsi = nr; - } else { - /* for acpi=off or acpi is not compiled in */ - int idx; - - nr = 0; - for (idx = 0; idx < nr_ioapics; idx++) - nr += io_apic_get_redir_entries(idx) + 1; - - if (nr > nr_irqs_gsi) - nr_irqs_gsi = nr; - } printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); } @@ -4085,22 +4062,27 @@ int __init io_apic_get_version(int ioapic) return reg_01.bits.version; } -int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity) +int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity) { - int i; + int ioapic, pin, idx; if (skip_ioapic_setup) return -1; - for (i = 0; i < mp_irq_entries; i++) - if (mp_irqs[i].irqtype == mp_INT && - mp_irqs[i].srcbusirq == bus_irq) - break; - if (i >= mp_irq_entries) + ioapic = mp_find_ioapic(gsi); + if (ioapic < 0) return -1; - *trigger = irq_trigger(i); - *polarity = irq_polarity(i); + pin = mp_find_ioapic_pin(ioapic, gsi); + if (pin < 0) + return -1; + + idx = find_irq_entry(ioapic, pin, mp_INT); + if (idx < 0) + return -1; + + *trigger = irq_trigger(idx); + *polarity = irq_polarity(idx); return 0; } @@ -4241,7 +4223,7 @@ void __init ioapic_insert_resources(void) } } -int mp_find_ioapic(int gsi) +int mp_find_ioapic(u32 gsi) { int i = 0; @@ -4256,7 +4238,7 @@ int mp_find_ioapic(int gsi) return -1; } -int mp_find_ioapic_pin(int ioapic, int gsi) +int mp_find_ioapic_pin(int ioapic, u32 gsi) { if (WARN_ON(ioapic == -1)) return -1; @@ -4284,6 +4266,7 @@ static int bad_ioapic(unsigned long address) void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) { int idx = 0; + int entries; if (bad_ioapic(address)) return; @@ -4302,9 +4285,17 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base) * Build basic GSI lookup table to facilitate gsi->io_apic lookups * and to prevent reprogramming of IOAPIC pins (PCI GSIs). */ + entries = io_apic_get_redir_entries(idx); mp_gsi_routing[idx].gsi_base = gsi_base; - mp_gsi_routing[idx].gsi_end = gsi_base + - io_apic_get_redir_entries(idx); + mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1; + + /* + * The number of IO-APIC IRQ registers (== #pins): + */ + nr_ioapic_registers[idx] = entries; + + if (mp_gsi_routing[idx].gsi_end >= gsi_top) + gsi_top = mp_gsi_routing[idx].gsi_end + 1; printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index 1edaf15c0b8e..a43f71cb30f8 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -401,13 +401,6 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason) int cpu = smp_processor_id(); int rc = 0; - /* check for other users first */ - if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) - == NOTIFY_STOP) { - rc = 1; - touched = 1; - } - sum = get_timer_irqs(cpu); if (__get_cpu_var(nmi_touch)) { diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index c085d52dbaf2..e46f98f36e31 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -735,9 +735,6 @@ void __init uv_system_init(void) uv_node_to_blade[nid] = blade; uv_cpu_to_blade[cpu] = blade; max_pnode = max(pnode, max_pnode); - - printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n", - cpu, apicid, pnode, nid, lcpu, blade); } /* Add blade/pnode info for nodes without cpus */ diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index 031aa887b0eb..4c9c67bf09b7 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -140,7 +140,7 @@ * is now the way life works). * Fix thinko in suspend() (wrong return). * Notify drivers on critical suspend. - * Make kapmd absorb more idle time (Pavel Machek <pavel@suse.cz> + * Make kapmd absorb more idle time (Pavel Machek <pavel@ucw.cz> * modified by sfr). * Disable interrupts while we are suspended (Andy Henroid * <andy_henroid@yahoo.com> fixed by sfr). @@ -1224,7 +1224,7 @@ static void reinit_timer(void) #ifdef INIT_TIMER_AFTER_SUSPEND unsigned long flags; - spin_lock_irqsave(&i8253_lock, flags); + raw_spin_lock_irqsave(&i8253_lock, flags); /* set the clock to HZ */ outb_pit(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */ udelay(10); @@ -1232,7 +1232,7 @@ static void reinit_timer(void) udelay(10); outb_pit(LATCH >> 8, PIT_CH0); /* MSB */ udelay(10); - spin_unlock_irqrestore(&i8253_lock, flags); + raw_spin_unlock_irqrestore(&i8253_lock, flags); #endif } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index c202b62f3671..3f0ebe429a01 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,11 +12,11 @@ endif nostackp := $(call cc-option, -fno-stack-protector) CFLAGS_common.o := $(nostackp) -obj-y := intel_cacheinfo.o addon_cpuid_features.o +obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o -obj-y += vmware.o hypervisor.o sched.o +obj-y += vmware.o hypervisor.o sched.o mshyperv.o -obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o +obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o obj-$(CONFIG_CPU_SUP_INTEL) += intel.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index e485825130d2..60a57b13082d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -466,7 +466,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) } } - if (c->x86 == 0x10 || c->x86 == 0x11) + if (c->x86 >= 0x10) set_cpu_cap(c, X86_FEATURE_REP_GOOD); /* get apicid instead of initial apic id from cpuid */ @@ -529,7 +529,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) num_cache_leaves = 3; } - if (c->x86 >= 0xf && c->x86 <= 0x11) + if (c->x86 >= 0xf) set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has_xmm2) { @@ -546,7 +546,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) fam10h_check_enable_mmcfg(); } - if (c == &boot_cpu_data && c->x86 >= 0xf && c->x86 <= 0x11) { + if (c == &boot_cpu_data && c->x86 >= 0xf) { unsigned long long tseg; /* @@ -609,3 +609,74 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { }; cpu_dev_register(amd_cpu_dev); + +/* + * AMD errata checking + * + * Errata are defined as arrays of ints using the AMD_LEGACY_ERRATUM() or + * AMD_OSVW_ERRATUM() macros. The latter is intended for newer errata that + * have an OSVW id assigned, which it takes as first argument. Both take a + * variable number of family-specific model-stepping ranges created by + * AMD_MODEL_RANGE(). Each erratum also has to be declared as extern const + * int[] in arch/x86/include/asm/processor.h. + * + * Example: + * + * const int amd_erratum_319[] = + * AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0x4, 0x2), + * AMD_MODEL_RANGE(0x10, 0x8, 0x0, 0x8, 0x0), + * AMD_MODEL_RANGE(0x10, 0x9, 0x0, 0x9, 0x0)); + */ + +const int amd_erratum_400[] = + AMD_OSVW_ERRATUM(1, AMD_MODEL_RANGE(0xf, 0x41, 0x2, 0xff, 0xf), + AMD_MODEL_RANGE(0x10, 0x2, 0x1, 0xff, 0xf)); +EXPORT_SYMBOL_GPL(amd_erratum_400); + +const int amd_erratum_383[] = + AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); +EXPORT_SYMBOL_GPL(amd_erratum_383); + +bool cpu_has_amd_erratum(const int *erratum) +{ + struct cpuinfo_x86 *cpu = ¤t_cpu_data; + int osvw_id = *erratum++; + u32 range; + u32 ms; + + /* + * If called early enough that current_cpu_data hasn't been initialized + * yet, fall back to boot_cpu_data. + */ + if (cpu->x86 == 0) + cpu = &boot_cpu_data; + + if (cpu->x86_vendor != X86_VENDOR_AMD) + return false; + + if (osvw_id >= 0 && osvw_id < 65536 && + cpu_has(cpu, X86_FEATURE_OSVW)) { + u64 osvw_len; + + rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, osvw_len); + if (osvw_id < osvw_len) { + u64 osvw_bits; + + rdmsrl(MSR_AMD64_OSVW_STATUS + (osvw_id >> 6), + osvw_bits); + return osvw_bits & (1ULL << (osvw_id & 0x3f)); + } + } + + /* OSVW unavailable or ID unknown, match family-model-stepping range */ + ms = (cpu->x86_model << 8) | cpu->x86_mask; + while ((range = *erratum++)) + if ((cpu->x86 == AMD_MODEL_RANGE_FAMILY(range)) && + (ms >= AMD_MODEL_RANGE_START(range)) && + (ms <= AMD_MODEL_RANGE_END(range))) + return true; + + return false; +} + +EXPORT_SYMBOL_GPL(cpu_has_amd_erratum); diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 01a265212395..c39576cb3018 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -86,7 +86,7 @@ static void __init check_fpu(void) static void __init check_hlt(void) { - if (paravirt_enabled()) + if (boot_cpu_data.x86 >= 5 || paravirt_enabled()) return; printk(KERN_INFO "Checking 'hlt' instruction... "); diff --git a/arch/x86/kernel/cpu/cmpxchg.c b/arch/x86/kernel/cpu/cmpxchg.c deleted file mode 100644 index 2056ccf572cc..000000000000 --- a/arch/x86/kernel/cpu/cmpxchg.c +++ /dev/null @@ -1,72 +0,0 @@ -/* - * cmpxchg*() fallbacks for CPU not supporting these instructions - */ - -#include <linux/kernel.h> -#include <linux/smp.h> -#include <linux/module.h> - -#ifndef CONFIG_X86_CMPXCHG -unsigned long cmpxchg_386_u8(volatile void *ptr, u8 old, u8 new) -{ - u8 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u8 *)ptr; - if (prev == old) - *(u8 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u8); - -unsigned long cmpxchg_386_u16(volatile void *ptr, u16 old, u16 new) -{ - u16 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u16 *)ptr; - if (prev == old) - *(u16 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u16); - -unsigned long cmpxchg_386_u32(volatile void *ptr, u32 old, u32 new) -{ - u32 prev; - unsigned long flags; - - /* Poor man's cmpxchg for 386. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u32 *)ptr; - if (prev == old) - *(u32 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_386_u32); -#endif - -#ifndef CONFIG_X86_CMPXCHG64 -unsigned long long cmpxchg_486_u64(volatile void *ptr, u64 old, u64 new) -{ - u64 prev; - unsigned long flags; - - /* Poor man's cmpxchg8b for 386 and 486. Unsuitable for SMP */ - local_irq_save(flags); - prev = *(u64 *)ptr; - if (prev == old) - *(u64 *)ptr = new; - local_irq_restore(flags); - return prev; -} -EXPORT_SYMBOL(cmpxchg_486_u64); -#endif - diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4868e4a951ee..490dac63c2d2 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -140,10 +140,18 @@ EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); static int __init x86_xsave_setup(char *s) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); return 1; } __setup("noxsave", x86_xsave_setup); +static int __init x86_xsaveopt_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); + return 1; +} +__setup("noxsaveopt", x86_xsaveopt_setup); + #ifdef CONFIG_X86_32 static int cachesize_override __cpuinitdata = -1; static int disable_x86_serial_nr __cpuinitdata = 1; @@ -551,6 +559,16 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) c->x86_capability[4] = excap; } + /* Additional Intel-defined flags: level 0x00000007 */ + if (c->cpuid_level >= 0x00000007) { + u32 eax, ebx, ecx, edx; + + cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); + + if (eax > 0) + c->x86_capability[9] = ebx; + } + /* AMD-defined flags: level 0x80000001 */ xlvl = cpuid_eax(0x80000000); c->extended_cpuid_level = xlvl; @@ -576,6 +594,7 @@ static void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c) if (c->extended_cpuid_level >= 0x80000007) c->x86_power = cpuid_edx(0x80000007); + init_scattered_cpuid_features(c); } static void __cpuinit identify_cpu_without_cpuid(struct cpuinfo_x86 *c) @@ -731,7 +750,6 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) get_model_name(c); /* Default name */ - init_scattered_cpuid_features(c); detect_nopl(c); } @@ -1084,6 +1102,20 @@ static void clear_all_debug_regs(void) } } +#ifdef CONFIG_KGDB +/* + * Restore debug regs if using kgdbwait and you have a kernel debugger + * connection established. + */ +static void dbg_restore_debug_regs(void) +{ + if (unlikely(kgdb_connected && arch_kgdb_ops.correct_hw_break)) + arch_kgdb_ops.correct_hw_break(); +} +#else /* ! CONFIG_KGDB */ +#define dbg_restore_debug_regs() +#endif /* ! CONFIG_KGDB */ + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT @@ -1107,9 +1139,9 @@ void __cpuinit cpu_init(void) oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA - if (cpu != 0 && percpu_read(node_number) == 0 && - cpu_to_node(cpu) != NUMA_NO_NODE) - percpu_write(node_number, cpu_to_node(cpu)); + if (cpu != 0 && percpu_read(numa_node) == 0 && + early_cpu_to_node(cpu) != NUMA_NO_NODE) + set_numa_node(early_cpu_to_node(cpu)); #endif me = current; @@ -1174,20 +1206,11 @@ void __cpuinit cpu_init(void) load_TR_desc(); load_LDT(&init_mm.context); -#ifdef CONFIG_KGDB - /* - * If the kgdb is connected no debug regs should be altered. This - * is only applicable when KGDB and a KGDB I/O module are built - * into the kernel and you are using early debugging with - * kgdbwait. KGDB will control the kernel HW breakpoint registers. - */ - if (kgdb_connected && arch_kgdb_ops.correct_hw_break) - arch_kgdb_ops.correct_hw_break(); - else -#endif - clear_all_debug_regs(); + clear_all_debug_regs(); + dbg_restore_debug_regs(); fpu_init(); + xsave_init(); raw_local_save_flags(kernel_eflags); @@ -1239,23 +1262,16 @@ void __cpuinit cpu_init(void) #endif clear_all_debug_regs(); + dbg_restore_debug_regs(); /* * Force FPU initialization: */ - if (cpu_has_xsave) - current_thread_info()->status = TS_XSAVE; - else - current_thread_info()->status = 0; + current_thread_info()->status = 0; clear_used_math(); mxcsr_feature_mask_init(); - /* - * Boot processor to setup the FP and extended state context info. - */ - if (smp_processor_id() == boot_cpu_id) - init_thread_xstate(); - + fpu_init(); xsave_init(); } #endif diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile index 1840c0a5170b..bd54bf67e6fb 100644 --- a/arch/x86/kernel/cpu/cpufreq/Makefile +++ b/arch/x86/kernel/cpu/cpufreq/Makefile @@ -2,8 +2,8 @@ # K8 systems. ACPI is preferred to all other hardware-specific drivers. # speedstep-* is preferred over p4-clockmod. -obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o -obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o +obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o +obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c index 459168083b77..246cd3afbb5f 100644 --- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -34,7 +34,6 @@ #include <linux/compiler.h> #include <linux/dmi.h> #include <linux/slab.h> -#include <trace/events/power.h> #include <linux/acpi.h> #include <linux/io.h> @@ -46,6 +45,7 @@ #include <asm/msr.h> #include <asm/processor.h> #include <asm/cpufeature.h> +#include "mperf.h" #define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \ "acpi-cpufreq", msg) @@ -71,8 +71,6 @@ struct acpi_cpufreq_data { static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data); -static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); - /* acpi_perf_data is a pointer to percpu data. */ static struct acpi_processor_performance *acpi_perf_data; @@ -240,45 +238,6 @@ static u32 get_cur_val(const struct cpumask *mask) return cmd.val; } -/* Called via smp_call_function_single(), on the target CPU */ -static void read_measured_perf_ctrs(void *_cur) -{ - struct aperfmperf *am = _cur; - - get_aperfmperf(am); -} - -/* - * Return the measured active (C0) frequency on this CPU since last call - * to this function. - * Input: cpu number - * Return: Average CPU frequency in terms of max frequency (zero on error) - * - * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance - * over a period of time, while CPU is in C0 state. - * IA32_MPERF counts at the rate of max advertised frequency - * IA32_APERF counts at the rate of actual CPU frequency - * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and - * no meaning should be associated with absolute values of these MSRs. - */ -static unsigned int get_measured_perf(struct cpufreq_policy *policy, - unsigned int cpu) -{ - struct aperfmperf perf; - unsigned long ratio; - unsigned int retval; - - if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) - return 0; - - ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); - per_cpu(acfreq_old_perf, cpu) = perf; - - retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; - - return retval; -} - static unsigned int get_cur_freq_on_cpu(unsigned int cpu) { struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu); @@ -364,8 +323,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency); - switch (data->cpu_feature) { case SYSTEM_INTEL_MSR_CAPABLE: cmd.type = SYSTEM_INTEL_MSR_CAPABLE; @@ -391,7 +348,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, freqs.old = perf->states[perf->state].core_frequency * 1000; freqs.new = data->freq_table[next_state].frequency; - for_each_cpu(i, cmd.mask) { + for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE); } @@ -407,7 +364,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy, } } - for_each_cpu(i, cmd.mask) { + for_each_cpu(i, policy->cpus) { freqs.cpu = i; cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE); } @@ -702,7 +659,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy) /* Check for APERF/MPERF support in hardware */ if (cpu_has(c, X86_FEATURE_APERFMPERF)) - acpi_cpufreq_driver.getavg = get_measured_perf; + acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf; dprintk("CPU%u - ACPI performance management activated.\n", cpu); for (i = 0; i < perf->state_count; i++) diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c index 16e3483be9e3..32974cf84232 100644 --- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c +++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c @@ -169,12 +169,9 @@ static int gx_freq_mult[16] = { * Low Level chipset interface * ****************************************************************/ static struct pci_device_id gx_chipset_tbl[] __initdata = { - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, - PCI_ANY_ID, PCI_ANY_ID }, - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5520, - PCI_ANY_ID, PCI_ANY_ID }, - { PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5510, - PCI_ANY_ID, PCI_ANY_ID }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), }, + { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), }, { 0, }, }; @@ -199,7 +196,7 @@ static __init struct pci_dev *gx_detect_chipset(void) } /* detect which companion chip is used */ - while ((gx_pci = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, gx_pci)) != NULL) { + for_each_pci_dev(gx_pci) { if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL) return gx_pci; } diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c index 7e7eea4f8261..03162dac6271 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.c +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c @@ -426,7 +426,7 @@ static int guess_fsb(int mult) } -static int __init longhaul_get_ranges(void) +static int __cpuinit longhaul_get_ranges(void) { unsigned int i, j, k = 0; unsigned int ratio; @@ -530,7 +530,7 @@ static int __init longhaul_get_ranges(void) } -static void __init longhaul_setup_voltagescaling(void) +static void __cpuinit longhaul_setup_voltagescaling(void) { union msr_longhaul longhaul; struct mV_pos minvid, maxvid, vid; @@ -784,7 +784,7 @@ static int longhaul_setup_southbridge(void) return 0; } -static int __init longhaul_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy) { struct cpuinfo_x86 *c = &cpu_data(0); char *cpuname = NULL; diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h index e2360a469f79..cbf48fbca881 100644 --- a/arch/x86/kernel/cpu/cpufreq/longhaul.h +++ b/arch/x86/kernel/cpu/cpufreq/longhaul.h @@ -56,7 +56,7 @@ union msr_longhaul { /* * VIA C3 Samuel 1 & Samuel 2 (stepping 0) */ -static const int __initdata samuel1_mults[16] = { +static const int __cpuinitdata samuel1_mults[16] = { -1, /* 0000 -> RESERVED */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -75,7 +75,7 @@ static const int __initdata samuel1_mults[16] = { -1, /* 1111 -> RESERVED */ }; -static const int __initdata samuel1_eblcr[16] = { +static const int __cpuinitdata samuel1_eblcr[16] = { 50, /* 0000 -> RESERVED */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -97,7 +97,7 @@ static const int __initdata samuel1_eblcr[16] = { /* * VIA C3 Samuel2 Stepping 1->15 */ -static const int __initdata samuel2_eblcr[16] = { +static const int __cpuinitdata samuel2_eblcr[16] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -119,7 +119,7 @@ static const int __initdata samuel2_eblcr[16] = { /* * VIA C3 Ezra */ -static const int __initdata ezra_mults[16] = { +static const int __cpuinitdata ezra_mults[16] = { 100, /* 0000 -> 10.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -138,7 +138,7 @@ static const int __initdata ezra_mults[16] = { 120, /* 1111 -> 12.0x */ }; -static const int __initdata ezra_eblcr[16] = { +static const int __cpuinitdata ezra_eblcr[16] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -160,7 +160,7 @@ static const int __initdata ezra_eblcr[16] = { /* * VIA C3 (Ezra-T) [C5M]. */ -static const int __initdata ezrat_mults[32] = { +static const int __cpuinitdata ezrat_mults[32] = { 100, /* 0000 -> 10.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -196,7 +196,7 @@ static const int __initdata ezrat_mults[32] = { -1, /* 1111 -> RESERVED (12.0x) */ }; -static const int __initdata ezrat_eblcr[32] = { +static const int __cpuinitdata ezrat_eblcr[32] = { 50, /* 0000 -> 5.0x */ 30, /* 0001 -> 3.0x */ 40, /* 0010 -> 4.0x */ @@ -235,7 +235,7 @@ static const int __initdata ezrat_eblcr[32] = { /* * VIA C3 Nehemiah */ -static const int __initdata nehemiah_mults[32] = { +static const int __cpuinitdata nehemiah_mults[32] = { 100, /* 0000 -> 10.0x */ -1, /* 0001 -> 16.0x */ 40, /* 0010 -> 4.0x */ @@ -270,7 +270,7 @@ static const int __initdata nehemiah_mults[32] = { -1, /* 1111 -> 12.0x */ }; -static const int __initdata nehemiah_eblcr[32] = { +static const int __cpuinitdata nehemiah_eblcr[32] = { 50, /* 0000 -> 5.0x */ 160, /* 0001 -> 16.0x */ 40, /* 0010 -> 4.0x */ @@ -315,7 +315,7 @@ struct mV_pos { unsigned short pos; }; -static const struct mV_pos __initdata vrm85_mV[32] = { +static const struct mV_pos __cpuinitdata vrm85_mV[32] = { {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2}, {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26}, {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18}, @@ -326,14 +326,14 @@ static const struct mV_pos __initdata vrm85_mV[32] = { {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11} }; -static const unsigned char __initdata mV_vrm85[32] = { +static const unsigned char __cpuinitdata mV_vrm85[32] = { 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11, 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d, 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19, 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15 }; -static const struct mV_pos __initdata mobilevrm_mV[32] = { +static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = { {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28}, {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24}, {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20}, @@ -344,7 +344,7 @@ static const struct mV_pos __initdata mobilevrm_mV[32] = { {675, 3}, {650, 2}, {625, 1}, {600, 0} }; -static const unsigned char __initdata mV_mobilevrm[32] = { +static const unsigned char __cpuinitdata mV_mobilevrm[32] = { 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c index e7b559d74c52..fc09f142d94d 100644 --- a/arch/x86/kernel/cpu/cpufreq/longrun.c +++ b/arch/x86/kernel/cpu/cpufreq/longrun.c @@ -165,8 +165,8 @@ static unsigned int longrun_get(unsigned int cpu) * TMTA rules: * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq) */ -static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, - unsigned int *high_freq) +static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq, + unsigned int *high_freq) { u32 msr_lo, msr_hi; u32 save_lo, save_hi; @@ -258,7 +258,7 @@ static unsigned int __init longrun_determine_freqs(unsigned int *low_freq, } -static int __init longrun_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy) { int result = 0; diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c new file mode 100644 index 000000000000..911e193018ae --- /dev/null +++ b/arch/x86/kernel/cpu/cpufreq/mperf.c @@ -0,0 +1,51 @@ +#include <linux/kernel.h> +#include <linux/smp.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/cpufreq.h> +#include <linux/slab.h> + +#include "mperf.h" + +static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf); + +/* Called via smp_call_function_single(), on the target CPU */ +static void read_measured_perf_ctrs(void *_cur) +{ + struct aperfmperf *am = _cur; + + get_aperfmperf(am); +} + +/* + * Return the measured active (C0) frequency on this CPU since last call + * to this function. + * Input: cpu number + * Return: Average CPU frequency in terms of max frequency (zero on error) + * + * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance + * over a period of time, while CPU is in C0 state. + * IA32_MPERF counts at the rate of max advertised frequency + * IA32_APERF counts at the rate of actual CPU frequency + * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and + * no meaning should be associated with absolute values of these MSRs. + */ +unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, + unsigned int cpu) +{ + struct aperfmperf perf; + unsigned long ratio; + unsigned int retval; + + if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) + return 0; + + ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf); + per_cpu(acfreq_old_perf, cpu) = perf; + + retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; + + return retval; +} +EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h new file mode 100644 index 000000000000..5dbf2950dc22 --- /dev/null +++ b/arch/x86/kernel/cpu/cpufreq/mperf.h @@ -0,0 +1,9 @@ +/* + * (c) 2010 Advanced Micro Devices, Inc. + * Your use of this code is subject to the terms and conditions of the + * GNU general public license version 2. See "COPYING" or + * http://www.gnu.org/licenses/gpl.html + */ + +unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy, + unsigned int cpu); diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c index 7b8a8ba67b07..bd1cac747f67 100644 --- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c +++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c @@ -178,13 +178,8 @@ static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c) } } - if (c->x86 != 0xF) { - if (!cpu_has(c, X86_FEATURE_EST)) - printk(KERN_WARNING PFX "Unknown CPU. " - "Please send an e-mail to " - "<cpufreq@vger.kernel.org>\n"); + if (c->x86 != 0xF) return 0; - } /* on P-4s, the TSC runs with constant frequency independent whether * throttling is active or not. */ diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c index ce7cde713e71..a36de5bbb622 100644 --- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c +++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c @@ -368,22 +368,16 @@ static int __init pcc_cpufreq_do_osc(acpi_handle *handle) return -ENODEV; out_obj = output.pointer; - if (out_obj->type != ACPI_TYPE_BUFFER) { - ret = -ENODEV; - goto out_free; - } + if (out_obj->type != ACPI_TYPE_BUFFER) + return -ENODEV; errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0); - if (errors) { - ret = -ENODEV; - goto out_free; - } + if (errors) + return -ENODEV; supported = *((u32 *)(out_obj->buffer.pointer + 4)); - if (!(supported & 0x1)) { - ret = -ENODEV; - goto out_free; - } + if (!(supported & 0x1)) + return -ENODEV; out_free: kfree(output.pointer); @@ -397,13 +391,17 @@ static int __init pcc_cpufreq_probe(void) struct pcc_memory_resource *mem_resource; struct pcc_register_resource *reg_resource; union acpi_object *out_obj, *member; - acpi_handle handle, osc_handle; + acpi_handle handle, osc_handle, pcch_handle; int ret = 0; status = acpi_get_handle(NULL, "\\_SB", &handle); if (ACPI_FAILURE(status)) return -ENODEV; + status = acpi_get_handle(handle, "PCCH", &pcch_handle); + if (ACPI_FAILURE(status)) + return -ENODEV; + status = acpi_get_handle(handle, "_OSC", &osc_handle); if (ACPI_SUCCESS(status)) { ret = pcc_cpufreq_do_osc(&osc_handle); @@ -543,13 +541,13 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) if (!pcch_virt_addr) { result = -1; - goto pcch_null; + goto out; } result = pcc_get_offset(cpu); if (result) { dprintk("init: PCCP evaluation failed\n"); - goto free; + goto out; } policy->max = policy->cpuinfo.max_freq = @@ -558,14 +556,15 @@ static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy) ioread32(&pcch_hdr->minimum_frequency) * 1000; policy->cur = pcc_get_freq(cpu); + if (!policy->cur) { + dprintk("init: Unable to get current CPU frequency\n"); + result = -EINVAL; + goto out; + } + dprintk("init: policy->max is %d, policy->min is %d\n", policy->max, policy->min); - - return 0; -free: - pcc_clear_mapping(); - free_percpu(pcc_cpu_info); -pcch_null: +out: return result; } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c index 9a97116f89e5..4a45fd6e41ba 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c @@ -569,7 +569,7 @@ static int powernow_verify(struct cpufreq_policy *policy) * We will then get the same kind of behaviour already tested under * the "well-known" other OS. */ -static int __init fixup_sgtc(void) +static int __cpuinit fixup_sgtc(void) { unsigned int sgtc; unsigned int m; @@ -603,7 +603,7 @@ static unsigned int powernow_get(unsigned int cpu) } -static int __init acer_cpufreq_pst(const struct dmi_system_id *d) +static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d) { printk(KERN_WARNING PFX "%s laptop with broken PST tables in BIOS detected.\n", @@ -621,7 +621,7 @@ static int __init acer_cpufreq_pst(const struct dmi_system_id *d) * A BIOS update is all that can save them. * Mention this, and disable cpufreq. */ -static struct dmi_system_id __initdata powernow_dmi_table[] = { +static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = { { .callback = acer_cpufreq_pst, .ident = "Acer Aspire", @@ -633,7 +633,7 @@ static struct dmi_system_id __initdata powernow_dmi_table[] = { { } }; -static int __init powernow_cpu_init(struct cpufreq_policy *policy) +static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy) { union msr_fidvidstatus fidvidstatus; int result; diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c index b6215b9798e2..491977baf6c0 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c @@ -1,6 +1,5 @@ - /* - * (c) 2003-2006 Advanced Micro Devices, Inc. + * (c) 2003-2010 Advanced Micro Devices, Inc. * Your use of this code is subject to the terms and conditions of the * GNU general public license version 2. See "COPYING" or * http://www.gnu.org/licenses/gpl.html @@ -10,7 +9,7 @@ * Based on the powernow-k7.c module written by Dave Jones. * (C) 2003 Dave Jones on behalf of SuSE Labs * (C) 2004 Dominik Brodowski <linux@brodo.de> - * (C) 2004 Pavel Machek <pavel@suse.cz> + * (C) 2004 Pavel Machek <pavel@ucw.cz> * Licensed under the terms of the GNU GPL License version 2. * Based upon datasheets & sample CPUs kindly provided by AMD. * @@ -46,6 +45,7 @@ #define PFX "powernow-k8: " #define VERSION "version 2.20.00" #include "powernow-k8.h" +#include "mperf.h" /* serialize freq changes */ static DEFINE_MUTEX(fidvid_mutex); @@ -54,6 +54,12 @@ static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data); static int cpu_family = CPU_OPTERON; +/* core performance boost */ +static bool cpb_capable, cpb_enabled; +static struct msr __percpu *msrs; + +static struct cpufreq_driver cpufreq_amd64_driver; + #ifndef CONFIG_SMP static inline const struct cpumask *cpu_core_mask(int cpu) { @@ -800,6 +806,8 @@ static int find_psb_table(struct powernow_k8_data *data) * www.amd.com */ printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n"); + printk(KERN_ERR PFX "Make sure that your BIOS is up to date" + " and Cool'N'Quiet support is enabled in BIOS setup\n"); return -ENODEV; } @@ -904,8 +912,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data, { int i; u32 hi = 0, lo = 0; - rdmsr(MSR_PSTATE_CUR_LIMIT, hi, lo); - data->max_hw_pstate = (hi & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; + rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi); + data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT; for (i = 0; i < data->acpi_data.state_count; i++) { u32 index; @@ -1017,13 +1025,12 @@ static int get_transition_latency(struct powernow_k8_data *data) } if (max_latency == 0) { /* - * Fam 11h always returns 0 as transition latency. - * This is intended and means "very fast". While cpufreq core - * and governors currently can handle that gracefully, better - * set it to 1 to avoid problems in the future. - * For all others it's a BIOS bug. + * Fam 11h and later may return 0 as transition latency. This + * is intended and means "very fast". While cpufreq core and + * governors currently can handle that gracefully, better set it + * to 1 to avoid problems in the future. */ - if (boot_cpu_data.x86 != 0x11) + if (boot_cpu_data.x86 < 0x11) printk(KERN_ERR FW_WARN PFX "Invalid zero transition " "latency\n"); max_latency = 1; @@ -1249,6 +1256,7 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) struct powernow_k8_data *data; struct init_on_cpu init_on_cpu; int rc; + struct cpuinfo_x86 *c = &cpu_data(pol->cpu); if (!cpu_online(pol->cpu)) return -ENODEV; @@ -1323,6 +1331,10 @@ static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol) return -EINVAL; } + /* Check for APERF/MPERF support in hardware */ + if (cpu_has(c, X86_FEATURE_APERFMPERF)) + cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf; + cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu); if (cpu_family == CPU_HW_PSTATE) @@ -1394,8 +1406,77 @@ out: return khz; } +static void _cpb_toggle_msrs(bool t) +{ + int cpu; + + get_online_cpus(); + + rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); + + for_each_cpu(cpu, cpu_online_mask) { + struct msr *reg = per_cpu_ptr(msrs, cpu); + if (t) + reg->l &= ~BIT(25); + else + reg->l |= BIT(25); + } + wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); + + put_online_cpus(); +} + +/* + * Switch on/off core performance boosting. + * + * 0=disable + * 1=enable. + */ +static void cpb_toggle(bool t) +{ + if (!cpb_capable) + return; + + if (t && !cpb_enabled) { + cpb_enabled = true; + _cpb_toggle_msrs(t); + printk(KERN_INFO PFX "Core Boosting enabled.\n"); + } else if (!t && cpb_enabled) { + cpb_enabled = false; + _cpb_toggle_msrs(t); + printk(KERN_INFO PFX "Core Boosting disabled.\n"); + } +} + +static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf, + size_t count) +{ + int ret = -EINVAL; + unsigned long val = 0; + + ret = strict_strtoul(buf, 10, &val); + if (!ret && (val == 0 || val == 1) && cpb_capable) + cpb_toggle(val); + else + return -EINVAL; + + return count; +} + +static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf) +{ + return sprintf(buf, "%u\n", cpb_enabled); +} + +#define define_one_rw(_name) \ +static struct freq_attr _name = \ +__ATTR(_name, 0644, show_##_name, store_##_name) + +define_one_rw(cpb); + static struct freq_attr *powernow_k8_attr[] = { &cpufreq_freq_attr_scaling_available_freqs, + &cpb, NULL, }; @@ -1411,10 +1492,51 @@ static struct cpufreq_driver cpufreq_amd64_driver = { .attr = powernow_k8_attr, }; +/* + * Clear the boost-disable flag on the CPU_DOWN path so that this cpu + * cannot block the remaining ones from boosting. On the CPU_UP path we + * simply keep the boost-disable flag in sync with the current global + * state. + */ +static int cpb_notify(struct notifier_block *nb, unsigned long action, + void *hcpu) +{ + unsigned cpu = (long)hcpu; + u32 lo, hi; + + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + + if (!cpb_enabled) { + rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); + lo |= BIT(25); + wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); + } + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi); + lo &= ~BIT(25); + wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block cpb_nb = { + .notifier_call = cpb_notify, +}; + /* driver entry point for init */ static int __cpuinit powernowk8_init(void) { - unsigned int i, supported_cpus = 0; + unsigned int i, supported_cpus = 0, cpu; for_each_online_cpu(i) { int rc; @@ -1423,15 +1545,36 @@ static int __cpuinit powernowk8_init(void) supported_cpus++; } - if (supported_cpus == num_online_cpus()) { - printk(KERN_INFO PFX "Found %d %s " - "processors (%d cpu cores) (" VERSION ")\n", - num_online_nodes(), - boot_cpu_data.x86_model_id, supported_cpus); - return cpufreq_register_driver(&cpufreq_amd64_driver); + if (supported_cpus != num_online_cpus()) + return -ENODEV; + + printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n", + num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus); + + if (boot_cpu_has(X86_FEATURE_CPB)) { + + cpb_capable = true; + + register_cpu_notifier(&cpb_nb); + + msrs = msrs_alloc(); + if (!msrs) { + printk(KERN_ERR "%s: Error allocating msrs!\n", __func__); + return -ENOMEM; + } + + rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs); + + for_each_cpu(cpu, cpu_online_mask) { + struct msr *reg = per_cpu_ptr(msrs, cpu); + cpb_enabled |= !(!!(reg->l & BIT(25))); + } + + printk(KERN_INFO PFX "Core Performance Boosting: %s.\n", + (cpb_enabled ? "on" : "off")); } - return -ENODEV; + return cpufreq_register_driver(&cpufreq_amd64_driver); } /* driver entry point for term */ @@ -1439,6 +1582,13 @@ static void __exit powernowk8_exit(void) { dprintk("exit\n"); + if (boot_cpu_has(X86_FEATURE_CPB)) { + msrs_free(msrs); + msrs = NULL; + + unregister_cpu_notifier(&cpb_nb); + } + cpufreq_unregister_driver(&cpufreq_amd64_driver); } diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h index 02ce824073cb..df3529b1c02d 100644 --- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h +++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h @@ -5,7 +5,6 @@ * http://www.gnu.org/licenses/gpl.html */ - enum pstate { HW_PSTATE_INVALID = 0xff, HW_PSTATE_0 = 0, @@ -55,7 +54,6 @@ struct powernow_k8_data { struct cpumask *available_cores; }; - /* processor's cpuid instruction support */ #define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */ #define CPUID_XFAM 0x0ff00000 /* extended family */ diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 08be922de33a..8095f8611f8a 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -21,37 +21,58 @@ * */ +#include <linux/module.h> #include <asm/processor.h> -#include <asm/vmware.h> #include <asm/hypervisor.h> -static inline void __cpuinit -detect_hypervisor_vendor(struct cpuinfo_x86 *c) +/* + * Hypervisor detect order. This is specified explicitly here because + * some hypervisors might implement compatibility modes for other + * hypervisors and therefore need to be detected in specific sequence. + */ +static const __initconst struct hypervisor_x86 * const hypervisors[] = { - if (vmware_platform()) - c->x86_hyper_vendor = X86_HYPER_VENDOR_VMWARE; - else - c->x86_hyper_vendor = X86_HYPER_VENDOR_NONE; -} + &x86_hyper_vmware, + &x86_hyper_ms_hyperv, +#ifdef CONFIG_XEN_PVHVM + &x86_hyper_xen_hvm, +#endif +}; -static inline void __cpuinit -hypervisor_set_feature_bits(struct cpuinfo_x86 *c) +const struct hypervisor_x86 *x86_hyper; +EXPORT_SYMBOL(x86_hyper); + +static inline void __init +detect_hypervisor_vendor(void) { - if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) { - vmware_set_feature_bits(c); - return; + const struct hypervisor_x86 *h, * const *p; + + for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { + h = *p; + if (h->detect()) { + x86_hyper = h; + printk(KERN_INFO "Hypervisor detected: %s\n", h->name); + break; + } } } void __cpuinit init_hypervisor(struct cpuinfo_x86 *c) { - detect_hypervisor_vendor(c); - hypervisor_set_feature_bits(c); + if (x86_hyper && x86_hyper->set_cpu_features) + x86_hyper->set_cpu_features(c); } void __init init_hypervisor_platform(void) { + + detect_hypervisor_vendor(); + + if (!x86_hyper) + return; + init_hypervisor(&boot_cpu_data); - if (boot_cpu_data.x86_hyper_vendor == X86_HYPER_VENDOR_VMWARE) - vmware_platform_setup(); + + if (x86_hyper->init_platform) + x86_hyper->init_platform(); } diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 1366c7cfd483..85f69cdeae10 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -12,7 +12,6 @@ #include <asm/processor.h> #include <asm/pgtable.h> #include <asm/msr.h> -#include <asm/ds.h> #include <asm/bugs.h> #include <asm/cpu.h> @@ -373,12 +372,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); } - if (c->cpuid_level > 6) { - unsigned ecx = cpuid_ecx(6); - if (ecx & 0x01) - set_cpu_cap(c, X86_FEATURE_APERFMPERF); - } - if (cpu_has_xmm2) set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); if (cpu_has_ds) { @@ -388,7 +381,6 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_BTS); if (!(l1 & (1<<12))) set_cpu_cap(c, X86_FEATURE_PEBS); - ds_init_intel(c); } if (c->x86 == 6 && c->x86_model == 29 && cpu_has_clflush) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 95962a93f99a..898c2f4eab88 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -148,13 +148,19 @@ union _cpuid4_leaf_ecx { u32 full; }; +struct amd_l3_cache { + struct pci_dev *dev; + bool can_disable; + unsigned indices; + u8 subcaches[4]; +}; + struct _cpuid4_info { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - bool can_disable; - unsigned int l3_indices; + struct amd_l3_cache *l3; DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; @@ -164,8 +170,7 @@ struct _cpuid4_info_regs { union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - bool can_disable; - unsigned int l3_indices; + struct amd_l3_cache *l3; }; unsigned short num_cache_leaves; @@ -302,124 +307,246 @@ struct _cache_attr { }; #ifdef CONFIG_CPU_SUP_AMD -static unsigned int __cpuinit amd_calc_l3_indices(void) + +/* + * L3 cache descriptors + */ +static struct amd_l3_cache **__cpuinitdata l3_caches; + +static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) { - /* - * We're called over smp_call_function_single() and therefore - * are on the correct cpu. - */ - int cpu = smp_processor_id(); - int node = cpu_to_node(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned int sc0, sc1, sc2, sc3; u32 val = 0; - pci_read_config_dword(dev, 0x1C4, &val); + pci_read_config_dword(l3->dev, 0x1C4, &val); /* calculate subcache sizes */ - sc0 = !(val & BIT(0)); - sc1 = !(val & BIT(4)); - sc2 = !(val & BIT(8)) + !(val & BIT(9)); - sc3 = !(val & BIT(12)) + !(val & BIT(13)); + l3->subcaches[0] = sc0 = !(val & BIT(0)); + l3->subcaches[1] = sc1 = !(val & BIT(4)); + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); + l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); - return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; + l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; } -static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node) { - if (index < 3) + struct amd_l3_cache *l3; + struct pci_dev *dev = node_to_k8_nb_misc(node); + + l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC); + if (!l3) { + printk(KERN_WARNING "Error allocating L3 struct\n"); + return NULL; + } + + l3->dev = dev; + + amd_calc_l3_indices(l3); + + return l3; +} + +static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, + int index) +{ + int node; + + if (boot_cpu_data.x86 != 0x10) return; - if (boot_cpu_data.x86 == 0x11) + if (index < 3) return; /* see errata #382 and #388 */ - if ((boot_cpu_data.x86 == 0x10) && - ((boot_cpu_data.x86_model < 0x8) || - (boot_cpu_data.x86_mask < 0x1))) + if (boot_cpu_data.x86_model < 0x8) return; + if ((boot_cpu_data.x86_model == 0x8 || + boot_cpu_data.x86_model == 0x9) + && + boot_cpu_data.x86_mask < 0x1) + return; + /* not in virtualized environments */ if (num_k8_northbridges == 0) return; - this_leaf->can_disable = true; - this_leaf->l3_indices = amd_calc_l3_indices(); + /* + * Strictly speaking, the amount in @size below is leaked since it is + * never freed but this is done only on shutdown so it doesn't matter. + */ + if (!l3_caches) { + int size = num_k8_northbridges * sizeof(struct amd_l3_cache *); + + l3_caches = kzalloc(size, GFP_ATOMIC); + if (!l3_caches) + return; + } + + node = amd_get_nb_id(smp_processor_id()); + + if (!l3_caches[node]) { + l3_caches[node] = amd_init_l3_cache(node); + l3_caches[node]->can_disable = true; + } + + WARN_ON(!l3_caches[node]); + + this_leaf->l3 = l3_caches[node]; } -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, - unsigned int index) +/* + * check whether a slot used for disabling an L3 index is occupied. + * @l3: L3 cache descriptor + * @slot: slot number (0..1) + * + * @returns: the disabled index if used or negative value if slot free. + */ +int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) { - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = amd_get_nb_id(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned int reg = 0; - if (!this_leaf->can_disable) - return -EINVAL; + pci_read_config_dword(l3->dev, 0x1BC + slot * 4, ®); + + /* check whether this slot is activated already */ + if (reg & (3UL << 30)) + return reg & 0xfff; - if (!dev) + return -1; +} + +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int slot) +{ + int index; + + if (!this_leaf->l3 || !this_leaf->l3->can_disable) return -EINVAL; - pci_read_config_dword(dev, 0x1BC + index * 4, ®); - return sprintf(buf, "0x%08x\n", reg); + index = amd_get_l3_disable_slot(this_leaf->l3, slot); + if (index >= 0) + return sprintf(buf, "%d\n", index); + + return sprintf(buf, "FREE\n"); } -#define SHOW_CACHE_DISABLE(index) \ +#define SHOW_CACHE_DISABLE(slot) \ static ssize_t \ -show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ { \ - return show_cache_disable(this_leaf, buf, index); \ + return show_cache_disable(this_leaf, buf, slot); \ } SHOW_CACHE_DISABLE(0) SHOW_CACHE_DISABLE(1) -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, unsigned int index) +static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, + unsigned slot, unsigned long idx) { - int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - int node = amd_get_nb_id(cpu); - struct pci_dev *dev = node_to_k8_nb_misc(node); - unsigned long val = 0; + int i; + + idx |= BIT(30); + + /* + * disable index in all 4 subcaches + */ + for (i = 0; i < 4; i++) { + u32 reg = idx | (i << 20); + + if (!l3->subcaches[i]) + continue; + + pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + + /* + * We need to WBINVD on a core on the node containing the L3 + * cache which indices we disable therefore a simple wbinvd() + * is not sufficient. + */ + wbinvd_on_cpu(cpu); + + reg |= BIT(31); + pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); + } +} + +/* + * disable a L3 cache index by using a disable-slot + * + * @l3: L3 cache descriptor + * @cpu: A CPU on the node containing the L3 cache + * @slot: slot number (0..1) + * @index: index to disable + * + * @return: 0 on success, error status on failure + */ +int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, + unsigned long index) +{ + int ret = 0; #define SUBCACHE_MASK (3UL << 20) #define SUBCACHE_INDEX 0xfff - if (!this_leaf->can_disable) + /* + * check whether this slot is already used or + * the index is already disabled + */ + ret = amd_get_l3_disable_slot(l3, slot); + if (ret >= 0) return -EINVAL; + /* + * check whether the other slot has disabled the + * same index already + */ + if (index == amd_get_l3_disable_slot(l3, !slot)) + return -EINVAL; + + /* do not allow writes outside of allowed bits */ + if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || + ((index & SUBCACHE_INDEX) > l3->indices)) + return -EINVAL; + + amd_l3_disable_index(l3, cpu, slot, index); + + return 0; +} + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, + unsigned int slot) +{ + unsigned long val = 0; + int cpu, err = 0; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!dev) + if (!this_leaf->l3 || !this_leaf->l3->can_disable) return -EINVAL; - if (strict_strtoul(buf, 10, &val) < 0) - return -EINVAL; + cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); - /* do not allow writes outside of allowed bits */ - if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || - ((val & SUBCACHE_INDEX) > this_leaf->l3_indices)) + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - val |= BIT(30); - pci_write_config_dword(dev, 0x1BC + index * 4, val); - /* - * We need to WBINVD on a core on the node containing the L3 cache which - * indices we disable therefore a simple wbinvd() is not sufficient. - */ - wbinvd_on_cpu(cpu); - pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31)); + err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); + if (err) { + if (err == -EEXIST) + printk(KERN_WARNING "L3 disable slot %d in use!\n", + slot); + return err; + } return count; } -#define STORE_CACHE_DISABLE(index) \ +#define STORE_CACHE_DISABLE(slot) \ static ssize_t \ -store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ +store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ const char *buf, size_t count) \ { \ - return store_cache_disable(this_leaf, buf, count, index); \ + return store_cache_disable(this_leaf, buf, count, slot); \ } STORE_CACHE_DISABLE(0) STORE_CACHE_DISABLE(1) @@ -431,7 +558,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, #else /* CONFIG_CPU_SUP_AMD */ static void __cpuinit -amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) +amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index) { }; #endif /* CONFIG_CPU_SUP_AMD */ @@ -447,8 +574,7 @@ __cpuinit cpuid4_cache_lookup_regs(int index, if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { amd_cpuid4(index, &eax, &ebx, &ecx); - if (boot_cpu_data.x86 >= 0x10) - amd_check_l3_disable(index, this_leaf); + amd_check_l3_disable(this_leaf, index); } else { cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } @@ -705,6 +831,7 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) for (i = 0; i < num_cache_leaves; i++) cache_remove_shared_cpu_map(cpu, i); + kfree(per_cpu(ici_cpuid4_info, cpu)->l3); kfree(per_cpu(ici_cpuid4_info, cpu)); per_cpu(ici_cpuid4_info, cpu) = NULL; } @@ -989,7 +1116,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) this_leaf = CPUID4_INFO_IDX(cpu, i); - if (this_leaf->can_disable) + if (this_leaf->l3 && this_leaf->l3->can_disable) ktype_cache.default_attrs = default_l3_attrs; else ktype_cache.default_attrs = default_attrs; diff --git a/arch/x86/kernel/cpu/mcheck/Makefile b/arch/x86/kernel/cpu/mcheck/Makefile index 4ac6d48fe11b..bb34b03af252 100644 --- a/arch/x86/kernel/cpu/mcheck/Makefile +++ b/arch/x86/kernel/cpu/mcheck/Makefile @@ -7,3 +7,5 @@ obj-$(CONFIG_X86_MCE_THRESHOLD) += threshold.o obj-$(CONFIG_X86_MCE_INJECT) += mce-inject.o obj-$(CONFIG_X86_THERMAL_VECTOR) += therm_throt.o + +obj-$(CONFIG_ACPI_APEI) += mce-apei.o diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c new file mode 100644 index 000000000000..745b54f9be89 --- /dev/null +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -0,0 +1,138 @@ +/* + * Bridge between MCE and APEI + * + * On some machine, corrected memory errors are reported via APEI + * generic hardware error source (GHES) instead of corrected Machine + * Check. These corrected memory errors can be reported to user space + * through /dev/mcelog via faking a corrected Machine Check, so that + * the error memory page can be offlined by /sbin/mcelog if the error + * count for one page is beyond the threshold. + * + * For fatal MCE, save MCE record into persistent storage via ERST, so + * that the MCE record can be logged after reboot via ERST. + * + * Copyright 2010 Intel Corp. + * Author: Huang Ying <ying.huang@intel.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/acpi.h> +#include <linux/cper.h> +#include <acpi/apei.h> +#include <asm/mce.h> + +#include "mce-internal.h" + +void apei_mce_report_mem_error(int corrected, struct cper_sec_mem_err *mem_err) +{ + struct mce m; + + /* Only corrected MC is reported */ + if (!corrected) + return; + + mce_setup(&m); + m.bank = 1; + /* Fake a memory read corrected error with unknown channel */ + m.status = MCI_STATUS_VAL | MCI_STATUS_EN | MCI_STATUS_ADDRV | 0x9f; + m.addr = mem_err->physical_addr; + mce_log(&m); + mce_notify_irq(); +} +EXPORT_SYMBOL_GPL(apei_mce_report_mem_error); + +#define CPER_CREATOR_MCE \ + UUID_LE(0x75a574e3, 0x5052, 0x4b29, 0x8a, 0x8e, 0xbe, 0x2c, \ + 0x64, 0x90, 0xb8, 0x9d) +#define CPER_SECTION_TYPE_MCE \ + UUID_LE(0xfe08ffbe, 0x95e4, 0x4be7, 0xbc, 0x73, 0x40, 0x96, \ + 0x04, 0x4a, 0x38, 0xfc) + +/* + * CPER specification (in UEFI specification 2.3 appendix N) requires + * byte-packed. + */ +struct cper_mce_record { + struct cper_record_header hdr; + struct cper_section_descriptor sec_hdr; + struct mce mce; +} __packed; + +int apei_write_mce(struct mce *m) +{ + struct cper_mce_record rcd; + + memset(&rcd, 0, sizeof(rcd)); + memcpy(rcd.hdr.signature, CPER_SIG_RECORD, CPER_SIG_SIZE); + rcd.hdr.revision = CPER_RECORD_REV; + rcd.hdr.signature_end = CPER_SIG_END; + rcd.hdr.section_count = 1; + rcd.hdr.error_severity = CPER_SER_FATAL; + /* timestamp, platform_id, partition_id are all invalid */ + rcd.hdr.validation_bits = 0; + rcd.hdr.record_length = sizeof(rcd); + rcd.hdr.creator_id = CPER_CREATOR_MCE; + rcd.hdr.notification_type = CPER_NOTIFY_MCE; + rcd.hdr.record_id = cper_next_record_id(); + rcd.hdr.flags = CPER_HW_ERROR_FLAGS_PREVERR; + + rcd.sec_hdr.section_offset = (void *)&rcd.mce - (void *)&rcd; + rcd.sec_hdr.section_length = sizeof(rcd.mce); + rcd.sec_hdr.revision = CPER_SEC_REV; + /* fru_id and fru_text is invalid */ + rcd.sec_hdr.validation_bits = 0; + rcd.sec_hdr.flags = CPER_SEC_PRIMARY; + rcd.sec_hdr.section_type = CPER_SECTION_TYPE_MCE; + rcd.sec_hdr.section_severity = CPER_SER_FATAL; + + memcpy(&rcd.mce, m, sizeof(*m)); + + return erst_write(&rcd.hdr); +} + +ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + struct cper_mce_record rcd; + ssize_t len; + + len = erst_read_next(&rcd.hdr, sizeof(rcd)); + if (len <= 0) + return len; + /* Can not skip other records in storage via ERST unless clear them */ + else if (len != sizeof(rcd) || + uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { + if (printk_ratelimit()) + pr_warning( + "MCE-APEI: Can not skip the unknown record in ERST"); + return -EIO; + } + + memcpy(m, &rcd.mce, sizeof(*m)); + *record_id = rcd.hdr.record_id; + + return sizeof(*m); +} + +/* Check whether there is record in ERST */ +int apei_check_mce(void) +{ + return erst_get_record_count(); +} + +int apei_clear_mce(u64 record_id) +{ + return erst_clear(record_id); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 32996f9fab67..fefcc69ee8b5 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -28,3 +28,26 @@ extern int mce_ser; extern struct mce_bank *mce_banks; +#ifdef CONFIG_ACPI_APEI +int apei_write_mce(struct mce *m); +ssize_t apei_read_mce(struct mce *m, u64 *record_id); +int apei_check_mce(void); +int apei_clear_mce(u64 record_id); +#else +static inline int apei_write_mce(struct mce *m) +{ + return -EINVAL; +} +static inline ssize_t apei_read_mce(struct mce *m, u64 *record_id) +{ + return 0; +} +static inline int apei_check_mce(void) +{ + return 0; +} +static inline int apei_clear_mce(u64 record_id) +{ + return -EINVAL; +} +#endif diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 8a6f0afa767e..ed41562909fe 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -36,6 +36,7 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/debugfs.h> +#include <linux/edac_mce.h> #include <asm/processor.h> #include <asm/hw_irq.h> @@ -50,7 +51,7 @@ static DEFINE_MUTEX(mce_read_mutex); #define rcu_dereference_check_mce(p) \ - rcu_dereference_check((p), \ + rcu_dereference_index_check((p), \ rcu_read_lock_sched_held() || \ lockdep_is_held(&mce_read_mutex)) @@ -106,8 +107,8 @@ EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); static int default_decode_mce(struct notifier_block *nb, unsigned long val, void *data) { - pr_emerg("No human readable MCE decoding support on this CPU type.\n"); - pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); + pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n"); + pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n"); return NOTIFY_STOP; } @@ -169,6 +170,15 @@ void mce_log(struct mce *mce) entry = rcu_dereference_check_mce(mcelog.next); for (;;) { /* + * If edac_mce is enabled, it will check the error type + * and will process it, if it is a known error. + * Otherwise, the error will be sent through mcelog + * interface + */ + if (edac_mce_parse(mce)) + return; + + /* * When the buffer fills up discard new entries. * Assume that the earlier errors are the more * interesting ones: @@ -201,11 +211,11 @@ void mce_log(struct mce *mce) static void print_mce(struct mce *m) { - pr_emerg("CPU %d: Machine Check Exception: %16Lx Bank %d: %016Lx\n", + pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", m->extcpu, m->mcgstatus, m->bank, m->status); if (m->ip) { - pr_emerg("RIP%s %02x:<%016Lx> ", + pr_emerg(HW_ERR "RIP%s %02x:<%016Lx> ", !(m->mcgstatus & MCG_STATUS_EIPV) ? " !INEXACT!" : "", m->cs, m->ip); @@ -214,14 +224,14 @@ static void print_mce(struct mce *m) pr_cont("\n"); } - pr_emerg("TSC %llx ", m->tsc); + pr_emerg(HW_ERR "TSC %llx ", m->tsc); if (m->addr) pr_cont("ADDR %llx ", m->addr); if (m->misc) pr_cont("MISC %llx ", m->misc); pr_cont("\n"); - pr_emerg("PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); /* @@ -231,16 +241,6 @@ static void print_mce(struct mce *m) atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); } -static void print_mce_head(void) -{ - pr_emerg("\nHARDWARE ERROR\n"); -} - -static void print_mce_tail(void) -{ - pr_emerg("This is not a software problem!\n"); -} - #define PANIC_TIMEOUT 5 /* 5 seconds */ static atomic_t mce_paniced; @@ -264,7 +264,7 @@ static void wait_for_panic(void) static void mce_panic(char *msg, struct mce *final, char *exp) { - int i; + int i, apei_err = 0; if (!fake_panic) { /* @@ -281,14 +281,16 @@ static void mce_panic(char *msg, struct mce *final, char *exp) if (atomic_inc_return(&mce_fake_paniced) > 1) return; } - print_mce_head(); /* First print corrected ones that are still unlogged */ for (i = 0; i < MCE_LOG_LEN; i++) { struct mce *m = &mcelog.entry[i]; if (!(m->status & MCI_STATUS_VAL)) continue; - if (!(m->status & MCI_STATUS_UC)) + if (!(m->status & MCI_STATUS_UC)) { print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } } /* Now print uncorrected but with the final one last */ for (i = 0; i < MCE_LOG_LEN; i++) { @@ -297,22 +299,27 @@ static void mce_panic(char *msg, struct mce *final, char *exp) continue; if (!(m->status & MCI_STATUS_UC)) continue; - if (!final || memcmp(m, final, sizeof(struct mce))) + if (!final || memcmp(m, final, sizeof(struct mce))) { print_mce(m); + if (!apei_err) + apei_err = apei_write_mce(m); + } } - if (final) + if (final) { print_mce(final); + if (!apei_err) + apei_err = apei_write_mce(final); + } if (cpu_missing) - printk(KERN_EMERG "Some CPUs didn't answer in synchronization\n"); - print_mce_tail(); + pr_emerg(HW_ERR "Some CPUs didn't answer in synchronization\n"); if (exp) - printk(KERN_EMERG "Machine check: %s\n", exp); + pr_emerg(HW_ERR "Machine check: %s\n", exp); if (!fake_panic) { if (panic_timeout == 0) panic_timeout = mce_panic_timeout; panic(msg); } else - printk(KERN_EMERG "Fake kernel panic: %s\n", msg); + pr_emerg(HW_ERR "Fake kernel panic: %s\n", msg); } /* Support code for software error injection */ @@ -539,7 +546,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) struct mce m; int i; - __get_cpu_var(mce_poll_count)++; + percpu_inc(mce_poll_count); mce_setup(&m); @@ -581,6 +588,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { mce_log(&m); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); add_taint(TAINT_MACHINE_CHECK); } @@ -934,7 +942,7 @@ void do_machine_check(struct pt_regs *regs, long error_code) atomic_inc(&mce_entry); - __get_cpu_var(mce_exception_count)++; + percpu_inc(mce_exception_count); if (notify_die(DIE_NMI, "machine check", regs, error_code, 18, SIGKILL) == NOTIFY_STOP) @@ -1201,7 +1209,7 @@ int mce_notify_irq(void) schedule_work(&mce_trigger_work); if (__ratelimit(&ratelimit)) - printk(KERN_INFO "Machine check events logged\n"); + pr_info(HW_ERR "Machine check events logged\n"); return 1; } @@ -1493,6 +1501,43 @@ static void collect_tscs(void *data) rdtscll(cpu_tsc[smp_processor_id()]); } +static int mce_apei_read_done; + +/* Collect MCE record of previous boot in persistent storage via APEI ERST. */ +static int __mce_read_apei(char __user **ubuf, size_t usize) +{ + int rc; + u64 record_id; + struct mce m; + + if (usize < sizeof(struct mce)) + return -EINVAL; + + rc = apei_read_mce(&m, &record_id); + /* Error or no more MCE record */ + if (rc <= 0) { + mce_apei_read_done = 1; + return rc; + } + rc = -EFAULT; + if (copy_to_user(*ubuf, &m, sizeof(struct mce))) + return rc; + /* + * In fact, we should have cleared the record after that has + * been flushed to the disk or sent to network in + * /sbin/mcelog, but we have no interface to support that now, + * so just clear it to avoid duplication. + */ + rc = apei_clear_mce(record_id); + if (rc) { + mce_apei_read_done = 1; + return rc; + } + *ubuf += sizeof(struct mce); + + return 0; +} + static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, loff_t *off) { @@ -1506,15 +1551,19 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, return -ENOMEM; mutex_lock(&mce_read_mutex); + + if (!mce_apei_read_done) { + err = __mce_read_apei(&buf, usize); + if (err || buf != ubuf) + goto out; + } + next = rcu_dereference_check_mce(mcelog.next); /* Only supports full reads right now */ - if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { - mutex_unlock(&mce_read_mutex); - kfree(cpu_tsc); - - return -EINVAL; - } + err = -EINVAL; + if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) + goto out; err = 0; prev = 0; @@ -1562,10 +1611,15 @@ timeout: memset(&mcelog.entry[i], 0, sizeof(struct mce)); } } + + if (err) + err = -EFAULT; + +out: mutex_unlock(&mce_read_mutex); kfree(cpu_tsc); - return err ? -EFAULT : buf - ubuf; + return err ? err : buf - ubuf; } static unsigned int mce_poll(struct file *file, poll_table *wait) @@ -1573,6 +1627,8 @@ static unsigned int mce_poll(struct file *file, poll_table *wait) poll_wait(file, &mce_wait, wait); if (rcu_dereference_check_mce(mcelog.next)) return POLLIN | POLLRDNORM; + if (!mce_apei_read_done && apei_check_mce()) + return POLLIN | POLLRDNORM; return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index 62b48e40920a..6fcd0936194f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -95,19 +95,20 @@ static void cmci_discover(int banks, int boot) rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Already owned by someone else? */ - if (val & CMCI_EN) { + if (val & MCI_CTL2_CMCI_EN) { if (test_and_clear_bit(i, owned) && !boot) print_update("SHD", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); continue; } - val |= CMCI_EN | CMCI_THRESHOLD; + val &= ~MCI_CTL2_CMCI_THRESHOLD_MASK; + val |= MCI_CTL2_CMCI_EN | CMCI_THRESHOLD; wrmsrl(MSR_IA32_MCx_CTL2(i), val); rdmsrl(MSR_IA32_MCx_CTL2(i), val); /* Did the enable bit stick? -- the bank supports CMCI */ - if (val & CMCI_EN) { + if (val & MCI_CTL2_CMCI_EN) { if (!test_and_set_bit(i, owned) && !boot) print_update("CMCI", &hdr, i); __clear_bit(i, __get_cpu_var(mce_poll_banks)); @@ -155,7 +156,7 @@ void cmci_clear(void) continue; /* Disable CMCI */ rdmsrl(MSR_IA32_MCx_CTL2(i), val); - val &= ~(CMCI_EN|CMCI_THRESHOLD_MASK); + val &= ~(MCI_CTL2_CMCI_EN|MCI_CTL2_CMCI_THRESHOLD_MASK); wrmsrl(MSR_IA32_MCx_CTL2(i), val); __clear_bit(i, __get_cpu_var(mce_banks_owned)); } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 81c499eceb21..c2a8b26d4fea 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -34,15 +34,25 @@ /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) +#define THERMAL_THROTTLING_EVENT 0 +#define POWER_LIMIT_EVENT 1 + /* - * Current thermal throttling state: + * Current thermal event state: */ -struct thermal_state { - bool is_throttled; - +struct _thermal_state { + bool new_event; + int event; u64 next_check; - unsigned long throttle_count; - unsigned long last_throttle_count; + unsigned long count; + unsigned long last_count; +}; + +struct thermal_state { + struct _thermal_state core_throttle; + struct _thermal_state core_power_limit; + struct _thermal_state package_throttle; + struct _thermal_state package_power_limit; }; static DEFINE_PER_CPU(struct thermal_state, thermal_state); @@ -53,11 +63,13 @@ static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS #define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) + static SYSDEV_ATTR(_name, 0444, \ + therm_throt_sysdev_show_##_name, \ + NULL) \ -#define define_therm_throt_sysdev_show_func(name) \ +#define define_therm_throt_sysdev_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##name( \ +static ssize_t therm_throt_sysdev_show_##event##_##name( \ struct sys_device *dev, \ struct sysdev_attribute *attr, \ char *buf) \ @@ -66,30 +78,42 @@ static ssize_t therm_throt_sysdev_show_##name( \ ssize_t ret; \ \ preempt_disable(); /* CPU hotplug */ \ - if (cpu_online(cpu)) \ + if (cpu_online(cpu)) { \ ret = sprintf(buf, "%lu\n", \ - per_cpu(thermal_state, cpu).name); \ - else \ + per_cpu(thermal_state, cpu).event.name); \ + } else \ ret = 0; \ preempt_enable(); \ \ return ret; \ } -define_therm_throt_sysdev_show_func(throttle_count); -define_therm_throt_sysdev_one_ro(throttle_count); +define_therm_throt_sysdev_show_func(core_throttle, count); +define_therm_throt_sysdev_one_ro(core_throttle_count); + +define_therm_throt_sysdev_show_func(core_power_limit, count); +define_therm_throt_sysdev_one_ro(core_power_limit_count); + +define_therm_throt_sysdev_show_func(package_throttle, count); +define_therm_throt_sysdev_one_ro(package_throttle_count); + +define_therm_throt_sysdev_show_func(package_power_limit, count); +define_therm_throt_sysdev_one_ro(package_power_limit_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_throttle_count.attr, + &attr_core_throttle_count.attr, NULL }; -static struct attribute_group thermal_throttle_attr_group = { +static struct attribute_group thermal_attr_group = { .attrs = thermal_throttle_attrs, .name = "thermal_throttle" }; #endif /* CONFIG_SYSFS */ +#define CORE_LEVEL 0 +#define PACKAGE_LEVEL 1 + /*** * therm_throt_process - Process thermal throttling event from interrupt * @curr: Whether the condition is current or not (boolean), since the @@ -106,39 +130,70 @@ static struct attribute_group thermal_throttle_attr_group = { * 1 : Event should be logged further, and a message has been * printed to the syslog. */ -static int therm_throt_process(bool is_throttled) +static int therm_throt_process(bool new_event, int event, int level) { - struct thermal_state *state; - unsigned int this_cpu; - bool was_throttled; + struct _thermal_state *state; + unsigned int this_cpu = smp_processor_id(); + bool old_event; u64 now; + struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu); - this_cpu = smp_processor_id(); now = get_jiffies_64(); - state = &per_cpu(thermal_state, this_cpu); + if (level == CORE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->core_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->core_power_limit; + else + return 0; + } else if (level == PACKAGE_LEVEL) { + if (event == THERMAL_THROTTLING_EVENT) + state = &pstate->package_throttle; + else if (event == POWER_LIMIT_EVENT) + state = &pstate->package_power_limit; + else + return 0; + } else + return 0; - was_throttled = state->is_throttled; - state->is_throttled = is_throttled; + old_event = state->new_event; + state->new_event = new_event; - if (is_throttled) - state->throttle_count++; + if (new_event) + state->count++; if (time_before64(now, state->next_check) && - state->throttle_count != state->last_throttle_count) + state->count != state->last_count) return 0; state->next_check = now + CHECK_INTERVAL; - state->last_throttle_count = state->throttle_count; + state->last_count = state->count; /* if we just entered the thermal event */ - if (is_throttled) { - printk(KERN_CRIT "CPU%d: Temperature above threshold, cpu clock throttled (total events = %lu)\n", this_cpu, state->throttle_count); + if (new_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_CRIT "CPU%d: %s temperature above threshold, cpu clock throttled (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); + else + printk(KERN_CRIT "CPU%d: %s power limit notification (total events = %lu)\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package", + state->count); add_taint(TAINT_MACHINE_CHECK); return 1; } - if (was_throttled) { - printk(KERN_INFO "CPU%d: Temperature/speed normal\n", this_cpu); + if (old_event) { + if (event == THERMAL_THROTTLING_EVENT) + printk(KERN_INFO "CPU%d: %s temperature/speed normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); + else + printk(KERN_INFO "CPU%d: %s power limit normal\n", + this_cpu, + level == CORE_LEVEL ? "Core" : "Package"); return 1; } @@ -149,13 +204,32 @@ static int therm_throt_process(bool is_throttled) /* Add/Remove thermal_throttle interface for CPU device: */ static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev) { - return sysfs_create_group(&sys_dev->kobj, - &thermal_throttle_attr_group); + int err; + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); + + err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); + if (err) + return err; + + if (cpu_has(c, X86_FEATURE_PLN)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_core_power_limit_count.attr, + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PTS)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_throttle_count.attr, + thermal_attr_group.name); + if (cpu_has(c, X86_FEATURE_PLN)) + err = sysfs_add_file_to_group(&sys_dev->kobj, + &attr_package_power_limit_count.attr, + thermal_attr_group.name); + + return err; } static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_throttle_attr_group); + sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); } /* Mutex protecting device creation against CPU hotplug: */ @@ -190,7 +264,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, mutex_unlock(&therm_cpu_lock); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block thermal_throttle_cpu_notifier __cpuinitdata = @@ -226,14 +300,50 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ +/* + * Set up the most two significant bit to notify mce log that this thermal + * event type. + * This is a temp solution. May be changed in the future with mce log + * infrasture. + */ +#define CORE_THROTTLED (0) +#define CORE_POWER_LIMIT ((__u64)1 << 62) +#define PACKAGE_THROTTLED ((__u64)2 << 62) +#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) + /* Thermal transition interrupt handler */ static void intel_thermal_interrupt(void) { __u64 msr_val; + struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); rdmsrl(MSR_IA32_THERM_STATUS, msr_val); - if (therm_throt_process((msr_val & THERM_STATUS_PROCHOT) != 0)) - mce_log_therm_throt_event(msr_val); + + if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + CORE_LEVEL) != 0) + mce_log_therm_throt_event(CORE_THROTTLED | msr_val); + + if (cpu_has(c, X86_FEATURE_PLN)) + if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + CORE_LEVEL) != 0) + mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); + + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); + if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + THERMAL_THROTTLING_EVENT, + PACKAGE_LEVEL) != 0) + mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); + if (cpu_has(c, X86_FEATURE_PLN)) + if (therm_throt_process(msr_val & + PACKAGE_THERM_STATUS_POWER_LIMIT, + POWER_LIMIT_EVENT, + PACKAGE_LEVEL) != 0) + mce_log_therm_throt_event(PACKAGE_POWER_LIMIT + | msr_val); + } } static void unexpected_thermal_interrupt(void) @@ -335,8 +445,26 @@ void intel_init_thermal(struct cpuinfo_x86 *c) apic_write(APIC_LVTTHMR, h); rdmsr(MSR_IA32_THERM_INTERRUPT, l, h); - wrmsr(MSR_IA32_THERM_INTERRUPT, - l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + if (cpu_has(c, X86_FEATURE_PLN)) + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE + | THERM_INT_HIGH_ENABLE | THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_THERM_INTERRUPT, + l | (THERM_INT_LOW_ENABLE | THERM_INT_HIGH_ENABLE), h); + + if (cpu_has(c, X86_FEATURE_PTS)) { + rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); + if (cpu_has(c, X86_FEATURE_PLN)) + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE + | PACKAGE_THERM_INT_PLN_ENABLE), h); + else + wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, + l | (PACKAGE_THERM_INT_LOW_ENABLE + | PACKAGE_THERM_INT_HIGH_ENABLE), h); + } smp_thermal_vector = intel_thermal_interrupt; diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c new file mode 100644 index 000000000000..d944bf6c50e9 --- /dev/null +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -0,0 +1,56 @@ +/* + * HyperV Detection code. + * + * Copyright (C) 2010, Novell, Inc. + * Author : K. Y. Srinivasan <ksrinivasan@novell.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + */ + +#include <linux/types.h> +#include <linux/module.h> +#include <asm/processor.h> +#include <asm/hypervisor.h> +#include <asm/hyperv.h> +#include <asm/mshyperv.h> + +struct ms_hyperv_info ms_hyperv; +EXPORT_SYMBOL_GPL(ms_hyperv); + +static bool __init ms_hyperv_platform(void) +{ + u32 eax; + u32 hyp_signature[3]; + + if (!boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + cpuid(HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS, + &eax, &hyp_signature[0], &hyp_signature[1], &hyp_signature[2]); + + return eax >= HYPERV_CPUID_MIN && + eax <= HYPERV_CPUID_MAX && + !memcmp("Microsoft Hv", hyp_signature, 12); +} + +static void __init ms_hyperv_init_platform(void) +{ + /* + * Extract the features and hints + */ + ms_hyperv.features = cpuid_eax(HYPERV_CPUID_FEATURES); + ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); + + printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", + ms_hyperv.features, ms_hyperv.hints); +} + +const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { + .name = "Microsoft HyperV", + .detect = ms_hyperv_platform, + .init_platform = ms_hyperv_init_platform, +}; +EXPORT_SYMBOL(x86_hyper_ms_hyperv); diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index 06130b52f012..c5f59d071425 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -632,9 +632,9 @@ static void __init mtrr_print_out_one_result(int i) unsigned long gran_base, chunk_base, lose_base; char gran_factor, chunk_factor, lose_factor; - gran_base = to_size_factor(result[i].gran_sizek, &gran_factor), - chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor), - lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor), + gran_base = to_size_factor(result[i].gran_sizek, &gran_factor); + chunk_base = to_size_factor(result[i].chunk_sizek, &chunk_factor); + lose_base = to_size_factor(result[i].lose_cover_sizek, &lose_factor); pr_info("%sgran_size: %ld%c \tchunk_size: %ld%c \t", result[i].bad ? "*BAD*" : " ", diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index fd31a441c61c..7d28d7d03885 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -433,13 +433,12 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, { unsigned int mask_lo, mask_hi, base_lo, base_hi; unsigned int tmp, hi; - int cpu; /* * get_mtrr doesn't need to update mtrr_state, also it could be called * from any cpu, so try to print it out directly. */ - cpu = get_cpu(); + get_cpu(); rdmsr(MTRRphysMask_MSR(reg), mask_lo, mask_hi); diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 79556bd9b602..01c0f3ee6cc3 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -35,6 +35,7 @@ #include <linux/types.h> /* FIXME: kvm_para.h needs this */ +#include <linux/stop_machine.h> #include <linux/kvm_para.h> #include <linux/uaccess.h> #include <linux/module.h> @@ -143,22 +144,28 @@ struct set_mtrr_data { mtrr_type smp_type; }; +static DEFINE_PER_CPU(struct cpu_stop_work, mtrr_work); + /** - * ipi_handler - Synchronisation handler. Executed by "other" CPUs. + * mtrr_work_handler - Synchronisation handler. Executed by "other" CPUs. * @info: pointer to mtrr configuration data * * Returns nothing. */ -static void ipi_handler(void *info) +static int mtrr_work_handler(void *info) { #ifdef CONFIG_SMP struct set_mtrr_data *data = info; unsigned long flags; + atomic_dec(&data->count); + while (!atomic_read(&data->gate)) + cpu_relax(); + local_irq_save(flags); atomic_dec(&data->count); - while (!atomic_read(&data->gate)) + while (atomic_read(&data->gate)) cpu_relax(); /* The master has cleared me to execute */ @@ -173,12 +180,13 @@ static void ipi_handler(void *info) } atomic_dec(&data->count); - while (atomic_read(&data->gate)) + while (!atomic_read(&data->gate)) cpu_relax(); atomic_dec(&data->count); local_irq_restore(flags); #endif + return 0; } static inline int types_compatible(mtrr_type type1, mtrr_type type2) @@ -198,7 +206,7 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * * This is kinda tricky, but fortunately, Intel spelled it out for us cleanly: * - * 1. Send IPI to do the following: + * 1. Queue work to do the following on all processors: * 2. Disable Interrupts * 3. Wait for all procs to do so * 4. Enter no-fill cache mode @@ -215,14 +223,17 @@ static inline int types_compatible(mtrr_type type1, mtrr_type type2) * 15. Enable interrupts. * * What does that mean for us? Well, first we set data.count to the number - * of CPUs. As each CPU disables interrupts, it'll decrement it once. We wait - * until it hits 0 and proceed. We set the data.gate flag and reset data.count. - * Meanwhile, they are waiting for that flag to be set. Once it's set, each + * of CPUs. As each CPU announces that it started the rendezvous handler by + * decrementing the count, We reset data.count and set the data.gate flag + * allowing all the cpu's to proceed with the work. As each cpu disables + * interrupts, it'll decrement data.count once. We wait until it hits 0 and + * proceed. We clear the data.gate flag and reset data.count. Meanwhile, they + * are waiting for that flag to be cleared. Once it's cleared, each * CPU goes through the transition of updating MTRRs. * The CPU vendors may each do it differently, * so we call mtrr_if->set() callback and let them take care of it. * When they're done, they again decrement data->count and wait for data.gate - * to be reset. + * to be set. * When we finish, we wait for data.count to hit 0 and toggle the data.gate flag * Everyone then enables interrupts and we all continue on. * @@ -234,6 +245,9 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ { struct set_mtrr_data data; unsigned long flags; + int cpu; + + preempt_disable(); data.smp_reg = reg; data.smp_base = base; @@ -246,10 +260,15 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ atomic_set(&data.gate, 0); /* Start the ball rolling on other CPUs */ - if (smp_call_function(ipi_handler, &data, 0) != 0) - panic("mtrr: timed out waiting for other CPUs\n"); + for_each_online_cpu(cpu) { + struct cpu_stop_work *work = &per_cpu(mtrr_work, cpu); + + if (cpu == smp_processor_id()) + continue; + + stop_one_cpu_nowait(cpu, mtrr_work_handler, &data, work); + } - local_irq_save(flags); while (atomic_read(&data.count)) cpu_relax(); @@ -259,6 +278,16 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ smp_wmb(); atomic_set(&data.gate, 1); + local_irq_save(flags); + + while (atomic_read(&data.count)) + cpu_relax(); + + /* Ok, reset count and toggle gate */ + atomic_set(&data.count, num_booting_cpus() - 1); + smp_wmb(); + atomic_set(&data.gate, 0); + /* Do our MTRR business */ /* @@ -279,7 +308,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ atomic_set(&data.count, num_booting_cpus() - 1); smp_wmb(); - atomic_set(&data.gate, 0); + atomic_set(&data.gate, 1); /* * Wait here for everyone to have seen the gate change @@ -289,6 +318,7 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ cpu_relax(); local_irq_restore(flags); + preempt_enable(); } /** diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index db5bdc8addf8..f2da20fda02d 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -31,46 +31,51 @@ #include <asm/nmi.h> #include <asm/compat.h> -static u64 perf_event_mask __read_mostly; +#if 0 +#undef wrmsrl +#define wrmsrl(msr, val) \ +do { \ + trace_printk("wrmsrl(%lx, %lx)\n", (unsigned long)(msr),\ + (unsigned long)(val)); \ + native_write_msr((msr), (u32)((u64)(val)), \ + (u32)((u64)(val) >> 32)); \ +} while (0) +#endif -/* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 4 +/* + * best effort, GUP based copy_from_user() that assumes IRQ or NMI context + */ +static unsigned long +copy_from_user_nmi(void *to, const void __user *from, unsigned long n) +{ + unsigned long offset, addr = (unsigned long)from; + int type = in_nmi() ? KM_NMI : KM_IRQ0; + unsigned long size, len = 0; + struct page *page; + void *map; + int ret; -/* The size of a BTS record in bytes: */ -#define BTS_RECORD_SIZE 24 + do { + ret = __get_user_pages_fast(addr, 1, 0, &page); + if (!ret) + break; -/* The size of a per-cpu BTS buffer in bytes: */ -#define BTS_BUFFER_SIZE (BTS_RECORD_SIZE * 2048) + offset = addr & (PAGE_SIZE - 1); + size = min(PAGE_SIZE - offset, n - len); -/* The BTS overflow threshold in bytes from the end of the buffer: */ -#define BTS_OVFL_TH (BTS_RECORD_SIZE * 128) + map = kmap_atomic(page, type); + memcpy(to, map+offset, size); + kunmap_atomic(map, type); + put_page(page); + len += size; + to += size; + addr += size; -/* - * Bits in the debugctlmsr controlling branch tracing. - */ -#define X86_DEBUGCTL_TR (1 << 6) -#define X86_DEBUGCTL_BTS (1 << 7) -#define X86_DEBUGCTL_BTINT (1 << 8) -#define X86_DEBUGCTL_BTS_OFF_OS (1 << 9) -#define X86_DEBUGCTL_BTS_OFF_USR (1 << 10) + } while (len < n); -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; -}; + return len; +} struct event_constraint { union { @@ -89,18 +94,42 @@ struct amd_nb { struct event_constraint event_constraints[X86_PMC_IDX_MAX]; }; +#define MAX_LBR_ENTRIES 16 + struct cpu_hw_events { + /* + * Generic x86 PMC bits + */ struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */ unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long interrupts; int enabled; - struct debug_store *ds; int n_events; int n_added; + int n_txn; int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */ u64 tags[X86_PMC_IDX_MAX]; struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */ + + unsigned int group_flag; + + /* + * Intel DebugStore bits + */ + struct debug_store *ds; + u64 pebs_enabled; + + /* + * Intel LBR bits + */ + int lbr_users; + void *lbr_context; + struct perf_branch_stack lbr_stack; + struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; + + /* + * AMD specific bits + */ struct amd_nb *amd_nb; }; @@ -114,44 +143,75 @@ struct cpu_hw_events { #define EVENT_CONSTRAINT(c, n, m) \ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) +/* + * Constraint on the Event code. + */ #define INTEL_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK) + EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) +/* + * Constraint on the Event code + UMask + fixed-mask + * + * filter mask to validate fixed counter events. + * the following filters disqualify for fixed counters: + * - inv + * - edge + * - cnt-mask + * The other filters are supported by fixed counters. + * The any-thread option is supported starting with v3. + */ #define FIXED_EVENT_CONSTRAINT(c, n) \ - EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK) + EVENT_CONSTRAINT(c, (1ULL << (32+n)), X86_RAW_EVENT_MASK) + +/* + * Constraint on the Event code + UMask + */ +#define PEBS_EVENT_CONSTRAINT(c, n) \ + EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) #define EVENT_CONSTRAINT_END \ EVENT_CONSTRAINT(0, 0, 0) #define for_each_event_constraint(e, c) \ - for ((e) = (c); (e)->cmask; (e)++) + for ((e) = (c); (e)->weight; (e)++) + +union perf_capabilities { + struct { + u64 lbr_format : 6; + u64 pebs_trap : 1; + u64 pebs_arch_reg : 1; + u64 pebs_format : 4; + u64 smm_freeze : 1; + }; + u64 capabilities; +}; /* * struct x86_pmu - generic x86 pmu */ struct x86_pmu { + /* + * Generic x86 PMC bits + */ const char *name; int version; int (*handle_irq)(struct pt_regs *); void (*disable_all)(void); - void (*enable_all)(void); + void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); + int (*hw_config)(struct perf_event *event); + int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); - u64 (*raw_event)(u64); int max_events; - int num_events; - int num_events_fixed; - int event_bits; - u64 event_mask; + int num_counters; + int num_counters_fixed; + int cntval_bits; + u64 cntval_mask; int apic; u64 max_period; - u64 intel_ctrl; - void (*enable_bts)(u64 config); - void (*disable_bts)(void); - struct event_constraint * (*get_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); @@ -159,11 +219,33 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; + void (*quirks)(void); + int perfctr_second_write; int (*cpu_prepare)(int cpu); void (*cpu_starting)(int cpu); void (*cpu_dying)(int cpu); void (*cpu_dead)(int cpu); + + /* + * Intel Arch Perfmon v2+ + */ + u64 intel_ctrl; + union perf_capabilities intel_cap; + + /* + * Intel DebugStore bits + */ + int bts, pebs; + int pebs_record_size; + void (*drain_pebs)(struct pt_regs *regs); + struct event_constraint *pebs_constraints; + + /* + * Intel LBR + */ + unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ + int lbr_nr; /* hardware stack size */ }; static struct x86_pmu x86_pmu __read_mostly; @@ -198,7 +280,7 @@ static u64 x86_perf_event_update(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - int shift = 64 - x86_pmu.event_bits; + int shift = 64 - x86_pmu.cntval_bits; u64 prev_raw_count, new_raw_count; int idx = hwc->idx; s64 delta; @@ -214,10 +296,10 @@ x86_perf_event_update(struct perf_event *event) * count to the generic event atomically: */ again: - prev_raw_count = atomic64_read(&hwc->prev_count); + prev_raw_count = local64_read(&hwc->prev_count); rdmsrl(hwc->event_base + idx, new_raw_count); - if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, new_raw_count) != prev_raw_count) goto again; @@ -232,8 +314,8 @@ again: delta = (new_raw_count << shift) - (prev_raw_count << shift); delta >>= shift; - atomic64_add(delta, &event->count); - atomic64_sub(delta, &hwc->period_left); + local64_add(delta, &event->count); + local64_sub(delta, &hwc->period_left); return new_raw_count; } @@ -241,33 +323,32 @@ again: static atomic_t active_events; static DEFINE_MUTEX(pmc_reserve_mutex); +#ifdef CONFIG_X86_LOCAL_APIC + static bool reserve_pmc_hardware(void) { -#ifdef CONFIG_X86_LOCAL_APIC int i; if (nmi_watchdog == NMI_LOCAL_APIC) disable_lapic_nmi_watchdog(); - for (i = 0; i < x86_pmu.num_events; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } - for (i = 0; i < x86_pmu.num_events; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } -#endif return true; -#ifdef CONFIG_X86_LOCAL_APIC eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); - i = x86_pmu.num_events; + i = x86_pmu.num_counters; perfctr_fail: for (i--; i >= 0; i--) @@ -277,128 +358,36 @@ perfctr_fail: enable_lapic_nmi_watchdog(); return false; -#endif } static void release_pmc_hardware(void) { -#ifdef CONFIG_X86_LOCAL_APIC int i; - for (i = 0; i < x86_pmu.num_events; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); -#endif -} - -static inline bool bts_available(void) -{ - return x86_pmu.enable_bts != NULL; -} - -static void init_debug_store_on_cpu(int cpu) -{ - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - return; - - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, - (u32)((u64)(unsigned long)ds), - (u32)((u64)(unsigned long)ds >> 32)); -} - -static void fini_debug_store_on_cpu(int cpu) -{ - if (!per_cpu(cpu_hw_events, cpu).ds) - return; - - wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); -} - -static void release_bts_hardware(void) -{ - int cpu; - - if (!bts_available()) - return; - - get_online_cpus(); - - for_each_online_cpu(cpu) - fini_debug_store_on_cpu(cpu); - - for_each_possible_cpu(cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - continue; - - per_cpu(cpu_hw_events, cpu).ds = NULL; - - kfree((void *)(unsigned long)ds->bts_buffer_base); - kfree(ds); - } - - put_online_cpus(); } -static int reserve_bts_hardware(void) -{ - int cpu, err = 0; - - if (!bts_available()) - return 0; - - get_online_cpus(); - - for_each_possible_cpu(cpu) { - struct debug_store *ds; - void *buffer; - - err = -ENOMEM; - buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); - if (unlikely(!buffer)) - break; - - ds = kzalloc(sizeof(*ds), GFP_KERNEL); - if (unlikely(!ds)) { - kfree(buffer); - break; - } - - ds->bts_buffer_base = (u64)(unsigned long)buffer; - ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = - ds->bts_buffer_base + BTS_BUFFER_SIZE; - ds->bts_interrupt_threshold = - ds->bts_absolute_maximum - BTS_OVFL_TH; +#else - per_cpu(cpu_hw_events, cpu).ds = ds; - err = 0; - } +static bool reserve_pmc_hardware(void) { return true; } +static void release_pmc_hardware(void) {} - if (err) - release_bts_hardware(); - else { - for_each_online_cpu(cpu) - init_debug_store_on_cpu(cpu); - } - - put_online_cpus(); +#endif - return err; -} +static int reserve_ds_buffers(void); +static void release_ds_buffers(void); static void hw_perf_event_destroy(struct perf_event *event) { if (atomic_dec_and_mutex_lock(&active_events, &pmc_reserve_mutex)) { release_pmc_hardware(); - release_bts_hardware(); + release_ds_buffers(); mutex_unlock(&pmc_reserve_mutex); } } @@ -441,59 +430,16 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) return 0; } -/* - * Setup the hardware configuration for a given attr_type - */ -static int __hw_perf_event_init(struct perf_event *event) +static int x86_setup_perfctr(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; struct hw_perf_event *hwc = &event->hw; u64 config; - int err; - - if (!x86_pmu_initialized()) - return -ENODEV; - - err = 0; - if (!atomic_inc_not_zero(&active_events)) { - mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&active_events) == 0) { - if (!reserve_pmc_hardware()) - err = -EBUSY; - else - err = reserve_bts_hardware(); - } - if (!err) - atomic_inc(&active_events); - mutex_unlock(&pmc_reserve_mutex); - } - if (err) - return err; - - event->destroy = hw_perf_event_destroy; - - /* - * Generate PMC IRQs: - * (keep 'enabled' bit clear for now) - */ - hwc->config = ARCH_PERFMON_EVENTSEL_INT; - - hwc->idx = -1; - hwc->last_cpu = -1; - hwc->last_tag = ~0ULL; - - /* - * Count user and OS events unless requested not to. - */ - if (!attr->exclude_user) - hwc->config |= ARCH_PERFMON_EVENTSEL_USR; - if (!attr->exclude_kernel) - hwc->config |= ARCH_PERFMON_EVENTSEL_OS; if (!hwc->sample_period) { hwc->sample_period = x86_pmu.max_period; hwc->last_period = hwc->sample_period; - atomic64_set(&hwc->period_left, hwc->sample_period); + local64_set(&hwc->period_left, hwc->sample_period); } else { /* * If we have a PMU initialized but no APIC @@ -505,16 +451,8 @@ static int __hw_perf_event_init(struct perf_event *event) return -EOPNOTSUPP; } - /* - * Raw hw_event type provide the config in the hw_event structure - */ - if (attr->type == PERF_TYPE_RAW) { - hwc->config |= x86_pmu.raw_event(attr->config); - if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) && - perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) - return -EACCES; + if (attr->type == PERF_TYPE_RAW) return 0; - } if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, attr); @@ -539,11 +477,11 @@ static int __hw_perf_event_init(struct perf_event *event) if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && (hwc->sample_period == 1)) { /* BTS is not supported by this architecture. */ - if (!bts_available()) + if (!x86_pmu.bts) return -EOPNOTSUPP; /* BTS is currently only allowed for user-mode. */ - if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + if (!attr->exclude_kernel) return -EOPNOTSUPP; } @@ -552,12 +490,87 @@ static int __hw_perf_event_init(struct perf_event *event) return 0; } +static int x86_pmu_hw_config(struct perf_event *event) +{ + if (event->attr.precise_ip) { + int precise = 0; + + /* Support for constant skid */ + if (x86_pmu.pebs) + precise++; + + /* Support for IP fixup */ + if (x86_pmu.lbr_nr) + precise++; + + if (event->attr.precise_ip > precise) + return -EOPNOTSUPP; + } + + /* + * Generate PMC IRQs: + * (keep 'enabled' bit clear for now) + */ + event->hw.config = ARCH_PERFMON_EVENTSEL_INT; + + /* + * Count user and OS events unless requested not to + */ + if (!event->attr.exclude_user) + event->hw.config |= ARCH_PERFMON_EVENTSEL_USR; + if (!event->attr.exclude_kernel) + event->hw.config |= ARCH_PERFMON_EVENTSEL_OS; + + if (event->attr.type == PERF_TYPE_RAW) + event->hw.config |= event->attr.config & X86_RAW_EVENT_MASK; + + return x86_setup_perfctr(event); +} + +/* + * Setup the hardware configuration for a given attr_type + */ +static int __hw_perf_event_init(struct perf_event *event) +{ + int err; + + if (!x86_pmu_initialized()) + return -ENODEV; + + err = 0; + if (!atomic_inc_not_zero(&active_events)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&active_events) == 0) { + if (!reserve_pmc_hardware()) + err = -EBUSY; + else { + err = reserve_ds_buffers(); + if (err) + release_pmc_hardware(); + } + } + if (!err) + atomic_inc(&active_events); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + + event->destroy = hw_perf_event_destroy; + + event->hw.idx = -1; + event->hw.last_cpu = -1; + event->hw.last_tag = ~0ULL; + + return x86_pmu.hw_config(event); +} + static void x86_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_events; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -587,12 +600,12 @@ void hw_perf_disable(void) x86_pmu.disable_all(); } -static void x86_pmu_enable_all(void) +static void x86_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int idx; - for (idx = 0; idx < x86_pmu.num_events; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { struct perf_event *event = cpuc->events[idx]; u64 val; @@ -667,14 +680,14 @@ static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) * assign events to counters starting with most * constrained events. */ - wmax = x86_pmu.num_events; + wmax = x86_pmu.num_counters; /* * when fixed event counters are present, * wmax is incremented by 1 to account * for one more choice */ - if (x86_pmu.num_events_fixed) + if (x86_pmu.num_counters_fixed) wmax++; for (w = 1, num = n; num && w <= wmax; w++) { @@ -724,7 +737,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, struct perf_event *event; int n, max_count; - max_count = x86_pmu.num_events + x86_pmu.num_events_fixed; + max_count = x86_pmu.num_counters + x86_pmu.num_counters_fixed; /* current number of events already accepted */ n = cpuc->n_events; @@ -795,7 +808,7 @@ void hw_perf_enable(void) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); struct perf_event *event; struct hw_perf_event *hwc; - int i; + int i, added = cpuc->n_added; if (!x86_pmu_initialized()) return; @@ -847,19 +860,20 @@ void hw_perf_enable(void) cpuc->enabled = 1; barrier(); - x86_pmu.enable_all(); + x86_pmu.enable_all(added); } -static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc) +static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc, + u64 enable_mask) { - (void)checking_wrmsrl(hwc->config_base + hwc->idx, - hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE); + wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask); } static inline void x86_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config); + + wrmsrl(hwc->config_base + hwc->idx, hwc->config); } static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); @@ -872,9 +886,9 @@ static int x86_perf_event_set_period(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; - s64 left = atomic64_read(&hwc->period_left); + s64 left = local64_read(&hwc->period_left); s64 period = hwc->sample_period; - int err, ret = 0, idx = hwc->idx; + int ret = 0, idx = hwc->idx; if (idx == X86_PMC_IDX_FIXED_BTS) return 0; @@ -884,14 +898,14 @@ x86_perf_event_set_period(struct perf_event *event) */ if (unlikely(left <= -period)) { left = period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } if (unlikely(left <= 0)) { left += period; - atomic64_set(&hwc->period_left, left); + local64_set(&hwc->period_left, left); hwc->last_period = period; ret = 1; } @@ -910,10 +924,19 @@ x86_perf_event_set_period(struct perf_event *event) * The hw event starts counting from this event offset, * mark it to be able to extra future deltas: */ - atomic64_set(&hwc->prev_count, (u64)-left); + local64_set(&hwc->prev_count, (u64)-left); + + wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); - err = checking_wrmsrl(hwc->event_base + idx, - (u64)(-left) & x86_pmu.event_mask); + /* + * Due to erratum on certan cpu we need + * a second write to be sure the register + * is updated properly + */ + if (x86_pmu.perfctr_second_write) { + wrmsrl(hwc->event_base + idx, + (u64)(-left) & x86_pmu.cntval_mask); + } perf_event_update_userpage(event); @@ -924,7 +947,8 @@ static void x86_pmu_enable_event(struct perf_event *event) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); if (cpuc->enabled) - __x86_pmu_enable_event(&event->hw); + __x86_pmu_enable_event(&event->hw, + ARCH_PERFMON_EVENTSEL_ENABLE); } /* @@ -950,7 +974,15 @@ static int x86_pmu_enable(struct perf_event *event) if (n < 0) return n; - ret = x86_schedule_events(cpuc, n, assign); + /* + * If group events scheduling transaction was started, + * skip the schedulability test here, it will be peformed + * at commit time(->commit_txn) as a whole + */ + if (cpuc->group_flag & PERF_EVENT_TXN) + goto out; + + ret = x86_pmu.schedule_events(cpuc, n, assign); if (ret) return ret; /* @@ -959,8 +991,10 @@ static int x86_pmu_enable(struct perf_event *event) */ memcpy(cpuc->assign, assign, n*sizeof(int)); +out: cpuc->n_events = n; cpuc->n_added += n - n0; + cpuc->n_txn += n - n0; return 0; } @@ -991,11 +1025,12 @@ static void x86_pmu_unthrottle(struct perf_event *event) void perf_event_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; + u64 pebs; struct cpu_hw_events *cpuc; unsigned long flags; int cpu, idx; - if (!x86_pmu.num_events) + if (!x86_pmu.num_counters) return; local_irq_save(flags); @@ -1008,16 +1043,18 @@ void perf_event_print_debug(void) rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + rdmsrl(MSR_IA32_PEBS_ENABLE, pebs); pr_info("\n"); pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); pr_info("CPU#%d: status: %016llx\n", cpu, status); pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); + pr_info("CPU#%d: pebs: %016llx\n", cpu, pebs); } - pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); + pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); - for (idx = 0; idx < x86_pmu.num_events; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); @@ -1030,7 +1067,7 @@ void perf_event_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -1064,6 +1101,14 @@ static void x86_pmu_disable(struct perf_event *event) struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); int i; + /* + * If we're called during a txn, we don't need to do anything. + * The events never got scheduled and ->cancel_txn will truncate + * the event_list. + */ + if (cpuc->group_flag & PERF_EVENT_TXN) + return; + x86_pmu_stop(event); for (i = 0; i < cpuc->n_events; i++) { @@ -1095,7 +1140,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) cpuc = &__get_cpu_var(cpu_hw_events); - for (idx = 0; idx < x86_pmu.num_events; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { if (!test_bit(idx, cpuc->active_mask)) continue; @@ -1103,7 +1148,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs) hwc = &event->hw; val = x86_perf_event_update(event); - if (val & (1ULL << (x86_pmu.event_bits - 1))) + if (val & (1ULL << (x86_pmu.cntval_bits - 1))) continue; /* @@ -1146,7 +1191,6 @@ void set_perf_event_pending(void) void perf_events_lapic_init(void) { -#ifdef CONFIG_X86_LOCAL_APIC if (!x86_pmu.apic || !x86_pmu_initialized()) return; @@ -1154,7 +1198,6 @@ void perf_events_lapic_init(void) * Always use NMI for PMU */ apic_write(APIC_LVTPC, APIC_DM_NMI); -#endif } static int __kprobes @@ -1178,9 +1221,7 @@ perf_event_nmi_handler(struct notifier_block *self, regs = args->regs; -#ifdef CONFIG_X86_LOCAL_APIC apic_write(APIC_LVTPC, APIC_DM_NMI); -#endif /* * Can't rely on the handled return value to say it was our NMI, two * events could trigger 'simultaneously' raising two back-to-back NMIs. @@ -1217,118 +1258,11 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) return &unconstrained; } -static int x86_event_sched_in(struct perf_event *event, - struct perf_cpu_context *cpuctx) -{ - int ret = 0; - - event->state = PERF_EVENT_STATE_ACTIVE; - event->oncpu = smp_processor_id(); - event->tstamp_running += event->ctx->time - event->tstamp_stopped; - - if (!is_x86_event(event)) - ret = event->pmu->enable(event); - - if (!ret && !is_software_event(event)) - cpuctx->active_oncpu++; - - if (!ret && event->attr.exclusive) - cpuctx->exclusive = 1; - - return ret; -} - -static void x86_event_sched_out(struct perf_event *event, - struct perf_cpu_context *cpuctx) -{ - event->state = PERF_EVENT_STATE_INACTIVE; - event->oncpu = -1; - - if (!is_x86_event(event)) - event->pmu->disable(event); - - event->tstamp_running -= event->ctx->time - event->tstamp_stopped; - - if (!is_software_event(event)) - cpuctx->active_oncpu--; - - if (event->attr.exclusive || !cpuctx->active_oncpu) - cpuctx->exclusive = 0; -} - -/* - * Called to enable a whole group of events. - * Returns 1 if the group was enabled, or -EAGAIN if it could not be. - * Assumes the caller has disabled interrupts and has - * frozen the PMU with hw_perf_save_disable. - * - * called with PMU disabled. If successful and return value 1, - * then guaranteed to call perf_enable() and hw_perf_enable() - */ -int hw_perf_group_sched_in(struct perf_event *leader, - struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct perf_event *sub; - int assign[X86_PMC_IDX_MAX]; - int n0, n1, ret; - - /* n0 = total number of events */ - n0 = collect_events(cpuc, leader, true); - if (n0 < 0) - return n0; - - ret = x86_schedule_events(cpuc, n0, assign); - if (ret) - return ret; - - ret = x86_event_sched_in(leader, cpuctx); - if (ret) - return ret; - - n1 = 1; - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - if (sub->state > PERF_EVENT_STATE_OFF) { - ret = x86_event_sched_in(sub, cpuctx); - if (ret) - goto undo; - ++n1; - } - } - /* - * copy new assignment, now we know it is possible - * will be used by hw_perf_enable() - */ - memcpy(cpuc->assign, assign, n0*sizeof(int)); - - cpuc->n_events = n0; - cpuc->n_added += n1; - ctx->nr_active += n1; - - /* - * 1 means successful and events are active - * This is not quite true because we defer - * actual activation until hw_perf_enable() but - * this way we* ensure caller won't try to enable - * individual events - */ - return 1; -undo: - x86_event_sched_out(leader, cpuctx); - n0 = 1; - list_for_each_entry(sub, &leader->sibling_list, group_entry) { - if (sub->state == PERF_EVENT_STATE_ACTIVE) { - x86_event_sched_out(sub, cpuctx); - if (++n0 == n1) - break; - } - } - return ret; -} - #include "perf_event_amd.c" #include "perf_event_p6.c" +#include "perf_event_p4.c" +#include "perf_event_intel_lbr.c" +#include "perf_event_intel_ds.c" #include "perf_event_intel.c" static int __cpuinit @@ -1402,48 +1336,50 @@ void __init init_hw_perf_events(void) pr_cont("%s PMU driver.\n", x86_pmu.name); - if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { + if (x86_pmu.quirks) + x86_pmu.quirks(); + + if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", - x86_pmu.num_events, X86_PMC_MAX_GENERIC); - x86_pmu.num_events = X86_PMC_MAX_GENERIC; + x86_pmu.num_counters, X86_PMC_MAX_GENERIC); + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; } - perf_event_mask = (1 << x86_pmu.num_events) - 1; - perf_max_events = x86_pmu.num_events; + x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; + perf_max_events = x86_pmu.num_counters; - if (x86_pmu.num_events_fixed > X86_PMC_MAX_FIXED) { + if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", - x86_pmu.num_events_fixed, X86_PMC_MAX_FIXED); - x86_pmu.num_events_fixed = X86_PMC_MAX_FIXED; + x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; } - perf_event_mask |= - ((1LL << x86_pmu.num_events_fixed)-1) << X86_PMC_IDX_FIXED; - x86_pmu.intel_ctrl = perf_event_mask; + x86_pmu.intel_ctrl |= + ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; perf_events_lapic_init(); register_die_notifier(&perf_event_nmi_notifier); unconstrained = (struct event_constraint) - __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1, - 0, x86_pmu.num_events); + __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, + 0, x86_pmu.num_counters); if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != INTEL_ARCH_FIXED_MASK) + if (c->cmask != X86_RAW_EVENT_MASK) continue; - c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1; - c->weight += x86_pmu.num_events; + c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; + c->weight += x86_pmu.num_counters; } } pr_info("... version: %d\n", x86_pmu.version); - pr_info("... bit width: %d\n", x86_pmu.event_bits); - pr_info("... generic registers: %d\n", x86_pmu.num_events); - pr_info("... value mask: %016Lx\n", x86_pmu.event_mask); + pr_info("... bit width: %d\n", x86_pmu.cntval_bits); + pr_info("... generic registers: %d\n", x86_pmu.num_counters); + pr_info("... value mask: %016Lx\n", x86_pmu.cntval_mask); pr_info("... max period: %016Lx\n", x86_pmu.max_period); - pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); - pr_info("... event mask: %016Lx\n", perf_event_mask); + pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); + pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); perf_cpu_notifier(x86_pmu_notifier); } @@ -1453,6 +1389,67 @@ static inline void x86_pmu_read(struct perf_event *event) x86_perf_event_update(event); } +/* + * Start group events scheduling transaction + * Set the flag to make pmu::enable() not perform the + * schedulability test, it will be performed at commit time + */ +static void x86_pmu_start_txn(const struct pmu *pmu) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + cpuc->group_flag |= PERF_EVENT_TXN; + cpuc->n_txn = 0; +} + +/* + * Stop group events scheduling transaction + * Clear the flag and pmu::enable() will perform the + * schedulability test. + */ +static void x86_pmu_cancel_txn(const struct pmu *pmu) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + cpuc->group_flag &= ~PERF_EVENT_TXN; + /* + * Truncate the collected events. + */ + cpuc->n_added -= cpuc->n_txn; + cpuc->n_events -= cpuc->n_txn; +} + +/* + * Commit group events scheduling transaction + * Perform the group schedulability test as a whole + * Return 0 if success + */ +static int x86_pmu_commit_txn(const struct pmu *pmu) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int assign[X86_PMC_IDX_MAX]; + int n, ret; + + n = cpuc->n_events; + + if (!x86_pmu_initialized()) + return -EAGAIN; + + ret = x86_pmu.schedule_events(cpuc, n, assign); + if (ret) + return ret; + + /* + * copy new assignment, now we know it is possible + * will be used by hw_perf_enable() + */ + memcpy(cpuc->assign, assign, n*sizeof(int)); + + cpuc->group_flag &= ~PERF_EVENT_TXN; + + return 0; +} + static const struct pmu pmu = { .enable = x86_pmu_enable, .disable = x86_pmu_disable, @@ -1460,9 +1457,38 @@ static const struct pmu pmu = { .stop = x86_pmu_stop, .read = x86_pmu_read, .unthrottle = x86_pmu_unthrottle, + .start_txn = x86_pmu_start_txn, + .cancel_txn = x86_pmu_cancel_txn, + .commit_txn = x86_pmu_commit_txn, }; /* + * validate that we can schedule this event + */ +static int validate_event(struct perf_event *event) +{ + struct cpu_hw_events *fake_cpuc; + struct event_constraint *c; + int ret = 0; + + fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO); + if (!fake_cpuc) + return -ENOMEM; + + c = x86_pmu.get_event_constraints(fake_cpuc, event); + + if (!c || !c->weight) + ret = -ENOSPC; + + if (x86_pmu.put_event_constraints) + x86_pmu.put_event_constraints(fake_cpuc, event); + + kfree(fake_cpuc); + + return ret; +} + +/* * validate a single event group * * validation include: @@ -1502,7 +1528,7 @@ static int validate_group(struct perf_event *event) fake_cpuc->n_events = n; - ret = x86_schedule_events(fake_cpuc, n, NULL); + ret = x86_pmu.schedule_events(fake_cpuc, n, NULL); out_free: kfree(fake_cpuc); @@ -1527,6 +1553,8 @@ const struct pmu *hw_perf_event_init(struct perf_event *event) if (event->group_leader != event) err = validate_group(event); + else + err = validate_event(event); event->pmu = tmp; } @@ -1574,8 +1602,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - if (reliable) - callchain_store(entry, addr); + callchain_store(entry, addr); } static const struct stacktrace_ops backtrace_ops = { @@ -1586,8 +1613,6 @@ static const struct stacktrace_ops backtrace_ops = { .walk_stack = print_context_stack_bp, }; -#include "../dumpstack.h" - static void perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { @@ -1597,41 +1622,6 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); } -/* - * best effort, GUP based copy_from_user() that assumes IRQ or NMI context - */ -static unsigned long -copy_from_user_nmi(void *to, const void __user *from, unsigned long n) -{ - unsigned long offset, addr = (unsigned long)from; - int type = in_nmi() ? KM_NMI : KM_IRQ0; - unsigned long size, len = 0; - struct page *page; - void *map; - int ret; - - do { - ret = __get_user_pages_fast(addr, 1, 0, &page); - if (!ret) - break; - - offset = addr & (PAGE_SIZE - 1); - size = min(PAGE_SIZE - offset, n - len); - - map = kmap_atomic(page, type); - memcpy(to, map+offset, size); - kunmap_atomic(map, type); - put_page(page); - - len += size; - to += size; - addr += size; - - } while (len < n); - - return len; -} - #ifdef CONFIG_COMPAT static inline int perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry) @@ -1727,6 +1717,11 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) { struct perf_callchain_entry *entry; + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + /* TODO: We don't support guest os callchain now */ + return NULL; + } + if (in_nmi()) entry = &__get_cpu_var(pmc_nmi_entry); else @@ -1739,14 +1734,36 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) return entry; } -void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip) +unsigned long perf_instruction_pointer(struct pt_regs *regs) { - regs->ip = ip; - /* - * perf_arch_fetch_caller_regs adds another call, we need to increment - * the skip level - */ - regs->bp = rewind_frame_pointer(skip + 1); - regs->cs = __KERNEL_CS; - local_save_flags(regs->flags); + unsigned long ip; + + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) + ip = perf_guest_cbs->get_guest_ip(); + else + ip = instruction_pointer(regs); + + return ip; +} + +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + int misc = 0; + + if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) { + if (perf_guest_cbs->is_user_mode()) + misc |= PERF_RECORD_MISC_GUEST_USER; + else + misc |= PERF_RECORD_MISC_GUEST_KERNEL; + } else { + if (user_mode(regs)) + misc |= PERF_RECORD_MISC_USER; + else + misc |= PERF_RECORD_MISC_KERNEL; + } + + if (regs->flags & PERF_EFLAGS_EXACT) + misc |= PERF_RECORD_MISC_EXACT_IP; + + return misc; } diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index db6f7d4056e1..c2897b7b4a3b 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -2,7 +2,7 @@ static DEFINE_RAW_SPINLOCK(amd_nb_lock); -static __initconst u64 amd_hw_cache_event_ids +static __initconst const u64 amd_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -102,8 +102,8 @@ static const u64 amd_perfmon_event_map[] = [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, }; static u64 amd_pmu_event_map(int hw_event) @@ -111,22 +111,19 @@ static u64 amd_pmu_event_map(int hw_event) return amd_perfmon_event_map[hw_event]; } -static u64 amd_pmu_raw_event(u64 hw_event) +static int amd_pmu_hw_config(struct perf_event *event) { -#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL -#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL -#define K7_EVNTSEL_INV_MASK 0x000800000ULL -#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL - -#define K7_EVNTSEL_MASK \ - (K7_EVNTSEL_EVENT_MASK | \ - K7_EVNTSEL_UNIT_MASK | \ - K7_EVNTSEL_EDGE_MASK | \ - K7_EVNTSEL_INV_MASK | \ - K7_EVNTSEL_REG_MASK) - - return hw_event & K7_EVNTSEL_MASK; + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + if (event->attr.type != PERF_TYPE_RAW) + return 0; + + event->hw.config |= event->attr.config & AMD64_RAW_EVENT_MASK; + + return 0; } /* @@ -165,7 +162,7 @@ static void amd_put_event_constraints(struct cpu_hw_events *cpuc, * be removed on one CPU at a time AND PMU is disabled * when we come here */ - for (i = 0; i < x86_pmu.num_events; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { if (nb->owners[i] == event) { cmpxchg(nb->owners+i, event, NULL); break; @@ -215,7 +212,7 @@ amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) struct hw_perf_event *hwc = &event->hw; struct amd_nb *nb = cpuc->amd_nb; struct perf_event *old = NULL; - int max = x86_pmu.num_events; + int max = x86_pmu.num_counters; int i, j, k = -1; /* @@ -293,7 +290,7 @@ static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) /* * initialize all possible NB constraints */ - for (i = 0; i < x86_pmu.num_events; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { __set_bit(i, nb->event_constraints[i].idxmsk); nb->event_constraints[i].weight = 1; } @@ -371,21 +368,22 @@ static void amd_pmu_cpu_dead(int cpu) raw_spin_unlock(&amd_nb_lock); } -static __initconst struct x86_pmu amd_pmu = { +static __initconst const struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = x86_pmu_handle_irq, .disable_all = x86_pmu_disable_all, .enable_all = x86_pmu_enable_all, .enable = x86_pmu_enable_event, .disable = x86_pmu_disable_event, + .hw_config = amd_pmu_hw_config, + .schedule_events = x86_schedule_events, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, .event_map = amd_pmu_event_map, - .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), - .num_events = 4, - .event_bits = 48, - .event_mask = (1ULL << 48) - 1, + .num_counters = 4, + .cntval_bits = 48, + .cntval_mask = (1ULL << 48) - 1, .apic = 1, /* use highest bit to detect overflow */ .max_period = (1ULL << 47) - 1, diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9c794ac87837..214ac860ebe0 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -72,6 +72,7 @@ static struct event_constraint intel_westmere_event_constraints[] = INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ + INTEL_EVENT_CONSTRAINT(0xb3, 0x1), /* SNOOPQ_REQUEST_OUTSTANDING */ EVENT_CONSTRAINT_END }; @@ -88,7 +89,7 @@ static u64 intel_pmu_event_map(int hw_event) return intel_perfmon_event_map[hw_event]; } -static __initconst u64 westmere_hw_cache_event_ids +static __initconst const u64 westmere_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -179,7 +180,7 @@ static __initconst u64 westmere_hw_cache_event_ids }, }; -static __initconst u64 nehalem_hw_cache_event_ids +static __initconst const u64 nehalem_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -270,7 +271,7 @@ static __initconst u64 nehalem_hw_cache_event_ids }, }; -static __initconst u64 core2_hw_cache_event_ids +static __initconst const u64 core2_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -361,7 +362,7 @@ static __initconst u64 core2_hw_cache_event_ids }, }; -static __initconst u64 atom_hw_cache_event_ids +static __initconst const u64 atom_hw_cache_event_ids [PERF_COUNT_HW_CACHE_MAX] [PERF_COUNT_HW_CACHE_OP_MAX] [PERF_COUNT_HW_CACHE_RESULT_MAX] = @@ -452,60 +453,6 @@ static __initconst u64 atom_hw_cache_event_ids }, }; -static u64 intel_pmu_raw_event(u64 hw_event) -{ -#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL -#define CORE_EVNTSEL_INV_MASK 0x00800000ULL -#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL - -#define CORE_EVNTSEL_MASK \ - (INTEL_ARCH_EVTSEL_MASK | \ - INTEL_ARCH_UNIT_MASK | \ - INTEL_ARCH_EDGE_MASK | \ - INTEL_ARCH_INV_MASK | \ - INTEL_ARCH_CNT_MASK) - - return hw_event & CORE_EVNTSEL_MASK; -} - -static void intel_pmu_enable_bts(u64 config) -{ - unsigned long debugctlmsr; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr |= X86_DEBUGCTL_TR; - debugctlmsr |= X86_DEBUGCTL_BTS; - debugctlmsr |= X86_DEBUGCTL_BTINT; - - if (!(config & ARCH_PERFMON_EVENTSEL_OS)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS; - - if (!(config & ARCH_PERFMON_EVENTSEL_USR)) - debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR; - - update_debugctlmsr(debugctlmsr); -} - -static void intel_pmu_disable_bts(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - unsigned long debugctlmsr; - - if (!cpuc->ds) - return; - - debugctlmsr = get_debugctlmsr(); - - debugctlmsr &= - ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT | - X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR); - - update_debugctlmsr(debugctlmsr); -} - static void intel_pmu_disable_all(void) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); @@ -514,12 +461,17 @@ static void intel_pmu_disable_all(void) if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); + + intel_pmu_pebs_disable_all(); + intel_pmu_lbr_disable_all(); } -static void intel_pmu_enable_all(void) +static void intel_pmu_enable_all(int added) { struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + intel_pmu_pebs_enable_all(); + intel_pmu_lbr_enable_all(); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { @@ -533,6 +485,42 @@ static void intel_pmu_enable_all(void) } } +/* + * Workaround for: + * Intel Errata AAK100 (model 26) + * Intel Errata AAP53 (model 30) + * Intel Errata BD53 (model 44) + * + * These chips need to be 'reset' when adding counters by programming + * the magic three (non counting) events 0x4300D2, 0x4300B1 and 0x4300B5 + * either in sequence on the same PMC or on different PMCs. + */ +static void intel_pmu_nhm_enable_all(int added) +{ + if (added) { + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int i; + + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 0, 0x4300D2); + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 1, 0x4300B1); + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + 2, 0x4300B5); + + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x3); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0x0); + + for (i = 0; i < 3; i++) { + struct perf_event *event = cpuc->events[i]; + + if (!event) + continue; + + __x86_pmu_enable_event(&event->hw, + ARCH_PERFMON_EVENTSEL_ENABLE); + } + } + intel_pmu_enable_all(added); +} + static inline u64 intel_pmu_get_status(void) { u64 status; @@ -547,8 +535,7 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static inline void -intel_pmu_disable_fixed(struct hw_perf_event *hwc) +static void intel_pmu_disable_fixed(struct hw_perf_event *hwc) { int idx = hwc->idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -557,71 +544,10 @@ intel_pmu_disable_fixed(struct hw_perf_event *hwc) rdmsrl(hwc->config_base, ctrl_val); ctrl_val &= ~mask; - (void)checking_wrmsrl(hwc->config_base, ctrl_val); -} - -static void intel_pmu_drain_bts_buffer(void) -{ - struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); - struct debug_store *ds = cpuc->ds; - struct bts_record { - u64 from; - u64 to; - u64 flags; - }; - struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; - struct bts_record *at, *top; - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_sample_data data; - struct pt_regs regs; - - if (!event) - return; - - if (!ds) - return; - - at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; - top = (struct bts_record *)(unsigned long)ds->bts_index; - - if (top <= at) - return; - - ds->bts_index = ds->bts_buffer_base; - - perf_sample_data_init(&data, 0); - - data.period = event->hw.last_period; - regs.ip = 0; - - /* - * Prepare a generic sample, i.e. fill in the invariant fields. - * We will overwrite the from and to address before we output - * the sample. - */ - perf_prepare_sample(&header, &data, event, ®s); - - if (perf_output_begin(&handle, event, - header.size * (top - at), 1, 1)) - return; - - for (; at < top; at++) { - data.ip = at->from; - data.addr = at->to; - - perf_output_sample(&handle, &header, &data, event); - } - - perf_output_end(&handle); - - /* There's new data available. */ - event->hw.interrupts++; - event->pending_kill = POLL_IN; + wrmsrl(hwc->config_base, ctrl_val); } -static inline void -intel_pmu_disable_event(struct perf_event *event) +static void intel_pmu_disable_event(struct perf_event *event) { struct hw_perf_event *hwc = &event->hw; @@ -637,14 +563,15 @@ intel_pmu_disable_event(struct perf_event *event) } x86_pmu_disable_event(event); + + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_disable(event); } -static inline void -intel_pmu_enable_fixed(struct hw_perf_event *hwc) +static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) { int idx = hwc->idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; - int err; /* * Enable IRQ generation (0x8), @@ -669,7 +596,7 @@ intel_pmu_enable_fixed(struct hw_perf_event *hwc) rdmsrl(hwc->config_base, ctrl_val); ctrl_val &= ~mask; ctrl_val |= bits; - err = checking_wrmsrl(hwc->config_base, ctrl_val); + wrmsrl(hwc->config_base, ctrl_val); } static void intel_pmu_enable_event(struct perf_event *event) @@ -689,7 +616,10 @@ static void intel_pmu_enable_event(struct perf_event *event) return; } - __x86_pmu_enable_event(hwc); + if (unlikely(event->attr.precise_ip)) + intel_pmu_pebs_enable(event); + + __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } /* @@ -708,20 +638,20 @@ static void intel_pmu_reset(void) unsigned long flags; int idx; - if (!x86_pmu.num_events) + if (!x86_pmu.num_counters) return; local_irq_save(flags); printk("clearing PMU state on CPU#%d\n", smp_processor_id()); - for (idx = 0; idx < x86_pmu.num_events; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); } - for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); - } + if (ds) ds->bts_index = ds->bts_buffer_base; @@ -747,7 +677,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) intel_pmu_drain_bts_buffer(); status = intel_pmu_get_status(); if (!status) { - intel_pmu_enable_all(); + intel_pmu_enable_all(0); return 0; } @@ -762,6 +692,15 @@ again: inc_irq_stat(apic_perf_irqs); ack = status; + + intel_pmu_lbr_read(); + + /* + * PEBS overflow sets bit 62 in the global status register + */ + if (__test_and_clear_bit(62, (unsigned long *)&status)) + x86_pmu.drain_pebs(regs); + for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_event *event = cpuc->events[bit]; @@ -787,26 +726,22 @@ again: goto again; done: - intel_pmu_enable_all(); + intel_pmu_enable_all(0); return 1; } -static struct event_constraint bts_constraint = - EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); - static struct event_constraint * -intel_special_constraints(struct perf_event *event) +intel_bts_constraints(struct perf_event *event) { - unsigned int hw_event; - - hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK; + struct hw_perf_event *hwc = &event->hw; + unsigned int hw_event, bts_event; - if (unlikely((hw_event == - x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && - (event->hw.sample_period == 1))) { + hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; + bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); + if (unlikely(hw_event == bts_event && hwc->sample_period == 1)) return &bts_constraint; - } + return NULL; } @@ -815,24 +750,53 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event { struct event_constraint *c; - c = intel_special_constraints(event); + c = intel_bts_constraints(event); + if (c) + return c; + + c = intel_pebs_constraints(event); if (c) return c; return x86_get_event_constraints(cpuc, event); } -static __initconst struct x86_pmu core_pmu = { +static int intel_pmu_hw_config(struct perf_event *event) +{ + int ret = x86_pmu_hw_config(event); + + if (ret) + return ret; + + if (event->attr.type != PERF_TYPE_RAW) + return 0; + + if (!(event->attr.config & ARCH_PERFMON_EVENTSEL_ANY)) + return 0; + + if (x86_pmu.version < 3) + return -EINVAL; + + if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN)) + return -EACCES; + + event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY; + + return 0; +} + +static __initconst const struct x86_pmu core_pmu = { .name = "core", .handle_irq = x86_pmu_handle_irq, .disable_all = x86_pmu_disable_all, .enable_all = x86_pmu_enable_all, .enable = x86_pmu_enable_event, .disable = x86_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, /* @@ -845,17 +809,32 @@ static __initconst struct x86_pmu core_pmu = { .event_constraints = intel_core_event_constraints, }; -static __initconst struct x86_pmu intel_pmu = { +static void intel_pmu_cpu_starting(int cpu) +{ + init_debug_store_on_cpu(cpu); + /* + * Deal with CPUs that don't clear their LBRs on power-up. + */ + intel_pmu_lbr_reset(); +} + +static void intel_pmu_cpu_dying(int cpu) +{ + fini_debug_store_on_cpu(cpu); +} + +static __initconst const struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, .disable_all = intel_pmu_disable_all, .enable_all = intel_pmu_enable_all, .enable = intel_pmu_enable_event, .disable = intel_pmu_disable_event, + .hw_config = intel_pmu_hw_config, + .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, .event_map = intel_pmu_event_map, - .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), .apic = 1, /* @@ -864,14 +843,38 @@ static __initconst struct x86_pmu intel_pmu = { * the generic event period: */ .max_period = (1ULL << 31) - 1, - .enable_bts = intel_pmu_enable_bts, - .disable_bts = intel_pmu_disable_bts, .get_event_constraints = intel_get_event_constraints, - .cpu_starting = init_debug_store_on_cpu, - .cpu_dying = fini_debug_store_on_cpu, + .cpu_starting = intel_pmu_cpu_starting, + .cpu_dying = intel_pmu_cpu_dying, }; +static void intel_clovertown_quirks(void) +{ + /* + * PEBS is unreliable due to: + * + * AJ67 - PEBS may experience CPL leaks + * AJ68 - PEBS PMI may be delayed by one event + * AJ69 - GLOBAL_STATUS[62] will only be set when DEBUGCTL[12] + * AJ106 - FREEZE_LBRS_ON_PMI doesn't work in combination with PEBS + * + * AJ67 could be worked around by restricting the OS/USR flags. + * AJ69 could be worked around by setting PMU_FREEZE_ON_PMI. + * + * AJ106 could possibly be worked around by not allowing LBR + * usage from PEBS, including the fixup. + * AJ68 could possibly be worked around by always programming + * a pebs_event_reset[0] value and coping with the lost events. + * + * But taken together it might just make sense to not enable PEBS on + * these chips. + */ + printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + x86_pmu.pebs = 0; + x86_pmu.pebs_constraints = NULL; +} + static __init int intel_pmu_init(void) { union cpuid10_edx edx; @@ -881,12 +884,13 @@ static __init int intel_pmu_init(void) int version; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { - /* check for P6 processor family */ - if (boot_cpu_data.x86 == 6) { - return p6_pmu_init(); - } else { + switch (boot_cpu_data.x86) { + case 0x6: + return p6_pmu_init(); + case 0xf: + return p4_pmu_init(); + } return -ENODEV; - } } /* @@ -904,16 +908,28 @@ static __init int intel_pmu_init(void) x86_pmu = intel_pmu; x86_pmu.version = version; - x86_pmu.num_events = eax.split.num_events; - x86_pmu.event_bits = eax.split.bit_width; - x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.cntval_bits = eax.split.bit_width; + x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: */ if (version > 1) - x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + + /* + * v2 and above have a perf capabilities MSR + */ + if (version > 1) { + u64 capabilities; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, capabilities); + x86_pmu.intel_cap.capabilities = capabilities; + } + + intel_ds_init(); /* * Install the hw-cache-events table: @@ -924,12 +940,15 @@ static __init int intel_pmu_init(void) break; case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ + x86_pmu.quirks = intel_clovertown_quirks; case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ case 29: /* six-core 45 nm xeon "Dunnington" */ memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + intel_pmu_lbr_init_core(); + x86_pmu.event_constraints = intel_core2_event_constraints; pr_cont("Core2 events, "); break; @@ -940,13 +959,19 @@ static __init int intel_pmu_init(void) memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + intel_pmu_lbr_init_nhm(); + x86_pmu.event_constraints = intel_nehalem_event_constraints; - pr_cont("Nehalem/Corei7 events, "); + x86_pmu.enable_all = intel_pmu_nhm_enable_all; + pr_cont("Nehalem events, "); break; + case 28: /* Atom */ memcpy(hw_cache_event_ids, atom_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + intel_pmu_lbr_init_atom(); + x86_pmu.event_constraints = intel_gen_event_constraints; pr_cont("Atom events, "); break; @@ -956,7 +981,10 @@ static __init int intel_pmu_init(void) memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, sizeof(hw_cache_event_ids)); + intel_pmu_lbr_init_nhm(); + x86_pmu.event_constraints = intel_westmere_event_constraints; + x86_pmu.enable_all = intel_pmu_nhm_enable_all; pr_cont("Westmere events, "); break; diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c new file mode 100644 index 000000000000..18018d1311cd --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -0,0 +1,641 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 4 + +/* The size of a BTS record in bytes: */ +#define BTS_RECORD_SIZE 24 + +#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) +#define PEBS_BUFFER_SIZE PAGE_SIZE + +/* + * pebs_record_32 for p4 and core not supported + +struct pebs_record_32 { + u32 flags, ip; + u32 ax, bc, cx, dx; + u32 si, di, bp, sp; +}; + + */ + +struct pebs_record_core { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; +}; + +struct pebs_record_nhm { + u64 flags, ip; + u64 ax, bx, cx, dx; + u64 si, di, bp, sp; + u64 r8, r9, r10, r11; + u64 r12, r13, r14, r15; + u64 status, dla, dse, lat; +}; + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +}; + +static void init_debug_store_on_cpu(int cpu) +{ + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, + (u32)((u64)(unsigned long)ds), + (u32)((u64)(unsigned long)ds >> 32)); +} + +static void fini_debug_store_on_cpu(int cpu) +{ + if (!per_cpu(cpu_hw_events, cpu).ds) + return; + + wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); +} + +static void release_ds_buffers(void) +{ + int cpu; + + if (!x86_pmu.bts && !x86_pmu.pebs) + return; + + get_online_cpus(); + + for_each_online_cpu(cpu) + fini_debug_store_on_cpu(cpu); + + for_each_possible_cpu(cpu) { + struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + + if (!ds) + continue; + + per_cpu(cpu_hw_events, cpu).ds = NULL; + + kfree((void *)(unsigned long)ds->pebs_buffer_base); + kfree((void *)(unsigned long)ds->bts_buffer_base); + kfree(ds); + } + + put_online_cpus(); +} + +static int reserve_ds_buffers(void) +{ + int cpu, err = 0; + + if (!x86_pmu.bts && !x86_pmu.pebs) + return 0; + + get_online_cpus(); + + for_each_possible_cpu(cpu) { + struct debug_store *ds; + void *buffer; + int max, thresh; + + err = -ENOMEM; + ds = kzalloc(sizeof(*ds), GFP_KERNEL); + if (unlikely(!ds)) + break; + per_cpu(cpu_hw_events, cpu).ds = ds; + + if (x86_pmu.bts) { + buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; + thresh = max / 16; + + ds->bts_buffer_base = (u64)(unsigned long)buffer; + ds->bts_index = ds->bts_buffer_base; + ds->bts_absolute_maximum = ds->bts_buffer_base + + max * BTS_RECORD_SIZE; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - + thresh * BTS_RECORD_SIZE; + } + + if (x86_pmu.pebs) { + buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL); + if (unlikely(!buffer)) + break; + + max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size; + + ds->pebs_buffer_base = (u64)(unsigned long)buffer; + ds->pebs_index = ds->pebs_buffer_base; + ds->pebs_absolute_maximum = ds->pebs_buffer_base + + max * x86_pmu.pebs_record_size; + /* + * Always use single record PEBS + */ + ds->pebs_interrupt_threshold = ds->pebs_buffer_base + + x86_pmu.pebs_record_size; + } + + err = 0; + } + + if (err) + release_ds_buffers(); + else { + for_each_online_cpu(cpu) + init_debug_store_on_cpu(cpu); + } + + put_online_cpus(); + + return err; +} + +/* + * BTS + */ + +static struct event_constraint bts_constraint = + EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0); + +static void intel_pmu_enable_bts(u64 config) +{ + unsigned long debugctlmsr; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr |= DEBUGCTLMSR_TR; + debugctlmsr |= DEBUGCTLMSR_BTS; + debugctlmsr |= DEBUGCTLMSR_BTINT; + + if (!(config & ARCH_PERFMON_EVENTSEL_OS)) + debugctlmsr |= DEBUGCTLMSR_BTS_OFF_OS; + + if (!(config & ARCH_PERFMON_EVENTSEL_USR)) + debugctlmsr |= DEBUGCTLMSR_BTS_OFF_USR; + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_disable_bts(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long debugctlmsr; + + if (!cpuc->ds) + return; + + debugctlmsr = get_debugctlmsr(); + + debugctlmsr &= + ~(DEBUGCTLMSR_TR | DEBUGCTLMSR_BTS | DEBUGCTLMSR_BTINT | + DEBUGCTLMSR_BTS_OFF_OS | DEBUGCTLMSR_BTS_OFF_USR); + + update_debugctlmsr(debugctlmsr); +} + +static void intel_pmu_drain_bts_buffer(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct bts_record { + u64 from; + u64 to; + u64 flags; + }; + struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS]; + struct bts_record *at, *top; + struct perf_output_handle handle; + struct perf_event_header header; + struct perf_sample_data data; + struct pt_regs regs; + + if (!event) + return; + + if (!ds) + return; + + at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; + top = (struct bts_record *)(unsigned long)ds->bts_index; + + if (top <= at) + return; + + ds->bts_index = ds->bts_buffer_base; + + perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + regs.ip = 0; + + /* + * Prepare a generic sample, i.e. fill in the invariant fields. + * We will overwrite the from and to address before we output + * the sample. + */ + perf_prepare_sample(&header, &data, event, ®s); + + if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) + return; + + for (; at < top; at++) { + data.ip = at->from; + data.addr = at->to; + + perf_output_sample(&handle, &header, &data, event); + } + + perf_output_end(&handle); + + /* There's new data available. */ + event->hw.interrupts++; + event->pending_kill = POLL_IN; +} + +/* + * PEBS + */ + +static struct event_constraint intel_core_pebs_events[] = { + PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */ + PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ + PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ + PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ + PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ + PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ + PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ + PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ + PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint intel_nehalem_pebs_events[] = { + PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ + PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ + PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ + PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ + PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ + PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ + PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ + PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ + PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ + EVENT_CONSTRAINT_END +}; + +static struct event_constraint * +intel_pebs_constraints(struct perf_event *event) +{ + struct event_constraint *c; + + if (!event->attr.precise_ip) + return NULL; + + if (x86_pmu.pebs_constraints) { + for_each_event_constraint(c, x86_pmu.pebs_constraints) { + if ((event->hw.config & c->cmask) == c->code) + return c; + } + } + + return &emptyconstraint; +} + +static void intel_pmu_pebs_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; + + cpuc->pebs_enabled |= 1ULL << hwc->idx; + WARN_ON_ONCE(cpuc->enabled); + + if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) + intel_pmu_lbr_enable(event); +} + +static void intel_pmu_pebs_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + + cpuc->pebs_enabled &= ~(1ULL << hwc->idx); + if (cpuc->enabled) + wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); + + hwc->config |= ARCH_PERFMON_EVENTSEL_INT; + + if (x86_pmu.intel_cap.pebs_trap && event->attr.precise_ip > 1) + intel_pmu_lbr_disable(event); +} + +static void intel_pmu_pebs_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->pebs_enabled) + wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); +} + +static void intel_pmu_pebs_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->pebs_enabled) + wrmsrl(MSR_IA32_PEBS_ENABLE, 0); +} + +#include <asm/insn.h> + +static inline bool kernel_ip(unsigned long ip) +{ +#ifdef CONFIG_X86_32 + return ip > PAGE_OFFSET; +#else + return (long)ip < 0; +#endif +} + +static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + unsigned long from = cpuc->lbr_entries[0].from; + unsigned long old_to, to = cpuc->lbr_entries[0].to; + unsigned long ip = regs->ip; + + /* + * We don't need to fixup if the PEBS assist is fault like + */ + if (!x86_pmu.intel_cap.pebs_trap) + return 1; + + /* + * No LBR entry, no basic block, no rewinding + */ + if (!cpuc->lbr_stack.nr || !from || !to) + return 0; + + /* + * Basic blocks should never cross user/kernel boundaries + */ + if (kernel_ip(ip) != kernel_ip(to)) + return 0; + + /* + * unsigned math, either ip is before the start (impossible) or + * the basic block is larger than 1 page (sanity) + */ + if ((ip - to) > PAGE_SIZE) + return 0; + + /* + * We sampled a branch insn, rewind using the LBR stack + */ + if (ip == to) { + regs->ip = from; + return 1; + } + + do { + struct insn insn; + u8 buf[MAX_INSN_SIZE]; + void *kaddr; + + old_to = to; + if (!kernel_ip(ip)) { + int bytes, size = MAX_INSN_SIZE; + + bytes = copy_from_user_nmi(buf, (void __user *)to, size); + if (bytes != size) + return 0; + + kaddr = buf; + } else + kaddr = (void *)to; + + kernel_insn_init(&insn, kaddr); + insn_get_length(&insn); + to += insn.length; + } while (to < ip); + + if (to == ip) { + regs->ip = old_to; + return 1; + } + + /* + * Even though we decoded the basic block, the instruction stream + * never matched the given IP, either the TO or the IP got corrupted. + */ + return 0; +} + +static int intel_pmu_save_and_restart(struct perf_event *event); + +static void __intel_pmu_pebs_event(struct perf_event *event, + struct pt_regs *iregs, void *__pebs) +{ + /* + * We cast to pebs_record_core since that is a subset of + * both formats and we don't use the other fields in this + * routine. + */ + struct pebs_record_core *pebs = __pebs; + struct perf_sample_data data; + struct pt_regs regs; + + if (!intel_pmu_save_and_restart(event)) + return; + + perf_sample_data_init(&data, 0); + data.period = event->hw.last_period; + + /* + * We use the interrupt regs as a base because the PEBS record + * does not contain a full regs set, specifically it seems to + * lack segment descriptors, which get used by things like + * user_mode(). + * + * In the simple case fix up only the IP and BP,SP regs, for + * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly. + * A possible PERF_SAMPLE_REGS will have to transfer all regs. + */ + regs = *iregs; + regs.ip = pebs->ip; + regs.bp = pebs->bp; + regs.sp = pebs->sp; + + if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) + regs.flags |= PERF_EFLAGS_EXACT; + else + regs.flags &= ~PERF_EFLAGS_EXACT; + + if (perf_event_overflow(event, 1, &data, ®s)) + x86_pmu_stop(event); +} + +static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct perf_event *event = cpuc->events[0]; /* PMC0 only */ + struct pebs_record_core *at, *top; + int n; + + if (!ds || !x86_pmu.pebs) + return; + + at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_record_core *)(unsigned long)ds->pebs_index; + + /* + * Whatever else happens, drain the thing + */ + ds->pebs_index = ds->pebs_buffer_base; + + if (!test_bit(0, cpuc->active_mask)) + return; + + WARN_ON_ONCE(!event); + + if (!event->attr.precise_ip) + return; + + n = top - at; + if (n <= 0) + return; + + /* + * Should not happen, we program the threshold at 1 and do not + * set a reset value. + */ + WARN_ON_ONCE(n > 1); + at += n - 1; + + __intel_pmu_pebs_event(event, iregs, at); +} + +static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + struct debug_store *ds = cpuc->ds; + struct pebs_record_nhm *at, *top; + struct perf_event *event = NULL; + u64 status = 0; + int bit, n; + + if (!ds || !x86_pmu.pebs) + return; + + at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; + top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; + + ds->pebs_index = ds->pebs_buffer_base; + + n = top - at; + if (n <= 0) + return; + + /* + * Should not happen, we program the threshold at 1 and do not + * set a reset value. + */ + WARN_ON_ONCE(n > MAX_PEBS_EVENTS); + + for ( ; at < top; at++) { + for_each_set_bit(bit, (unsigned long *)&at->status, MAX_PEBS_EVENTS) { + event = cpuc->events[bit]; + if (!test_bit(bit, cpuc->active_mask)) + continue; + + WARN_ON_ONCE(!event); + + if (!event->attr.precise_ip) + continue; + + if (__test_and_set_bit(bit, (unsigned long *)&status)) + continue; + + break; + } + + if (!event || bit >= MAX_PEBS_EVENTS) + continue; + + __intel_pmu_pebs_event(event, iregs, at); + } +} + +/* + * BTS, PEBS probe and setup + */ + +static void intel_ds_init(void) +{ + /* + * No support for 32bit formats + */ + if (!boot_cpu_has(X86_FEATURE_DTES64)) + return; + + x86_pmu.bts = boot_cpu_has(X86_FEATURE_BTS); + x86_pmu.pebs = boot_cpu_has(X86_FEATURE_PEBS); + if (x86_pmu.pebs) { + char pebs_type = x86_pmu.intel_cap.pebs_trap ? '+' : '-'; + int format = x86_pmu.intel_cap.pebs_format; + + switch (format) { + case 0: + printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); + x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; + x86_pmu.pebs_constraints = intel_core_pebs_events; + break; + + case 1: + printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); + x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); + x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; + x86_pmu.pebs_constraints = intel_nehalem_pebs_events; + break; + + default: + printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); + x86_pmu.pebs = 0; + break; + } + } +} + +#else /* CONFIG_CPU_SUP_INTEL */ + +static int reserve_ds_buffers(void) +{ + return 0; +} + +static void release_ds_buffers(void) +{ +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c new file mode 100644 index 000000000000..d202c1bece1a --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -0,0 +1,218 @@ +#ifdef CONFIG_CPU_SUP_INTEL + +enum { + LBR_FORMAT_32 = 0x00, + LBR_FORMAT_LIP = 0x01, + LBR_FORMAT_EIP = 0x02, + LBR_FORMAT_EIP_FLAGS = 0x03, +}; + +/* + * We only support LBR implementations that have FREEZE_LBRS_ON_PMI + * otherwise it becomes near impossible to get a reliable stack. + */ + +static void __intel_pmu_lbr_enable(void) +{ + u64 debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl |= (DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +} + +static void __intel_pmu_lbr_disable(void) +{ + u64 debugctl; + + rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); + debugctl &= ~(DEBUGCTLMSR_LBR | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); +} + +static void intel_pmu_lbr_reset_32(void) +{ + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) + wrmsrl(x86_pmu.lbr_from + i, 0); +} + +static void intel_pmu_lbr_reset_64(void) +{ + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + wrmsrl(x86_pmu.lbr_from + i, 0); + wrmsrl(x86_pmu.lbr_to + i, 0); + } +} + +static void intel_pmu_lbr_reset(void) +{ + if (!x86_pmu.lbr_nr) + return; + + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + intel_pmu_lbr_reset_32(); + else + intel_pmu_lbr_reset_64(); +} + +static void intel_pmu_lbr_enable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + WARN_ON_ONCE(cpuc->enabled); + + /* + * Reset the LBR stack if we changed task context to + * avoid data leaks. + */ + + if (event->ctx->task && cpuc->lbr_context != event->ctx) { + intel_pmu_lbr_reset(); + cpuc->lbr_context = event->ctx; + } + + cpuc->lbr_users++; +} + +static void intel_pmu_lbr_disable(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!x86_pmu.lbr_nr) + return; + + cpuc->lbr_users--; + WARN_ON_ONCE(cpuc->lbr_users < 0); + + if (cpuc->enabled && !cpuc->lbr_users) + __intel_pmu_lbr_disable(); +} + +static void intel_pmu_lbr_enable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->lbr_users) + __intel_pmu_lbr_enable(); +} + +static void intel_pmu_lbr_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (cpuc->lbr_users) + __intel_pmu_lbr_disable(); +} + +static inline u64 intel_pmu_lbr_tos(void) +{ + u64 tos; + + rdmsrl(x86_pmu.lbr_tos, tos); + + return tos; +} + +static void intel_pmu_lbr_read_32(struct cpu_hw_events *cpuc) +{ + unsigned long mask = x86_pmu.lbr_nr - 1; + u64 tos = intel_pmu_lbr_tos(); + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + unsigned long lbr_idx = (tos - i) & mask; + union { + struct { + u32 from; + u32 to; + }; + u64 lbr; + } msr_lastbranch; + + rdmsrl(x86_pmu.lbr_from + lbr_idx, msr_lastbranch.lbr); + + cpuc->lbr_entries[i].from = msr_lastbranch.from; + cpuc->lbr_entries[i].to = msr_lastbranch.to; + cpuc->lbr_entries[i].flags = 0; + } + cpuc->lbr_stack.nr = i; +} + +#define LBR_FROM_FLAG_MISPRED (1ULL << 63) + +/* + * Due to lack of segmentation in Linux the effective address (offset) + * is the same as the linear address, allowing us to merge the LIP and EIP + * LBR formats. + */ +static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) +{ + unsigned long mask = x86_pmu.lbr_nr - 1; + int lbr_format = x86_pmu.intel_cap.lbr_format; + u64 tos = intel_pmu_lbr_tos(); + int i; + + for (i = 0; i < x86_pmu.lbr_nr; i++) { + unsigned long lbr_idx = (tos - i) & mask; + u64 from, to, flags = 0; + + rdmsrl(x86_pmu.lbr_from + lbr_idx, from); + rdmsrl(x86_pmu.lbr_to + lbr_idx, to); + + if (lbr_format == LBR_FORMAT_EIP_FLAGS) { + flags = !!(from & LBR_FROM_FLAG_MISPRED); + from = (u64)((((s64)from) << 1) >> 1); + } + + cpuc->lbr_entries[i].from = from; + cpuc->lbr_entries[i].to = to; + cpuc->lbr_entries[i].flags = flags; + } + cpuc->lbr_stack.nr = i; +} + +static void intel_pmu_lbr_read(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + + if (!cpuc->lbr_users) + return; + + if (x86_pmu.intel_cap.lbr_format == LBR_FORMAT_32) + intel_pmu_lbr_read_32(cpuc); + else + intel_pmu_lbr_read_64(cpuc); +} + +static void intel_pmu_lbr_init_core(void) +{ + x86_pmu.lbr_nr = 4; + x86_pmu.lbr_tos = 0x01c9; + x86_pmu.lbr_from = 0x40; + x86_pmu.lbr_to = 0x60; +} + +static void intel_pmu_lbr_init_nhm(void) +{ + x86_pmu.lbr_nr = 16; + x86_pmu.lbr_tos = 0x01c9; + x86_pmu.lbr_from = 0x680; + x86_pmu.lbr_to = 0x6c0; +} + +static void intel_pmu_lbr_init_atom(void) +{ + x86_pmu.lbr_nr = 8; + x86_pmu.lbr_tos = 0x01c9; + x86_pmu.lbr_from = 0x40; + x86_pmu.lbr_to = 0x60; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c new file mode 100644 index 000000000000..107711bf0ee8 --- /dev/null +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -0,0 +1,942 @@ +/* + * Netburst Perfomance Events (P4, old Xeon) + * + * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> + * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> + * + * For licencing details see kernel-base/COPYING + */ + +#ifdef CONFIG_CPU_SUP_INTEL + +#include <asm/perf_event_p4.h> + +#define P4_CNTR_LIMIT 3 +/* + * array indices: 0,1 - HT threads, used with HT enabled cpu + */ +struct p4_event_bind { + unsigned int opcode; /* Event code and ESCR selector */ + unsigned int escr_msr[2]; /* ESCR MSR for this event */ + char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ +}; + +struct p4_pebs_bind { + unsigned int metric_pebs; + unsigned int metric_vert; +}; + +/* it sets P4_PEBS_ENABLE_UOP_TAG as well */ +#define P4_GEN_PEBS_BIND(name, pebs, vert) \ + [P4_PEBS_METRIC__##name] = { \ + .metric_pebs = pebs | P4_PEBS_ENABLE_UOP_TAG, \ + .metric_vert = vert, \ + } + +/* + * note we have P4_PEBS_ENABLE_UOP_TAG always set here + * + * it's needed for mapping P4_PEBS_CONFIG_METRIC_MASK bits of + * event configuration to find out which values are to be + * written into MSR_IA32_PEBS_ENABLE and MSR_P4_PEBS_MATRIX_VERT + * resgisters + */ +static struct p4_pebs_bind p4_pebs_bind_map[] = { + P4_GEN_PEBS_BIND(1stl_cache_load_miss_retired, 0x0000001, 0x0000001), + P4_GEN_PEBS_BIND(2ndl_cache_load_miss_retired, 0x0000002, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_load_miss_retired, 0x0000004, 0x0000001), + P4_GEN_PEBS_BIND(dtlb_store_miss_retired, 0x0000004, 0x0000002), + P4_GEN_PEBS_BIND(dtlb_all_miss_retired, 0x0000004, 0x0000003), + P4_GEN_PEBS_BIND(tagged_mispred_branch, 0x0018000, 0x0000010), + P4_GEN_PEBS_BIND(mob_load_replay_retired, 0x0000200, 0x0000001), + P4_GEN_PEBS_BIND(split_load_retired, 0x0000400, 0x0000001), + P4_GEN_PEBS_BIND(split_store_retired, 0x0000400, 0x0000002), +}; + +/* + * Note that we don't use CCCR1 here, there is an + * exception for P4_BSQ_ALLOCATION but we just have + * no workaround + * + * consider this binding as resources which particular + * event may borrow, it doesn't contain EventMask, + * Tags and friends -- they are left to a caller + */ +static struct p4_event_bind p4_event_bind_map[] = { + [P4_EVENT_TC_DELIVER_MODE] = { + .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), + .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_BPU_FETCH_REQUEST] = { + .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), + .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_ITLB_REFERENCE] = { + .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), + .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_MEMORY_CANCEL] = { + .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), + .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_MEMORY_COMPLETE] = { + .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), + .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_LOAD_PORT_REPLAY] = { + .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), + .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_STORE_PORT_REPLAY] = { + .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), + .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_MOB_LOAD_REPLAY] = { + .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), + .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_PAGE_WALK_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), + .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_BSQ_CACHE_REFERENCE] = { + .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), + .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_IOQ_ALLOCATION] = { + .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ + .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), + .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, + .cntr = { {2, -1, -1}, {3, -1, -1} }, + }, + [P4_EVENT_FSB_DATA_ACTIVITY] = { + .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ + .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), + .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, + .cntr = { {0, -1, -1}, {1, -1, -1} }, + }, + [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ + .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), + .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, + .cntr = { {2, -1, -1}, {3, -1, -1} }, + }, + [P4_EVENT_SSE_INPUT_ASSIST] = { + .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_PACKED_SP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_PACKED_DP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_SCALAR_SP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_SCALAR_DP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_64BIT_MMX_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_128BIT_MMX_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_X87_FP_UOP] = { + .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), + .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_TC_MISC] = { + .opcode = P4_OPCODE(P4_EVENT_TC_MISC), + .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_GLOBAL_POWER_EVENTS] = { + .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_TC_MS_XFER] = { + .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), + .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_UOP_QUEUE_WRITES] = { + .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), + .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), + .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_RETIRED_BRANCH_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), + .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, + .cntr = { {4, 5, -1}, {6, 7, -1} }, + }, + [P4_EVENT_RESOURCE_STALL] = { + .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), + .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_WC_BUFFER] = { + .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), + .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, + .cntr = { {8, 9, -1}, {10, 11, -1} }, + }, + [P4_EVENT_B2B_CYCLES] = { + .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_BNR] = { + .opcode = P4_OPCODE(P4_EVENT_BNR), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_SNOOP] = { + .opcode = P4_OPCODE(P4_EVENT_SNOOP), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_RESPONSE] = { + .opcode = P4_OPCODE(P4_EVENT_RESPONSE), + .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, + .cntr = { {0, -1, -1}, {2, -1, -1} }, + }, + [P4_EVENT_FRONT_END_EVENT] = { + .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_EXECUTION_EVENT] = { + .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_REPLAY_EVENT] = { + .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_INSTR_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_UOPS_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_UOP_TYPE] = { + .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), + .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_BRANCH_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_MISPRED_BRANCH_RETIRED] = { + .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_X87_ASSIST] = { + .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_MACHINE_CLEAR] = { + .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), + .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, + [P4_EVENT_INSTR_COMPLETED] = { + .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), + .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, + .cntr = { {12, 13, 16}, {14, 15, 17} }, + }, +}; + +#define P4_GEN_CACHE_EVENT(event, bit, metric) \ + p4_config_pack_escr(P4_ESCR_EVENT(event) | \ + P4_ESCR_EMASK_BIT(event, bit)) | \ + p4_config_pack_cccr(metric | \ + P4_CCCR_ESEL(P4_OPCODE_ESEL(P4_OPCODE(event)))) + +static __initconst const u64 p4_hw_cache_event_ids + [PERF_COUNT_HW_CACHE_MAX] + [PERF_COUNT_HW_CACHE_OP_MAX] + [PERF_COUNT_HW_CACHE_RESULT_MAX] = +{ + [ C(L1D ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__1stl_cache_load_miss_retired), + }, + }, + [ C(LL ) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__2ndl_cache_load_miss_retired), + }, +}, + [ C(DTLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__dtlb_load_miss_retired), + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = 0x0, + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_REPLAY_EVENT, NBOGUS, + P4_PEBS_METRIC__dtlb_store_miss_retired), + }, + }, + [ C(ITLB) ] = { + [ C(OP_READ) ] = { + [ C(RESULT_ACCESS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, HIT, + P4_PEBS_METRIC__none), + [ C(RESULT_MISS) ] = P4_GEN_CACHE_EVENT(P4_EVENT_ITLB_REFERENCE, MISS, + P4_PEBS_METRIC__none), + }, + [ C(OP_WRITE) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + [ C(OP_PREFETCH) ] = { + [ C(RESULT_ACCESS) ] = -1, + [ C(RESULT_MISS) ] = -1, + }, + }, +}; + +static u64 p4_general_events[PERF_COUNT_HW_MAX] = { + /* non-halted CPU clocks */ + [PERF_COUNT_HW_CPU_CYCLES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_GLOBAL_POWER_EVENTS) | + P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING)), + + /* + * retired instructions + * in a sake of simplicity we don't use the FSB tagging + */ + [PERF_COUNT_HW_INSTRUCTIONS] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_INSTR_RETIRED) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) | + P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG)), + + /* cache hits */ + [PERF_COUNT_HW_CACHE_REFERENCES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM)), + + /* cache misses */ + [PERF_COUNT_HW_CACHE_MISSES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_BSQ_CACHE_REFERENCE) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) | + P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS)), + + /* branch instructions retired */ + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_RETIRED_BRANCH_TYPE) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) | + P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT)), + + /* mispredicted branches retired */ + [PERF_COUNT_HW_BRANCH_MISSES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_MISPRED_BRANCH_RETIRED) | + P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS)), + + /* bus ready clocks (cpu is driving #DRDY_DRV\#DRDY_OWN): */ + [PERF_COUNT_HW_BUS_CYCLES] = + p4_config_pack_escr(P4_ESCR_EVENT(P4_EVENT_FSB_DATA_ACTIVITY) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) | + P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN)) | + p4_config_pack_cccr(P4_CCCR_EDGE | P4_CCCR_COMPARE), +}; + +static struct p4_event_bind *p4_config_get_bind(u64 config) +{ + unsigned int evnt = p4_config_unpack_event(config); + struct p4_event_bind *bind = NULL; + + if (evnt < ARRAY_SIZE(p4_event_bind_map)) + bind = &p4_event_bind_map[evnt]; + + return bind; +} + +static u64 p4_pmu_event_map(int hw_event) +{ + struct p4_event_bind *bind; + unsigned int esel; + u64 config; + + config = p4_general_events[hw_event]; + bind = p4_config_get_bind(config); + esel = P4_OPCODE_ESEL(bind->opcode); + config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel)); + + return config; +} + +static int p4_validate_raw_event(struct perf_event *event) +{ + unsigned int v; + + /* user data may have out-of-bound event index */ + v = p4_config_unpack_event(event->attr.config); + if (v >= ARRAY_SIZE(p4_event_bind_map)) { + pr_warning("P4 PMU: Unknown event code: %d\n", v); + return -EINVAL; + } + + /* + * it may have some screwed PEBS bits + */ + if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { + pr_warning("P4 PMU: PEBS are not supported yet\n"); + return -EINVAL; + } + v = p4_config_unpack_metric(event->attr.config); + if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { + pr_warning("P4 PMU: Unknown metric code: %d\n", v); + return -EINVAL; + } + + return 0; +} + +static int p4_hw_config(struct perf_event *event) +{ + int cpu = get_cpu(); + int rc = 0; + u32 escr, cccr; + + /* + * the reason we use cpu that early is that: if we get scheduled + * first time on the same cpu -- we will not need swap thread + * specific flags in config (and will save some cpu cycles) + */ + + cccr = p4_default_cccr_conf(cpu); + escr = p4_default_escr_conf(cpu, event->attr.exclude_kernel, + event->attr.exclude_user); + event->hw.config = p4_config_pack_escr(escr) | + p4_config_pack_cccr(cccr); + + if (p4_ht_active() && p4_ht_thread(cpu)) + event->hw.config = p4_set_ht_bit(event->hw.config); + + if (event->attr.type == PERF_TYPE_RAW) { + + rc = p4_validate_raw_event(event); + if (rc) + goto out; + + /* + * We don't control raw events so it's up to the caller + * to pass sane values (and we don't count the thread number + * on HT machine but allow HT-compatible specifics to be + * passed on) + * + * Note that for RAW events we allow user to use P4_CCCR_RESERVED + * bits since we keep additional info here (for cache events and etc) + * + * XXX: HT wide things should check perf_paranoid_cpu() && + * CAP_SYS_ADMIN + */ + event->hw.config |= event->attr.config & + (p4_config_pack_escr(P4_ESCR_MASK_HT) | + p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); + } + + rc = x86_setup_perfctr(event); +out: + put_cpu(); + return rc; +} + +static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) +{ + int overflow = 0; + u32 low, high; + + rdmsr(hwc->config_base + hwc->idx, low, high); + + /* we need to check high bit for unflagged overflows */ + if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { + overflow = 1; + (void)checking_wrmsrl(hwc->config_base + hwc->idx, + ((u64)low) & ~P4_CCCR_OVF); + } + + return overflow; +} + +static void p4_pmu_disable_pebs(void) +{ + /* + * FIXME + * + * It's still allowed that two threads setup same cache + * events so we can't simply clear metrics until we knew + * noone is depending on us, so we need kind of counter + * for "ReplayEvent" users. + * + * What is more complex -- RAW events, if user (for some + * reason) will pass some cache event metric with improper + * event opcode -- it's fine from hardware point of view + * but completely nonsence from "meaning" of such action. + * + * So at moment let leave metrics turned on forever -- it's + * ok for now but need to be revisited! + * + * (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)0); + * (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)0); + */ +} + +static inline void p4_pmu_disable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + /* + * If event gets disabled while counter is in overflowed + * state we need to clear P4_CCCR_OVF, otherwise interrupt get + * asserted again and again + */ + (void)checking_wrmsrl(hwc->config_base + hwc->idx, + (u64)(p4_config_unpack_cccr(hwc->config)) & + ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); +} + +static void p4_pmu_disable_all(void) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + if (!test_bit(idx, cpuc->active_mask)) + continue; + p4_pmu_disable_event(event); + } + + p4_pmu_disable_pebs(); +} + +/* configuration must be valid */ +static void p4_pmu_enable_pebs(u64 config) +{ + struct p4_pebs_bind *bind; + unsigned int idx; + + BUILD_BUG_ON(P4_PEBS_METRIC__max > P4_PEBS_CONFIG_METRIC_MASK); + + idx = p4_config_unpack_metric(config); + if (idx == P4_PEBS_METRIC__none) + return; + + bind = &p4_pebs_bind_map[idx]; + + (void)checking_wrmsrl(MSR_IA32_PEBS_ENABLE, (u64)bind->metric_pebs); + (void)checking_wrmsrl(MSR_P4_PEBS_MATRIX_VERT, (u64)bind->metric_vert); +} + +static void p4_pmu_enable_event(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + int thread = p4_ht_config_thread(hwc->config); + u64 escr_conf = p4_config_unpack_escr(p4_clear_ht_bit(hwc->config)); + unsigned int idx = p4_config_unpack_event(hwc->config); + struct p4_event_bind *bind; + u64 escr_addr, cccr; + + bind = &p4_event_bind_map[idx]; + escr_addr = (u64)bind->escr_msr[thread]; + + /* + * - we dont support cascaded counters yet + * - and counter 1 is broken (erratum) + */ + WARN_ON_ONCE(p4_is_event_cascaded(hwc->config)); + WARN_ON_ONCE(hwc->idx == 1); + + /* we need a real Event value */ + escr_conf &= ~P4_ESCR_EVENT_MASK; + escr_conf |= P4_ESCR_EVENT(P4_OPCODE_EVNT(bind->opcode)); + + cccr = p4_config_unpack_cccr(hwc->config); + + /* + * it could be Cache event so we need to write metrics + * into additional MSRs + */ + p4_pmu_enable_pebs(hwc->config); + + (void)checking_wrmsrl(escr_addr, escr_conf); + (void)checking_wrmsrl(hwc->config_base + hwc->idx, + (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); +} + +static void p4_pmu_enable_all(int added) +{ + struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); + int idx; + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + struct perf_event *event = cpuc->events[idx]; + if (!test_bit(idx, cpuc->active_mask)) + continue; + p4_pmu_enable_event(event); + } +} + +static int p4_pmu_handle_irq(struct pt_regs *regs) +{ + struct perf_sample_data data; + struct cpu_hw_events *cpuc; + struct perf_event *event; + struct hw_perf_event *hwc; + int idx, handled = 0; + u64 val; + + data.addr = 0; + data.raw = NULL; + + cpuc = &__get_cpu_var(cpu_hw_events); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + + if (!test_bit(idx, cpuc->active_mask)) + continue; + + event = cpuc->events[idx]; + hwc = &event->hw; + + WARN_ON_ONCE(hwc->idx != idx); + + /* it might be unflagged overflow */ + handled = p4_pmu_clear_cccr_ovf(hwc); + + val = x86_perf_event_update(event); + if (!handled && (val & (1ULL << (x86_pmu.cntval_bits - 1)))) + continue; + + /* event overflow for sure */ + data.period = event->hw.last_period; + + if (!x86_perf_event_set_period(event)) + continue; + if (perf_event_overflow(event, 1, &data, regs)) + p4_pmu_disable_event(event); + } + + if (handled) { + /* p4 quirk: unmask it again */ + apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); + inc_irq_stat(apic_perf_irqs); + } + + return handled; +} + +/* + * swap thread specific fields according to a thread + * we are going to run on + */ +static void p4_pmu_swap_config_ts(struct hw_perf_event *hwc, int cpu) +{ + u32 escr, cccr; + + /* + * we either lucky and continue on same cpu or no HT support + */ + if (!p4_should_swap_ts(hwc->config, cpu)) + return; + + /* + * the event is migrated from an another logical + * cpu, so we need to swap thread specific flags + */ + + escr = p4_config_unpack_escr(hwc->config); + cccr = p4_config_unpack_cccr(hwc->config); + + if (p4_ht_thread(cpu)) { + cccr &= ~P4_CCCR_OVF_PMI_T0; + cccr |= P4_CCCR_OVF_PMI_T1; + if (escr & P4_ESCR_T0_OS) { + escr &= ~P4_ESCR_T0_OS; + escr |= P4_ESCR_T1_OS; + } + if (escr & P4_ESCR_T0_USR) { + escr &= ~P4_ESCR_T0_USR; + escr |= P4_ESCR_T1_USR; + } + hwc->config = p4_config_pack_escr(escr); + hwc->config |= p4_config_pack_cccr(cccr); + hwc->config |= P4_CONFIG_HT; + } else { + cccr &= ~P4_CCCR_OVF_PMI_T1; + cccr |= P4_CCCR_OVF_PMI_T0; + if (escr & P4_ESCR_T1_OS) { + escr &= ~P4_ESCR_T1_OS; + escr |= P4_ESCR_T0_OS; + } + if (escr & P4_ESCR_T1_USR) { + escr &= ~P4_ESCR_T1_USR; + escr |= P4_ESCR_T0_USR; + } + hwc->config = p4_config_pack_escr(escr); + hwc->config |= p4_config_pack_cccr(cccr); + hwc->config &= ~P4_CONFIG_HT; + } +} + +/* + * ESCR address hashing is tricky, ESCRs are not sequential + * in memory but all starts from MSR_P4_BSU_ESCR0 (0x03a0) and + * the metric between any ESCRs is laid in range [0xa0,0xe1] + * + * so we make ~70% filled hashtable + */ + +#define P4_ESCR_MSR_BASE 0x000003a0 +#define P4_ESCR_MSR_MAX 0x000003e1 +#define P4_ESCR_MSR_TABLE_SIZE (P4_ESCR_MSR_MAX - P4_ESCR_MSR_BASE + 1) +#define P4_ESCR_MSR_IDX(msr) (msr - P4_ESCR_MSR_BASE) +#define P4_ESCR_MSR_TABLE_ENTRY(msr) [P4_ESCR_MSR_IDX(msr)] = msr + +static const unsigned int p4_escr_table[P4_ESCR_MSR_TABLE_SIZE] = { + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ALF_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BPU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_BSU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR2), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR3), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR4), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_CRU_ESCR5), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_DAC_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FIRM_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FLAME_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_FSB_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IQ_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IS_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_ITLB_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_IX_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MOB_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_MS_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_PMH_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_RAT_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SAAT_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_SSU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TBPU_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_TC_ESCR1), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR0), + P4_ESCR_MSR_TABLE_ENTRY(MSR_P4_U2L_ESCR1), +}; + +static int p4_get_escr_idx(unsigned int addr) +{ + unsigned int idx = P4_ESCR_MSR_IDX(addr); + + if (unlikely(idx >= P4_ESCR_MSR_TABLE_SIZE || + !p4_escr_table[idx] || + p4_escr_table[idx] != addr)) { + WARN_ONCE(1, "P4 PMU: Wrong address passed: %x\n", addr); + return -1; + } + + return idx; +} + +static int p4_next_cntr(int thread, unsigned long *used_mask, + struct p4_event_bind *bind) +{ + int i, j; + + for (i = 0; i < P4_CNTR_LIMIT; i++) { + j = bind->cntr[thread][i]; + if (j != -1 && !test_bit(j, used_mask)) + return j; + } + + return -1; +} + +static int p4_pmu_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) +{ + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long escr_mask[BITS_TO_LONGS(P4_ESCR_MSR_TABLE_SIZE)]; + int cpu = smp_processor_id(); + struct hw_perf_event *hwc; + struct p4_event_bind *bind; + unsigned int i, thread, num; + int cntr_idx, escr_idx; + + bitmap_zero(used_mask, X86_PMC_IDX_MAX); + bitmap_zero(escr_mask, P4_ESCR_MSR_TABLE_SIZE); + + for (i = 0, num = n; i < n; i++, num--) { + + hwc = &cpuc->event_list[i]->hw; + thread = p4_ht_thread(cpu); + bind = p4_config_get_bind(hwc->config); + escr_idx = p4_get_escr_idx(bind->escr_msr[thread]); + if (unlikely(escr_idx == -1)) + goto done; + + if (hwc->idx != -1 && !p4_should_swap_ts(hwc->config, cpu)) { + cntr_idx = hwc->idx; + if (assign) + assign[i] = hwc->idx; + goto reserve; + } + + cntr_idx = p4_next_cntr(thread, used_mask, bind); + if (cntr_idx == -1 || test_bit(escr_idx, escr_mask)) + goto done; + + p4_pmu_swap_config_ts(hwc, cpu); + if (assign) + assign[i] = cntr_idx; +reserve: + set_bit(cntr_idx, used_mask); + set_bit(escr_idx, escr_mask); + } + +done: + return num ? -ENOSPC : 0; +} + +static __initconst const struct x86_pmu p4_pmu = { + .name = "Netburst P4/Xeon", + .handle_irq = p4_pmu_handle_irq, + .disable_all = p4_pmu_disable_all, + .enable_all = p4_pmu_enable_all, + .enable = p4_pmu_enable_event, + .disable = p4_pmu_disable_event, + .eventsel = MSR_P4_BPU_CCCR0, + .perfctr = MSR_P4_BPU_PERFCTR0, + .event_map = p4_pmu_event_map, + .max_events = ARRAY_SIZE(p4_general_events), + .get_event_constraints = x86_get_event_constraints, + /* + * IF HT disabled we may need to use all + * ARCH_P4_MAX_CCCR counters simulaneously + * though leave it restricted at moment assuming + * HT is on + */ + .num_counters = ARCH_P4_MAX_CCCR, + .apic = 1, + .cntval_bits = 40, + .cntval_mask = (1ULL << 40) - 1, + .max_period = (1ULL << 39) - 1, + .hw_config = p4_hw_config, + .schedule_events = p4_pmu_schedule_events, + /* + * This handles erratum N15 in intel doc 249199-029, + * the counter may not be updated correctly on write + * so we need a second write operation to do the trick + * (the official workaround didn't work) + * + * the former idea is taken from OProfile code + */ + .perfctr_second_write = 1, +}; + +static __init int p4_pmu_init(void) +{ + unsigned int low, high; + + /* If we get stripped -- indexig fails */ + BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); + + rdmsr(MSR_IA32_MISC_ENABLE, low, high); + if (!(low & (1 << 7))) { + pr_cont("unsupported Netburst CPU model %d ", + boot_cpu_data.x86_model); + return -ENODEV; + } + + memcpy(hw_cache_event_ids, p4_hw_cache_event_ids, + sizeof(hw_cache_event_ids)); + + pr_cont("Netburst events, "); + + x86_pmu = p4_pmu; + + return 0; +} + +#endif /* CONFIG_CPU_SUP_INTEL */ diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c index a330485d14da..34ba07be2cda 100644 --- a/arch/x86/kernel/cpu/perf_event_p6.c +++ b/arch/x86/kernel/cpu/perf_event_p6.c @@ -27,24 +27,6 @@ static u64 p6_pmu_event_map(int hw_event) */ #define P6_NOP_EVENT 0x0000002EULL -static u64 p6_pmu_raw_event(u64 hw_event) -{ -#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL -#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL -#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL -#define P6_EVNTSEL_INV_MASK 0x00800000ULL -#define P6_EVNTSEL_REG_MASK 0xFF000000ULL - -#define P6_EVNTSEL_MASK \ - (P6_EVNTSEL_EVENT_MASK | \ - P6_EVNTSEL_UNIT_MASK | \ - P6_EVNTSEL_EDGE_MASK | \ - P6_EVNTSEL_INV_MASK | \ - P6_EVNTSEL_REG_MASK) - - return hw_event & P6_EVNTSEL_MASK; -} - static struct event_constraint p6_event_constraints[] = { INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */ @@ -66,7 +48,7 @@ static void p6_pmu_disable_all(void) wrmsrl(MSR_P6_EVNTSEL0, val); } -static void p6_pmu_enable_all(void) +static void p6_pmu_enable_all(int added) { unsigned long val; @@ -102,22 +84,23 @@ static void p6_pmu_enable_event(struct perf_event *event) (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); } -static __initconst struct x86_pmu p6_pmu = { +static __initconst const struct x86_pmu p6_pmu = { .name = "p6", .handle_irq = x86_pmu_handle_irq, .disable_all = p6_pmu_disable_all, .enable_all = p6_pmu_enable_all, .enable = p6_pmu_enable_event, .disable = p6_pmu_disable_event, + .hw_config = x86_pmu_hw_config, + .schedule_events = x86_schedule_events, .eventsel = MSR_P6_EVNTSEL0, .perfctr = MSR_P6_PERFCTR0, .event_map = p6_pmu_event_map, - .raw_event = p6_pmu_raw_event, .max_events = ARRAY_SIZE(p6_perfmon_event_map), .apic = 1, .max_period = (1ULL << 31) - 1, .version = 0, - .num_events = 2, + .num_counters = 2, /* * Events have 40 bits implemented. However they are designed such * that bits [32-39] are sign extensions of bit 31. As such the @@ -125,8 +108,8 @@ static __initconst struct x86_pmu p6_pmu = { * * See IA-32 Intel Architecture Software developer manual Vol 3B */ - .event_bits = 32, - .event_mask = (1ULL << 32) - 1, + .cntval_bits = 32, + .cntval_mask = (1ULL << 32) - 1, .get_event_constraints = x86_get_event_constraints, .event_constraints = p6_event_constraints, }; diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c new file mode 100644 index 000000000000..34b4dad6f0b8 --- /dev/null +++ b/arch/x86/kernel/cpu/scattered.c @@ -0,0 +1,63 @@ +/* + * Routines to indentify additional cpu features that are scattered in + * cpuid space. + */ +#include <linux/cpu.h> + +#include <asm/pat.h> +#include <asm/processor.h> + +#include <asm/apic.h> + +struct cpuid_bit { + u16 feature; + u8 reg; + u8 bit; + u32 level; + u32 sub_leaf; +}; + +enum cpuid_regs { + CR_EAX = 0, + CR_ECX, + CR_EDX, + CR_EBX +}; + +void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) +{ + u32 max_level; + u32 regs[4]; + const struct cpuid_bit *cb; + + static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { + { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006, 0 }, + { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006, 0 }, + { X86_FEATURE_PLN, CR_EAX, 4, 0x00000006, 0 }, + { X86_FEATURE_PTS, CR_EAX, 6, 0x00000006, 0 }, + { X86_FEATURE_APERFMPERF, CR_ECX, 0, 0x00000006, 0 }, + { X86_FEATURE_EPB, CR_ECX, 3, 0x00000006, 0 }, + { X86_FEATURE_XSAVEOPT, CR_EAX, 0, 0x0000000d, 1 }, + { X86_FEATURE_CPB, CR_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a, 0 }, + { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, + { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, + { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, + { 0, 0, 0, 0, 0 } + }; + + for (cb = cpuid_bits; cb->feature; cb++) { + + /* Verify that the level is valid */ + max_level = cpuid_eax(cb->level & 0xffff0000); + if (max_level < cb->level || + max_level > (cb->level | 0xffff)) + continue; + + cpuid_count(cb->level, cb->sub_leaf, ®s[CR_EAX], + ®s[CR_EBX], ®s[CR_ECX], ®s[CR_EDX]); + + if (regs[cb->reg] & (1 << cb->bit)) + set_cpu_cap(c, cb->feature); + } +} diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/topology.c index 97ad79cdf688..4397e987a1cf 100644 --- a/arch/x86/kernel/cpu/addon_cpuid_features.c +++ b/arch/x86/kernel/cpu/topology.c @@ -1,60 +1,14 @@ /* - * Routines to indentify additional cpu features that are scattered in - * cpuid space. + * Check for extended topology enumeration cpuid leaf 0xb and if it + * exists, use it for populating initial_apicid and cpu topology + * detection. */ -#include <linux/cpu.h> +#include <linux/cpu.h> +#include <asm/apic.h> #include <asm/pat.h> #include <asm/processor.h> -#include <asm/apic.h> - -struct cpuid_bit { - u16 feature; - u8 reg; - u8 bit; - u32 level; -}; - -enum cpuid_regs { - CR_EAX = 0, - CR_ECX, - CR_EDX, - CR_EBX -}; - -void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c) -{ - u32 max_level; - u32 regs[4]; - const struct cpuid_bit *cb; - - static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { - { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, - { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, - { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a }, - { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a }, - { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a }, - { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a }, - { 0, 0, 0, 0 } - }; - - for (cb = cpuid_bits; cb->feature; cb++) { - - /* Verify that the level is valid */ - max_level = cpuid_eax(cb->level & 0xffff0000); - if (max_level < cb->level || - max_level > (cb->level | 0xffff)) - continue; - - cpuid(cb->level, ®s[CR_EAX], ®s[CR_EBX], - ®s[CR_ECX], ®s[CR_EDX]); - - if (regs[cb->reg] & (1 << cb->bit)) - set_cpu_cap(c, cb->feature); - } -} - /* leaf 0xb SMT level */ #define SMT_LEVEL 0 diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c index dfdb4dba2320..227b0448960d 100644 --- a/arch/x86/kernel/cpu/vmware.c +++ b/arch/x86/kernel/cpu/vmware.c @@ -24,8 +24,8 @@ #include <linux/dmi.h> #include <linux/module.h> #include <asm/div64.h> -#include <asm/vmware.h> #include <asm/x86_init.h> +#include <asm/hypervisor.h> #define CPUID_VMWARE_INFO_LEAF 0x40000000 #define VMWARE_HYPERVISOR_MAGIC 0x564D5868 @@ -51,7 +51,7 @@ static inline int __vmware_platform(void) static unsigned long vmware_get_tsc_khz(void) { - uint64_t tsc_hz; + uint64_t tsc_hz, lpj; uint32_t eax, ebx, ecx, edx; VMWARE_PORT(GETHZ, eax, ebx, ecx, edx); @@ -62,10 +62,17 @@ static unsigned long vmware_get_tsc_khz(void) printk(KERN_INFO "TSC freq read from hypervisor : %lu.%03lu MHz\n", (unsigned long) tsc_hz / 1000, (unsigned long) tsc_hz % 1000); + + if (!preset_lpj) { + lpj = ((u64)tsc_hz * 1000); + do_div(lpj, HZ); + preset_lpj = lpj; + } + return tsc_hz; } -void __init vmware_platform_setup(void) +static void __init vmware_platform_setup(void) { uint32_t eax, ebx, ecx, edx; @@ -83,26 +90,22 @@ void __init vmware_platform_setup(void) * serial key should be enough, as this will always have a VMware * specific string when running under VMware hypervisor. */ -int vmware_platform(void) +static bool __init vmware_platform(void) { if (cpu_has_hypervisor) { - unsigned int eax, ebx, ecx, edx; - char hyper_vendor_id[13]; - - cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &ebx, &ecx, &edx); - memcpy(hyper_vendor_id + 0, &ebx, 4); - memcpy(hyper_vendor_id + 4, &ecx, 4); - memcpy(hyper_vendor_id + 8, &edx, 4); - hyper_vendor_id[12] = '\0'; - if (!strcmp(hyper_vendor_id, "VMwareVMware")) - return 1; + unsigned int eax; + unsigned int hyper_vendor_id[3]; + + cpuid(CPUID_VMWARE_INFO_LEAF, &eax, &hyper_vendor_id[0], + &hyper_vendor_id[1], &hyper_vendor_id[2]); + if (!memcmp(hyper_vendor_id, "VMwareVMware", 12)) + return true; } else if (dmi_available && dmi_name_in_serial("VMware") && __vmware_platform()) - return 1; + return true; - return 0; + return false; } -EXPORT_SYMBOL(vmware_platform); /* * VMware hypervisor takes care of exporting a reliable TSC to the guest. @@ -116,8 +119,16 @@ EXPORT_SYMBOL(vmware_platform); * so that the kernel could just trust the hypervisor with providing a * reliable virtual TSC that is suitable for timekeeping. */ -void __cpuinit vmware_set_feature_bits(struct cpuinfo_x86 *c) +static void __cpuinit vmware_set_cpu_features(struct cpuinfo_x86 *c) { set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); } + +const __refconst struct hypervisor_x86 x86_hyper_vmware = { + .name = "VMware", + .detect = vmware_platform, + .set_cpu_features = vmware_set_cpu_features, + .init_platform = vmware_platform_setup, +}; +EXPORT_SYMBOL(x86_hyper_vmware); diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 8b862d5900fe..1b7b31ab7d86 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -170,7 +170,7 @@ static int __cpuinit cpuid_class_cpu_callback(struct notifier_block *nfb, cpuid_device_destroy(cpu); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __refdata cpuid_class_cpu_notifier = diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c deleted file mode 100644 index 1c47390dd0e5..000000000000 --- a/arch/x86/kernel/ds.c +++ /dev/null @@ -1,1437 +0,0 @@ -/* - * Debug Store support - * - * This provides a low-level interface to the hardware's Debug Store - * feature that is used for branch trace store (BTS) and - * precise-event based sampling (PEBS). - * - * It manages: - * - DS and BTS hardware configuration - * - buffer overflow handling (to be done) - * - buffer access - * - * It does not do: - * - security checking (is the caller allowed to trace the task) - * - buffer allocation (memory accounting) - * - * - * Copyright (C) 2007-2009 Intel Corporation. - * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009 - */ - -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/errno.h> -#include <linux/sched.h> -#include <linux/slab.h> -#include <linux/mm.h> -#include <linux/trace_clock.h> - -#include <asm/ds.h> - -#include "ds_selftest.h" - -/* - * The configuration for a particular DS hardware implementation: - */ -struct ds_configuration { - /* The name of the configuration: */ - const char *name; - - /* The size of pointer-typed fields in DS, BTS, and PEBS: */ - unsigned char sizeof_ptr_field; - - /* The size of a BTS/PEBS record in bytes: */ - unsigned char sizeof_rec[2]; - - /* The number of pebs counter reset values in the DS structure. */ - unsigned char nr_counter_reset; - - /* Control bit-masks indexed by enum ds_feature: */ - unsigned long ctl[dsf_ctl_max]; -}; -static struct ds_configuration ds_cfg __read_mostly; - - -/* Maximal size of a DS configuration: */ -#define MAX_SIZEOF_DS 0x80 - -/* Maximal size of a BTS record: */ -#define MAX_SIZEOF_BTS (3 * 8) - -/* BTS and PEBS buffer alignment: */ -#define DS_ALIGNMENT (1 << 3) - -/* Number of buffer pointers in DS: */ -#define NUM_DS_PTR_FIELDS 8 - -/* Size of a pebs reset value in DS: */ -#define PEBS_RESET_FIELD_SIZE 8 - -/* Mask of control bits in the DS MSR register: */ -#define BTS_CONTROL \ - ( ds_cfg.ctl[dsf_bts] | \ - ds_cfg.ctl[dsf_bts_kernel] | \ - ds_cfg.ctl[dsf_bts_user] | \ - ds_cfg.ctl[dsf_bts_overflow] ) - -/* - * A BTS or PEBS tracer. - * - * This holds the configuration of the tracer and serves as a handle - * to identify tracers. - */ -struct ds_tracer { - /* The DS context (partially) owned by this tracer. */ - struct ds_context *context; - /* The buffer provided on ds_request() and its size in bytes. */ - void *buffer; - size_t size; -}; - -struct bts_tracer { - /* The common DS part: */ - struct ds_tracer ds; - - /* The trace including the DS configuration: */ - struct bts_trace trace; - - /* Buffer overflow notification function: */ - bts_ovfl_callback_t ovfl; - - /* Active flags affecting trace collection. */ - unsigned int flags; -}; - -struct pebs_tracer { - /* The common DS part: */ - struct ds_tracer ds; - - /* The trace including the DS configuration: */ - struct pebs_trace trace; - - /* Buffer overflow notification function: */ - pebs_ovfl_callback_t ovfl; -}; - -/* - * Debug Store (DS) save area configuration (see Intel64 and IA32 - * Architectures Software Developer's Manual, section 18.5) - * - * The DS configuration consists of the following fields; different - * architetures vary in the size of those fields. - * - * - double-word aligned base linear address of the BTS buffer - * - write pointer into the BTS buffer - * - end linear address of the BTS buffer (one byte beyond the end of - * the buffer) - * - interrupt pointer into BTS buffer - * (interrupt occurs when write pointer passes interrupt pointer) - * - double-word aligned base linear address of the PEBS buffer - * - write pointer into the PEBS buffer - * - end linear address of the PEBS buffer (one byte beyond the end of - * the buffer) - * - interrupt pointer into PEBS buffer - * (interrupt occurs when write pointer passes interrupt pointer) - * - value to which counter is reset following counter overflow - * - * Later architectures use 64bit pointers throughout, whereas earlier - * architectures use 32bit pointers in 32bit mode. - * - * - * We compute the base address for the first 8 fields based on: - * - the field size stored in the DS configuration - * - the relative field position - * - an offset giving the start of the respective region - * - * This offset is further used to index various arrays holding - * information for BTS and PEBS at the respective index. - * - * On later 32bit processors, we only access the lower 32bit of the - * 64bit pointer fields. The upper halves will be zeroed out. - */ - -enum ds_field { - ds_buffer_base = 0, - ds_index, - ds_absolute_maximum, - ds_interrupt_threshold, -}; - -enum ds_qualifier { - ds_bts = 0, - ds_pebs -}; - -static inline unsigned long -ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) -{ - base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); - return *(unsigned long *)base; -} - -static inline void -ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, - unsigned long value) -{ - base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); - (*(unsigned long *)base) = value; -} - - -/* - * Locking is done only for allocating BTS or PEBS resources. - */ -static DEFINE_SPINLOCK(ds_lock); - -/* - * We either support (system-wide) per-cpu or per-thread allocation. - * We distinguish the two based on the task_struct pointer, where a - * NULL pointer indicates per-cpu allocation for the current cpu. - * - * Allocations are use-counted. As soon as resources are allocated, - * further allocations must be of the same type (per-cpu or - * per-thread). We model this by counting allocations (i.e. the number - * of tracers of a certain type) for one type negatively: - * =0 no tracers - * >0 number of per-thread tracers - * <0 number of per-cpu tracers - * - * Tracers essentially gives the number of ds contexts for a certain - * type of allocation. - */ -static atomic_t tracers = ATOMIC_INIT(0); - -static inline int get_tracer(struct task_struct *task) -{ - int error; - - spin_lock_irq(&ds_lock); - - if (task) { - error = -EPERM; - if (atomic_read(&tracers) < 0) - goto out; - atomic_inc(&tracers); - } else { - error = -EPERM; - if (atomic_read(&tracers) > 0) - goto out; - atomic_dec(&tracers); - } - - error = 0; -out: - spin_unlock_irq(&ds_lock); - return error; -} - -static inline void put_tracer(struct task_struct *task) -{ - if (task) - atomic_dec(&tracers); - else - atomic_inc(&tracers); -} - -/* - * The DS context is either attached to a thread or to a cpu: - * - in the former case, the thread_struct contains a pointer to the - * attached context. - * - in the latter case, we use a static array of per-cpu context - * pointers. - * - * Contexts are use-counted. They are allocated on first access and - * deallocated when the last user puts the context. - */ -struct ds_context { - /* The DS configuration; goes into MSR_IA32_DS_AREA: */ - unsigned char ds[MAX_SIZEOF_DS]; - - /* The owner of the BTS and PEBS configuration, respectively: */ - struct bts_tracer *bts_master; - struct pebs_tracer *pebs_master; - - /* Use count: */ - unsigned long count; - - /* Pointer to the context pointer field: */ - struct ds_context **this; - - /* The traced task; NULL for cpu tracing: */ - struct task_struct *task; - - /* The traced cpu; only valid if task is NULL: */ - int cpu; -}; - -static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context); - - -static struct ds_context *ds_get_context(struct task_struct *task, int cpu) -{ - struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu)); - struct ds_context *context = NULL; - struct ds_context *new_context = NULL; - - /* Chances are small that we already have a context. */ - new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); - if (!new_context) - return NULL; - - spin_lock_irq(&ds_lock); - - context = *p_context; - if (likely(!context)) { - context = new_context; - - context->this = p_context; - context->task = task; - context->cpu = cpu; - context->count = 0; - - *p_context = context; - } - - context->count++; - - spin_unlock_irq(&ds_lock); - - if (context != new_context) - kfree(new_context); - - return context; -} - -static void ds_put_context(struct ds_context *context) -{ - struct task_struct *task; - unsigned long irq; - - if (!context) - return; - - spin_lock_irqsave(&ds_lock, irq); - - if (--context->count) { - spin_unlock_irqrestore(&ds_lock, irq); - return; - } - - *(context->this) = NULL; - - task = context->task; - - if (task) - clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); - - /* - * We leave the (now dangling) pointer to the DS configuration in - * the DS_AREA msr. This is as good or as bad as replacing it with - * NULL - the hardware would crash if we enabled tracing. - * - * This saves us some problems with having to write an msr on a - * different cpu while preventing others from doing the same for the - * next context for that same cpu. - */ - - spin_unlock_irqrestore(&ds_lock, irq); - - /* The context might still be in use for context switching. */ - if (task && (task != current)) - wait_task_context_switch(task); - - kfree(context); -} - -static void ds_install_ds_area(struct ds_context *context) -{ - unsigned long ds; - - ds = (unsigned long)context->ds; - - /* - * There is a race between the bts master and the pebs master. - * - * The thread/cpu access is synchronized via get/put_cpu() for - * task tracing and via wrmsr_on_cpu for cpu tracing. - * - * If bts and pebs are collected for the same task or same cpu, - * the same confiuration is written twice. - */ - if (context->task) { - get_cpu(); - if (context->task == current) - wrmsrl(MSR_IA32_DS_AREA, ds); - set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); - put_cpu(); - } else - wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA, - (u32)((u64)ds), (u32)((u64)ds >> 32)); -} - -/* - * Call the tracer's callback on a buffer overflow. - * - * context: the ds context - * qual: the buffer type - */ -static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) -{ - switch (qual) { - case ds_bts: - if (context->bts_master && - context->bts_master->ovfl) - context->bts_master->ovfl(context->bts_master); - break; - case ds_pebs: - if (context->pebs_master && - context->pebs_master->ovfl) - context->pebs_master->ovfl(context->pebs_master); - break; - } -} - - -/* - * Write raw data into the BTS or PEBS buffer. - * - * The remainder of any partially written record is zeroed out. - * - * context: the DS context - * qual: the buffer type - * record: the data to write - * size: the size of the data - */ -static int ds_write(struct ds_context *context, enum ds_qualifier qual, - const void *record, size_t size) -{ - int bytes_written = 0; - - if (!record) - return -EINVAL; - - while (size) { - unsigned long base, index, end, write_end, int_th; - unsigned long write_size, adj_write_size; - - /* - * Write as much as possible without producing an - * overflow interrupt. - * - * Interrupt_threshold must either be - * - bigger than absolute_maximum or - * - point to a record between buffer_base and absolute_maximum - * - * Index points to a valid record. - */ - base = ds_get(context->ds, qual, ds_buffer_base); - index = ds_get(context->ds, qual, ds_index); - end = ds_get(context->ds, qual, ds_absolute_maximum); - int_th = ds_get(context->ds, qual, ds_interrupt_threshold); - - write_end = min(end, int_th); - - /* - * If we are already beyond the interrupt threshold, - * we fill the entire buffer. - */ - if (write_end <= index) - write_end = end; - - if (write_end <= index) - break; - - write_size = min((unsigned long) size, write_end - index); - memcpy((void *)index, record, write_size); - - record = (const char *)record + write_size; - size -= write_size; - bytes_written += write_size; - - adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; - adj_write_size *= ds_cfg.sizeof_rec[qual]; - - /* Zero out trailing bytes. */ - memset((char *)index + write_size, 0, - adj_write_size - write_size); - index += adj_write_size; - - if (index >= end) - index = base; - ds_set(context->ds, qual, ds_index, index); - - if (index >= int_th) - ds_overflow(context, qual); - } - - return bytes_written; -} - - -/* - * Branch Trace Store (BTS) uses the following format. Different - * architectures vary in the size of those fields. - * - source linear address - * - destination linear address - * - flags - * - * Later architectures use 64bit pointers throughout, whereas earlier - * architectures use 32bit pointers in 32bit mode. - * - * We compute the base address for the fields based on: - * - the field size stored in the DS configuration - * - the relative field position - * - * In order to store additional information in the BTS buffer, we use - * a special source address to indicate that the record requires - * special interpretation. - * - * Netburst indicated via a bit in the flags field whether the branch - * was predicted; this is ignored. - * - * We use two levels of abstraction: - * - the raw data level defined here - * - an arch-independent level defined in ds.h - */ - -enum bts_field { - bts_from, - bts_to, - bts_flags, - - bts_qual = bts_from, - bts_clock = bts_to, - bts_pid = bts_flags, - - bts_qual_mask = (bts_qual_max - 1), - bts_escape = ((unsigned long)-1 & ~bts_qual_mask) -}; - -static inline unsigned long bts_get(const char *base, unsigned long field) -{ - base += (ds_cfg.sizeof_ptr_field * field); - return *(unsigned long *)base; -} - -static inline void bts_set(char *base, unsigned long field, unsigned long val) -{ - base += (ds_cfg.sizeof_ptr_field * field); - (*(unsigned long *)base) = val; -} - - -/* - * The raw BTS data is architecture dependent. - * - * For higher-level users, we give an arch-independent view. - * - ds.h defines struct bts_struct - * - bts_read translates one raw bts record into a bts_struct - * - bts_write translates one bts_struct into the raw format and - * writes it into the top of the parameter tracer's buffer. - * - * return: bytes read/written on success; -Eerrno, otherwise - */ -static int -bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) -{ - if (!tracer) - return -EINVAL; - - if (at < tracer->trace.ds.begin) - return -EINVAL; - - if (tracer->trace.ds.end < (at + tracer->trace.ds.size)) - return -EINVAL; - - memset(out, 0, sizeof(*out)); - if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { - out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); - out->variant.event.clock = bts_get(at, bts_clock); - out->variant.event.pid = bts_get(at, bts_pid); - } else { - out->qualifier = bts_branch; - out->variant.lbr.from = bts_get(at, bts_from); - out->variant.lbr.to = bts_get(at, bts_to); - - if (!out->variant.lbr.from && !out->variant.lbr.to) - out->qualifier = bts_invalid; - } - - return ds_cfg.sizeof_rec[ds_bts]; -} - -static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) -{ - unsigned char raw[MAX_SIZEOF_BTS]; - - if (!tracer) - return -EINVAL; - - if (MAX_SIZEOF_BTS < ds_cfg.sizeof_rec[ds_bts]) - return -EOVERFLOW; - - switch (in->qualifier) { - case bts_invalid: - bts_set(raw, bts_from, 0); - bts_set(raw, bts_to, 0); - bts_set(raw, bts_flags, 0); - break; - case bts_branch: - bts_set(raw, bts_from, in->variant.lbr.from); - bts_set(raw, bts_to, in->variant.lbr.to); - bts_set(raw, bts_flags, 0); - break; - case bts_task_arrives: - case bts_task_departs: - bts_set(raw, bts_qual, (bts_escape | in->qualifier)); - bts_set(raw, bts_clock, in->variant.event.clock); - bts_set(raw, bts_pid, in->variant.event.pid); - break; - default: - return -EINVAL; - } - - return ds_write(tracer->ds.context, ds_bts, raw, - ds_cfg.sizeof_rec[ds_bts]); -} - - -static void ds_write_config(struct ds_context *context, - struct ds_trace *cfg, enum ds_qualifier qual) -{ - unsigned char *ds = context->ds; - - ds_set(ds, qual, ds_buffer_base, (unsigned long)cfg->begin); - ds_set(ds, qual, ds_index, (unsigned long)cfg->top); - ds_set(ds, qual, ds_absolute_maximum, (unsigned long)cfg->end); - ds_set(ds, qual, ds_interrupt_threshold, (unsigned long)cfg->ith); -} - -static void ds_read_config(struct ds_context *context, - struct ds_trace *cfg, enum ds_qualifier qual) -{ - unsigned char *ds = context->ds; - - cfg->begin = (void *)ds_get(ds, qual, ds_buffer_base); - cfg->top = (void *)ds_get(ds, qual, ds_index); - cfg->end = (void *)ds_get(ds, qual, ds_absolute_maximum); - cfg->ith = (void *)ds_get(ds, qual, ds_interrupt_threshold); -} - -static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, - void *base, size_t size, size_t ith, - unsigned int flags) { - unsigned long buffer, adj; - - /* - * Adjust the buffer address and size to meet alignment - * constraints: - * - buffer is double-word aligned - * - size is multiple of record size - * - * We checked the size at the very beginning; we have enough - * space to do the adjustment. - */ - buffer = (unsigned long)base; - - adj = ALIGN(buffer, DS_ALIGNMENT) - buffer; - buffer += adj; - size -= adj; - - trace->n = size / ds_cfg.sizeof_rec[qual]; - trace->size = ds_cfg.sizeof_rec[qual]; - - size = (trace->n * trace->size); - - trace->begin = (void *)buffer; - trace->top = trace->begin; - trace->end = (void *)(buffer + size); - /* - * The value for 'no threshold' is -1, which will set the - * threshold outside of the buffer, just like we want it. - */ - ith *= ds_cfg.sizeof_rec[qual]; - trace->ith = (void *)(buffer + size - ith); - - trace->flags = flags; -} - - -static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, - enum ds_qualifier qual, struct task_struct *task, - int cpu, void *base, size_t size, size_t th) -{ - struct ds_context *context; - int error; - size_t req_size; - - error = -EOPNOTSUPP; - if (!ds_cfg.sizeof_rec[qual]) - goto out; - - error = -EINVAL; - if (!base) - goto out; - - req_size = ds_cfg.sizeof_rec[qual]; - /* We might need space for alignment adjustments. */ - if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT)) - req_size += DS_ALIGNMENT; - - error = -EINVAL; - if (size < req_size) - goto out; - - if (th != (size_t)-1) { - th *= ds_cfg.sizeof_rec[qual]; - - error = -EINVAL; - if (size <= th) - goto out; - } - - tracer->buffer = base; - tracer->size = size; - - error = -ENOMEM; - context = ds_get_context(task, cpu); - if (!context) - goto out; - tracer->context = context; - - /* - * Defer any tracer-specific initialization work for the context until - * context ownership has been clarified. - */ - - error = 0; - out: - return error; -} - -static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu, - void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th, - unsigned int flags) -{ - struct bts_tracer *tracer; - int error; - - /* Buffer overflow notification is not yet implemented. */ - error = -EOPNOTSUPP; - if (ovfl) - goto out; - - error = get_tracer(task); - if (error < 0) - goto out; - - error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); - if (!tracer) - goto out_put_tracer; - tracer->ovfl = ovfl; - - /* Do some more error checking and acquire a tracing context. */ - error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_bts, task, cpu, base, size, th); - if (error < 0) - goto out_tracer; - - /* Claim the bts part of the tracing context we acquired above. */ - spin_lock_irq(&ds_lock); - - error = -EPERM; - if (tracer->ds.context->bts_master) - goto out_unlock; - tracer->ds.context->bts_master = tracer; - - spin_unlock_irq(&ds_lock); - - /* - * Now that we own the bts part of the context, let's complete the - * initialization for that part. - */ - ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags); - ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); - ds_install_ds_area(tracer->ds.context); - - tracer->trace.read = bts_read; - tracer->trace.write = bts_write; - - /* Start tracing. */ - ds_resume_bts(tracer); - - return tracer; - - out_unlock: - spin_unlock_irq(&ds_lock); - ds_put_context(tracer->ds.context); - out_tracer: - kfree(tracer); - out_put_tracer: - put_tracer(task); - out: - return ERR_PTR(error); -} - -struct bts_tracer *ds_request_bts_task(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, - size_t th, unsigned int flags) -{ - return ds_request_bts(task, 0, base, size, ovfl, th, flags); -} - -struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, - bts_ovfl_callback_t ovfl, - size_t th, unsigned int flags) -{ - return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags); -} - -static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, size_t th, - unsigned int flags) -{ - struct pebs_tracer *tracer; - int error; - - /* Buffer overflow notification is not yet implemented. */ - error = -EOPNOTSUPP; - if (ovfl) - goto out; - - error = get_tracer(task); - if (error < 0) - goto out; - - error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); - if (!tracer) - goto out_put_tracer; - tracer->ovfl = ovfl; - - /* Do some more error checking and acquire a tracing context. */ - error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_pebs, task, cpu, base, size, th); - if (error < 0) - goto out_tracer; - - /* Claim the pebs part of the tracing context we acquired above. */ - spin_lock_irq(&ds_lock); - - error = -EPERM; - if (tracer->ds.context->pebs_master) - goto out_unlock; - tracer->ds.context->pebs_master = tracer; - - spin_unlock_irq(&ds_lock); - - /* - * Now that we own the pebs part of the context, let's complete the - * initialization for that part. - */ - ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags); - ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); - ds_install_ds_area(tracer->ds.context); - - /* Start tracing. */ - ds_resume_pebs(tracer); - - return tracer; - - out_unlock: - spin_unlock_irq(&ds_lock); - ds_put_context(tracer->ds.context); - out_tracer: - kfree(tracer); - out_put_tracer: - put_tracer(task); - out: - return ERR_PTR(error); -} - -struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, - size_t th, unsigned int flags) -{ - return ds_request_pebs(task, 0, base, size, ovfl, th, flags); -} - -struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size, - pebs_ovfl_callback_t ovfl, - size_t th, unsigned int flags) -{ - return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags); -} - -static void ds_free_bts(struct bts_tracer *tracer) -{ - struct task_struct *task; - - task = tracer->ds.context->task; - - WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); - tracer->ds.context->bts_master = NULL; - - /* Make sure tracing stopped and the tracer is not in use. */ - if (task && (task != current)) - wait_task_context_switch(task); - - ds_put_context(tracer->ds.context); - put_tracer(task); - - kfree(tracer); -} - -void ds_release_bts(struct bts_tracer *tracer) -{ - might_sleep(); - - if (!tracer) - return; - - ds_suspend_bts(tracer); - ds_free_bts(tracer); -} - -int ds_release_bts_noirq(struct bts_tracer *tracer) -{ - struct task_struct *task; - unsigned long irq; - int error; - - if (!tracer) - return 0; - - task = tracer->ds.context->task; - - local_irq_save(irq); - - error = -EPERM; - if (!task && - (tracer->ds.context->cpu != smp_processor_id())) - goto out; - - error = -EPERM; - if (task && (task != current)) - goto out; - - ds_suspend_bts_noirq(tracer); - ds_free_bts(tracer); - - error = 0; - out: - local_irq_restore(irq); - return error; -} - -static void update_task_debugctlmsr(struct task_struct *task, - unsigned long debugctlmsr) -{ - task->thread.debugctlmsr = debugctlmsr; - - get_cpu(); - if (task == current) - update_debugctlmsr(debugctlmsr); - put_cpu(); -} - -void ds_suspend_bts(struct bts_tracer *tracer) -{ - struct task_struct *task; - unsigned long debugctlmsr; - int cpu; - - if (!tracer) - return; - - tracer->flags = 0; - - task = tracer->ds.context->task; - cpu = tracer->ds.context->cpu; - - WARN_ON(!task && irqs_disabled()); - - debugctlmsr = (task ? - task->thread.debugctlmsr : - get_debugctlmsr_on_cpu(cpu)); - debugctlmsr &= ~BTS_CONTROL; - - if (task) - update_task_debugctlmsr(task, debugctlmsr); - else - update_debugctlmsr_on_cpu(cpu, debugctlmsr); -} - -int ds_suspend_bts_noirq(struct bts_tracer *tracer) -{ - struct task_struct *task; - unsigned long debugctlmsr, irq; - int cpu, error = 0; - - if (!tracer) - return 0; - - tracer->flags = 0; - - task = tracer->ds.context->task; - cpu = tracer->ds.context->cpu; - - local_irq_save(irq); - - error = -EPERM; - if (!task && (cpu != smp_processor_id())) - goto out; - - debugctlmsr = (task ? - task->thread.debugctlmsr : - get_debugctlmsr()); - debugctlmsr &= ~BTS_CONTROL; - - if (task) - update_task_debugctlmsr(task, debugctlmsr); - else - update_debugctlmsr(debugctlmsr); - - error = 0; - out: - local_irq_restore(irq); - return error; -} - -static unsigned long ds_bts_control(struct bts_tracer *tracer) -{ - unsigned long control; - - control = ds_cfg.ctl[dsf_bts]; - if (!(tracer->trace.ds.flags & BTS_KERNEL)) - control |= ds_cfg.ctl[dsf_bts_kernel]; - if (!(tracer->trace.ds.flags & BTS_USER)) - control |= ds_cfg.ctl[dsf_bts_user]; - - return control; -} - -void ds_resume_bts(struct bts_tracer *tracer) -{ - struct task_struct *task; - unsigned long debugctlmsr; - int cpu; - - if (!tracer) - return; - - tracer->flags = tracer->trace.ds.flags; - - task = tracer->ds.context->task; - cpu = tracer->ds.context->cpu; - - WARN_ON(!task && irqs_disabled()); - - debugctlmsr = (task ? - task->thread.debugctlmsr : - get_debugctlmsr_on_cpu(cpu)); - debugctlmsr |= ds_bts_control(tracer); - - if (task) - update_task_debugctlmsr(task, debugctlmsr); - else - update_debugctlmsr_on_cpu(cpu, debugctlmsr); -} - -int ds_resume_bts_noirq(struct bts_tracer *tracer) -{ - struct task_struct *task; - unsigned long debugctlmsr, irq; - int cpu, error = 0; - - if (!tracer) - return 0; - - tracer->flags = tracer->trace.ds.flags; - - task = tracer->ds.context->task; - cpu = tracer->ds.context->cpu; - - local_irq_save(irq); - - error = -EPERM; - if (!task && (cpu != smp_processor_id())) - goto out; - - debugctlmsr = (task ? - task->thread.debugctlmsr : - get_debugctlmsr()); - debugctlmsr |= ds_bts_control(tracer); - - if (task) - update_task_debugctlmsr(task, debugctlmsr); - else - update_debugctlmsr(debugctlmsr); - - error = 0; - out: - local_irq_restore(irq); - return error; -} - -static void ds_free_pebs(struct pebs_tracer *tracer) -{ - struct task_struct *task; - - task = tracer->ds.context->task; - - WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); - tracer->ds.context->pebs_master = NULL; - - ds_put_context(tracer->ds.context); - put_tracer(task); - - kfree(tracer); -} - -void ds_release_pebs(struct pebs_tracer *tracer) -{ - might_sleep(); - - if (!tracer) - return; - - ds_suspend_pebs(tracer); - ds_free_pebs(tracer); -} - -int ds_release_pebs_noirq(struct pebs_tracer *tracer) -{ - struct task_struct *task; - unsigned long irq; - int error; - - if (!tracer) - return 0; - - task = tracer->ds.context->task; - - local_irq_save(irq); - - error = -EPERM; - if (!task && - (tracer->ds.context->cpu != smp_processor_id())) - goto out; - - error = -EPERM; - if (task && (task != current)) - goto out; - - ds_suspend_pebs_noirq(tracer); - ds_free_pebs(tracer); - - error = 0; - out: - local_irq_restore(irq); - return error; -} - -void ds_suspend_pebs(struct pebs_tracer *tracer) -{ - -} - -int ds_suspend_pebs_noirq(struct pebs_tracer *tracer) -{ - return 0; -} - -void ds_resume_pebs(struct pebs_tracer *tracer) -{ - -} - -int ds_resume_pebs_noirq(struct pebs_tracer *tracer) -{ - return 0; -} - -const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) -{ - if (!tracer) - return NULL; - - ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_bts); - return &tracer->trace; -} - -const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) -{ - if (!tracer) - return NULL; - - ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); - - tracer->trace.counters = ds_cfg.nr_counter_reset; - memcpy(tracer->trace.counter_reset, - tracer->ds.context->ds + - (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field), - ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE); - - return &tracer->trace; -} - -int ds_reset_bts(struct bts_tracer *tracer) -{ - if (!tracer) - return -EINVAL; - - tracer->trace.ds.top = tracer->trace.ds.begin; - - ds_set(tracer->ds.context->ds, ds_bts, ds_index, - (unsigned long)tracer->trace.ds.top); - - return 0; -} - -int ds_reset_pebs(struct pebs_tracer *tracer) -{ - if (!tracer) - return -EINVAL; - - tracer->trace.ds.top = tracer->trace.ds.begin; - - ds_set(tracer->ds.context->ds, ds_pebs, ds_index, - (unsigned long)tracer->trace.ds.top); - - return 0; -} - -int ds_set_pebs_reset(struct pebs_tracer *tracer, - unsigned int counter, u64 value) -{ - if (!tracer) - return -EINVAL; - - if (ds_cfg.nr_counter_reset < counter) - return -EINVAL; - - *(u64 *)(tracer->ds.context->ds + - (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) + - (counter * PEBS_RESET_FIELD_SIZE)) = value; - - return 0; -} - -static const struct ds_configuration ds_cfg_netburst = { - .name = "Netburst", - .ctl[dsf_bts] = (1 << 2) | (1 << 3), - .ctl[dsf_bts_kernel] = (1 << 5), - .ctl[dsf_bts_user] = (1 << 6), - .nr_counter_reset = 1, -}; -static const struct ds_configuration ds_cfg_pentium_m = { - .name = "Pentium M", - .ctl[dsf_bts] = (1 << 6) | (1 << 7), - .nr_counter_reset = 1, -}; -static const struct ds_configuration ds_cfg_core2_atom = { - .name = "Core 2/Atom", - .ctl[dsf_bts] = (1 << 6) | (1 << 7), - .ctl[dsf_bts_kernel] = (1 << 9), - .ctl[dsf_bts_user] = (1 << 10), - .nr_counter_reset = 1, -}; -static const struct ds_configuration ds_cfg_core_i7 = { - .name = "Core i7", - .ctl[dsf_bts] = (1 << 6) | (1 << 7), - .ctl[dsf_bts_kernel] = (1 << 9), - .ctl[dsf_bts_user] = (1 << 10), - .nr_counter_reset = 4, -}; - -static void -ds_configure(const struct ds_configuration *cfg, - struct cpuinfo_x86 *cpu) -{ - unsigned long nr_pebs_fields = 0; - - printk(KERN_INFO "[ds] using %s configuration\n", cfg->name); - -#ifdef __i386__ - nr_pebs_fields = 10; -#else - nr_pebs_fields = 18; -#endif - - /* - * Starting with version 2, architectural performance - * monitoring supports a format specifier. - */ - if ((cpuid_eax(0xa) & 0xff) > 1) { - unsigned long perf_capabilities, format; - - rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities); - - format = (perf_capabilities >> 8) & 0xf; - - switch (format) { - case 0: - nr_pebs_fields = 18; - break; - case 1: - nr_pebs_fields = 22; - break; - default: - printk(KERN_INFO - "[ds] unknown PEBS format: %lu\n", format); - nr_pebs_fields = 0; - break; - } - } - - memset(&ds_cfg, 0, sizeof(ds_cfg)); - ds_cfg = *cfg; - - ds_cfg.sizeof_ptr_field = - (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4); - - ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3; - ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields; - - if (!cpu_has(cpu, X86_FEATURE_BTS)) { - ds_cfg.sizeof_rec[ds_bts] = 0; - printk(KERN_INFO "[ds] bts not available\n"); - } - if (!cpu_has(cpu, X86_FEATURE_PEBS)) { - ds_cfg.sizeof_rec[ds_pebs] = 0; - printk(KERN_INFO "[ds] pebs not available\n"); - } - - printk(KERN_INFO "[ds] sizes: address: %u bit, ", - 8 * ds_cfg.sizeof_ptr_field); - printk("bts/pebs record: %u/%u bytes\n", - ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); - - WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset); -} - -void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) -{ - /* Only configure the first cpu. Others are identical. */ - if (ds_cfg.name) - return; - - switch (c->x86) { - case 0x6: - switch (c->x86_model) { - case 0x9: - case 0xd: /* Pentium M */ - ds_configure(&ds_cfg_pentium_m, c); - break; - case 0xf: - case 0x17: /* Core2 */ - case 0x1c: /* Atom */ - ds_configure(&ds_cfg_core2_atom, c); - break; - case 0x1a: /* Core i7 */ - ds_configure(&ds_cfg_core_i7, c); - break; - default: - /* Sorry, don't know about them. */ - break; - } - break; - case 0xf: - switch (c->x86_model) { - case 0x0: - case 0x1: - case 0x2: /* Netburst */ - ds_configure(&ds_cfg_netburst, c); - break; - default: - /* Sorry, don't know about them. */ - break; - } - break; - default: - /* Sorry, don't know about them. */ - break; - } -} - -static inline void ds_take_timestamp(struct ds_context *context, - enum bts_qualifier qualifier, - struct task_struct *task) -{ - struct bts_tracer *tracer = context->bts_master; - struct bts_struct ts; - - /* Prevent compilers from reading the tracer pointer twice. */ - barrier(); - - if (!tracer || !(tracer->flags & BTS_TIMESTAMPS)) - return; - - memset(&ts, 0, sizeof(ts)); - ts.qualifier = qualifier; - ts.variant.event.clock = trace_clock_global(); - ts.variant.event.pid = task->pid; - - bts_write(tracer, &ts); -} - -/* - * Change the DS configuration from tracing prev to tracing next. - */ -void ds_switch_to(struct task_struct *prev, struct task_struct *next) -{ - struct ds_context *prev_ctx = prev->thread.ds_ctx; - struct ds_context *next_ctx = next->thread.ds_ctx; - unsigned long debugctlmsr = next->thread.debugctlmsr; - - /* Make sure all data is read before we start. */ - barrier(); - - if (prev_ctx) { - update_debugctlmsr(0); - - ds_take_timestamp(prev_ctx, bts_task_departs, prev); - } - - if (next_ctx) { - ds_take_timestamp(next_ctx, bts_task_arrives, next); - - wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); - } - - update_debugctlmsr(debugctlmsr); -} - -static __init int ds_selftest(void) -{ - if (ds_cfg.sizeof_rec[ds_bts]) { - int error; - - error = ds_selftest_bts(); - if (error) { - WARN(1, "[ds] selftest failed. disabling bts.\n"); - ds_cfg.sizeof_rec[ds_bts] = 0; - } - } - - if (ds_cfg.sizeof_rec[ds_pebs]) { - int error; - - error = ds_selftest_pebs(); - if (error) { - WARN(1, "[ds] selftest failed. disabling pebs.\n"); - ds_cfg.sizeof_rec[ds_pebs] = 0; - } - } - - return 0; -} -device_initcall(ds_selftest); diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c deleted file mode 100644 index 6bc7c199ab99..000000000000 --- a/arch/x86/kernel/ds_selftest.c +++ /dev/null @@ -1,408 +0,0 @@ -/* - * Debug Store support - selftest - * - * - * Copyright (C) 2009 Intel Corporation. - * Markus Metzger <markus.t.metzger@intel.com>, 2009 - */ - -#include "ds_selftest.h" - -#include <linux/kernel.h> -#include <linux/string.h> -#include <linux/smp.h> -#include <linux/cpu.h> - -#include <asm/ds.h> - - -#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ -#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */ - -struct ds_selftest_bts_conf { - struct bts_tracer *tracer; - int error; - int (*suspend)(struct bts_tracer *); - int (*resume)(struct bts_tracer *); -}; - -static int ds_selftest_bts_consistency(const struct bts_trace *trace) -{ - int error = 0; - - if (!trace) { - printk(KERN_CONT "failed to access trace..."); - /* Bail out. Other tests are pointless. */ - return -1; - } - - if (!trace->read) { - printk(KERN_CONT "bts read not available..."); - error = -1; - } - - /* Do some sanity checks on the trace configuration. */ - if (!trace->ds.n) { - printk(KERN_CONT "empty bts buffer..."); - error = -1; - } - if (!trace->ds.size) { - printk(KERN_CONT "bad bts trace setup..."); - error = -1; - } - if (trace->ds.end != - (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) { - printk(KERN_CONT "bad bts buffer setup..."); - error = -1; - } - /* - * We allow top in [begin; end], since its not clear when the - * overflow adjustment happens: after the increment or before the - * write. - */ - if ((trace->ds.top < trace->ds.begin) || - (trace->ds.end < trace->ds.top)) { - printk(KERN_CONT "bts top out of bounds..."); - error = -1; - } - - return error; -} - -static int ds_selftest_bts_read(struct bts_tracer *tracer, - const struct bts_trace *trace, - const void *from, const void *to) -{ - const unsigned char *at; - - /* - * Check a few things which do not belong to this test. - * They should be covered by other tests. - */ - if (!trace) - return -1; - - if (!trace->read) - return -1; - - if (to < from) - return -1; - - if (from < trace->ds.begin) - return -1; - - if (trace->ds.end < to) - return -1; - - if (!trace->ds.size) - return -1; - - /* Now to the test itself. */ - for (at = from; (void *)at < to; at += trace->ds.size) { - struct bts_struct bts; - unsigned long index; - int error; - - if (((void *)at - trace->ds.begin) % trace->ds.size) { - printk(KERN_CONT - "read from non-integer index..."); - return -1; - } - index = ((void *)at - trace->ds.begin) / trace->ds.size; - - memset(&bts, 0, sizeof(bts)); - error = trace->read(tracer, at, &bts); - if (error < 0) { - printk(KERN_CONT - "error reading bts trace at [%lu] (0x%p)...", - index, at); - return error; - } - - switch (bts.qualifier) { - case BTS_BRANCH: - break; - default: - printk(KERN_CONT - "unexpected bts entry %llu at [%lu] (0x%p)...", - bts.qualifier, index, at); - return -1; - } - } - - return 0; -} - -static void ds_selftest_bts_cpu(void *arg) -{ - struct ds_selftest_bts_conf *conf = arg; - const struct bts_trace *trace; - void *top; - - if (IS_ERR(conf->tracer)) { - conf->error = PTR_ERR(conf->tracer); - conf->tracer = NULL; - - printk(KERN_CONT - "initialization failed (err: %d)...", conf->error); - return; - } - - /* We should meanwhile have enough trace. */ - conf->error = conf->suspend(conf->tracer); - if (conf->error < 0) - return; - - /* Let's see if we can access the trace. */ - trace = ds_read_bts(conf->tracer); - - conf->error = ds_selftest_bts_consistency(trace); - if (conf->error < 0) - return; - - /* If everything went well, we should have a few trace entries. */ - if (trace->ds.top == trace->ds.begin) { - /* - * It is possible but highly unlikely that we got a - * buffer overflow and end up at exactly the same - * position we started from. - * Let's issue a warning, but continue. - */ - printk(KERN_CONT "no trace/overflow..."); - } - - /* Let's try to read the trace we collected. */ - conf->error = - ds_selftest_bts_read(conf->tracer, trace, - trace->ds.begin, trace->ds.top); - if (conf->error < 0) - return; - - /* - * Let's read the trace again. - * Since we suspended tracing, we should get the same result. - */ - top = trace->ds.top; - - trace = ds_read_bts(conf->tracer); - conf->error = ds_selftest_bts_consistency(trace); - if (conf->error < 0) - return; - - if (top != trace->ds.top) { - printk(KERN_CONT "suspend not working..."); - conf->error = -1; - return; - } - - /* Let's collect some more trace - see if resume is working. */ - conf->error = conf->resume(conf->tracer); - if (conf->error < 0) - return; - - conf->error = conf->suspend(conf->tracer); - if (conf->error < 0) - return; - - trace = ds_read_bts(conf->tracer); - - conf->error = ds_selftest_bts_consistency(trace); - if (conf->error < 0) - return; - - if (trace->ds.top == top) { - /* - * It is possible but highly unlikely that we got a - * buffer overflow and end up at exactly the same - * position we started from. - * Let's issue a warning and check the full trace. - */ - printk(KERN_CONT - "no resume progress/overflow..."); - - conf->error = - ds_selftest_bts_read(conf->tracer, trace, - trace->ds.begin, trace->ds.end); - } else if (trace->ds.top < top) { - /* - * We had a buffer overflow - the entire buffer should - * contain trace records. - */ - conf->error = - ds_selftest_bts_read(conf->tracer, trace, - trace->ds.begin, trace->ds.end); - } else { - /* - * It is quite likely that the buffer did not overflow. - * Let's just check the delta trace. - */ - conf->error = - ds_selftest_bts_read(conf->tracer, trace, top, - trace->ds.top); - } - if (conf->error < 0) - return; - - conf->error = 0; -} - -static int ds_suspend_bts_wrap(struct bts_tracer *tracer) -{ - ds_suspend_bts(tracer); - return 0; -} - -static int ds_resume_bts_wrap(struct bts_tracer *tracer) -{ - ds_resume_bts(tracer); - return 0; -} - -static void ds_release_bts_noirq_wrap(void *tracer) -{ - (void)ds_release_bts_noirq(tracer); -} - -static int ds_selftest_bts_bad_release_noirq(int cpu, - struct bts_tracer *tracer) -{ - int error = -EPERM; - - /* Try to release the tracer on the wrong cpu. */ - get_cpu(); - if (cpu != smp_processor_id()) { - error = ds_release_bts_noirq(tracer); - if (error != -EPERM) - printk(KERN_CONT "release on wrong cpu..."); - } - put_cpu(); - - return error ? 0 : -1; -} - -static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer) -{ - struct bts_tracer *tracer; - int error; - - /* Try to request cpu tracing while task tracing is active. */ - tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, - (size_t)-1, BTS_KERNEL); - error = PTR_ERR(tracer); - if (!IS_ERR(tracer)) { - ds_release_bts(tracer); - error = 0; - } - - if (error != -EPERM) - printk(KERN_CONT "cpu/task tracing overlap..."); - - return error ? 0 : -1; -} - -static int ds_selftest_bts_bad_request_task(void *buffer) -{ - struct bts_tracer *tracer; - int error; - - /* Try to request cpu tracing while task tracing is active. */ - tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL, - (size_t)-1, BTS_KERNEL); - error = PTR_ERR(tracer); - if (!IS_ERR(tracer)) { - error = 0; - ds_release_bts(tracer); - } - - if (error != -EPERM) - printk(KERN_CONT "task/cpu tracing overlap..."); - - return error ? 0 : -1; -} - -int ds_selftest_bts(void) -{ - struct ds_selftest_bts_conf conf; - unsigned char buffer[BUFFER_SIZE], *small_buffer; - unsigned long irq; - int cpu; - - printk(KERN_INFO "[ds] bts selftest..."); - conf.error = 0; - - small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8; - - get_online_cpus(); - for_each_online_cpu(cpu) { - conf.suspend = ds_suspend_bts_wrap; - conf.resume = ds_resume_bts_wrap; - conf.tracer = - ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - ds_selftest_bts_cpu(&conf); - if (conf.error >= 0) - conf.error = ds_selftest_bts_bad_request_task(buffer); - ds_release_bts(conf.tracer); - if (conf.error < 0) - goto out; - - conf.suspend = ds_suspend_bts_noirq; - conf.resume = ds_resume_bts_noirq; - conf.tracer = - ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1); - if (conf.error >= 0) { - conf.error = - ds_selftest_bts_bad_release_noirq(cpu, - conf.tracer); - /* We must not release the tracer twice. */ - if (conf.error < 0) - conf.tracer = NULL; - } - if (conf.error >= 0) - conf.error = ds_selftest_bts_bad_request_task(buffer); - smp_call_function_single(cpu, ds_release_bts_noirq_wrap, - conf.tracer, 1); - if (conf.error < 0) - goto out; - } - - conf.suspend = ds_suspend_bts_wrap; - conf.resume = ds_resume_bts_wrap; - conf.tracer = - ds_request_bts_task(current, buffer, BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - ds_selftest_bts_cpu(&conf); - if (conf.error >= 0) - conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); - ds_release_bts(conf.tracer); - if (conf.error < 0) - goto out; - - conf.suspend = ds_suspend_bts_noirq; - conf.resume = ds_resume_bts_noirq; - conf.tracer = - ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - local_irq_save(irq); - ds_selftest_bts_cpu(&conf); - if (conf.error >= 0) - conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); - ds_release_bts_noirq(conf.tracer); - local_irq_restore(irq); - if (conf.error < 0) - goto out; - - conf.error = 0; - out: - put_online_cpus(); - printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed")); - - return conf.error; -} - -int ds_selftest_pebs(void) -{ - return 0; -} diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h deleted file mode 100644 index 2ba8745c6663..000000000000 --- a/arch/x86/kernel/ds_selftest.h +++ /dev/null @@ -1,15 +0,0 @@ -/* - * Debug Store support - selftest - * - * - * Copyright (C) 2009 Intel Corporation. - * Markus Metzger <markus.t.metzger@intel.com>, 2009 - */ - -#ifdef CONFIG_X86_DS_SELFTEST -extern int ds_selftest_bts(void); -extern int ds_selftest_pebs(void); -#else -static inline int ds_selftest_bts(void) { return 0; } -static inline int ds_selftest_pebs(void) { return 0; } -#endif diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index 6d817554780a..6e8752c1bd52 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -18,7 +18,6 @@ #include <asm/stacktrace.h> -#include "dumpstack.h" int panic_on_unrecovered_nmi; int panic_on_io_nmi; @@ -224,11 +223,6 @@ unsigned __kprobes long oops_begin(void) int cpu; unsigned long flags; - /* notify the hw-branch tracer so it may disable tracing and - add the last trace to the trace buffer - - the earlier this happens, the more useful the trace. */ - trace_hw_branch_oops(); - oops_enter(); /* racy, but better than risking deadlock. */ diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h deleted file mode 100644 index e1a93be4fd44..000000000000 --- a/arch/x86/kernel/dumpstack.h +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 1991, 1992 Linus Torvalds - * Copyright (C) 2000, 2001, 2002 Andi Kleen, SuSE Labs - */ - -#ifndef DUMPSTACK_H -#define DUMPSTACK_H - -#ifdef CONFIG_X86_32 -#define STACKSLOTS_PER_LINE 8 -#define get_bp(bp) asm("movl %%ebp, %0" : "=r" (bp) :) -#else -#define STACKSLOTS_PER_LINE 4 -#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) -#endif - -#include <linux/uaccess.h> - -extern void -show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *stack, unsigned long bp, char *log_lvl); - -extern void -show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, - unsigned long *sp, unsigned long bp, char *log_lvl); - -extern unsigned int code_bytes; - -/* The form of the top of the frame on the stack */ -struct stack_frame { - struct stack_frame *next_frame; - unsigned long return_address; -}; - -struct stack_frame_ia32 { - u32 next_frame; - u32 return_address; -}; - -static inline unsigned long rewind_frame_pointer(int n) -{ - struct stack_frame *frame; - - get_bp(frame); - -#ifdef CONFIG_FRAME_POINTER - while (n--) { - if (probe_kernel_address(&frame->next_frame, frame)) - break; - } -#endif - - return (unsigned long)frame; -} - -#endif /* DUMPSTACK_H */ diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 11540a189d93..0f6376ffa2d9 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -16,8 +16,6 @@ #include <asm/stacktrace.h> -#include "dumpstack.h" - void dump_trace(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, unsigned long bp, diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 272c9f1f05f3..57a21f11c791 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -16,7 +16,6 @@ #include <asm/stacktrace.h> -#include "dumpstack.h" #define N_EXCEPTION_STACKS_END \ (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 7bca3c6a02fb..0d6fc71bedb1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -729,7 +729,7 @@ static int __init e820_mark_nvs_memory(void) struct e820entry *ei = &e820.map[i]; if (ei->type == E820_NVS) - hibernate_nvs_register(ei->addr, ei->size); + suspend_nvs_register(ei->addr, ei->size); } return 0; diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index ebdb85cf2686..e5cc7e82e60d 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -18,6 +18,7 @@ #include <asm/apic.h> #include <asm/iommu.h> #include <asm/gart.h> +#include <asm/hpet.h> static void __init fix_hypertransport_config(int num, int slot, int func) { @@ -191,6 +192,21 @@ static void __init ati_bugs_contd(int num, int slot, int func) } #endif +/* + * Force the read back of the CMP register in hpet_next_event() + * to work around the problem that the CMP register write seems to be + * delayed. See hpet_next_event() for details. + * + * We do this on all SMBUS incarnations for now until we have more + * information about the affected chipsets. + */ +static void __init ati_hpet_bugs(int num, int slot, int func) +{ +#ifdef CONFIG_HPET_TIMER + hpet_readback_cmp = 1; +#endif +} + #define QFLAG_APPLY_ONCE 0x1 #define QFLAG_APPLIED 0x2 #define QFLAG_DONE (QFLAG_APPLY_ONCE|QFLAG_APPLIED) @@ -220,6 +236,8 @@ static struct chipset early_qrk[] __initdata = { PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs }, { PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS, PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_bugs_contd }, + { PCI_VENDOR_ID_ATI, PCI_ANY_ID, + PCI_CLASS_SERIAL_SMBUS, PCI_ANY_ID, 0, ati_hpet_bugs }, {} }; diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index b9c830c12b4a..fa99bae75ace 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -41,6 +41,14 @@ static void early_vga_write(struct console *con, const char *str, unsigned n) writew(0x720, VGABASE + 2*(max_xpos*j + i)); current_ypos = max_ypos-1; } +#ifdef CONFIG_KGDB_KDB + if (c == '\b') { + if (current_xpos > 0) + current_xpos--; + } else if (c == '\r') { + current_xpos = 0; + } else +#endif if (c == '\n') { current_xpos = 0; current_ypos++; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 44a8e0dc6737..227d00920d2f 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -53,6 +53,7 @@ #include <asm/processor-flags.h> #include <asm/ftrace.h> #include <asm/irq_vectors.h> +#include <asm/cpufeature.h> /* Avoid __ASSEMBLER__'ifying <linux/audit.h> just for this. */ #include <linux/elf-em.h> @@ -610,14 +611,14 @@ ldt_ss: * compensating for the offset by changing to the ESPFIX segment with * a base address that matches for the difference. */ +#define GDT_ESPFIX_SS PER_CPU_VAR(gdt_page) + (GDT_ENTRY_ESPFIX_SS * 8) mov %esp, %edx /* load kernel esp */ mov PT_OLDESP(%esp), %eax /* load userspace esp */ mov %dx, %ax /* eax: new kernel esp */ sub %eax, %edx /* offset (low word is 0) */ - PER_CPU(gdt_page, %ebx) shr $16, %edx - mov %dl, GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx) /* bits 16..23 */ - mov %dh, GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx) /* bits 24..31 */ + mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ + mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ pushl $__ESPFIX_SS CFI_ADJUST_CFA_OFFSET 4 push %eax /* new kernel esp */ @@ -790,9 +791,8 @@ ptregs_clone: * normal stack and adjusts ESP with the matching offset. */ /* fixup the stack */ - PER_CPU(gdt_page, %ebx) - mov GDT_ENTRY_ESPFIX_SS * 8 + 4(%ebx), %al /* bits 16..23 */ - mov GDT_ENTRY_ESPFIX_SS * 8 + 7(%ebx), %ah /* bits 24..31 */ + mov GDT_ESPFIX_SS + 4, %al /* bits 16..23 */ + mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ shl $16, %eax addl %esp, %eax /* the adjusted stack pointer */ pushl $__KERNEL_DS @@ -905,7 +905,25 @@ ENTRY(simd_coprocessor_error) RING0_INT_FRAME pushl $0 CFI_ADJUST_CFA_OFFSET 4 +#ifdef CONFIG_X86_INVD_BUG + /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ +661: pushl $do_general_protection +662: +.section .altinstructions,"a" + .balign 4 + .long 661b + .long 663f + .word X86_FEATURE_XMM + .byte 662b-661b + .byte 664f-663f +.previous +.section .altinstr_replacement,"ax" +663: pushl $do_simd_coprocessor_error +664: +.previous +#else pushl $do_simd_coprocessor_error +#endif CFI_ADJUST_CFA_OFFSET 4 jmp error_code CFI_ENDPROC @@ -1147,6 +1165,9 @@ ENTRY(xen_failsafe_callback) .previous ENDPROC(xen_failsafe_callback) +BUILD_INTERRUPT3(xen_hvm_callback_vector, XEN_HVM_EVTCHN_CALLBACK, + xen_evtchn_do_upcall) + #endif /* CONFIG_XEN */ #ifdef CONFIG_FUNCTION_TRACER diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 0697ff139837..c5ea5cdbe7b3 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -571,8 +571,8 @@ auditsys: * masked off. */ sysret_audit: - movq %rax,%rsi /* second arg, syscall return value */ - cmpq $0,%rax /* is it < 0? */ + movq RAX-ARGOFFSET(%rsp),%rsi /* second arg, syscall return value */ + cmpq $0,%rsi /* is it < 0? */ setl %al /* 1 if so, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ inc %edi /* first arg, 0->1(AUDITSC_SUCCESS), 1->2(AUDITSC_FAILURE) */ @@ -1065,6 +1065,7 @@ ENTRY(\sym) END(\sym) .endm +#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) .macro paranoidzeroentry_ist sym do_sym ist ENTRY(\sym) INTR_FRAME @@ -1076,10 +1077,9 @@ ENTRY(\sym) TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - PER_CPU(init_tss, %r12) - subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) + subq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) call \do_sym - addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) + addq $EXCEPTION_STKSZ, INIT_TSS_IST(\ist) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) @@ -1329,6 +1329,9 @@ ENTRY(xen_failsafe_callback) CFI_ENDPROC END(xen_failsafe_callback) +apicinterrupt XEN_HVM_EVTCHN_CALLBACK \ + xen_hvm_callback_vector xen_evtchn_do_upcall + #endif /* CONFIG_XEN */ /* diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index b2e246037392..784360c0625c 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -20,7 +20,7 @@ static void __init i386_default_early_setup(void) { - /* Initilize 32bit specific setup functions */ + /* Initialize 32bit specific setup functions */ x86_init.resources.probe_roms = probe_roms; x86_init.resources.reserve_resources = i386_reserve_resources; x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 37c3d4b17d85..ff4c453e13f3 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -131,6 +131,12 @@ ENTRY(startup_32) movsl 1: +#ifdef CONFIG_OLPC_OPENFIRMWARE + /* save OFW's pgdir table for later use when calling into OFW */ + movl %cr3, %eax + movl %eax, pa(olpc_ofw_pgd) +#endif + #ifdef CONFIG_PARAVIRT /* This is can only trip for a broken bootloader... */ cmpw $0x207, pa(boot_params + BP_version) diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 3d1e6f16b7a6..239046bd447f 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -234,9 +234,8 @@ ENTRY(secondary_startup_64) * init data section till per cpu areas are set up. */ movl $MSR_GS_BASE,%ecx - movq initial_gs(%rip),%rax - movq %rax,%rdx - shrq $32,%rdx + movl initial_gs(%rip),%eax + movl initial_gs+4(%rip),%edx wrmsr /* esi is pointer to real mode structure with interesting info. diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 23b4ecdffa9b..33dbcc4ec5ff 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -16,7 +16,6 @@ #include <asm/hpet.h> #define HPET_MASK CLOCKSOURCE_MASK(32) -#define HPET_SHIFT 22 /* FSEC = 10^-15 NSEC = 10^-9 */ @@ -36,6 +35,7 @@ unsigned long hpet_address; u8 hpet_blockid; /* OS timer block num */ u8 hpet_msi_disable; +u8 hpet_readback_cmp; #ifdef CONFIG_PCI_MSI static unsigned long hpet_num_timers; @@ -395,19 +395,23 @@ static int hpet_next_event(unsigned long delta, * at that point and we would wait for the next hpet interrupt * forever. We found out that reading the CMP register back * forces the transfer so we can rely on the comparison with - * the counter register below. If the read back from the - * compare register does not match the value we programmed - * then we might have a real hardware problem. We can not do - * much about it here, but at least alert the user/admin with - * a prominent warning. - * An erratum on some chipsets (ICH9,..), results in comparator read - * immediately following a write returning old value. Workaround - * for this is to read this value second time, when first - * read returns old value. + * the counter register below. + * + * That works fine on those ATI chipsets, but on newer Intel + * chipsets (ICH9...) this triggers due to an erratum: Reading + * the comparator immediately following a write is returning + * the old value. + * + * We restrict the read back to the affected ATI chipsets (set + * by quirks) and also run it with hpet=verbose for debugging + * purposes. */ - if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { - WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt, - KERN_WARNING "hpet: compare register read back failed.\n"); + if (hpet_readback_cmp || hpet_verbose) { + u32 cmp = hpet_readl(HPET_Tn_CMP(timer)); + + if (cmp != cnt) + printk_once(KERN_WARNING + "hpet: compare register read back failed.\n"); } return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; @@ -782,7 +786,6 @@ static struct clocksource clocksource_hpet = { .rating = 250, .read = read_hpet, .mask = HPET_MASK, - .shift = HPET_SHIFT, .flags = CLOCK_SOURCE_IS_CONTINUOUS, .resume = hpet_resume_counter, #ifdef CONFIG_X86_64 @@ -793,6 +796,7 @@ static struct clocksource clocksource_hpet = { static int hpet_clocksource_register(void) { u64 start, now; + u64 hpet_freq; cycle_t t1; /* Start the counter */ @@ -827,9 +831,15 @@ static int hpet_clocksource_register(void) * mult = (hpet_period * 2^shift)/10^6 * mult = (hpet_period << shift)/FSEC_PER_NSEC */ - clocksource_hpet.mult = div_sc(hpet_period, FSEC_PER_NSEC, HPET_SHIFT); - clocksource_register(&clocksource_hpet); + /* Need to convert hpet_period (fsec/cyc) to cyc/sec: + * + * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc) + * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period + */ + hpet_freq = FSEC_PER_NSEC * NSEC_PER_SEC; + do_div(hpet_freq, hpet_period); + clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); return 0; } @@ -959,7 +969,7 @@ fs_initcall(hpet_late_init); void hpet_disable(void) { - if (is_hpet_capable()) { + if (is_hpet_capable() && hpet_virt_address) { unsigned int cfg = hpet_readl(HPET_CFG); if (hpet_legacy_int_enabled) { diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index d6cc065f519f..a474ec37c32f 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -189,25 +189,16 @@ static int get_hbp_len(u8 hbp_len) } /* - * Check for virtual address in user space. - */ -int arch_check_va_in_userspace(unsigned long va, u8 hbp_len) -{ - unsigned int len; - - len = get_hbp_len(hbp_len); - - return (va <= TASK_SIZE - len); -} - -/* * Check for virtual address in kernel space. */ -static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len) +int arch_check_bp_in_kernelspace(struct perf_event *bp) { unsigned int len; + unsigned long va; + struct arch_hw_breakpoint *info = counter_arch_bp(bp); - len = get_hbp_len(hbp_len); + va = info->address; + len = get_hbp_len(info->len); return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE); } @@ -217,6 +208,9 @@ int arch_bp_generic_fields(int x86_len, int x86_type, { /* Len */ switch (x86_len) { + case X86_BREAKPOINT_LEN_X: + *gen_len = sizeof(long); + break; case X86_BREAKPOINT_LEN_1: *gen_len = HW_BREAKPOINT_LEN_1; break; @@ -260,6 +254,29 @@ static int arch_build_bp_info(struct perf_event *bp) info->address = bp->attr.bp_addr; + /* Type */ + switch (bp->attr.bp_type) { + case HW_BREAKPOINT_W: + info->type = X86_BREAKPOINT_WRITE; + break; + case HW_BREAKPOINT_W | HW_BREAKPOINT_R: + info->type = X86_BREAKPOINT_RW; + break; + case HW_BREAKPOINT_X: + info->type = X86_BREAKPOINT_EXECUTE; + /* + * x86 inst breakpoints need to have a specific undefined len. + * But we still need to check userspace is not trying to setup + * an unsupported length, to get a range breakpoint for example. + */ + if (bp->attr.bp_len == sizeof(long)) { + info->len = X86_BREAKPOINT_LEN_X; + return 0; + } + default: + return -EINVAL; + } + /* Len */ switch (bp->attr.bp_len) { case HW_BREAKPOINT_LEN_1: @@ -280,28 +297,12 @@ static int arch_build_bp_info(struct perf_event *bp) return -EINVAL; } - /* Type */ - switch (bp->attr.bp_type) { - case HW_BREAKPOINT_W: - info->type = X86_BREAKPOINT_WRITE; - break; - case HW_BREAKPOINT_W | HW_BREAKPOINT_R: - info->type = X86_BREAKPOINT_RW; - break; - case HW_BREAKPOINT_X: - info->type = X86_BREAKPOINT_EXECUTE; - break; - default: - return -EINVAL; - } - return 0; } /* * Validate the arch-specific HW Breakpoint register settings */ -int arch_validate_hwbkpt_settings(struct perf_event *bp, - struct task_struct *tsk) +int arch_validate_hwbkpt_settings(struct perf_event *bp) { struct arch_hw_breakpoint *info = counter_arch_bp(bp); unsigned int align; @@ -314,17 +315,10 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, ret = -EINVAL; - if (info->type == X86_BREAKPOINT_EXECUTE) - /* - * Ptrace-refactoring code - * For now, we'll allow instruction breakpoint only for user-space - * addresses - */ - if ((!arch_check_va_in_userspace(info->address, info->len)) && - info->len != X86_BREAKPOINT_EXECUTE) - return ret; - switch (info->len) { + case X86_BREAKPOINT_LEN_X: + align = sizeof(long) -1; + break; case X86_BREAKPOINT_LEN_1: align = 0; break; @@ -350,15 +344,6 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, if (info->address & align) return -EINVAL; - /* Check that the virtual address is in the proper range */ - if (tsk) { - if (!arch_check_va_in_userspace(info->address, info->len)) - return -EFAULT; - } else { - if (!arch_check_va_in_kernelspace(info->address, info->len)) - return -EFAULT; - } - return 0; } @@ -495,6 +480,13 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) perf_bp_event(bp, args->regs); + /* + * Set up resume flag to avoid breakpoint recursion when + * returning back to origin. + */ + if (bp->hw.info.type == X86_BREAKPOINT_EXECUTE) + args->regs->flags |= X86_EFLAGS_RF; + rcu_read_unlock(); } /* diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index 54c31c285488..1f11f5ce668f 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -59,18 +59,18 @@ void __cpuinit mxcsr_feature_mask_init(void) stts(); } -void __cpuinit init_thread_xstate(void) +static void __cpuinit init_thread_xstate(void) { + /* + * Note that xstate_size might be overwriten later during + * xsave_init(). + */ + if (!HAVE_HWFP) { xstate_size = sizeof(struct i387_soft_struct); return; } - if (cpu_has_xsave) { - xsave_cntxt_init(); - return; - } - if (cpu_has_fxsr) xstate_size = sizeof(struct i387_fxsave_struct); #ifdef CONFIG_X86_32 @@ -84,6 +84,7 @@ void __cpuinit init_thread_xstate(void) * Called at bootup to set up the initial FPU state that is later cloned * into all processes. */ + void __cpuinit fpu_init(void) { unsigned long oldcr0 = read_cr0(); @@ -93,74 +94,77 @@ void __cpuinit fpu_init(void) write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ - /* - * Boot processor to setup the FP and extended state context info. - */ if (!smp_processor_id()) init_thread_xstate(); - xsave_init(); mxcsr_feature_mask_init(); /* clean state in init */ - if (cpu_has_xsave) - current_thread_info()->status = TS_XSAVE; - else - current_thread_info()->status = 0; + current_thread_info()->status = 0; clear_used_math(); } -#endif /* CONFIG_X86_64 */ -/* - * The _current_ task is using the FPU for the first time - * so initialize it and set the mxcsr to its default - * value at reset if we support XMM instructions and then - * remeber the current task has used the FPU. - */ -int init_fpu(struct task_struct *tsk) +#else /* CONFIG_X86_64 */ + +void __cpuinit fpu_init(void) { - if (tsk_used_math(tsk)) { - if (HAVE_HWFP && tsk == current) - unlazy_fpu(tsk); - return 0; - } + if (!smp_processor_id()) + init_thread_xstate(); +} - /* - * Memory allocation at the first usage of the FPU and other state. - */ - if (!tsk->thread.xstate) { - tsk->thread.xstate = kmem_cache_alloc(task_xstate_cachep, - GFP_KERNEL); - if (!tsk->thread.xstate) - return -ENOMEM; - } +#endif /* CONFIG_X86_32 */ +void fpu_finit(struct fpu *fpu) +{ #ifdef CONFIG_X86_32 if (!HAVE_HWFP) { - memset(tsk->thread.xstate, 0, xstate_size); - finit_task(tsk); - set_stopped_child_used_math(tsk); - return 0; + finit_soft_fpu(&fpu->state->soft); + return; } #endif if (cpu_has_fxsr) { - struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; + struct i387_fxsave_struct *fx = &fpu->state->fxsave; memset(fx, 0, xstate_size); fx->cwd = 0x37f; if (cpu_has_xmm) fx->mxcsr = MXCSR_DEFAULT; } else { - struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; + struct i387_fsave_struct *fp = &fpu->state->fsave; memset(fp, 0, xstate_size); fp->cwd = 0xffff037fu; fp->swd = 0xffff0000u; fp->twd = 0xffffffffu; fp->fos = 0xffff0000u; } +} +EXPORT_SYMBOL_GPL(fpu_finit); + +/* + * The _current_ task is using the FPU for the first time + * so initialize it and set the mxcsr to its default + * value at reset if we support XMM instructions and then + * remeber the current task has used the FPU. + */ +int init_fpu(struct task_struct *tsk) +{ + int ret; + + if (tsk_used_math(tsk)) { + if (HAVE_HWFP && tsk == current) + unlazy_fpu(tsk); + return 0; + } + /* - * Only the device not available exception or ptrace can call init_fpu. + * Memory allocation at the first usage of the FPU and other state. */ + ret = fpu_alloc(&tsk->thread.fpu); + if (ret) + return ret; + + fpu_finit(&tsk->thread.fpu); + set_stopped_child_used_math(tsk); return 0; } @@ -193,8 +197,10 @@ int xfpregs_get(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->fxsave, 0, -1); + &target->thread.fpu.state->fxsave, 0, -1); } int xfpregs_set(struct task_struct *target, const struct user_regset *regset, @@ -210,20 +216,22 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->fxsave, 0, -1); + &target->thread.fpu.state->fxsave, 0, -1); /* * mxcsr reserved bits must be masked to zero for security reasons. */ - target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; /* * update the header bits in the xsave header, indicating the * presence of FP and SSE state. */ if (cpu_has_xsave) - target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; return ret; } @@ -246,14 +254,14 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, * memory layout in the thread struct, so that we can copy the entire * xstateregs to the user using one user_regset_copyout(). */ - memcpy(&target->thread.xstate->fxsave.sw_reserved, + memcpy(&target->thread.fpu.state->fxsave.sw_reserved, xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); /* * Copy the xstate memory layout. */ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->xsave, 0, -1); + &target->thread.fpu.state->xsave, 0, -1); return ret; } @@ -272,14 +280,14 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, return ret; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->xsave, 0, -1); + &target->thread.fpu.state->xsave, 0, -1); /* * mxcsr reserved bits must be masked to zero for security reasons. */ - target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; - xsave_hdr = &target->thread.xstate->xsave.xsave_hdr; + xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr; xsave_hdr->xstate_bv &= pcntxt_mask; /* @@ -365,7 +373,7 @@ static inline u32 twd_fxsr_to_i387(struct i387_fxsave_struct *fxsave) static void convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk) { - struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; + struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; struct _fpreg *to = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *from = (struct _fpxreg *) &fxsave->st_space[0]; int i; @@ -405,7 +413,7 @@ static void convert_to_fxsr(struct task_struct *tsk, const struct user_i387_ia32_struct *env) { - struct i387_fxsave_struct *fxsave = &tsk->thread.xstate->fxsave; + struct i387_fxsave_struct *fxsave = &tsk->thread.fpu.state->fxsave; struct _fpreg *from = (struct _fpreg *) &env->st_space[0]; struct _fpxreg *to = (struct _fpxreg *) &fxsave->st_space[0]; int i; @@ -445,10 +453,12 @@ int fpregs_get(struct task_struct *target, const struct user_regset *regset, if (!cpu_has_fxsr) { return user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->fsave, 0, + &target->thread.fpu.state->fsave, 0, -1); } + sanitize_i387_state(target); + if (kbuf && pos == 0 && count == sizeof(env)) { convert_from_fxsr(kbuf, target); return 0; @@ -470,12 +480,14 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; + sanitize_i387_state(target); + if (!HAVE_HWFP) return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); if (!cpu_has_fxsr) { return user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.xstate->fsave, 0, -1); + &target->thread.fpu.state->fsave, 0, -1); } if (pos > 0 || count < sizeof(env)) @@ -490,7 +502,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, * presence of FP. */ if (cpu_has_xsave) - target->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; + target->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FP; return ret; } @@ -501,7 +513,7 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset, static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) { struct task_struct *tsk = current; - struct i387_fsave_struct *fp = &tsk->thread.xstate->fsave; + struct i387_fsave_struct *fp = &tsk->thread.fpu.state->fsave; fp->status = fp->swd; if (__copy_to_user(buf, fp, sizeof(struct i387_fsave_struct))) @@ -512,7 +524,7 @@ static inline int save_i387_fsave(struct _fpstate_ia32 __user *buf) static int save_i387_fxsave(struct _fpstate_ia32 __user *buf) { struct task_struct *tsk = current; - struct i387_fxsave_struct *fx = &tsk->thread.xstate->fxsave; + struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; struct user_i387_ia32_struct env; int err = 0; @@ -536,6 +548,9 @@ static int save_i387_xsave(void __user *buf) struct _fpstate_ia32 __user *fx = buf; int err = 0; + + sanitize_i387_state(tsk); + /* * For legacy compatible, we always set FP/SSE bits in the bit * vector while saving the state to the user context. @@ -547,7 +562,7 @@ static int save_i387_xsave(void __user *buf) * header as well as change any contents in the memory layout. * xrestore as part of sigreturn will capture all the changes. */ - tsk->thread.xstate->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; + tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv |= XSTATE_FPSSE; if (save_i387_fxsave(fx) < 0) return -1; @@ -599,7 +614,7 @@ static inline int restore_i387_fsave(struct _fpstate_ia32 __user *buf) { struct task_struct *tsk = current; - return __copy_from_user(&tsk->thread.xstate->fsave, buf, + return __copy_from_user(&tsk->thread.fpu.state->fsave, buf, sizeof(struct i387_fsave_struct)); } @@ -610,10 +625,10 @@ static int restore_i387_fxsave(struct _fpstate_ia32 __user *buf, struct user_i387_ia32_struct env; int err; - err = __copy_from_user(&tsk->thread.xstate->fxsave, &buf->_fxsr_env[0], + err = __copy_from_user(&tsk->thread.fpu.state->fxsave, &buf->_fxsr_env[0], size); /* mxcsr reserved bits must be masked to zero for security reasons */ - tsk->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask; + tsk->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; if (err || __copy_from_user(&env, buf, sizeof(env))) return 1; convert_to_fxsr(tsk, &env); @@ -629,7 +644,7 @@ static int restore_i387_xsave(void __user *buf) struct i387_fxsave_struct __user *fx = (struct i387_fxsave_struct __user *) &fx_user->_fxsr_env[0]; struct xsave_hdr_struct *xsave_hdr = - ¤t->thread.xstate->xsave.xsave_hdr; + ¤t->thread.fpu.state->xsave.xsave_hdr; u64 mask; int err; diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c index 23c167925a5c..2dfd31597443 100644 --- a/arch/x86/kernel/i8253.c +++ b/arch/x86/kernel/i8253.c @@ -16,7 +16,7 @@ #include <asm/hpet.h> #include <asm/smp.h> -DEFINE_SPINLOCK(i8253_lock); +DEFINE_RAW_SPINLOCK(i8253_lock); EXPORT_SYMBOL(i8253_lock); /* @@ -33,7 +33,7 @@ struct clock_event_device *global_clock_event; static void init_pit_timer(enum clock_event_mode mode, struct clock_event_device *evt) { - spin_lock(&i8253_lock); + raw_spin_lock(&i8253_lock); switch (mode) { case CLOCK_EVT_MODE_PERIODIC: @@ -62,7 +62,7 @@ static void init_pit_timer(enum clock_event_mode mode, /* Nothing to do here */ break; } - spin_unlock(&i8253_lock); + raw_spin_unlock(&i8253_lock); } /* @@ -72,10 +72,10 @@ static void init_pit_timer(enum clock_event_mode mode, */ static int pit_next_event(unsigned long delta, struct clock_event_device *evt) { - spin_lock(&i8253_lock); + raw_spin_lock(&i8253_lock); outb_pit(delta & 0xff , PIT_CH0); /* LSB */ outb_pit(delta >> 8 , PIT_CH0); /* MSB */ - spin_unlock(&i8253_lock); + raw_spin_unlock(&i8253_lock); return 0; } @@ -130,7 +130,7 @@ static cycle_t pit_read(struct clocksource *cs) int count; u32 jifs; - spin_lock_irqsave(&i8253_lock, flags); + raw_spin_lock_irqsave(&i8253_lock, flags); /* * Although our caller may have the read side of xtime_lock, * this is now a seqlock, and we are cheating in this routine @@ -176,7 +176,7 @@ static cycle_t pit_read(struct clocksource *cs) old_count = count; old_jifs = jifs; - spin_unlock_irqrestore(&i8253_lock, flags); + raw_spin_unlock_irqrestore(&i8253_lock, flags); count = (LATCH - 1) - count; diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c index 7c9f02c130f3..cafa7c80ac95 100644 --- a/arch/x86/kernel/i8259.c +++ b/arch/x86/kernel/i8259.c @@ -276,16 +276,6 @@ static struct sys_device device_i8259A = { .cls = &i8259_sysdev_class, }; -static int __init i8259A_init_sysfs(void) -{ - int error = sysdev_class_register(&i8259_sysdev_class); - if (!error) - error = sysdev_register(&device_i8259A); - return error; -} - -device_initcall(i8259A_init_sysfs); - static void mask_8259A(void) { unsigned long flags; @@ -407,3 +397,18 @@ struct legacy_pic default_legacy_pic = { }; struct legacy_pic *legacy_pic = &default_legacy_pic; + +static int __init i8259A_init_sysfs(void) +{ + int error; + + if (legacy_pic != &default_legacy_pic) + return 0; + + error = sysdev_class_register(&i8259_sysdev_class); + if (!error) + error = sysdev_register(&device_i8259A); + return error; +} + +device_initcall(i8259A_init_sysfs); diff --git a/arch/x86/kernel/init_task.c b/arch/x86/kernel/init_task.c index 3a54dcb9cd0e..43e9ccf44947 100644 --- a/arch/x86/kernel/init_task.c +++ b/arch/x86/kernel/init_task.c @@ -34,7 +34,7 @@ EXPORT_SYMBOL(init_task); /* * per-CPU TSS segments. Threads are completely 'soft' on Linux, * no more per-task TSS's. The TSS size is kept cacheline-aligned - * so they are allowed to end up in the .data.cacheline_aligned + * so they are allowed to end up in the .data..cacheline_aligned * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 0ed2d300cd46..990ae7cfc578 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -60,7 +60,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id) outb(0, 0xF0); if (ignore_fpu_irq || !boot_cpu_data.hard_math) return IRQ_NONE; - math_error((void __user *)get_irq_regs()->ip); + math_error(get_irq_regs(), 0, 16); return IRQ_HANDLED; } diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index b2258ca91003..ef10940e1af0 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -47,69 +47,96 @@ #include <asm/debugreg.h> #include <asm/apicdef.h> #include <asm/system.h> - #include <asm/apic.h> -/* - * Put the error code here just in case the user cares: - */ -static int gdb_x86errcode; - -/* - * Likewise, the vector number here (since GDB only gets the signal - * number through the usual means, and that's not very specific): - */ -static int gdb_x86vector = -1; - -/** - * pt_regs_to_gdb_regs - Convert ptrace regs to GDB regs - * @gdb_regs: A pointer to hold the registers in the order GDB wants. - * @regs: The &struct pt_regs of the current process. - * - * Convert the pt_regs in @regs into the format for registers that - * GDB expects, stored in @gdb_regs. - */ -void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) +struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { -#ifndef CONFIG_X86_32 - u32 *gdb_regs32 = (u32 *)gdb_regs; +#ifdef CONFIG_X86_32 + { "ax", 4, offsetof(struct pt_regs, ax) }, + { "cx", 4, offsetof(struct pt_regs, cx) }, + { "dx", 4, offsetof(struct pt_regs, dx) }, + { "bx", 4, offsetof(struct pt_regs, bx) }, + { "sp", 4, offsetof(struct pt_regs, sp) }, + { "bp", 4, offsetof(struct pt_regs, bp) }, + { "si", 4, offsetof(struct pt_regs, si) }, + { "di", 4, offsetof(struct pt_regs, di) }, + { "ip", 4, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, + { "ds", 4, offsetof(struct pt_regs, ds) }, + { "es", 4, offsetof(struct pt_regs, es) }, + { "fs", 4, -1 }, + { "gs", 4, -1 }, +#else + { "ax", 8, offsetof(struct pt_regs, ax) }, + { "bx", 8, offsetof(struct pt_regs, bx) }, + { "cx", 8, offsetof(struct pt_regs, cx) }, + { "dx", 8, offsetof(struct pt_regs, dx) }, + { "si", 8, offsetof(struct pt_regs, dx) }, + { "di", 8, offsetof(struct pt_regs, di) }, + { "bp", 8, offsetof(struct pt_regs, bp) }, + { "sp", 8, offsetof(struct pt_regs, sp) }, + { "r8", 8, offsetof(struct pt_regs, r8) }, + { "r9", 8, offsetof(struct pt_regs, r9) }, + { "r10", 8, offsetof(struct pt_regs, r10) }, + { "r11", 8, offsetof(struct pt_regs, r11) }, + { "r12", 8, offsetof(struct pt_regs, r12) }, + { "r13", 8, offsetof(struct pt_regs, r13) }, + { "r14", 8, offsetof(struct pt_regs, r14) }, + { "r15", 8, offsetof(struct pt_regs, r15) }, + { "ip", 8, offsetof(struct pt_regs, ip) }, + { "flags", 4, offsetof(struct pt_regs, flags) }, + { "cs", 4, offsetof(struct pt_regs, cs) }, + { "ss", 4, offsetof(struct pt_regs, ss) }, #endif - gdb_regs[GDB_AX] = regs->ax; - gdb_regs[GDB_BX] = regs->bx; - gdb_regs[GDB_CX] = regs->cx; - gdb_regs[GDB_DX] = regs->dx; - gdb_regs[GDB_SI] = regs->si; - gdb_regs[GDB_DI] = regs->di; - gdb_regs[GDB_BP] = regs->bp; - gdb_regs[GDB_PC] = regs->ip; +}; + +int dbg_set_reg(int regno, void *mem, struct pt_regs *regs) +{ + if ( #ifdef CONFIG_X86_32 - gdb_regs[GDB_PS] = regs->flags; - gdb_regs[GDB_DS] = regs->ds; - gdb_regs[GDB_ES] = regs->es; - gdb_regs[GDB_CS] = regs->cs; - gdb_regs[GDB_FS] = 0xFFFF; - gdb_regs[GDB_GS] = 0xFFFF; - if (user_mode_vm(regs)) { - gdb_regs[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = regs->sp; - } else { - gdb_regs[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + regno == GDB_SS || regno == GDB_FS || regno == GDB_GS || +#endif + regno == GDB_SP || regno == GDB_ORIG_AX) + return 0; + + if (dbg_reg_def[regno].offset != -1) + memcpy((void *)regs + dbg_reg_def[regno].offset, mem, + dbg_reg_def[regno].size); + return 0; +} + +char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) +{ + if (regno == GDB_ORIG_AX) { + memcpy(mem, ®s->orig_ax, sizeof(regs->orig_ax)); + return "orig_ax"; } -#else - gdb_regs[GDB_R8] = regs->r8; - gdb_regs[GDB_R9] = regs->r9; - gdb_regs[GDB_R10] = regs->r10; - gdb_regs[GDB_R11] = regs->r11; - gdb_regs[GDB_R12] = regs->r12; - gdb_regs[GDB_R13] = regs->r13; - gdb_regs[GDB_R14] = regs->r14; - gdb_regs[GDB_R15] = regs->r15; - gdb_regs32[GDB_PS] = regs->flags; - gdb_regs32[GDB_CS] = regs->cs; - gdb_regs32[GDB_SS] = regs->ss; - gdb_regs[GDB_SP] = kernel_stack_pointer(regs); + if (regno >= DBG_MAX_REG_NUM || regno < 0) + return NULL; + + if (dbg_reg_def[regno].offset != -1) + memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, + dbg_reg_def[regno].size); + + switch (regno) { +#ifdef CONFIG_X86_32 + case GDB_SS: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = __KERNEL_DS; + break; + case GDB_SP: + if (!user_mode_vm(regs)) + *(unsigned long *)mem = kernel_stack_pointer(regs); + break; + case GDB_GS: + case GDB_FS: + *(unsigned long *)mem = 0xFFFF; + break; #endif + } + return dbg_reg_def[regno].name; } /** @@ -162,66 +189,35 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs[GDB_SP] = p->thread.sp; } -/** - * gdb_regs_to_pt_regs - Convert GDB regs to ptrace regs. - * @gdb_regs: A pointer to hold the registers we've received from GDB. - * @regs: A pointer to a &struct pt_regs to hold these values in. - * - * Convert the GDB regs in @gdb_regs into the pt_regs, and store them - * in @regs. - */ -void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs) -{ -#ifndef CONFIG_X86_32 - u32 *gdb_regs32 = (u32 *)gdb_regs; -#endif - regs->ax = gdb_regs[GDB_AX]; - regs->bx = gdb_regs[GDB_BX]; - regs->cx = gdb_regs[GDB_CX]; - regs->dx = gdb_regs[GDB_DX]; - regs->si = gdb_regs[GDB_SI]; - regs->di = gdb_regs[GDB_DI]; - regs->bp = gdb_regs[GDB_BP]; - regs->ip = gdb_regs[GDB_PC]; -#ifdef CONFIG_X86_32 - regs->flags = gdb_regs[GDB_PS]; - regs->ds = gdb_regs[GDB_DS]; - regs->es = gdb_regs[GDB_ES]; - regs->cs = gdb_regs[GDB_CS]; -#else - regs->r8 = gdb_regs[GDB_R8]; - regs->r9 = gdb_regs[GDB_R9]; - regs->r10 = gdb_regs[GDB_R10]; - regs->r11 = gdb_regs[GDB_R11]; - regs->r12 = gdb_regs[GDB_R12]; - regs->r13 = gdb_regs[GDB_R13]; - regs->r14 = gdb_regs[GDB_R14]; - regs->r15 = gdb_regs[GDB_R15]; - regs->flags = gdb_regs32[GDB_PS]; - regs->cs = gdb_regs32[GDB_CS]; - regs->ss = gdb_regs32[GDB_SS]; -#endif -} - static struct hw_breakpoint { unsigned enabled; unsigned long addr; int len; int type; struct perf_event **pev; -} breakinfo[4]; +} breakinfo[HBP_NUM]; + +static unsigned long early_dr7; static void kgdb_correct_hw_break(void) { int breakno; - for (breakno = 0; breakno < 4; breakno++) { + for (breakno = 0; breakno < HBP_NUM; breakno++) { struct perf_event *bp; struct arch_hw_breakpoint *info; int val; int cpu = raw_smp_processor_id(); if (!breakinfo[breakno].enabled) continue; + if (dbg_is_early) { + set_debugreg(breakinfo[breakno].addr, breakno); + early_dr7 |= encode_dr7(breakno, + breakinfo[breakno].len, + breakinfo[breakno].type); + set_debugreg(early_dr7, 7); + continue; + } bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu); info = counter_arch_bp(bp); if (bp->attr.disabled != 1) @@ -236,7 +232,8 @@ static void kgdb_correct_hw_break(void) if (!val) bp->attr.disabled = 0; } - hw_breakpoint_restore(); + if (!dbg_is_early) + hw_breakpoint_restore(); } static int hw_break_reserve_slot(int breakno) @@ -245,6 +242,9 @@ static int hw_break_reserve_slot(int breakno) int cnt = 0; struct perf_event **pevent; + if (dbg_is_early) + return 0; + for_each_online_cpu(cpu) { cnt++; pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); @@ -270,6 +270,9 @@ static int hw_break_release_slot(int breakno) struct perf_event **pevent; int cpu; + if (dbg_is_early) + return 0; + for_each_online_cpu(cpu) { pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); if (dbg_release_bp_slot(*pevent)) @@ -287,10 +290,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { int i; - for (i = 0; i < 4; i++) + for (i = 0; i < HBP_NUM; i++) if (breakinfo[i].addr == addr && breakinfo[i].enabled) break; - if (i == 4) + if (i == HBP_NUM) return -1; if (hw_break_release_slot(i)) { @@ -308,13 +311,17 @@ static void kgdb_remove_all_hw_break(void) int cpu = raw_smp_processor_id(); struct perf_event *bp; - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; bp = *per_cpu_ptr(breakinfo[i].pev, cpu); if (bp->attr.disabled == 1) continue; - arch_uninstall_hw_breakpoint(bp); + if (dbg_is_early) + early_dr7 &= ~encode_dr7(i, breakinfo[i].len, + breakinfo[i].type); + else + arch_uninstall_hw_breakpoint(bp); bp->attr.disabled = 1; } } @@ -324,10 +331,10 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) { int i; - for (i = 0; i < 4; i++) + for (i = 0; i < HBP_NUM; i++) if (!breakinfo[i].enabled) break; - if (i == 4) + if (i == HBP_NUM) return -1; switch (bptype) { @@ -388,9 +395,14 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) /* Disable hardware debugging while we are in kgdb: */ set_debugreg(0UL, 7); - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { if (!breakinfo[i].enabled) continue; + if (dbg_is_early) { + early_dr7 &= ~encode_dr7(i, breakinfo[i].len, + breakinfo[i].type); + continue; + } bp = *per_cpu_ptr(breakinfo[i].pev, cpu); if (bp->attr.disabled == 1) continue; @@ -399,23 +411,6 @@ void kgdb_disable_hw_debug(struct pt_regs *regs) } } -/** - * kgdb_post_primary_code - Save error vector/code numbers. - * @regs: Original pt_regs. - * @e_vector: Original error vector. - * @err_code: Original error code. - * - * This is needed on architectures which support SMP and KGDB. - * This function is called after all the slave cpus have been put - * to a know spin state and the primary CPU has control over KGDB. - */ -void kgdb_post_primary_code(struct pt_regs *regs, int e_vector, int err_code) -{ - /* primary processor is completely in the debugger */ - gdb_x86vector = e_vector; - gdb_x86errcode = err_code; -} - #ifdef CONFIG_SMP /** * kgdb_roundup_cpus - Get other CPUs into a holding pattern @@ -461,7 +456,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, { unsigned long addr; char *ptr; - int newPC; switch (remcomInBuffer[0]) { case 'c': @@ -472,8 +466,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, linux_regs->ip = addr; case 'D': case 'k': - newPC = linux_regs->ip; - /* clear the trace bit */ linux_regs->flags &= ~X86_EFLAGS_TF; atomic_set(&kgdb_cpu_doing_single_step, -1); @@ -567,7 +559,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) return NOTIFY_DONE; } - if (kgdb_handle_exception(args->trapnr, args->signr, args->err, regs)) + if (kgdb_handle_exception(args->trapnr, args->signr, cmd, regs)) return NOTIFY_DONE; /* Must touch watchdog before return to normal operation */ @@ -575,6 +567,24 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd) return NOTIFY_STOP; } +int kgdb_ll_trap(int cmd, const char *str, + struct pt_regs *regs, long err, int trap, int sig) +{ + struct die_args args = { + .regs = regs, + .str = str, + .err = err, + .trapnr = trap, + .signr = sig, + + }; + + if (!kgdb_io_module_registered) + return NOTIFY_DONE; + + return __kgdb_notify(&args, cmd); +} + static int kgdb_notify(struct notifier_block *self, unsigned long cmd, void *ptr) { @@ -605,14 +615,21 @@ static struct notifier_block kgdb_notifier = { */ int kgdb_arch_init(void) { + return register_die_notifier(&kgdb_notifier); +} + +static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, + struct perf_sample_data *data, struct pt_regs *regs) +{ + kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP); +} + +void kgdb_arch_late(void) +{ int i, cpu; - int ret; struct perf_event_attr attr; struct perf_event **pevent; - ret = register_die_notifier(&kgdb_notifier); - if (ret != 0) - return ret; /* * Pre-allocate the hw breakpoint structions in the non-atomic * portion of kgdb because this operation requires mutexs to @@ -623,24 +640,27 @@ int kgdb_arch_init(void) attr.bp_len = HW_BREAKPOINT_LEN_1; attr.bp_type = HW_BREAKPOINT_W; attr.disabled = 1; - for (i = 0; i < 4; i++) { + for (i = 0; i < HBP_NUM; i++) { + if (breakinfo[i].pev) + continue; breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); if (IS_ERR(breakinfo[i].pev)) { - printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n"); + printk(KERN_ERR "kgdb: Could not allocate hw" + "breakpoints\nDisabling the kernel debugger\n"); breakinfo[i].pev = NULL; kgdb_arch_exit(); - return -1; + return; } for_each_online_cpu(cpu) { pevent = per_cpu_ptr(breakinfo[i].pev, cpu); pevent[0]->hw.sample_period = 1; + pevent[0]->overflow_handler = kgdb_hw_overflow_handler; if (pevent[0]->destroy != NULL) { pevent[0]->destroy = NULL; release_bp_slot(*pevent); } } } - return ret; } /** @@ -690,6 +710,11 @@ unsigned long kgdb_arch_pc(int exception, struct pt_regs *regs) return instruction_pointer(regs); } +void kgdb_arch_set_pc(struct pt_regs *regs, unsigned long ip) +{ + regs->ip = ip; +} + struct kgdb_arch arch_kgdb_ops = { /* Breakpoint instruction: */ .gdb_bpt_instr = { 0xcc }, diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c index 1658efdfb4e5..1bfb6cf4dd55 100644 --- a/arch/x86/kernel/kprobes.c +++ b/arch/x86/kernel/kprobes.c @@ -126,16 +126,22 @@ static void __kprobes synthesize_reljump(void *from, void *to) } /* - * Check for the REX prefix which can only exist on X86_64 - * X86_32 always returns 0 + * Skip the prefixes of the instruction. */ -static int __kprobes is_REX_prefix(kprobe_opcode_t *insn) +static kprobe_opcode_t *__kprobes skip_prefixes(kprobe_opcode_t *insn) { + insn_attr_t attr; + + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + while (inat_is_legacy_prefix(attr)) { + insn++; + attr = inat_get_opcode_attribute((insn_byte_t)*insn); + } #ifdef CONFIG_X86_64 - if ((*insn & 0xf0) == 0x40) - return 1; + if (inat_is_rex_prefix(attr)) + insn++; #endif - return 0; + return insn; } /* @@ -272,6 +278,9 @@ static int __kprobes can_probe(unsigned long paddr) */ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) { + /* Skip prefixes */ + insn = skip_prefixes(insn); + switch (*insn) { case 0xfa: /* cli */ case 0xfb: /* sti */ @@ -280,13 +289,6 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn) return 1; } - /* - * on X86_64, 0x40-0x4f are REX prefixes so we need to look - * at the next byte instead.. but of course not recurse infinitely - */ - if (is_REX_prefix(insn)) - return is_IF_modifier(++insn); - return 0; } @@ -422,14 +424,22 @@ static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, static void __kprobes clear_btf(void) { - if (test_thread_flag(TIF_DEBUGCTLMSR)) - update_debugctlmsr(0); + if (test_thread_flag(TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); + + debugctl &= ~DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + } } static void __kprobes restore_btf(void) { - if (test_thread_flag(TIF_DEBUGCTLMSR)) - update_debugctlmsr(current->thread.debugctlmsr); + if (test_thread_flag(TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); + + debugctl |= DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + } } void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, @@ -632,8 +642,8 @@ static int __kprobes kprobe_handler(struct pt_regs *regs) /* Skip cs, ip, orig_ax and gs. */ \ " subl $16, %esp\n" \ " pushl %fs\n" \ - " pushl %ds\n" \ " pushl %es\n" \ + " pushl %ds\n" \ " pushl %eax\n" \ " pushl %ebp\n" \ " pushl %edi\n" \ @@ -795,9 +805,8 @@ static void __kprobes resume_execution(struct kprobe *p, unsigned long orig_ip = (unsigned long)p->addr; kprobe_opcode_t *insn = p->ainsn.insn; - /*skip the REX prefix*/ - if (is_REX_prefix(insn)) - insn++; + /* Skip prefixes */ + insn = skip_prefixes(insn); regs->flags &= ~X86_EFLAGS_TF; switch (*insn) { diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index feaeb0d3aa4f..eb9b76c716c2 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -29,6 +29,8 @@ #define KVM_SCALE 22 static int kvmclock = 1; +static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; +static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; static int parse_no_kvmclock(char *arg) { @@ -54,7 +56,8 @@ static unsigned long kvm_get_wallclock(void) low = (int)__pa_symbol(&wall_clock); high = ((u64)__pa_symbol(&wall_clock) >> 32); - native_write_msr(MSR_KVM_WALL_CLOCK, low, high); + + native_write_msr(msr_kvm_wall_clock, low, high); vcpu_time = &get_cpu_var(hv_clock); pvclock_read_wallclock(&wall_clock, vcpu_time, &ts); @@ -130,7 +133,8 @@ static int kvm_register_clock(char *txt) high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", cpu, high, low, txt); - return native_write_msr_safe(MSR_KVM_SYSTEM_TIME, low, high); + + return native_write_msr_safe(msr_kvm_system_time, low, high); } #ifdef CONFIG_X86_LOCAL_APIC @@ -165,14 +169,14 @@ static void __init kvm_smp_prepare_boot_cpu(void) #ifdef CONFIG_KEXEC static void kvm_crash_shutdown(struct pt_regs *regs) { - native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_write_msr(msr_kvm_system_time, 0, 0); native_machine_crash_shutdown(regs); } #endif static void kvm_shutdown(void) { - native_write_msr_safe(MSR_KVM_SYSTEM_TIME, 0, 0); + native_write_msr(msr_kvm_system_time, 0, 0); native_machine_shutdown(); } @@ -181,27 +185,37 @@ void __init kvmclock_init(void) if (!kvm_para_available()) return; - if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)) { - if (kvm_register_clock("boot clock")) - return; - pv_time_ops.sched_clock = kvm_clock_read; - x86_platform.calibrate_tsc = kvm_get_tsc_khz; - x86_platform.get_wallclock = kvm_get_wallclock; - x86_platform.set_wallclock = kvm_set_wallclock; + if (kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE2)) { + msr_kvm_system_time = MSR_KVM_SYSTEM_TIME_NEW; + msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK_NEW; + } else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE))) + return; + + printk(KERN_INFO "kvm-clock: Using msrs %x and %x", + msr_kvm_system_time, msr_kvm_wall_clock); + + if (kvm_register_clock("boot clock")) + return; + pv_time_ops.sched_clock = kvm_clock_read; + x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.get_wallclock = kvm_get_wallclock; + x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC - x86_cpuinit.setup_percpu_clockev = - kvm_setup_secondary_clock; + x86_cpuinit.setup_percpu_clockev = + kvm_setup_secondary_clock; #endif #ifdef CONFIG_SMP - smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; + smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu; #endif - machine_ops.shutdown = kvm_shutdown; + machine_ops.shutdown = kvm_shutdown; #ifdef CONFIG_KEXEC - machine_ops.crash_shutdown = kvm_crash_shutdown; + machine_ops.crash_shutdown = kvm_crash_shutdown; #endif - kvm_get_preset_lpj(); - clocksource_register(&kvm_clock); - pv_info.paravirt_enabled = 1; - pv_info.name = "KVM"; - } + kvm_get_preset_lpj(); + clocksource_register(&kvm_clock); + pv_info.paravirt_enabled = 1; + pv_info.name = "KVM"; + + if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT)) + pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT); } diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index cceb5bc3c3c2..fa6551d36c10 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -201,9 +201,9 @@ static int do_microcode_update(const void __user *buf, size_t size) return error; } -static int microcode_open(struct inode *unused1, struct file *unused2) +static int microcode_open(struct inode *inode, struct file *file) { - return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; + return capable(CAP_SYS_RAWIO) ? nonseekable_open(inode, file) : -EPERM; } static ssize_t microcode_write(struct file *file, const char __user *buf, @@ -260,6 +260,7 @@ static void microcode_dev_exit(void) } MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); +MODULE_ALIAS("devname:cpu/microcode"); #else #define microcode_dev_init() 0 #define microcode_dev_exit() do { } while (0) diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 85a343e28937..356170262a93 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -343,10 +343,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, int (*get_ucode_data)(void *, const void *, size_t)) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - u8 *ucode_ptr = data, *new_mc = NULL, *mc; + u8 *ucode_ptr = data, *new_mc = NULL, *mc = NULL; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; enum ucode_state state = UCODE_OK; + unsigned int curr_mc_size = 0; while (leftover) { struct microcode_header_intel mc_header; @@ -361,9 +362,15 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, break; } - mc = vmalloc(mc_size); - if (!mc) - break; + /* For performance reasons, reuse mc area when possible */ + if (!mc || mc_size > curr_mc_size) { + if (mc) + vfree(mc); + mc = vmalloc(mc_size); + if (!mc) + break; + curr_mc_size = mc_size; + } if (get_ucode_data(mc, ucode_ptr, mc_size) || microcode_sanity_check(mc) < 0) { @@ -376,13 +383,16 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; - } else - vfree(mc); + mc = NULL; /* trigger new vmalloc */ + } ucode_ptr += mc_size; leftover -= mc_size; } + if (mc) + vfree(mc); + if (leftover) { if (new_mc) vfree(new_mc); diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index e81030f71a8f..d86dbf7e54be 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -115,21 +115,6 @@ static void __init MP_bus_info(struct mpc_bus *m) printk(KERN_WARNING "Unknown bustype %s - ignoring\n", str); } -static int bad_ioapic(unsigned long address) -{ - if (nr_ioapics >= MAX_IO_APICS) { - printk(KERN_ERR "ERROR: Max # of I/O APICs (%d) exceeded " - "(found %d)\n", MAX_IO_APICS, nr_ioapics); - panic("Recompile kernel with bigger MAX_IO_APICS!\n"); - } - if (!address) { - printk(KERN_ERR "WARNING: Bogus (zero) I/O APIC address" - " found in table, skipping!\n"); - return 1; - } - return 0; -} - static void __init MP_ioapic_info(struct mpc_ioapic *m) { if (!(m->flags & MPC_APIC_USABLE)) @@ -138,15 +123,7 @@ static void __init MP_ioapic_info(struct mpc_ioapic *m) printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n", m->apicid, m->apicver, m->apicaddr); - if (bad_ioapic(m->apicaddr)) - return; - - mp_ioapics[nr_ioapics].apicaddr = m->apicaddr; - mp_ioapics[nr_ioapics].apicid = m->apicid; - mp_ioapics[nr_ioapics].type = m->type; - mp_ioapics[nr_ioapics].apicver = m->apicver; - mp_ioapics[nr_ioapics].flags = m->flags; - nr_ioapics++; + mp_register_ioapic(m->apicid, m->apicaddr, gsi_top); } static void print_MP_intsrc_info(struct mpc_intsrc *m) diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c index 0aad8670858e..79ae68154e87 100644 --- a/arch/x86/kernel/mrst.c +++ b/arch/x86/kernel/mrst.c @@ -25,8 +25,34 @@ #include <asm/i8259.h> #include <asm/apb_timer.h> +/* + * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock, + * cmdline option x86_mrst_timer can be used to override the configuration + * to prefer one or the other. + * at runtime, there are basically three timer configurations: + * 1. per cpu apbt clock only + * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only + * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast. + * + * by default (without cmdline option), platform code first detects cpu type + * to see if we are on lincroft or penwell, then set up both lapic or apbt + * clocks accordingly. + * i.e. by default, medfield uses configuration #2, moorestown uses #1. + * config #3 is supported but not recommended on medfield. + * + * rating and feature summary: + * lapic (with C3STOP) --------- 100 + * apbt (always-on) ------------ 110 + * lapic (always-on,ARAT) ------ 150 + */ + +__cpuinitdata enum mrst_timer_options mrst_timer_options; + static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM]; static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM]; +enum mrst_cpu_type __mrst_cpu_chip; +EXPORT_SYMBOL_GPL(__mrst_cpu_chip); + int sfi_mtimer_num; struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX]; @@ -167,18 +193,6 @@ int __init sfi_parse_mrtc(struct sfi_table_header *table) return 0; } -/* - * the secondary clock in Moorestown can be APBT or LAPIC clock, default to - * APBT but cmdline option can also override it. - */ -static void __cpuinit mrst_setup_secondary_clock(void) -{ - /* restore default lapic clock if disabled by cmdline */ - if (disable_apbt_percpu) - return setup_secondary_APIC_clock(); - apbt_setup_secondary_clock(); -} - static unsigned long __init mrst_calibrate_tsc(void) { unsigned long flags, fast_calibrate; @@ -195,6 +209,21 @@ static unsigned long __init mrst_calibrate_tsc(void) void __init mrst_time_init(void) { + switch (mrst_timer_options) { + case MRST_TIMER_APBT_ONLY: + break; + case MRST_TIMER_LAPIC_APBT: + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + break; + default: + if (!boot_cpu_has(X86_FEATURE_ARAT)) + break; + x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock; + x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock; + return; + } + /* we need at least one APB timer */ sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); pre_init_apic_IRQ0(); apbt_time_init(); @@ -205,16 +234,27 @@ void __init mrst_rtc_init(void) sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc); } -/* - * if we use per cpu apb timer, the bootclock already setup. if we use lapic - * timer and one apbt timer for broadcast, we need to set up lapic boot clock. - */ -static void __init mrst_setup_boot_clock(void) +void __cpuinit mrst_arch_setup(void) { - pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu); - if (disable_apbt_percpu) - setup_boot_APIC_clock(); -}; + if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) + __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; + else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26) + __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + else { + pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT; + } + pr_debug("Moorestown CPU %s identified\n", + (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ? + "Lincroft" : "Penwell"); +} + +/* MID systems don't have i8042 controller */ +static int mrst_i8042_detect(void) +{ + return 0; +} /* * Moorestown specific x86_init function overrides and early setup @@ -226,15 +266,46 @@ void __init x86_mrst_early_setup(void) x86_init.resources.reserve_resources = x86_init_noop; x86_init.timers.timer_init = mrst_time_init; - x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock; + x86_init.timers.setup_percpu_clockev = x86_init_noop; x86_init.irqs.pre_vector_init = x86_init_noop; - x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock; + x86_init.oem.arch_setup = mrst_arch_setup; + + x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock; x86_platform.calibrate_tsc = mrst_calibrate_tsc; + x86_platform.i8042_detect = mrst_i8042_detect; x86_init.pci.init = pci_mrst_init; x86_init.pci.fixup_irqs = x86_init_noop; legacy_pic = &null_legacy_pic; + + /* Avoid searching for BIOS MP tables */ + x86_init.mpparse.find_smp_config = x86_init_noop; + x86_init.mpparse.get_smp_config = x86_init_uint_noop; + +} + +/* + * if user does not want to use per CPU apb timer, just give it a lower rating + * than local apic timer and skip the late per cpu timer init. + */ +static inline int __init setup_x86_mrst_timer(char *arg) +{ + if (!arg) + return -EINVAL; + + if (strcmp("apbt_only", arg) == 0) + mrst_timer_options = MRST_TIMER_APBT_ONLY; + else if (strcmp("lapic_and_apbt", arg) == 0) + mrst_timer_options = MRST_TIMER_LAPIC_APBT; + else { + pr_warning("X86 MRST timer option %s not recognised" + " use x86_mrst_timer=apbt_only or lapic_and_apbt\n", + arg); + return -EINVAL; + } + return 0; } +__setup("x86_mrst_timer=", setup_x86_mrst_timer); diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 4d4468e9f47c..7bf2dc4c8f70 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -230,7 +230,7 @@ static int __cpuinit msr_class_cpu_callback(struct notifier_block *nfb, msr_device_destroy(cpu); break; } - return err ? NOTIFY_BAD : NOTIFY_OK; + return notifier_from_errno(err); } static struct notifier_block __refdata msr_class_cpu_notifier = { diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c index 8297160c41b3..0e0cdde519be 100644 --- a/arch/x86/kernel/olpc.c +++ b/arch/x86/kernel/olpc.c @@ -21,10 +21,7 @@ #include <asm/geode.h> #include <asm/setup.h> #include <asm/olpc.h> - -#ifdef CONFIG_OPEN_FIRMWARE -#include <asm/ofw.h> -#endif +#include <asm/olpc_ofw.h> struct olpc_platform_t olpc_platform_info; EXPORT_SYMBOL_GPL(olpc_platform_info); @@ -145,7 +142,7 @@ restart: * The OBF flag will sometimes misbehave due to what we believe * is a hardware quirk.. */ - printk(KERN_DEBUG "olpc-ec: running cmd 0x%x\n", cmd); + pr_devel("olpc-ec: running cmd 0x%x\n", cmd); outb(cmd, 0x6c); if (wait_on_ibf(0x6c, 0)) { @@ -162,8 +159,7 @@ restart: " EC accept data!\n"); goto err; } - printk(KERN_DEBUG "olpc-ec: sending cmd arg 0x%x\n", - inbuf[i]); + pr_devel("olpc-ec: sending cmd arg 0x%x\n", inbuf[i]); outb(inbuf[i], 0x68); } } @@ -176,8 +172,7 @@ restart: goto restart; } outbuf[i] = inb(0x68); - printk(KERN_DEBUG "olpc-ec: received 0x%x\n", - outbuf[i]); + pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); } } @@ -188,14 +183,15 @@ err: } EXPORT_SYMBOL_GPL(olpc_ec_cmd); -#ifdef CONFIG_OPEN_FIRMWARE +#ifdef CONFIG_OLPC_OPENFIRMWARE static void __init platform_detect(void) { size_t propsize; __be32 rev; + const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; + void *res[] = { &propsize }; - if (ofw("getprop", 4, 1, NULL, "board-revision-int", &rev, 4, - &propsize) || propsize != 4) { + if (olpc_ofw("getprop", args, res) || propsize != 4) { printk(KERN_ERR "ofw: getprop call failed!\n"); rev = cpu_to_be32(0); } diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/kernel/olpc_ofw.c new file mode 100644 index 000000000000..3218aa71ab5e --- /dev/null +++ b/arch/x86/kernel/olpc_ofw.c @@ -0,0 +1,106 @@ +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/init.h> +#include <asm/page.h> +#include <asm/setup.h> +#include <asm/io.h> +#include <asm/pgtable.h> +#include <asm/olpc_ofw.h> + +/* address of OFW callback interface; will be NULL if OFW isn't found */ +static int (*olpc_ofw_cif)(int *); + +/* page dir entry containing OFW's pgdir table; filled in by head_32.S */ +u32 olpc_ofw_pgd __initdata; + +static DEFINE_SPINLOCK(ofw_lock); + +#define MAXARGS 10 + +void __init setup_olpc_ofw_pgd(void) +{ + pgd_t *base, *ofw_pde; + + if (!olpc_ofw_cif) + return; + + /* fetch OFW's PDE */ + base = early_ioremap(olpc_ofw_pgd, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); + if (!base) { + printk(KERN_ERR "failed to remap OFW's pgd - disabling OFW!\n"); + olpc_ofw_cif = NULL; + return; + } + ofw_pde = &base[OLPC_OFW_PDE_NR]; + + /* install OFW's PDE permanently into the kernel's pgtable */ + set_pgd(&swapper_pg_dir[OLPC_OFW_PDE_NR], *ofw_pde); + /* implicit optimization barrier here due to uninline function return */ + + early_iounmap(base, sizeof(olpc_ofw_pgd) * PTRS_PER_PGD); +} + +int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res, + void **res) +{ + int ofw_args[MAXARGS + 3]; + unsigned long flags; + int ret, i, *p; + + BUG_ON(nr_args + nr_res > MAXARGS); + + if (!olpc_ofw_cif) + return -EIO; + + ofw_args[0] = (int)name; + ofw_args[1] = nr_args; + ofw_args[2] = nr_res; + + p = &ofw_args[3]; + for (i = 0; i < nr_args; i++, p++) + *p = (int)args[i]; + + /* call into ofw */ + spin_lock_irqsave(&ofw_lock, flags); + ret = olpc_ofw_cif(ofw_args); + spin_unlock_irqrestore(&ofw_lock, flags); + + if (!ret) { + for (i = 0; i < nr_res; i++, p++) + *((int *)res[i]) = *p; + } + + return ret; +} +EXPORT_SYMBOL_GPL(__olpc_ofw); + +/* OFW cif _should_ be above this address */ +#define OFW_MIN 0xff000000 + +/* OFW starts on a 1MB boundary */ +#define OFW_BOUND (1<<20) + +void __init olpc_ofw_detect(void) +{ + struct olpc_ofw_header *hdr = &boot_params.olpc_ofw_header; + unsigned long start; + + /* ensure OFW booted us by checking for "OFW " string */ + if (hdr->ofw_magic != OLPC_OFW_SIG) + return; + + olpc_ofw_cif = (int (*)(int *))hdr->cif_handler; + + if ((unsigned long)olpc_ofw_cif < OFW_MIN) { + printk(KERN_ERR "OFW detected, but cif has invalid address 0x%lx - disabling.\n", + (unsigned long)olpc_ofw_cif); + olpc_ofw_cif = NULL; + return; + } + + /* determine where OFW starts in memory */ + start = round_down((unsigned long)olpc_ofw_cif, OFW_BOUND); + printk(KERN_INFO "OFW detected in memory, cif @ 0x%lx (reserving top %ldMB)\n", + (unsigned long)olpc_ofw_cif, (-start) >> 20); + reserve_top_address(-start); +} diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index fb99f7edb341..078d4ec1a9d9 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -103,11 +103,16 @@ int use_calgary __read_mostly = 0; #define PMR_SOFTSTOPFAULT 0x40000000 #define PMR_HARDSTOP 0x20000000 -#define MAX_NUM_OF_PHBS 8 /* how many PHBs in total? */ -#define MAX_NUM_CHASSIS 8 /* max number of chassis */ -/* MAX_PHB_BUS_NUM is the maximal possible dev->bus->number */ -#define MAX_PHB_BUS_NUM (MAX_NUM_OF_PHBS * MAX_NUM_CHASSIS * 2) -#define PHBS_PER_CALGARY 4 +/* + * The maximum PHB bus number. + * x3950M2 (rare): 8 chassis, 48 PHBs per chassis = 384 + * x3950M2: 4 chassis, 48 PHBs per chassis = 192 + * x3950 (PCIE): 8 chassis, 32 PHBs per chassis = 256 + * x3950 (PCIX): 8 chassis, 16 PHBs per chassis = 128 + */ +#define MAX_PHB_BUS_NUM 256 + +#define PHBS_PER_CALGARY 4 /* register offsets in Calgary's internal register space */ static const unsigned long tar_offsets[] = { @@ -1051,8 +1056,6 @@ static int __init calgary_init_one(struct pci_dev *dev) struct iommu_table *tbl; int ret; - BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); - bbar = busno_to_bbar(dev->bus->number); ret = calgary_setup_tar(dev, bbar); if (ret) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 7d2829dde20e..a5bc528d4328 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -31,8 +31,6 @@ static struct dma_map_ops swiotlb_dma_ops = { .free_coherent = swiotlb_free_coherent, .sync_single_for_cpu = swiotlb_sync_single_for_cpu, .sync_single_for_device = swiotlb_sync_single_for_device, - .sync_single_range_for_cpu = swiotlb_sync_single_range_for_cpu, - .sync_single_range_for_device = swiotlb_sync_single_range_for_device, .sync_sg_for_cpu = swiotlb_sync_sg_for_cpu, .sync_sg_for_device = swiotlb_sync_sg_for_device, .map_sg = swiotlb_map_sg_attrs, diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 0415c3ef91b5..d401f1d2d06e 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -20,7 +20,6 @@ #include <asm/idle.h> #include <asm/uaccess.h> #include <asm/i387.h> -#include <asm/ds.h> #include <asm/debugreg.h> unsigned long idle_halt; @@ -29,29 +28,26 @@ unsigned long idle_nomwait; EXPORT_SYMBOL(idle_nomwait); struct kmem_cache *task_xstate_cachep; +EXPORT_SYMBOL_GPL(task_xstate_cachep); int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) { + int ret; + *dst = *src; - if (src->thread.xstate) { - dst->thread.xstate = kmem_cache_alloc(task_xstate_cachep, - GFP_KERNEL); - if (!dst->thread.xstate) - return -ENOMEM; - WARN_ON((unsigned long)dst->thread.xstate & 15); - memcpy(dst->thread.xstate, src->thread.xstate, xstate_size); + if (fpu_allocated(&src->thread.fpu)) { + memset(&dst->thread.fpu, 0, sizeof(dst->thread.fpu)); + ret = fpu_alloc(&dst->thread.fpu); + if (ret) + return ret; + fpu_copy(&dst->thread.fpu, &src->thread.fpu); } return 0; } void free_thread_xstate(struct task_struct *tsk) { - if (tsk->thread.xstate) { - kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); - tsk->thread.xstate = NULL; - } - - WARN(tsk->thread.ds_ctx, "leaking DS context\n"); + fpu_free(&tsk->thread.fpu); } void free_thread_info(struct thread_info *ti) @@ -198,11 +194,16 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, prev = &prev_p->thread; next = &next_p->thread; - if (test_tsk_thread_flag(next_p, TIF_DS_AREA_MSR) || - test_tsk_thread_flag(prev_p, TIF_DS_AREA_MSR)) - ds_switch_to(prev_p, next_p); - else if (next->debugctlmsr != prev->debugctlmsr) - update_debugctlmsr(next->debugctlmsr); + if (test_tsk_thread_flag(prev_p, TIF_BLOCKSTEP) ^ + test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); + + debugctl &= ~DEBUGCTLMSR_BTF; + if (test_tsk_thread_flag(next_p, TIF_BLOCKSTEP)) + debugctl |= DEBUGCTLMSR_BTF; + + update_debugctlmsr(debugctl); + } if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ test_tsk_thread_flag(next_p, TIF_NOTSC)) { @@ -371,7 +372,7 @@ static inline int hlt_use_halt(void) void default_idle(void) { if (hlt_use_halt()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); current_thread_info()->status &= ~TS_POLLING; /* * TS_POLLING-cleared state must be visible before we @@ -441,7 +442,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait); */ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) { - trace_power_start(POWER_CSTATE, (ax>>4)+1); + trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id()); if (!need_resched()) { if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -457,7 +458,7 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx) static void mwait_idle(void) { if (!need_resched()) { - trace_power_start(POWER_CSTATE, 1); + trace_power_start(POWER_CSTATE, 1, smp_processor_id()); if (cpu_has(¤t_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) clflush((void *)¤t_thread_info()->flags); @@ -478,7 +479,7 @@ static void mwait_idle(void) */ static void poll_idle(void) { - trace_power_start(POWER_CSTATE, 0); + trace_power_start(POWER_CSTATE, 0, smp_processor_id()); local_irq_enable(); while (!need_resched()) cpu_relax(); @@ -525,44 +526,10 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) return (edx & MWAIT_EDX_C1); } -/* - * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e. - * For more information see - * - Erratum #400 for NPT family 0xf and family 0x10 CPUs - * - Erratum #365 for family 0x11 (not affected because C1e not in use) - */ -static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) -{ - u64 val; - if (c->x86_vendor != X86_VENDOR_AMD) - goto no_c1e_idle; - - /* Family 0x0f models < rev F do not have C1E */ - if (c->x86 == 0x0F && c->x86_model >= 0x40) - return 1; - - if (c->x86 == 0x10) { - /* - * check OSVW bit for CPUs that are not affected - * by erratum #400 - */ - if (cpu_has(c, X86_FEATURE_OSVW)) { - rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val); - if (val >= 2) { - rdmsrl(MSR_AMD64_OSVW_STATUS, val); - if (!(val & BIT(1))) - goto no_c1e_idle; - } - } - return 1; - } - -no_c1e_idle: - return 0; -} +bool c1e_detected; +EXPORT_SYMBOL(c1e_detected); static cpumask_var_t c1e_mask; -static int c1e_detected; void c1e_remove_cpu(int cpu) { @@ -584,12 +551,12 @@ static void c1e_idle(void) u32 lo, hi; rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); + if (lo & K8_INTP_C1E_ACTIVE_MASK) { - c1e_detected = 1; + c1e_detected = true; if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halt in AMD C1E"); printk(KERN_INFO "System has AMD C1E enabled\n"); - set_cpu_cap(&boot_cpu_data, X86_FEATURE_AMDC1E); } } @@ -638,7 +605,8 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) */ printk(KERN_INFO "using mwait in idle threads.\n"); pm_idle = mwait_idle; - } else if (check_c1e_idle(c)) { + } else if (cpu_has_amd_erratum(amd_erratum_400)) { + /* E400: APIC timer interrupt does not wake up CPU from C1e */ printk(KERN_INFO "using C1E aware idle routine\n"); pm_idle = c1e_idle; } else diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f6c62667e30c..96586c3cbbbf 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -55,9 +55,10 @@ #include <asm/cpu.h> #include <asm/idle.h> #include <asm/syscalls.h> -#include <asm/ds.h> #include <asm/debugreg.h> +#include <trace/events/power.h> + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); /* @@ -112,6 +113,8 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); + + trace_power_end(smp_processor_id()); } tick_nohz_restart_sched_tick(); preempt_enable_no_resched(); @@ -238,13 +241,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, kfree(p->thread.io_bitmap_ptr); p->thread.io_bitmap_max = 0; } - - clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); - p->thread.ds_ctx = NULL; - - clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); - p->thread.debugctlmsr = 0; - return err; } @@ -317,7 +313,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* we're going to use this soon, after a few expensive things */ if (preload_fpu) - prefetch(next->xstate); + prefetch(next->fpu.state); /* * Reload esp0. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 17cb3295cbf7..3d9ea531ddd1 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -49,9 +49,10 @@ #include <asm/ia32.h> #include <asm/idle.h> #include <asm/syscalls.h> -#include <asm/ds.h> #include <asm/debugreg.h> +#include <trace/events/power.h> + asmlinkage extern void ret_from_fork(void); DEFINE_PER_CPU(unsigned long, old_rsp); @@ -139,6 +140,9 @@ void cpu_idle(void) stop_critical_timings(); pm_idle(); start_critical_timings(); + + trace_power_end(smp_processor_id()); + /* In many cases the interrupt that ended idle has already called exit_idle. But some idle loops can be woken up without interrupt. */ @@ -313,13 +317,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, if (err) goto out; } - - clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); - p->thread.ds_ctx = NULL; - - clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); - p->thread.debugctlmsr = 0; - err = 0; out: if (err && p->thread.io_bitmap_ptr) { @@ -396,7 +393,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* we're going to use this soon, after a few expensive things */ if (preload_fpu) - prefetch(next->xstate); + prefetch(next->fpu.state); /* * Reload esp0, LDT and the page table pointer: diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 2e9b55027b7e..70c4872cd8aa 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -2,9 +2,6 @@ /* * Pentium III FXSR, SSE support * Gareth Hughes <gareth@valinux.com>, May 2000 - * - * BTS tracing - * Markus Metzger <markus.t.metzger@intel.com>, Dec 2007 */ #include <linux/kernel.h> @@ -22,7 +19,6 @@ #include <linux/audit.h> #include <linux/seccomp.h> #include <linux/signal.h> -#include <linux/workqueue.h> #include <linux/perf_event.h> #include <linux/hw_breakpoint.h> @@ -36,7 +32,6 @@ #include <asm/desc.h> #include <asm/prctl.h> #include <asm/proto.h> -#include <asm/ds.h> #include <asm/hw_breakpoint.h> #include "tls.h" @@ -693,7 +688,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, struct perf_event_attr attr; if (!t->ptrace_bps[nr]) { - hw_breakpoint_init(&attr); + ptrace_breakpoint_init(&attr); /* * Put stub len and type to register (reserve) an inactive but * correct bp @@ -789,342 +784,6 @@ static int ioperm_get(struct task_struct *target, 0, IO_BITMAP_BYTES); } -#ifdef CONFIG_X86_PTRACE_BTS -/* - * A branch trace store context. - * - * Contexts may only be installed by ptrace_bts_config() and only for - * ptraced tasks. - * - * Contexts are destroyed when the tracee is detached from the tracer. - * The actual destruction work requires interrupts enabled, so the - * work is deferred and will be scheduled during __ptrace_unlink(). - * - * Contexts hold an additional task_struct reference on the traced - * task, as well as a reference on the tracer's mm. - * - * Ptrace already holds a task_struct for the duration of ptrace operations, - * but since destruction is deferred, it may be executed after both - * tracer and tracee exited. - */ -struct bts_context { - /* The branch trace handle. */ - struct bts_tracer *tracer; - - /* The buffer used to store the branch trace and its size. */ - void *buffer; - unsigned int size; - - /* The mm that paid for the above buffer. */ - struct mm_struct *mm; - - /* The task this context belongs to. */ - struct task_struct *task; - - /* The signal to send on a bts buffer overflow. */ - unsigned int bts_ovfl_signal; - - /* The work struct to destroy a context. */ - struct work_struct work; -}; - -static int alloc_bts_buffer(struct bts_context *context, unsigned int size) -{ - void *buffer = NULL; - int err = -ENOMEM; - - err = account_locked_memory(current->mm, current->signal->rlim, size); - if (err < 0) - return err; - - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto out_refund; - - context->buffer = buffer; - context->size = size; - context->mm = get_task_mm(current); - - return 0; - - out_refund: - refund_locked_memory(current->mm, size); - return err; -} - -static inline void free_bts_buffer(struct bts_context *context) -{ - if (!context->buffer) - return; - - kfree(context->buffer); - context->buffer = NULL; - - refund_locked_memory(context->mm, context->size); - context->size = 0; - - mmput(context->mm); - context->mm = NULL; -} - -static void free_bts_context_work(struct work_struct *w) -{ - struct bts_context *context; - - context = container_of(w, struct bts_context, work); - - ds_release_bts(context->tracer); - put_task_struct(context->task); - free_bts_buffer(context); - kfree(context); -} - -static inline void free_bts_context(struct bts_context *context) -{ - INIT_WORK(&context->work, free_bts_context_work); - schedule_work(&context->work); -} - -static inline struct bts_context *alloc_bts_context(struct task_struct *task) -{ - struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL); - if (context) { - context->task = task; - task->bts = context; - - get_task_struct(task); - } - - return context; -} - -static int ptrace_bts_read_record(struct task_struct *child, size_t index, - struct bts_struct __user *out) -{ - struct bts_context *context; - const struct bts_trace *trace; - struct bts_struct bts; - const unsigned char *at; - int error; - - context = child->bts; - if (!context) - return -ESRCH; - - trace = ds_read_bts(context->tracer); - if (!trace) - return -ESRCH; - - at = trace->ds.top - ((index + 1) * trace->ds.size); - if ((void *)at < trace->ds.begin) - at += (trace->ds.n * trace->ds.size); - - if (!trace->read) - return -EOPNOTSUPP; - - error = trace->read(context->tracer, at, &bts); - if (error < 0) - return error; - - if (copy_to_user(out, &bts, sizeof(bts))) - return -EFAULT; - - return sizeof(bts); -} - -static int ptrace_bts_drain(struct task_struct *child, - long size, - struct bts_struct __user *out) -{ - struct bts_context *context; - const struct bts_trace *trace; - const unsigned char *at; - int error, drained = 0; - - context = child->bts; - if (!context) - return -ESRCH; - - trace = ds_read_bts(context->tracer); - if (!trace) - return -ESRCH; - - if (!trace->read) - return -EOPNOTSUPP; - - if (size < (trace->ds.top - trace->ds.begin)) - return -EIO; - - for (at = trace->ds.begin; (void *)at < trace->ds.top; - out++, drained++, at += trace->ds.size) { - struct bts_struct bts; - - error = trace->read(context->tracer, at, &bts); - if (error < 0) - return error; - - if (copy_to_user(out, &bts, sizeof(bts))) - return -EFAULT; - } - - memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - - error = ds_reset_bts(context->tracer); - if (error < 0) - return error; - - return drained; -} - -static int ptrace_bts_config(struct task_struct *child, - long cfg_size, - const struct ptrace_bts_config __user *ucfg) -{ - struct bts_context *context; - struct ptrace_bts_config cfg; - unsigned int flags = 0; - - if (cfg_size < sizeof(cfg)) - return -EIO; - - if (copy_from_user(&cfg, ucfg, sizeof(cfg))) - return -EFAULT; - - context = child->bts; - if (!context) - context = alloc_bts_context(child); - if (!context) - return -ENOMEM; - - if (cfg.flags & PTRACE_BTS_O_SIGNAL) { - if (!cfg.signal) - return -EINVAL; - - return -EOPNOTSUPP; - context->bts_ovfl_signal = cfg.signal; - } - - ds_release_bts(context->tracer); - context->tracer = NULL; - - if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { - int err; - - free_bts_buffer(context); - if (!cfg.size) - return 0; - - err = alloc_bts_buffer(context, cfg.size); - if (err < 0) - return err; - } - - if (cfg.flags & PTRACE_BTS_O_TRACE) - flags |= BTS_USER; - - if (cfg.flags & PTRACE_BTS_O_SCHED) - flags |= BTS_TIMESTAMPS; - - context->tracer = - ds_request_bts_task(child, context->buffer, context->size, - NULL, (size_t)-1, flags); - if (unlikely(IS_ERR(context->tracer))) { - int error = PTR_ERR(context->tracer); - - free_bts_buffer(context); - context->tracer = NULL; - return error; - } - - return sizeof(cfg); -} - -static int ptrace_bts_status(struct task_struct *child, - long cfg_size, - struct ptrace_bts_config __user *ucfg) -{ - struct bts_context *context; - const struct bts_trace *trace; - struct ptrace_bts_config cfg; - - context = child->bts; - if (!context) - return -ESRCH; - - if (cfg_size < sizeof(cfg)) - return -EIO; - - trace = ds_read_bts(context->tracer); - if (!trace) - return -ESRCH; - - memset(&cfg, 0, sizeof(cfg)); - cfg.size = trace->ds.end - trace->ds.begin; - cfg.signal = context->bts_ovfl_signal; - cfg.bts_size = sizeof(struct bts_struct); - - if (cfg.signal) - cfg.flags |= PTRACE_BTS_O_SIGNAL; - - if (trace->ds.flags & BTS_USER) - cfg.flags |= PTRACE_BTS_O_TRACE; - - if (trace->ds.flags & BTS_TIMESTAMPS) - cfg.flags |= PTRACE_BTS_O_SCHED; - - if (copy_to_user(ucfg, &cfg, sizeof(cfg))) - return -EFAULT; - - return sizeof(cfg); -} - -static int ptrace_bts_clear(struct task_struct *child) -{ - struct bts_context *context; - const struct bts_trace *trace; - - context = child->bts; - if (!context) - return -ESRCH; - - trace = ds_read_bts(context->tracer); - if (!trace) - return -ESRCH; - - memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - - return ds_reset_bts(context->tracer); -} - -static int ptrace_bts_size(struct task_struct *child) -{ - struct bts_context *context; - const struct bts_trace *trace; - - context = child->bts; - if (!context) - return -ESRCH; - - trace = ds_read_bts(context->tracer); - if (!trace) - return -ESRCH; - - return (trace->ds.top - trace->ds.begin) / trace->ds.size; -} - -/* - * Called from __ptrace_unlink() after the child has been moved back - * to its original parent. - */ -void ptrace_bts_untrace(struct task_struct *child) -{ - if (unlikely(child->bts)) { - free_bts_context(child->bts); - child->bts = NULL; - } -} -#endif /* CONFIG_X86_PTRACE_BTS */ - /* * Called by kernel/ptrace.c when detaching.. * @@ -1252,39 +911,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) break; #endif - /* - * These bits need more cooking - not enabled yet: - */ -#ifdef CONFIG_X86_PTRACE_BTS - case PTRACE_BTS_CONFIG: - ret = ptrace_bts_config - (child, data, (struct ptrace_bts_config __user *)addr); - break; - - case PTRACE_BTS_STATUS: - ret = ptrace_bts_status - (child, data, (struct ptrace_bts_config __user *)addr); - break; - - case PTRACE_BTS_SIZE: - ret = ptrace_bts_size(child); - break; - - case PTRACE_BTS_GET: - ret = ptrace_bts_read_record - (child, data, (struct bts_struct __user *) addr); - break; - - case PTRACE_BTS_CLEAR: - ret = ptrace_bts_clear(child); - break; - - case PTRACE_BTS_DRAIN: - ret = ptrace_bts_drain - (child, data, (struct bts_struct __user *) addr); - break; -#endif /* CONFIG_X86_PTRACE_BTS */ - default: ret = ptrace_request(child, request, addr, data); break; @@ -1544,14 +1170,6 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request, case PTRACE_GET_THREAD_AREA: case PTRACE_SET_THREAD_AREA: -#ifdef CONFIG_X86_PTRACE_BTS - case PTRACE_BTS_CONFIG: - case PTRACE_BTS_STATUS: - case PTRACE_BTS_SIZE: - case PTRACE_BTS_GET: - case PTRACE_BTS_CLEAR: - case PTRACE_BTS_DRAIN: -#endif /* CONFIG_X86_PTRACE_BTS */ return arch_ptrace(child, request, addr, data); default: diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 03801f2f761f..239427ca02af 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -31,8 +31,16 @@ struct pvclock_shadow_time { u32 tsc_to_nsec_mul; int tsc_shift; u32 version; + u8 flags; }; +static u8 valid_flags __read_mostly = 0; + +void pvclock_set_flags(u8 flags) +{ + valid_flags = flags; +} + /* * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction, * yielding a 64-bit result. @@ -91,6 +99,7 @@ static unsigned pvclock_get_time_values(struct pvclock_shadow_time *dst, dst->system_timestamp = src->system_time; dst->tsc_to_nsec_mul = src->tsc_to_system_mul; dst->tsc_shift = src->tsc_shift; + dst->flags = src->flags; rmb(); /* test version after fetching data */ } while ((src->version & 1) || (dst->version != src->version)); @@ -109,11 +118,14 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src) return pv_tsc_khz; } +static atomic64_t last_value = ATOMIC64_INIT(0); + cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) { struct pvclock_shadow_time shadow; unsigned version; cycle_t ret, offset; + u64 last; do { version = pvclock_get_time_values(&shadow, src); @@ -123,6 +135,31 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) barrier(); } while (version != src->version); + if ((valid_flags & PVCLOCK_TSC_STABLE_BIT) && + (shadow.flags & PVCLOCK_TSC_STABLE_BIT)) + return ret; + + /* + * Assumption here is that last_value, a global accumulator, always goes + * forward. If we are less than that, we should not be much smaller. + * We assume there is an error marging we're inside, and then the correction + * does not sacrifice accuracy. + * + * For reads: global may have changed between test and return, + * but this means someone else updated poked the clock at a later time. + * We just need to make sure we are not seeing a backwards event. + * + * For updates: last_value = ret is not enough, since two vcpus could be + * updating at the same time, and one of them could be slightly behind, + * making the assumption that last_value always go forward fail to hold. + */ + last = atomic64_read(&last_value); + do { + if (ret < last) + return last; + last = atomic64_cmpxchg(&last_value, last, ret); + } while (unlikely(last != ret)); + return ret; } diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index 12e9feaa2f7a..939b9e98245f 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -495,6 +495,9 @@ void force_hpet_resume(void) /* * HPET MSI on some boards (ATI SB700/SB800) has side effect on * floppy DMA. Disable HPET MSI on such platforms. + * See erratum #27 (Misinterpreted MSI Requests May Result in + * Corrupted LPC DMA Data) in AMD Publication #46837, + * "SB700 Family Product Errata", Rev. 1.0, March 2010. */ static void force_disable_hpet_msi(struct pci_dev *unused) { diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 8e1aac86b50c..e3af342fe83a 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -228,6 +228,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T5400"), }, }, + { /* Handle problems with rebooting on Dell T7400's */ + .callback = set_bios_reboot, + .ident = "Dell Precision T7400", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Precision WorkStation T7400"), + }, + }, { /* Handle problems with rebooting on HP laptops */ .callback = set_bios_reboot, .ident = "HP Compaq Laptop", diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index c4851eff57b3..b008e7883207 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -102,6 +102,7 @@ #include <asm/paravirt.h> #include <asm/hypervisor.h> +#include <asm/olpc_ofw.h> #include <asm/percpu.h> #include <asm/topology.h> @@ -676,6 +677,17 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "DG45FC"), }, }, + /* + * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so + * match on the product name. + */ + { + .callback = dmi_low_memory_corruption, + .ident = "Phoenix BIOS", + .matches = { + DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"), + }, + }, #endif {} }; @@ -725,9 +737,15 @@ void __init setup_arch(char **cmdline_p) /* VMI may relocate the fixmap; do this before touching ioremap area */ vmi_init(); + /* OFW also may relocate the fixmap */ + olpc_ofw_detect(); + + early_trap_init(); early_cpu_init(); early_ioremap_init(); + setup_olpc_ofw_pgd(); + ROOT_DEV = old_decode_dev(boot_params.hdr.root_dev); screen_info = boot_params.screen_info; edid_info = boot_params.edid_info; diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index ef6370b00e70..a60df9ae6454 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -21,12 +21,6 @@ #include <asm/cpu.h> #include <asm/stackprotector.h> -#ifdef CONFIG_DEBUG_PER_CPU_MAPS -# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__) -#else -# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0) -#endif - DEFINE_PER_CPU(int, cpu_number); EXPORT_PER_CPU_SYMBOL(cpu_number); @@ -244,10 +238,19 @@ void __init setup_per_cpu_areas(void) #ifdef CONFIG_NUMA per_cpu(x86_cpu_to_node_map, cpu) = early_per_cpu_map(x86_cpu_to_node_map, cpu); + /* + * Ensure that the boot cpu numa_node is correct when the boot + * cpu is on a node that doesn't have memory installed. + * Also cpu_up() will call cpu_to_node() for APs when + * MEMORY_HOTPLUG is defined, before per_cpu(numa_node) is set + * up later with c_init aka intel_init/amd_init. + * So set them all (boot cpu and all APs). + */ + set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); #endif #endif /* - * Up to this point, the boot CPU has been using .data.init + * Up to this point, the boot CPU has been using .init.data * area. Reload any changed state for the boot CPU. */ if (cpu == boot_cpu_id) @@ -263,14 +266,6 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif -#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) - /* - * make sure boot cpu node_number is right, when boot cpu is on the - * node that doesn't have mem installed - */ - per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); -#endif - /* Setup node to cpumask map */ setup_node_to_cpumask_map(); diff --git a/arch/x86/kernel/sfi.c b/arch/x86/kernel/sfi.c index 34e099382651..cb22acf3ed09 100644 --- a/arch/x86/kernel/sfi.c +++ b/arch/x86/kernel/sfi.c @@ -81,7 +81,6 @@ static int __init sfi_parse_cpus(struct sfi_table_header *table) #endif /* CONFIG_X86_LOCAL_APIC */ #ifdef CONFIG_X86_IO_APIC -static u32 gsi_base; static int __init sfi_parse_ioapic(struct sfi_table_header *table) { @@ -94,8 +93,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table) pentry = (struct sfi_apic_table_entry *)sb->pentry; for (i = 0; i < num; i++) { - mp_register_ioapic(i, pentry->phys_addr, gsi_base); - gsi_base += io_apic_get_redir_entries(i); + mp_register_ioapic(i, pentry->phys_addr, gsi_top); pentry++; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 763d815e27a0..a5e928b0cb5f 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -686,7 +686,7 @@ static void __cpuinit do_fork_idle(struct work_struct *work) static void __cpuinit announce_cpu(int cpu, int apicid) { static int current_node = -1; - int node = cpu_to_node(cpu); + int node = early_cpu_to_node(cpu); if (system_state == SYSTEM_BOOTING) { if (node != current_node) { @@ -735,12 +735,8 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu) goto do_rest; } - if (!keventd_up() || current_is_keventd()) - c_idle.work.func(&c_idle.work); - else { - schedule_work(&c_idle.work); - wait_for_completion(&c_idle.done); - } + schedule_work(&c_idle.work); + wait_for_completion(&c_idle.done); if (IS_ERR(c_idle.idle)) { printk("failed fork for CPU %d\n", cpu); @@ -816,6 +812,13 @@ do_rest: if (cpumask_test_cpu(cpu, cpu_callin_mask)) break; /* It has booted */ udelay(100); + /* + * Allow other tasks to run while we wait for the + * AP to come online. This also gives a chance + * for the MTRR work(triggered by the AP coming online) + * to be completed in the stop machine context. + */ + schedule(); } if (cpumask_test_cpu(cpu, cpu_callin_mask)) @@ -1215,9 +1218,17 @@ __init void prefill_possible_map(void) if (!num_processors) num_processors = 1; - if (setup_possible_cpus == -1) - possible = num_processors + disabled_cpus; - else + i = setup_max_cpus ?: 1; + if (setup_possible_cpus == -1) { + possible = num_processors; +#ifdef CONFIG_HOTPLUG_CPU + if (setup_max_cpus) + possible += disabled_cpus; +#else + if (possible > i) + possible = i; +#endif + } else possible = setup_possible_cpus; total_cpus = max_t(int, possible, num_processors + disabled_cpus); @@ -1230,11 +1241,23 @@ __init void prefill_possible_map(void) possible = nr_cpu_ids; } +#ifdef CONFIG_HOTPLUG_CPU + if (!setup_max_cpus) +#endif + if (possible > i) { + printk(KERN_WARNING + "%d Processors exceeds max_cpus limit of %u\n", + possible, setup_max_cpus); + possible = i; + } + printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", possible, max_t(int, possible - num_processors, 0)); for (i = 0; i < possible; i++) set_cpu_possible(i, true); + for (; i < NR_CPUS; i++) + set_cpu_possible(i, false); nr_cpu_ids = possible; } diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index 922eefbb3f6c..b53c525368a7 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -23,11 +23,16 @@ static int save_stack_stack(void *data, char *name) return 0; } -static void save_stack_address(void *data, unsigned long addr, int reliable) +static void +__save_stack_address(void *data, unsigned long addr, bool reliable, bool nosched) { struct stack_trace *trace = data; +#ifdef CONFIG_FRAME_POINTER if (!reliable) return; +#endif + if (nosched && in_sched_functions(addr)) + return; if (trace->skip > 0) { trace->skip--; return; @@ -36,20 +41,15 @@ static void save_stack_address(void *data, unsigned long addr, int reliable) trace->entries[trace->nr_entries++] = addr; } +static void save_stack_address(void *data, unsigned long addr, int reliable) +{ + return __save_stack_address(data, addr, reliable, false); +} + static void save_stack_address_nosched(void *data, unsigned long addr, int reliable) { - struct stack_trace *trace = (struct stack_trace *)data; - if (!reliable) - return; - if (in_sched_functions(addr)) - return; - if (trace->skip > 0) { - trace->skip--; - return; - } - if (trace->nr_entries < trace->max_entries) - trace->entries[trace->nr_entries++] = addr; + return __save_stack_address(data, addr, reliable, true); } static const struct stacktrace_ops save_stack_ops = { @@ -96,12 +96,13 @@ EXPORT_SYMBOL_GPL(save_stack_trace_tsk); /* Userspace stacktrace - based on kernel/trace/trace_sysprof.c */ -struct stack_frame { +struct stack_frame_user { const void __user *next_fp; unsigned long ret_addr; }; -static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +static int +copy_stack_frame(const void __user *fp, struct stack_frame_user *frame) { int ret; @@ -126,7 +127,7 @@ static inline void __save_stack_trace_user(struct stack_trace *trace) trace->entries[trace->nr_entries++] = regs->ip; while (trace->nr_entries < trace->max_entries) { - struct stack_frame frame; + struct stack_frame_user frame; frame.next_fp = NULL; frame.ret_addr = 0; diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index 3149032ff107..58de45ee08b6 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -158,22 +158,6 @@ static int enable_single_step(struct task_struct *child) } /* - * Install this value in MSR_IA32_DEBUGCTLMSR whenever child is running. - */ -static void write_debugctlmsr(struct task_struct *child, unsigned long val) -{ - if (child->thread.debugctlmsr == val) - return; - - child->thread.debugctlmsr = val; - - if (child != current) - return; - - update_debugctlmsr(val); -} - -/* * Enable single or block step. */ static void enable_step(struct task_struct *child, bool block) @@ -186,15 +170,17 @@ static void enable_step(struct task_struct *child, bool block) * that uses user-mode single stepping itself. */ if (enable_single_step(child) && block) { - set_tsk_thread_flag(child, TIF_DEBUGCTLMSR); - write_debugctlmsr(child, - child->thread.debugctlmsr | DEBUGCTLMSR_BTF); - } else { - write_debugctlmsr(child, - child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); - - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + unsigned long debugctl = get_debugctlmsr(); + + debugctl |= DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + set_tsk_thread_flag(child, TIF_BLOCKSTEP); + } else if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); + + debugctl &= ~DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + clear_tsk_thread_flag(child, TIF_BLOCKSTEP); } } @@ -213,11 +199,13 @@ void user_disable_single_step(struct task_struct *child) /* * Make sure block stepping (BTF) is disabled. */ - write_debugctlmsr(child, - child->thread.debugctlmsr & ~DEBUGCTLMSR_BTF); + if (test_tsk_thread_flag(child, TIF_BLOCKSTEP)) { + unsigned long debugctl = get_debugctlmsr(); - if (!child->thread.debugctlmsr) - clear_tsk_thread_flag(child, TIF_DEBUGCTLMSR); + debugctl &= ~DEBUGCTLMSR_BTF; + update_debugctlmsr(debugctl); + clear_tsk_thread_flag(child, TIF_BLOCKSTEP); + } /* Always clear TIF_SINGLESTEP... */ clear_tsk_thread_flag(child, TIF_SINGLESTEP); diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8b3729341216..b35786dc9b8f 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -337,3 +337,6 @@ ENTRY(sys_call_table) .long sys_rt_tgsigqueueinfo /* 335 */ .long sys_perf_event_open .long sys_recvmmsg + .long sys_fanotify_init + .long sys_fanotify_mark + .long sys_prlimit64 /* 340 */ diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index 86c9f91b48ae..c2f1b26141e2 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -46,6 +46,7 @@ /* Global pointer to shared data; NULL means no measured launch. */ struct tboot *tboot __read_mostly; +EXPORT_SYMBOL(tboot); /* timeout for APs (in secs) to enter wait-for-SIPI state during shutdown */ #define AP_WAIT_TIMEOUT 1 @@ -175,6 +176,9 @@ static void add_mac_region(phys_addr_t start, unsigned long size) struct tboot_mac_region *mr; phys_addr_t end = start + size; + if (tboot->num_mac_regions >= MAX_TB_MAC_REGIONS) + panic("tboot: Too many MAC regions\n"); + if (start && size) { mr = &tboot->mac_regions[tboot->num_mac_regions++]; mr->start = round_down(start, PAGE_SIZE); @@ -184,18 +188,17 @@ static void add_mac_region(phys_addr_t start, unsigned long size) static int tboot_setup_sleep(void) { + int i; + tboot->num_mac_regions = 0; - /* S3 resume code */ - add_mac_region(acpi_wakeup_address, WAKEUP_SIZE); + for (i = 0; i < e820.nr_map; i++) { + if ((e820.map[i].type != E820_RAM) + && (e820.map[i].type != E820_RESERVED_KERN)) + continue; -#ifdef CONFIG_X86_TRAMPOLINE - /* AP trampoline code */ - add_mac_region(virt_to_phys(trampoline_base), TRAMPOLINE_SIZE); -#endif - - /* kernel code + data + bss */ - add_mac_region(virt_to_phys(_text), _end - _text); + add_mac_region(e820.map[i].addr, e820.map[i].size); + } tboot->acpi_sinfo.kernel_s3_resume_vector = acpi_wakeup_address; diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 17b03dd3a6b5..7fea555929e2 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -1,7 +1,7 @@ /* * SGI UltraViolet TLB flush routines. * - * (c) 2008 Cliff Wickman <cpw@sgi.com>, SGI. + * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI. * * This code is released under the GNU General Public License version 2 or * later. @@ -20,42 +20,67 @@ #include <asm/idle.h> #include <asm/tsc.h> #include <asm/irq_vectors.h> +#include <asm/timer.h> -static struct bau_control **uv_bau_table_bases __read_mostly; -static int uv_bau_retry_limit __read_mostly; +struct msg_desc { + struct bau_payload_queue_entry *msg; + int msg_slot; + int sw_ack_slot; + struct bau_payload_queue_entry *va_queue_first; + struct bau_payload_queue_entry *va_queue_last; +}; -/* base pnode in this partition */ -static int uv_partition_base_pnode __read_mostly; +#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL + +static int uv_bau_max_concurrent __read_mostly; + +static int nobau; +static int __init setup_nobau(char *arg) +{ + nobau = 1; + return 0; +} +early_param("nobau", setup_nobau); -static unsigned long uv_mmask __read_mostly; +/* base pnode in this partition */ +static int uv_partition_base_pnode __read_mostly; +/* position of pnode (which is nasid>>1): */ +static int uv_nshift __read_mostly; +static unsigned long uv_mmask __read_mostly; static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); +static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); + +struct reset_args { + int sender; +}; /* - * Determine the first node on a blade. + * Determine the first node on a uvhub. 'Nodes' are used for kernel + * memory allocation. */ -static int __init blade_to_first_node(int blade) +static int __init uvhub_to_first_node(int uvhub) { int node, b; for_each_online_node(node) { b = uv_node_to_blade_id(node); - if (blade == b) + if (uvhub == b) return node; } - return -1; /* shouldn't happen */ + return -1; } /* - * Determine the apicid of the first cpu on a blade. + * Determine the apicid of the first cpu on a uvhub. */ -static int __init blade_to_first_apicid(int blade) +static int __init uvhub_to_first_apicid(int uvhub) { int cpu; for_each_present_cpu(cpu) - if (blade == uv_cpu_to_blade_id(cpu)) + if (uvhub == uv_cpu_to_blade_id(cpu)) return per_cpu(x86_cpu_to_apicid, cpu); return -1; } @@ -68,195 +93,459 @@ static int __init blade_to_first_apicid(int blade) * clear of the Timeout bit (as well) will free the resource. No reply will * be sent (the hardware will only do one reply per message). */ -static void uv_reply_to_message(int resource, - struct bau_payload_queue_entry *msg, - struct bau_msg_status *msp) +static inline void uv_reply_to_message(struct msg_desc *mdp, + struct bau_control *bcp) { unsigned long dw; + struct bau_payload_queue_entry *msg; - dw = (1 << (resource + UV_SW_ACK_NPENDING)) | (1 << resource); + msg = mdp->msg; + if (!msg->canceled) { + dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) | + msg->sw_ack_vector; + uv_write_local_mmr( + UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); + } msg->replied_to = 1; msg->sw_ack_vector = 0; - if (msp) - msp->seen_by.bits = 0; - uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw); } /* - * Do all the things a cpu should do for a TLB shootdown message. - * Other cpu's may come here at the same time for this message. + * Process the receipt of a RETRY message */ -static void uv_bau_process_message(struct bau_payload_queue_entry *msg, - int msg_slot, int sw_ack_slot) +static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, + struct bau_control *bcp) { - unsigned long this_cpu_mask; - struct bau_msg_status *msp; - int cpu; + int i; + int cancel_count = 0; + int slot2; + unsigned long msg_res; + unsigned long mmr = 0; + struct bau_payload_queue_entry *msg; + struct bau_payload_queue_entry *msg2; + struct ptc_stats *stat; - msp = __get_cpu_var(bau_control).msg_statuses + msg_slot; - cpu = uv_blade_processor_id(); - msg->number_of_cpus = - uv_blade_nr_online_cpus(uv_node_to_blade_id(numa_node_id())); - this_cpu_mask = 1UL << cpu; - if (msp->seen_by.bits & this_cpu_mask) - return; - atomic_or_long(&msp->seen_by.bits, this_cpu_mask); + msg = mdp->msg; + stat = &per_cpu(ptcstats, bcp->cpu); + stat->d_retries++; + /* + * cancel any message from msg+1 to the retry itself + */ + for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { + if (msg2 > mdp->va_queue_last) + msg2 = mdp->va_queue_first; + if (msg2 == msg) + break; + + /* same conditions for cancellation as uv_do_reset */ + if ((msg2->replied_to == 0) && (msg2->canceled == 0) && + (msg2->sw_ack_vector) && ((msg2->sw_ack_vector & + msg->sw_ack_vector) == 0) && + (msg2->sending_cpu == msg->sending_cpu) && + (msg2->msg_type != MSG_NOOP)) { + slot2 = msg2 - mdp->va_queue_first; + mmr = uv_read_local_mmr + (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); + msg_res = ((msg2->sw_ack_vector << 8) | + msg2->sw_ack_vector); + /* + * This is a message retry; clear the resources held + * by the previous message only if they timed out. + * If it has not timed out we have an unexpected + * situation to report. + */ + if (mmr & (msg_res << 8)) { + /* + * is the resource timed out? + * make everyone ignore the cancelled message. + */ + msg2->canceled = 1; + stat->d_canceled++; + cancel_count++; + uv_write_local_mmr( + UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, + (msg_res << 8) | msg_res); + } else + printk(KERN_INFO "note bau retry: no effect\n"); + } + } + if (!cancel_count) + stat->d_nocanceled++; +} - if (msg->replied_to == 1) - return; +/* + * Do all the things a cpu should do for a TLB shootdown message. + * Other cpu's may come here at the same time for this message. + */ +static void uv_bau_process_message(struct msg_desc *mdp, + struct bau_control *bcp) +{ + int msg_ack_count; + short socket_ack_count = 0; + struct ptc_stats *stat; + struct bau_payload_queue_entry *msg; + struct bau_control *smaster = bcp->socket_master; + /* + * This must be a normal message, or retry of a normal message + */ + msg = mdp->msg; + stat = &per_cpu(ptcstats, bcp->cpu); if (msg->address == TLB_FLUSH_ALL) { local_flush_tlb(); - __get_cpu_var(ptcstats).alltlb++; + stat->d_alltlb++; } else { __flush_tlb_one(msg->address); - __get_cpu_var(ptcstats).onetlb++; + stat->d_onetlb++; } + stat->d_requestee++; + + /* + * One cpu on each uvhub has the additional job on a RETRY + * of releasing the resource held by the message that is + * being retried. That message is identified by sending + * cpu number. + */ + if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) + uv_bau_process_retry_msg(mdp, bcp); - __get_cpu_var(ptcstats).requestee++; + /* + * This is a sw_ack message, so we have to reply to it. + * Count each responding cpu on the socket. This avoids + * pinging the count's cache line back and forth between + * the sockets. + */ + socket_ack_count = atomic_add_short_return(1, (struct atomic_short *) + &smaster->socket_acknowledge_count[mdp->msg_slot]); + if (socket_ack_count == bcp->cpus_in_socket) { + /* + * Both sockets dump their completed count total into + * the message's count. + */ + smaster->socket_acknowledge_count[mdp->msg_slot] = 0; + msg_ack_count = atomic_add_short_return(socket_ack_count, + (struct atomic_short *)&msg->acknowledge_count); + + if (msg_ack_count == bcp->cpus_in_uvhub) { + /* + * All cpus in uvhub saw it; reply + */ + uv_reply_to_message(mdp, bcp); + } + } - atomic_inc_short(&msg->acknowledge_count); - if (msg->number_of_cpus == msg->acknowledge_count) - uv_reply_to_message(sw_ack_slot, msg, msp); + return; } /* - * Examine the payload queue on one distribution node to see - * which messages have not been seen, and which cpu(s) have not seen them. + * Determine the first cpu on a uvhub. + */ +static int uvhub_to_first_cpu(int uvhub) +{ + int cpu; + for_each_present_cpu(cpu) + if (uvhub == uv_cpu_to_blade_id(cpu)) + return cpu; + return -1; +} + +/* + * Last resort when we get a large number of destination timeouts is + * to clear resources held by a given cpu. + * Do this with IPI so that all messages in the BAU message queue + * can be identified by their nonzero sw_ack_vector field. * - * Returns the number of cpu's that have not responded. + * This is entered for a single cpu on the uvhub. + * The sender want's this uvhub to free a specific message's + * sw_ack resources. */ -static int uv_examine_destination(struct bau_control *bau_tablesp, int sender) +static void +uv_do_reset(void *ptr) { - struct bau_payload_queue_entry *msg; - struct bau_msg_status *msp; - int count = 0; int i; - int j; + int slot; + int count = 0; + unsigned long mmr; + unsigned long msg_res; + struct bau_control *bcp; + struct reset_args *rap; + struct bau_payload_queue_entry *msg; + struct ptc_stats *stat; - for (msg = bau_tablesp->va_queue_first, i = 0; i < DEST_Q_SIZE; - msg++, i++) { - if ((msg->sending_cpu == sender) && (!msg->replied_to)) { - msp = bau_tablesp->msg_statuses + i; - printk(KERN_DEBUG - "blade %d: address:%#lx %d of %d, not cpu(s): ", - i, msg->address, msg->acknowledge_count, - msg->number_of_cpus); - for (j = 0; j < msg->number_of_cpus; j++) { - if (!((1L << j) & msp->seen_by.bits)) { - count++; - printk("%d ", j); - } + bcp = &per_cpu(bau_control, smp_processor_id()); + rap = (struct reset_args *)ptr; + stat = &per_cpu(ptcstats, bcp->cpu); + stat->d_resets++; + + /* + * We're looking for the given sender, and + * will free its sw_ack resource. + * If all cpu's finally responded after the timeout, its + * message 'replied_to' was set. + */ + for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { + /* uv_do_reset: same conditions for cancellation as + uv_bau_process_retry_msg() */ + if ((msg->replied_to == 0) && + (msg->canceled == 0) && + (msg->sending_cpu == rap->sender) && + (msg->sw_ack_vector) && + (msg->msg_type != MSG_NOOP)) { + /* + * make everyone else ignore this message + */ + msg->canceled = 1; + slot = msg - bcp->va_queue_first; + count++; + /* + * only reset the resource if it is still pending + */ + mmr = uv_read_local_mmr + (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); + msg_res = ((msg->sw_ack_vector << 8) | + msg->sw_ack_vector); + if (mmr & msg_res) { + stat->d_rcanceled++; + uv_write_local_mmr( + UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, + msg_res); } - printk("\n"); } } - return count; + return; } /* - * Examine the payload queue on all the distribution nodes to see - * which messages have not been seen, and which cpu(s) have not seen them. - * - * Returns the number of cpu's that have not responded. + * Use IPI to get all target uvhubs to release resources held by + * a given sending cpu number. */ -static int uv_examine_destinations(struct bau_target_nodemask *distribution) +static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution, + int sender) { - int sender; - int i; - int count = 0; + int uvhub; + int cpu; + cpumask_t mask; + struct reset_args reset_args; + + reset_args.sender = sender; - sender = smp_processor_id(); - for (i = 0; i < sizeof(struct bau_target_nodemask) * BITSPERBYTE; i++) { - if (!bau_node_isset(i, distribution)) + cpus_clear(mask); + /* find a single cpu for each uvhub in this distribution mask */ + for (uvhub = 0; + uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE; + uvhub++) { + if (!bau_uvhub_isset(uvhub, distribution)) continue; - count += uv_examine_destination(uv_bau_table_bases[i], sender); + /* find a cpu for this uvhub */ + cpu = uvhub_to_first_cpu(uvhub); + cpu_set(cpu, mask); } - return count; + /* IPI all cpus; Preemption is already disabled */ + smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1); + return; +} + +static inline unsigned long +cycles_2_us(unsigned long long cyc) +{ + unsigned long long ns; + unsigned long us; + ns = (cyc * per_cpu(cyc2ns, smp_processor_id())) + >> CYC2NS_SCALE_FACTOR; + us = ns / 1000; + return us; } /* - * wait for completion of a broadcast message - * - * return COMPLETE, RETRY or GIVEUP + * wait for all cpus on this hub to finish their sends and go quiet + * leaves uvhub_quiesce set so that no new broadcasts are started by + * bau_flush_send_and_wait() + */ +static inline void +quiesce_local_uvhub(struct bau_control *hmaster) +{ + atomic_add_short_return(1, (struct atomic_short *) + &hmaster->uvhub_quiesce); +} + +/* + * mark this quiet-requestor as done + */ +static inline void +end_uvhub_quiesce(struct bau_control *hmaster) +{ + atomic_add_short_return(-1, (struct atomic_short *) + &hmaster->uvhub_quiesce); +} + +/* + * Wait for completion of a broadcast software ack message + * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP */ static int uv_wait_completion(struct bau_desc *bau_desc, - unsigned long mmr_offset, int right_shift) + unsigned long mmr_offset, int right_shift, int this_cpu, + struct bau_control *bcp, struct bau_control *smaster, long try) { - int exams = 0; - long destination_timeouts = 0; - long source_timeouts = 0; + int relaxes = 0; unsigned long descriptor_status; + unsigned long mmr; + unsigned long mask; + cycles_t ttime; + cycles_t timeout_time; + struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu); + struct bau_control *hmaster; + + hmaster = bcp->uvhub_master; + timeout_time = get_cycles() + bcp->timeout_interval; + /* spin on the status MMR, waiting for it to go idle */ while ((descriptor_status = (((unsigned long) uv_read_local_mmr(mmr_offset) >> right_shift) & UV_ACT_STATUS_MASK)) != DESC_STATUS_IDLE) { - if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { - source_timeouts++; - if (source_timeouts > SOURCE_TIMEOUT_LIMIT) - source_timeouts = 0; - __get_cpu_var(ptcstats).s_retry++; - return FLUSH_RETRY; - } /* - * spin here looking for progress at the destinations + * Our software ack messages may be blocked because there are + * no swack resources available. As long as none of them + * has timed out hardware will NACK our message and its + * state will stay IDLE. */ - if (descriptor_status == DESC_STATUS_DESTINATION_TIMEOUT) { - destination_timeouts++; - if (destination_timeouts > DESTINATION_TIMEOUT_LIMIT) { - /* - * returns number of cpus not responding - */ - if (uv_examine_destinations - (&bau_desc->distribution) == 0) { - __get_cpu_var(ptcstats).d_retry++; - return FLUSH_RETRY; - } - exams++; - if (exams >= uv_bau_retry_limit) { - printk(KERN_DEBUG - "uv_flush_tlb_others"); - printk("giving up on cpu %d\n", - smp_processor_id()); + if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { + stat->s_stimeout++; + return FLUSH_GIVEUP; + } else if (descriptor_status == + DESC_STATUS_DESTINATION_TIMEOUT) { + stat->s_dtimeout++; + ttime = get_cycles(); + + /* + * Our retries may be blocked by all destination + * swack resources being consumed, and a timeout + * pending. In that case hardware returns the + * ERROR that looks like a destination timeout. + */ + if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) { + bcp->conseccompletes = 0; + return FLUSH_RETRY_PLUGGED; + } + + bcp->conseccompletes = 0; + return FLUSH_RETRY_TIMEOUT; + } else { + /* + * descriptor_status is still BUSY + */ + cpu_relax(); + relaxes++; + if (relaxes >= 10000) { + relaxes = 0; + if (get_cycles() > timeout_time) { + quiesce_local_uvhub(hmaster); + + /* single-thread the register change */ + spin_lock(&hmaster->masks_lock); + mmr = uv_read_local_mmr(mmr_offset); + mask = 0UL; + mask |= (3UL < right_shift); + mask = ~mask; + mmr &= mask; + uv_write_local_mmr(mmr_offset, mmr); + spin_unlock(&hmaster->masks_lock); + end_uvhub_quiesce(hmaster); + stat->s_busy++; return FLUSH_GIVEUP; } - /* - * delays can hang the simulator - udelay(1000); - */ - destination_timeouts = 0; } } - cpu_relax(); } + bcp->conseccompletes++; return FLUSH_COMPLETE; } +static inline cycles_t +sec_2_cycles(unsigned long sec) +{ + unsigned long ns; + cycles_t cyc; + + ns = sec * 1000000000; + cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); + return cyc; +} + +/* + * conditionally add 1 to *v, unless *v is >= u + * return 0 if we cannot add 1 to *v because it is >= u + * return 1 if we can add 1 to *v because it is < u + * the add is atomic + * + * This is close to atomic_add_unless(), but this allows the 'u' value + * to be lowered below the current 'v'. atomic_add_unless can only stop + * on equal. + */ +static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) +{ + spin_lock(lock); + if (atomic_read(v) >= u) { + spin_unlock(lock); + return 0; + } + atomic_inc(v); + spin_unlock(lock); + return 1; +} + /** * uv_flush_send_and_wait * - * Send a broadcast and wait for a broadcast message to complete. + * Send a broadcast and wait for it to complete. * - * The flush_mask contains the cpus the broadcast was sent to. + * The flush_mask contains the cpus the broadcast is to be sent to, plus + * cpus that are on the local uvhub. * - * Returns NULL if all remote flushing was done. The mask is zeroed. + * Returns NULL if all flushing represented in the mask was done. The mask + * is zeroed. * Returns @flush_mask if some remote flushing remains to be done. The - * mask will have some bits still set. + * mask will have some bits still set, representing any cpus on the local + * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed. */ -const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, - struct bau_desc *bau_desc, - struct cpumask *flush_mask) +const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, + struct cpumask *flush_mask, + struct bau_control *bcp) { - int completion_status = 0; int right_shift; - int tries = 0; - int pnode; + int uvhub; int bit; + int completion_status = 0; + int seq_number = 0; + long try = 0; + int cpu = bcp->uvhub_cpu; + int this_cpu = bcp->cpu; + int this_uvhub = bcp->uvhub; unsigned long mmr_offset; unsigned long index; cycles_t time1; cycles_t time2; + struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); + struct bau_control *smaster = bcp->socket_master; + struct bau_control *hmaster = bcp->uvhub_master; + + /* + * Spin here while there are hmaster->max_concurrent or more active + * descriptors. This is the per-uvhub 'throttle'. + */ + if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, + &hmaster->active_descriptor_count, + hmaster->max_concurrent)) { + stat->s_throttles++; + do { + cpu_relax(); + } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, + &hmaster->active_descriptor_count, + hmaster->max_concurrent)); + } + + while (hmaster->uvhub_quiesce) + cpu_relax(); if (cpu < UV_CPUS_PER_ACT_STATUS) { mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0; @@ -268,24 +557,108 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, } time1 = get_cycles(); do { - tries++; + /* + * Every message from any given cpu gets a unique message + * sequence number. But retries use that same number. + * Our message may have timed out at the destination because + * all sw-ack resources are in use and there is a timeout + * pending there. In that case, our last send never got + * placed into the queue and we need to persist until it + * does. + * + * Make any retry a type MSG_RETRY so that the destination will + * free any resource held by a previous message from this cpu. + */ + if (try == 0) { + /* use message type set by the caller the first time */ + seq_number = bcp->message_number++; + } else { + /* use RETRY type on all the rest; same sequence */ + bau_desc->header.msg_type = MSG_RETRY; + stat->s_retry_messages++; + } + bau_desc->header.sequence = seq_number; index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | - cpu; + bcp->uvhub_cpu; + bcp->send_message = get_cycles(); + uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); + + try++; completion_status = uv_wait_completion(bau_desc, mmr_offset, - right_shift); - } while (completion_status == FLUSH_RETRY); + right_shift, this_cpu, bcp, smaster, try); + + if (completion_status == FLUSH_RETRY_PLUGGED) { + /* + * Our retries may be blocked by all destination swack + * resources being consumed, and a timeout pending. In + * that case hardware immediately returns the ERROR + * that looks like a destination timeout. + */ + udelay(TIMEOUT_DELAY); + bcp->plugged_tries++; + if (bcp->plugged_tries >= PLUGSB4RESET) { + bcp->plugged_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); + uv_reset_with_ipi(&bau_desc->distribution, + this_cpu); + spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; + stat->s_resets_plug++; + } + } else if (completion_status == FLUSH_RETRY_TIMEOUT) { + hmaster->max_concurrent = 1; + bcp->timeout_tries++; + udelay(TIMEOUT_DELAY); + if (bcp->timeout_tries >= TIMEOUTSB4RESET) { + bcp->timeout_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); + uv_reset_with_ipi(&bau_desc->distribution, + this_cpu); + spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; + stat->s_resets_timeout++; + } + } + if (bcp->ipi_attempts >= 3) { + bcp->ipi_attempts = 0; + completion_status = FLUSH_GIVEUP; + break; + } + cpu_relax(); + } while ((completion_status == FLUSH_RETRY_PLUGGED) || + (completion_status == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); - __get_cpu_var(ptcstats).sflush += (time2 - time1); - if (tries > 1) - __get_cpu_var(ptcstats).retriesok++; - if (completion_status == FLUSH_GIVEUP) { + if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) + && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) + hmaster->max_concurrent++; + + /* + * hold any cpu not timing out here; no other cpu currently held by + * the 'throttle' should enter the activation code + */ + while (hmaster->uvhub_quiesce) + cpu_relax(); + atomic_dec(&hmaster->active_descriptor_count); + + /* guard against cycles wrap */ + if (time2 > time1) + stat->s_time += (time2 - time1); + else + stat->s_requestor--; /* don't count this one */ + if (completion_status == FLUSH_COMPLETE && try > 1) + stat->s_retriesok++; + else if (completion_status == FLUSH_GIVEUP) { /* * Cause the caller to do an IPI-style TLB shootdown on - * the cpu's, all of which are still in the mask. + * the target cpu's, all of which are still in the mask. */ - __get_cpu_var(ptcstats).ptc_i++; + stat->s_giveup++; return flush_mask; } @@ -294,18 +667,17 @@ const struct cpumask *uv_flush_send_and_wait(int cpu, int this_pnode, * use the IPI method of shootdown on them. */ for_each_cpu(bit, flush_mask) { - pnode = uv_cpu_to_pnode(bit); - if (pnode == this_pnode) + uvhub = uv_cpu_to_blade_id(bit); + if (uvhub == this_uvhub) continue; cpumask_clear_cpu(bit, flush_mask); } if (!cpumask_empty(flush_mask)) return flush_mask; + return NULL; } -static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); - /** * uv_flush_tlb_others - globally purge translation cache of a virtual * address or all TLB's @@ -322,8 +694,8 @@ static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); * The caller has derived the cpumask from the mm_struct. This function * is called only if there are bits set in the mask. (e.g. flush_tlb_page()) * - * The cpumask is converted into a nodemask of the nodes containing - * the cpus. + * The cpumask is converted into a uvhubmask of the uvhubs containing + * those cpus. * * Note that this function should be called with preemption disabled. * @@ -335,52 +707,82 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long va, unsigned int cpu) { - struct cpumask *flush_mask = __get_cpu_var(uv_flush_tlb_mask); - int i; - int bit; - int pnode; - int uv_cpu; - int this_pnode; + int remotes; + int tcpu; + int uvhub; int locals = 0; struct bau_desc *bau_desc; + struct cpumask *flush_mask; + struct ptc_stats *stat; + struct bau_control *bcp; - cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); + if (nobau) + return cpumask; - uv_cpu = uv_blade_processor_id(); - this_pnode = uv_hub_info->pnode; - bau_desc = __get_cpu_var(bau_control).descriptor_base; - bau_desc += UV_ITEMS_PER_DESCRIPTOR * uv_cpu; + bcp = &per_cpu(bau_control, cpu); + /* + * Each sending cpu has a per-cpu mask which it fills from the caller's + * cpu mask. Only remote cpus are converted to uvhubs and copied. + */ + flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); + /* + * copy cpumask to flush_mask, removing current cpu + * (current cpu should already have been flushed by the caller and + * should never be returned if we return flush_mask) + */ + cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); + if (cpu_isset(cpu, *cpumask)) + locals++; /* current cpu was targeted */ - bau_nodes_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); + bau_desc = bcp->descriptor_base; + bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; - i = 0; - for_each_cpu(bit, flush_mask) { - pnode = uv_cpu_to_pnode(bit); - BUG_ON(pnode > (UV_DISTRIBUTION_SIZE - 1)); - if (pnode == this_pnode) { + bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); + remotes = 0; + for_each_cpu(tcpu, flush_mask) { + uvhub = uv_cpu_to_blade_id(tcpu); + if (uvhub == bcp->uvhub) { locals++; continue; } - bau_node_set(pnode - uv_partition_base_pnode, - &bau_desc->distribution); - i++; + bau_uvhub_set(uvhub, &bau_desc->distribution); + remotes++; } - if (i == 0) { + if (remotes == 0) { /* - * no off_node flushing; return status for local node + * No off_hub flushing; return status for local hub. + * Return the caller's mask if all were local (the current + * cpu may be in that mask). */ if (locals) - return flush_mask; + return cpumask; else return NULL; } - __get_cpu_var(ptcstats).requestor++; - __get_cpu_var(ptcstats).ntargeted += i; + stat = &per_cpu(ptcstats, cpu); + stat->s_requestor++; + stat->s_ntargcpu += remotes; + remotes = bau_uvhub_weight(&bau_desc->distribution); + stat->s_ntarguvhub += remotes; + if (remotes >= 16) + stat->s_ntarguvhub16++; + else if (remotes >= 8) + stat->s_ntarguvhub8++; + else if (remotes >= 4) + stat->s_ntarguvhub4++; + else if (remotes >= 2) + stat->s_ntarguvhub2++; + else + stat->s_ntarguvhub1++; bau_desc->payload.address = va; bau_desc->payload.sending_cpu = cpu; - return uv_flush_send_and_wait(uv_cpu, this_pnode, bau_desc, flush_mask); + /* + * uv_flush_send_and_wait returns null if all cpu's were messaged, or + * the adjusted flush_mask if any cpu's were not messaged. + */ + return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); } /* @@ -389,87 +791,70 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, * * We received a broadcast assist message. * - * Interrupts may have been disabled; this interrupt could represent + * Interrupts are disabled; this interrupt could represent * the receipt of several messages. * - * All cores/threads on this node get this interrupt. - * The last one to see it does the s/w ack. + * All cores/threads on this hub get this interrupt. + * The last one to see it does the software ack. * (the resource will not be freed until noninterruptable cpus see this - * interrupt; hardware will timeout the s/w ack and reply ERROR) + * interrupt; hardware may timeout the s/w ack and reply ERROR) */ void uv_bau_message_interrupt(struct pt_regs *regs) { - struct bau_payload_queue_entry *va_queue_first; - struct bau_payload_queue_entry *va_queue_last; - struct bau_payload_queue_entry *msg; - struct pt_regs *old_regs = set_irq_regs(regs); - cycles_t time1; - cycles_t time2; - int msg_slot; - int sw_ack_slot; - int fw; int count = 0; - unsigned long local_pnode; - - ack_APIC_irq(); - exit_idle(); - irq_enter(); - - time1 = get_cycles(); - - local_pnode = uv_blade_to_pnode(uv_numa_blade_id()); - - va_queue_first = __get_cpu_var(bau_control).va_queue_first; - va_queue_last = __get_cpu_var(bau_control).va_queue_last; - - msg = __get_cpu_var(bau_control).bau_msg_head; + cycles_t time_start; + struct bau_payload_queue_entry *msg; + struct bau_control *bcp; + struct ptc_stats *stat; + struct msg_desc msgdesc; + + time_start = get_cycles(); + bcp = &per_cpu(bau_control, smp_processor_id()); + stat = &per_cpu(ptcstats, smp_processor_id()); + msgdesc.va_queue_first = bcp->va_queue_first; + msgdesc.va_queue_last = bcp->va_queue_last; + msg = bcp->bau_msg_head; while (msg->sw_ack_vector) { count++; - fw = msg->sw_ack_vector; - msg_slot = msg - va_queue_first; - sw_ack_slot = ffs(fw) - 1; - - uv_bau_process_message(msg, msg_slot, sw_ack_slot); - + msgdesc.msg_slot = msg - msgdesc.va_queue_first; + msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1; + msgdesc.msg = msg; + uv_bau_process_message(&msgdesc, bcp); msg++; - if (msg > va_queue_last) - msg = va_queue_first; - __get_cpu_var(bau_control).bau_msg_head = msg; + if (msg > msgdesc.va_queue_last) + msg = msgdesc.va_queue_first; + bcp->bau_msg_head = msg; } + stat->d_time += (get_cycles() - time_start); if (!count) - __get_cpu_var(ptcstats).nomsg++; + stat->d_nomsg++; else if (count > 1) - __get_cpu_var(ptcstats).multmsg++; - - time2 = get_cycles(); - __get_cpu_var(ptcstats).dflush += (time2 - time1); - - irq_exit(); - set_irq_regs(old_regs); + stat->d_multmsg++; + ack_APIC_irq(); } /* * uv_enable_timeouts * - * Each target blade (i.e. blades that have cpu's) needs to have + * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have * shootdown message timeouts enabled. The timeout does not cause * an interrupt, but causes an error message to be returned to * the sender. */ static void uv_enable_timeouts(void) { - int blade; - int nblades; + int uvhub; + int nuvhubs; int pnode; unsigned long mmr_image; - nblades = uv_num_possible_blades(); + nuvhubs = uv_num_possible_blades(); - for (blade = 0; blade < nblades; blade++) { - if (!uv_blade_nr_possible_cpus(blade)) + for (uvhub = 0; uvhub < nuvhubs; uvhub++) { + if (!uv_blade_nr_possible_cpus(uvhub)) continue; - pnode = uv_blade_to_pnode(blade); + pnode = uv_blade_to_pnode(uvhub); mmr_image = uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL); /* @@ -479,16 +864,16 @@ static void uv_enable_timeouts(void) * To program the period, the SOFT_ACK_MODE must be off. */ mmr_image &= ~((unsigned long)1 << - UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); + UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); uv_write_global_mmr64 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); /* * Set the 4-bit period. */ mmr_image &= ~((unsigned long)0xf << - UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); + UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << - UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT); + UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); uv_write_global_mmr64 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); /* @@ -497,7 +882,7 @@ static void uv_enable_timeouts(void) * indicated in bits 2:0 (7 causes all of them to timeout). */ mmr_image |= ((unsigned long)1 << - UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT); + UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); uv_write_global_mmr64 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); } @@ -522,9 +907,20 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data) { } +static inline unsigned long long +millisec_2_cycles(unsigned long millisec) +{ + unsigned long ns; + unsigned long long cyc; + + ns = millisec * 1000; + cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); + return cyc; +} + /* - * Display the statistics thru /proc - * data points to the cpu number + * Display the statistics thru /proc. + * 'data' points to the cpu number */ static int uv_ptc_seq_show(struct seq_file *file, void *data) { @@ -535,78 +931,155 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) if (!cpu) { seq_printf(file, - "# cpu requestor requestee one all sretry dretry ptc_i "); + "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "sw_ack sflush dflush sok dnomsg dmult starget\n"); + "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); + seq_printf(file, + "retries rok resetp resett giveup sto bz throt "); + seq_printf(file, + "sw_ack recv rtime all "); + seq_printf(file, + "one mult none retry canc nocan reset rcan\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); - seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld ", - cpu, stat->requestor, - stat->requestee, stat->onetlb, stat->alltlb, - stat->s_retry, stat->d_retry, stat->ptc_i); - seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld\n", + /* source side statistics */ + seq_printf(file, + "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", + cpu, stat->s_requestor, cycles_2_us(stat->s_time), + stat->s_ntarguvhub, stat->s_ntarguvhub16, + stat->s_ntarguvhub8, stat->s_ntarguvhub4, + stat->s_ntarguvhub2, stat->s_ntarguvhub1, + stat->s_ntargcpu, stat->s_dtimeout); + seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", + stat->s_retry_messages, stat->s_retriesok, + stat->s_resets_plug, stat->s_resets_timeout, + stat->s_giveup, stat->s_stimeout, + stat->s_busy, stat->s_throttles); + /* destination side statistics */ + seq_printf(file, + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", uv_read_global_mmr64(uv_cpu_to_pnode(cpu), UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), - stat->sflush, stat->dflush, - stat->retriesok, stat->nomsg, - stat->multmsg, stat->ntargeted); + stat->d_requestee, cycles_2_us(stat->d_time), + stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, + stat->d_nomsg, stat->d_retries, stat->d_canceled, + stat->d_nocanceled, stat->d_resets, + stat->d_rcanceled); } return 0; } /* + * -1: resetf the statistics * 0: display meaning of the statistics - * >0: retry limit + * >0: maximum concurrent active descriptors per uvhub (throttle) */ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, size_t count, loff_t *data) { - long newmode; + int cpu; + long input_arg; char optstr[64]; + struct ptc_stats *stat; + struct bau_control *bcp; if (count == 0 || count > sizeof(optstr)) return -EINVAL; if (copy_from_user(optstr, user, count)) return -EFAULT; optstr[count - 1] = '\0'; - if (strict_strtoul(optstr, 10, &newmode) < 0) { + if (strict_strtol(optstr, 10, &input_arg) < 0) { printk(KERN_DEBUG "%s is invalid\n", optstr); return -EINVAL; } - if (newmode == 0) { + if (input_arg == 0) { printk(KERN_DEBUG "# cpu: cpu number\n"); + printk(KERN_DEBUG "Sender statistics:\n"); + printk(KERN_DEBUG + "sent: number of shootdown messages sent\n"); + printk(KERN_DEBUG + "stime: time spent sending messages\n"); + printk(KERN_DEBUG + "numuvhubs: number of hubs targeted with shootdown\n"); + printk(KERN_DEBUG + "numuvhubs16: number times 16 or more hubs targeted\n"); + printk(KERN_DEBUG + "numuvhubs8: number times 8 or more hubs targeted\n"); + printk(KERN_DEBUG + "numuvhubs4: number times 4 or more hubs targeted\n"); + printk(KERN_DEBUG + "numuvhubs2: number times 2 or more hubs targeted\n"); + printk(KERN_DEBUG + "numuvhubs1: number times 1 hub targeted\n"); + printk(KERN_DEBUG + "numcpus: number of cpus targeted with shootdown\n"); + printk(KERN_DEBUG + "dto: number of destination timeouts\n"); + printk(KERN_DEBUG + "retries: destination timeout retries sent\n"); + printk(KERN_DEBUG + "rok: : destination timeouts successfully retried\n"); + printk(KERN_DEBUG + "resetp: ipi-style resource resets for plugs\n"); + printk(KERN_DEBUG + "resett: ipi-style resource resets for timeouts\n"); + printk(KERN_DEBUG + "giveup: fall-backs to ipi-style shootdowns\n"); + printk(KERN_DEBUG + "sto: number of source timeouts\n"); + printk(KERN_DEBUG + "bz: number of stay-busy's\n"); + printk(KERN_DEBUG + "throt: number times spun in throttle\n"); + printk(KERN_DEBUG "Destination side statistics:\n"); printk(KERN_DEBUG - "requestor: times this cpu was the flush requestor\n"); + "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); printk(KERN_DEBUG - "requestee: times this cpu was requested to flush its TLBs\n"); + "recv: shootdown messages received\n"); printk(KERN_DEBUG - "one: times requested to flush a single address\n"); + "rtime: time spent processing messages\n"); printk(KERN_DEBUG - "all: times requested to flush all TLB's\n"); + "all: shootdown all-tlb messages\n"); printk(KERN_DEBUG - "sretry: number of retries of source-side timeouts\n"); + "one: shootdown one-tlb messages\n"); printk(KERN_DEBUG - "dretry: number of retries of destination-side timeouts\n"); + "mult: interrupts that found multiple messages\n"); printk(KERN_DEBUG - "ptc_i: times UV fell through to IPI-style flushes\n"); + "none: interrupts that found no messages\n"); printk(KERN_DEBUG - "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n"); + "retry: number of retry messages processed\n"); printk(KERN_DEBUG - "sflush_us: cycles spent in uv_flush_tlb_others()\n"); + "canc: number messages canceled by retries\n"); printk(KERN_DEBUG - "dflush_us: cycles spent in handling flush requests\n"); - printk(KERN_DEBUG "sok: successes on retry\n"); - printk(KERN_DEBUG "dnomsg: interrupts with no message\n"); + "nocan: number retries that found nothing to cancel\n"); printk(KERN_DEBUG - "dmult: interrupts with multiple messages\n"); - printk(KERN_DEBUG "starget: nodes targeted\n"); + "reset: number of ipi-style reset requests processed\n"); + printk(KERN_DEBUG + "rcan: number messages canceled by reset requests\n"); + } else if (input_arg == -1) { + for_each_present_cpu(cpu) { + stat = &per_cpu(ptcstats, cpu); + memset(stat, 0, sizeof(struct ptc_stats)); + } } else { - uv_bau_retry_limit = newmode; - printk(KERN_DEBUG "timeout retry limit:%d\n", - uv_bau_retry_limit); + uv_bau_max_concurrent = input_arg; + bcp = &per_cpu(bau_control, smp_processor_id()); + if (uv_bau_max_concurrent < 1 || + uv_bau_max_concurrent > bcp->cpus_in_uvhub) { + printk(KERN_DEBUG + "Error: BAU max concurrent %d; %d is invalid\n", + bcp->max_concurrent, uv_bau_max_concurrent); + return -EINVAL; + } + printk(KERN_DEBUG "Set BAU max concurrent:%d\n", + uv_bau_max_concurrent); + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->max_concurrent = uv_bau_max_concurrent; + } } return count; @@ -650,79 +1123,30 @@ static int __init uv_ptc_init(void) } /* - * begin the initialization of the per-blade control structures - */ -static struct bau_control * __init uv_table_bases_init(int blade, int node) -{ - int i; - struct bau_msg_status *msp; - struct bau_control *bau_tabp; - - bau_tabp = - kmalloc_node(sizeof(struct bau_control), GFP_KERNEL, node); - BUG_ON(!bau_tabp); - - bau_tabp->msg_statuses = - kmalloc_node(sizeof(struct bau_msg_status) * - DEST_Q_SIZE, GFP_KERNEL, node); - BUG_ON(!bau_tabp->msg_statuses); - - for (i = 0, msp = bau_tabp->msg_statuses; i < DEST_Q_SIZE; i++, msp++) - bau_cpubits_clear(&msp->seen_by, (int) - uv_blade_nr_possible_cpus(blade)); - - uv_bau_table_bases[blade] = bau_tabp; - - return bau_tabp; -} - -/* - * finish the initialization of the per-blade control structures - */ -static void __init -uv_table_bases_finish(int blade, - struct bau_control *bau_tablesp, - struct bau_desc *adp) -{ - struct bau_control *bcp; - int cpu; - - for_each_present_cpu(cpu) { - if (blade != uv_cpu_to_blade_id(cpu)) - continue; - - bcp = (struct bau_control *)&per_cpu(bau_control, cpu); - bcp->bau_msg_head = bau_tablesp->va_queue_first; - bcp->va_queue_first = bau_tablesp->va_queue_first; - bcp->va_queue_last = bau_tablesp->va_queue_last; - bcp->msg_statuses = bau_tablesp->msg_statuses; - bcp->descriptor_base = adp; - } -} - -/* * initialize the sending side's sending buffers */ -static struct bau_desc * __init +static void uv_activation_descriptor_init(int node, int pnode) { int i; + int cpu; unsigned long pa; unsigned long m; unsigned long n; - struct bau_desc *adp; - struct bau_desc *ad2; + struct bau_desc *bau_desc; + struct bau_desc *bd2; + struct bau_control *bcp; /* * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) - * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade + * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub */ - adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* + bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)* UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); - BUG_ON(!adp); + BUG_ON(!bau_desc); - pa = uv_gpa(adp); /* need the real nasid*/ - n = uv_gpa_to_pnode(pa); + pa = uv_gpa(bau_desc); /* need the real nasid*/ + n = pa >> uv_nshift; m = pa & uv_mmask; uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, @@ -731,96 +1155,188 @@ uv_activation_descriptor_init(int node, int pnode) /* * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each * cpu even though we only use the first one; one descriptor can - * describe a broadcast to 256 nodes. + * describe a broadcast to 256 uv hubs. */ - for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); - i++, ad2++) { - memset(ad2, 0, sizeof(struct bau_desc)); - ad2->header.sw_ack_flag = 1; + for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); + i++, bd2++) { + memset(bd2, 0, sizeof(struct bau_desc)); + bd2->header.sw_ack_flag = 1; /* - * base_dest_nodeid is the first node in the partition, so - * the bit map will indicate partition-relative node numbers. - * note that base_dest_nodeid is actually a nasid. + * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub + * in the partition. The bit map will indicate uvhub numbers, + * which are 0-N in a partition. Pnodes are unique system-wide. */ - ad2->header.base_dest_nodeid = uv_partition_base_pnode << 1; - ad2->header.dest_subnodeid = 0x10; /* the LB */ - ad2->header.command = UV_NET_ENDPOINT_INTD; - ad2->header.int_both = 1; + bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1; + bd2->header.dest_subnodeid = 0x10; /* the LB */ + bd2->header.command = UV_NET_ENDPOINT_INTD; + bd2->header.int_both = 1; /* * all others need to be set to zero: * fairness chaining multilevel count replied_to */ } - return adp; + for_each_present_cpu(cpu) { + if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu))) + continue; + bcp = &per_cpu(bau_control, cpu); + bcp->descriptor_base = bau_desc; + } } /* * initialize the destination side's receiving buffers + * entered for each uvhub in the partition + * - node is first node (kernel memory notion) on the uvhub + * - pnode is the uvhub's physical identifier */ -static struct bau_payload_queue_entry * __init -uv_payload_queue_init(int node, int pnode, struct bau_control *bau_tablesp) +static void +uv_payload_queue_init(int node, int pnode) { - struct bau_payload_queue_entry *pqp; - unsigned long pa; int pn; + int cpu; char *cp; + unsigned long pa; + struct bau_payload_queue_entry *pqp; + struct bau_payload_queue_entry *pqp_malloc; + struct bau_control *bcp; pqp = (struct bau_payload_queue_entry *) kmalloc_node( (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry), GFP_KERNEL, node); BUG_ON(!pqp); + pqp_malloc = pqp; cp = (char *)pqp + 31; pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); - bau_tablesp->va_queue_first = pqp; + + for_each_present_cpu(cpu) { + if (pnode != uv_cpu_to_pnode(cpu)) + continue; + /* for every cpu on this pnode: */ + bcp = &per_cpu(bau_control, cpu); + bcp->va_queue_first = pqp; + bcp->bau_msg_head = pqp; + bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1); + } /* * need the pnode of where the memory was really allocated */ pa = uv_gpa(pqp); - pn = uv_gpa_to_pnode(pa); + pn = pa >> uv_nshift; uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | uv_physnodeaddr(pqp)); uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, uv_physnodeaddr(pqp)); - bau_tablesp->va_queue_last = pqp + (DEST_Q_SIZE - 1); uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, (unsigned long) - uv_physnodeaddr(bau_tablesp->va_queue_last)); + uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1))); + /* in effect, all msg_type's are set to MSG_NOOP */ memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); - - return pqp; } /* - * Initialization of each UV blade's structures + * Initialization of each UV hub's structures */ -static int __init uv_init_blade(int blade) +static void __init uv_init_uvhub(int uvhub, int vector) { int node; int pnode; - unsigned long pa; unsigned long apicid; - struct bau_desc *adp; - struct bau_payload_queue_entry *pqp; - struct bau_control *bau_tablesp; - - node = blade_to_first_node(blade); - bau_tablesp = uv_table_bases_init(blade, node); - pnode = uv_blade_to_pnode(blade); - adp = uv_activation_descriptor_init(node, pnode); - pqp = uv_payload_queue_init(node, pnode, bau_tablesp); - uv_table_bases_finish(blade, bau_tablesp, adp); + + node = uvhub_to_first_node(uvhub); + pnode = uv_blade_to_pnode(uvhub); + uv_activation_descriptor_init(node, pnode); + uv_payload_queue_init(node, pnode); /* * the below initialization can't be in firmware because the * messaging IRQ will be determined by the OS */ - apicid = blade_to_first_apicid(blade); - pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); + apicid = uvhub_to_first_apicid(uvhub); uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, - ((apicid << 32) | UV_BAU_MESSAGE)); - return 0; + ((apicid << 32) | vector)); +} + +/* + * initialize the bau_control structure for each cpu + */ +static void uv_init_per_cpu(int nuvhubs) +{ + int i, j, k; + int cpu; + int pnode; + int uvhub; + short socket = 0; + struct bau_control *bcp; + struct uvhub_desc *bdp; + struct socket_desc *sdp; + struct bau_control *hmaster = NULL; + struct bau_control *smaster = NULL; + struct socket_desc { + short num_cpus; + short cpu_number[16]; + }; + struct uvhub_desc { + short num_sockets; + short num_cpus; + short uvhub; + short pnode; + struct socket_desc socket[2]; + }; + struct uvhub_desc *uvhub_descs; + + uvhub_descs = (struct uvhub_desc *) + kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); + memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + memset(bcp, 0, sizeof(struct bau_control)); + spin_lock_init(&bcp->masks_lock); + bcp->max_concurrent = uv_bau_max_concurrent; + pnode = uv_cpu_hub_info(cpu)->pnode; + uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; + bdp = &uvhub_descs[uvhub]; + bdp->num_cpus++; + bdp->uvhub = uvhub; + bdp->pnode = pnode; + /* time interval to catch a hardware stay-busy bug */ + bcp->timeout_interval = millisec_2_cycles(3); + /* kludge: assume uv_hub.h is constant */ + socket = (cpu_physical_id(cpu)>>5)&1; + if (socket >= bdp->num_sockets) + bdp->num_sockets = socket+1; + sdp = &bdp->socket[socket]; + sdp->cpu_number[sdp->num_cpus] = cpu; + sdp->num_cpus++; + } + socket = 0; + for_each_possible_blade(uvhub) { + bdp = &uvhub_descs[uvhub]; + for (i = 0; i < bdp->num_sockets; i++) { + sdp = &bdp->socket[i]; + for (j = 0; j < sdp->num_cpus; j++) { + cpu = sdp->cpu_number[j]; + bcp = &per_cpu(bau_control, cpu); + bcp->cpu = cpu; + if (j == 0) { + smaster = bcp; + if (i == 0) + hmaster = bcp; + } + bcp->cpus_in_uvhub = bdp->num_cpus; + bcp->cpus_in_socket = sdp->num_cpus; + bcp->socket_master = smaster; + bcp->uvhub_master = hmaster; + for (k = 0; k < DEST_Q_SIZE; k++) + bcp->socket_acknowledge_count[k] = 0; + bcp->uvhub_cpu = + uv_cpu_hub_info(cpu)->blade_processor_id; + } + socket++; + } + } + kfree(uvhub_descs); } /* @@ -828,38 +1344,54 @@ static int __init uv_init_blade(int blade) */ static int __init uv_bau_init(void) { - int blade; - int nblades; + int uvhub; + int pnode; + int nuvhubs; int cur_cpu; + int vector; + unsigned long mmr; if (!is_uv_system()) return 0; + if (nobau) + return 0; + for_each_possible_cpu(cur_cpu) zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), GFP_KERNEL, cpu_to_node(cur_cpu)); - uv_bau_retry_limit = 1; + uv_bau_max_concurrent = MAX_BAU_CONCURRENT; + uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; - nblades = uv_num_possible_blades(); + nuvhubs = uv_num_possible_blades(); - uv_bau_table_bases = (struct bau_control **) - kmalloc(nblades * sizeof(struct bau_control *), GFP_KERNEL); - BUG_ON(!uv_bau_table_bases); + uv_init_per_cpu(nuvhubs); uv_partition_base_pnode = 0x7fffffff; - for (blade = 0; blade < nblades; blade++) - if (uv_blade_nr_possible_cpus(blade) && - (uv_blade_to_pnode(blade) < uv_partition_base_pnode)) - uv_partition_base_pnode = uv_blade_to_pnode(blade); - for (blade = 0; blade < nblades; blade++) - if (uv_blade_nr_possible_cpus(blade)) - uv_init_blade(blade); - - alloc_intr_gate(UV_BAU_MESSAGE, uv_bau_message_intr1); + for (uvhub = 0; uvhub < nuvhubs; uvhub++) + if (uv_blade_nr_possible_cpus(uvhub) && + (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) + uv_partition_base_pnode = uv_blade_to_pnode(uvhub); + + vector = UV_BAU_MESSAGE; + for_each_possible_blade(uvhub) + if (uv_blade_nr_possible_cpus(uvhub)) + uv_init_uvhub(uvhub, vector); + uv_enable_timeouts(); + alloc_intr_gate(vector, uv_bau_message_intr1); + + for_each_possible_blade(uvhub) { + pnode = uv_blade_to_pnode(uvhub); + /* INIT the bau */ + uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, + ((unsigned long)1 << 63)); + mmr = 1; /* should be 1 to broadcast to both sockets */ + uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr); + } return 0; } -__initcall(uv_bau_init); -__initcall(uv_ptc_init); +core_initcall(uv_bau_init); +core_initcall(uv_ptc_init); diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 1168e4454188..60788dee0f8a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -15,6 +15,7 @@ #include <linux/kprobes.h> #include <linux/uaccess.h> #include <linux/kdebug.h> +#include <linux/kgdb.h> #include <linux/kernel.h> #include <linux/module.h> #include <linux/ptrace.h> @@ -108,15 +109,6 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) dec_preempt_count(); } -#ifdef CONFIG_X86_32 -static inline void -die_if_kernel(const char *str, struct pt_regs *regs, long err) -{ - if (!user_mode_vm(regs)) - die(str, regs, err); -} -#endif - static void __kprobes do_trap(int trapnr, int signr, char *str, struct pt_regs *regs, long error_code, siginfo_t *info) @@ -400,7 +392,13 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT) == NOTIFY_STOP) return; + #ifdef CONFIG_X86_LOCAL_APIC + if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) + == NOTIFY_STOP) + return; + +#ifndef CONFIG_LOCKUP_DETECTOR /* * Ok, so this is none of the documented NMI sources, * so it must be the NMI watchdog. @@ -408,6 +406,7 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) if (nmi_watchdog_tick(regs, reason)) return; if (!do_nmi_callback(regs, cpu)) +#endif /* !CONFIG_LOCKUP_DETECTOR */ unknown_nmi_error(reason, regs); #else unknown_nmi_error(reason, regs); @@ -460,6 +459,11 @@ void restart_nmi(void) /* May run on IST stack. */ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) { +#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP + if (kgdb_ll_trap(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) + == NOTIFY_STOP) + return; +#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ #ifdef CONFIG_KPROBES if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) @@ -529,6 +533,7 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs) dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) { struct task_struct *tsk = current; + int user_icebp = 0; unsigned long dr6; int si_code; @@ -537,17 +542,25 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) /* Filter out all the reserved bits which are preset to 1 */ dr6 &= ~DR6_RESERVED; + /* + * If dr6 has no reason to give us about the origin of this trap, + * then it's very likely the result of an icebp/int01 trap. + * User wants a sigtrap for that. + */ + if (!dr6 && user_mode(regs)) + user_icebp = 1; + /* Catch kmemcheck conditions first of all! */ if ((dr6 & DR_STEP) && kmemcheck_trap(regs)) return; /* DR6 may or may not be cleared by the CPU */ set_debugreg(0, 6); + /* * The processor cleared BTF, so don't mark that we need it set. */ - clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); - tsk->thread.debugctlmsr = 0; + clear_tsk_thread_flag(tsk, TIF_BLOCKSTEP); /* Store the virtualized DR6 value */ tsk->thread.debugreg6 = dr6; @@ -578,62 +591,74 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) regs->flags &= ~X86_EFLAGS_TF; } si_code = get_si_code(tsk->thread.debugreg6); - if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS)) + if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); return; } -#ifdef CONFIG_X86_64 -static int kernel_math_error(struct pt_regs *regs, const char *str, int trapnr) -{ - if (fixup_exception(regs)) - return 1; - - notify_die(DIE_GPF, str, regs, 0, trapnr, SIGFPE); - /* Illegal floating point operation in the kernel */ - current->thread.trap_no = trapnr; - die(str, regs, 0); - return 0; -} -#endif - /* * Note that we play around with the 'TS' bit in an attempt to get * the correct behaviour even in the presence of the asynchronous * IRQ13 behaviour */ -void math_error(void __user *ip) +void math_error(struct pt_regs *regs, int error_code, int trapnr) { - struct task_struct *task; + struct task_struct *task = current; siginfo_t info; - unsigned short cwd, swd, err; + unsigned short err; + char *str = (trapnr == 16) ? "fpu exception" : "simd exception"; + + if (notify_die(DIE_TRAP, str, regs, error_code, trapnr, SIGFPE) == NOTIFY_STOP) + return; + conditional_sti(regs); + + if (!user_mode_vm(regs)) + { + if (!fixup_exception(regs)) { + task->thread.error_code = error_code; + task->thread.trap_no = trapnr; + die(str, regs, error_code); + } + return; + } /* * Save the info for the exception handler and clear the error. */ - task = current; save_init_fpu(task); - task->thread.trap_no = 16; - task->thread.error_code = 0; + task->thread.trap_no = trapnr; + task->thread.error_code = error_code; info.si_signo = SIGFPE; info.si_errno = 0; - info.si_addr = ip; - /* - * (~cwd & swd) will mask out exceptions that are not set to unmasked - * status. 0x3f is the exception bits in these regs, 0x200 is the - * C1 reg you need in case of a stack fault, 0x040 is the stack - * fault bit. We should only be taking one exception at a time, - * so if this combination doesn't produce any single exception, - * then we have a bad program that isn't synchronizing its FPU usage - * and it will suffer the consequences since we won't be able to - * fully reproduce the context of the exception - */ - cwd = get_fpu_cwd(task); - swd = get_fpu_swd(task); + info.si_addr = (void __user *)regs->ip; + if (trapnr == 16) { + unsigned short cwd, swd; + /* + * (~cwd & swd) will mask out exceptions that are not set to unmasked + * status. 0x3f is the exception bits in these regs, 0x200 is the + * C1 reg you need in case of a stack fault, 0x040 is the stack + * fault bit. We should only be taking one exception at a time, + * so if this combination doesn't produce any single exception, + * then we have a bad program that isn't synchronizing its FPU usage + * and it will suffer the consequences since we won't be able to + * fully reproduce the context of the exception + */ + cwd = get_fpu_cwd(task); + swd = get_fpu_swd(task); - err = swd & ~cwd; + err = swd & ~cwd; + } else { + /* + * The SIMD FPU exceptions are handled a little differently, as there + * is only a single status/control register. Thus, to determine which + * unmasked exception was caught we must mask the exception mask bits + * at 0x1f80, and then use these to mask the exception bits at 0x3f. + */ + unsigned short mxcsr = get_fpu_mxcsr(task); + err = ~(mxcsr >> 7) & mxcsr; + } if (err & 0x001) { /* Invalid op */ /* @@ -662,97 +687,17 @@ void math_error(void __user *ip) dotraplinkage void do_coprocessor_error(struct pt_regs *regs, long error_code) { - conditional_sti(regs); - #ifdef CONFIG_X86_32 ignore_fpu_irq = 1; -#else - if (!user_mode(regs) && - kernel_math_error(regs, "kernel x87 math error", 16)) - return; #endif - math_error((void __user *)regs->ip); -} - -static void simd_math_error(void __user *ip) -{ - struct task_struct *task; - siginfo_t info; - unsigned short mxcsr; - - /* - * Save the info for the exception handler and clear the error. - */ - task = current; - save_init_fpu(task); - task->thread.trap_no = 19; - task->thread.error_code = 0; - info.si_signo = SIGFPE; - info.si_errno = 0; - info.si_code = __SI_FAULT; - info.si_addr = ip; - /* - * The SIMD FPU exceptions are handled a little differently, as there - * is only a single status/control register. Thus, to determine which - * unmasked exception was caught we must mask the exception mask bits - * at 0x1f80, and then use these to mask the exception bits at 0x3f. - */ - mxcsr = get_fpu_mxcsr(task); - switch (~((mxcsr & 0x1f80) >> 7) & (mxcsr & 0x3f)) { - case 0x000: - default: - break; - case 0x001: /* Invalid Op */ - info.si_code = FPE_FLTINV; - break; - case 0x002: /* Denormalize */ - case 0x010: /* Underflow */ - info.si_code = FPE_FLTUND; - break; - case 0x004: /* Zero Divide */ - info.si_code = FPE_FLTDIV; - break; - case 0x008: /* Overflow */ - info.si_code = FPE_FLTOVF; - break; - case 0x020: /* Precision */ - info.si_code = FPE_FLTRES; - break; - } - force_sig_info(SIGFPE, &info, task); + math_error(regs, error_code, 16); } dotraplinkage void do_simd_coprocessor_error(struct pt_regs *regs, long error_code) { - conditional_sti(regs); - -#ifdef CONFIG_X86_32 - if (cpu_has_xmm) { - /* Handle SIMD FPU exceptions on PIII+ processors. */ - ignore_fpu_irq = 1; - simd_math_error((void __user *)regs->ip); - return; - } - /* - * Handle strange cache flush from user space exception - * in all other cases. This is undocumented behaviour. - */ - if (regs->flags & X86_VM_MASK) { - handle_vm86_fault((struct kernel_vm86_regs *)regs, error_code); - return; - } - current->thread.trap_no = 19; - current->thread.error_code = error_code; - die_if_kernel("cache flush denied", regs, error_code); - force_sig(SIGSEGV, current); -#else - if (!user_mode(regs) && - kernel_math_error(regs, "kernel simd math error", 19)) - return; - simd_math_error((void __user *)regs->ip); -#endif + math_error(regs, error_code, 19); } dotraplinkage void @@ -879,6 +824,16 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) } #endif +/* Set of traps needed for early debugging. */ +void __init early_trap_init(void) +{ + set_intr_gate_ist(1, &debug, DEBUG_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(3, &int3, DEBUG_STACK); + set_intr_gate(14, &page_fault); + load_idt(&idt_descr); +} + void __init trap_init(void) { int i; @@ -892,10 +847,7 @@ void __init trap_init(void) #endif set_intr_gate(0, ÷_error); - set_intr_gate_ist(1, &debug, DEBUG_STACK); set_intr_gate_ist(2, &nmi, NMI_STACK); - /* int3 can be called from all */ - set_system_intr_gate_ist(3, &int3, DEBUG_STACK); /* int4 can be called from all */ set_system_intr_gate(4, &overflow); set_intr_gate(5, &bounds); @@ -911,7 +863,6 @@ void __init trap_init(void) set_intr_gate(11, &segment_not_present); set_intr_gate_ist(12, &stack_segment, STACKFAULT_STACK); set_intr_gate(13, &general_protection); - set_intr_gate(14, &page_fault); set_intr_gate(15, &spurious_interrupt_bug); set_intr_gate(16, &coprocessor_error); set_intr_gate(17, &alignment_check); diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 9faf91ae1841..ce8e50239332 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -751,7 +751,6 @@ static struct clocksource clocksource_tsc = { .read = read_tsc, .resume = resume_tsc, .mask = CLOCKSOURCE_MASK(64), - .shift = 22, .flags = CLOCK_SOURCE_IS_CONTINUOUS | CLOCK_SOURCE_MUST_VERIFY, #ifdef CONFIG_X86_64 @@ -845,8 +844,6 @@ __cpuinit int unsynchronized_tsc(void) static void __init init_tsc_clocksource(void) { - clocksource_tsc.mult = clocksource_khz2mult(tsc_khz, - clocksource_tsc.shift); if (tsc_clocksource_reliable) clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; /* lower the rating if we already know its unstable: */ @@ -854,7 +851,7 @@ static void __init init_tsc_clocksource(void) clocksource_tsc.rating = 0; clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; } - clocksource_register(&clocksource_tsc); + clocksource_register_khz(&clocksource_tsc, tsc_khz); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c index 1d40336b030a..1132129db792 100644 --- a/arch/x86/kernel/uv_irq.c +++ b/arch/x86/kernel/uv_irq.c @@ -44,7 +44,7 @@ static void uv_ack_apic(unsigned int irq) ack_APIC_irq(); } -struct irq_chip uv_irq_chip = { +static struct irq_chip uv_irq_chip = { .name = "UV-CORE", .startup = uv_noop_ret, .shutdown = uv_noop, @@ -141,7 +141,7 @@ int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode) */ static int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, - unsigned long mmr_offset, int restrict) + unsigned long mmr_offset, int limit) { const struct cpumask *eligible_cpu = cpumask_of(cpu); struct irq_desc *desc = irq_to_desc(irq); @@ -160,7 +160,7 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade, if (err != 0) return err; - if (restrict == UV_AFFINITY_CPU) + if (limit == UV_AFFINITY_CPU) desc->status |= IRQ_NO_BALANCING; else desc->status |= IRQ_MOVE_PCNTXT; @@ -214,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) unsigned long mmr_value; struct uv_IO_APIC_route_entry *entry; unsigned long mmr_offset; - unsigned mmr_pnode; + int mmr_pnode; if (set_desc_affinity(desc, mask, &dest)) return -1; @@ -248,7 +248,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) * interrupt is raised. */ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, - unsigned long mmr_offset, int restrict) + unsigned long mmr_offset, int limit) { int irq, ret; @@ -258,7 +258,7 @@ int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, return -EBUSY; ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset, - restrict); + limit); if (ret == irq) uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade); else diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu_64.S index 45b6f8a975a1..56a8c2a867d9 100644 --- a/arch/x86/kernel/verify_cpu_64.S +++ b/arch/x86/kernel/verify_cpu_64.S @@ -31,6 +31,7 @@ */ #include <asm/cpufeature.h> +#include <asm/msr-index.h> verify_cpu: pushfl # Save caller passed flags @@ -88,7 +89,7 @@ verify_cpu_sse_test: je verify_cpu_sse_ok test %di,%di jz verify_cpu_no_longmode # only try to force SSE on AMD - movl $0xc0010015,%ecx # HWCR + movl $MSR_K7_HWCR,%ecx rdmsr btr $15,%eax # enable SSE wrmsr diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 2cc249718c46..d0bb52296fa3 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -97,7 +97,7 @@ SECTIONS HEAD_TEXT #ifdef CONFIG_X86_32 . = ALIGN(PAGE_SIZE); - *(.text.page_aligned) + *(.text..page_aligned) #endif . = ALIGN(8); _stext = .; @@ -305,7 +305,7 @@ SECTIONS . = ALIGN(PAGE_SIZE); .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __bss_start = .; - *(.bss.page_aligned) + *(.bss..page_aligned) *(.bss) . = ALIGN(4); __bss_stop = .; diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 1c0c6ab9c60f..dcbb28c4b694 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -73,8 +73,8 @@ void update_vsyscall_tz(void) write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } -void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, - u32 mult) +void update_vsyscall(struct timespec *wall_time, struct timespec *wtm, + struct clocksource *clock, u32 mult) { unsigned long flags; @@ -87,7 +87,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock, vsyscall_gtod_data.clock.shift = clock->shift; vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; - vsyscall_gtod_data.wall_to_monotonic = wall_to_monotonic; + vsyscall_gtod_data.wall_to_monotonic = *wtm; vsyscall_gtod_data.wall_time_coarse = __current_kernel_time(); write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); } @@ -169,13 +169,18 @@ int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) * unlikely */ time_t __vsyscall(1) vtime(time_t *t) { - struct timeval tv; + unsigned seq; time_t result; if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) return time_syscall(t); - vgettimeofday(&tv, NULL); - result = tv.tv_sec; + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); + + result = __vsyscall_gtod_data.wall_time_sec; + + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + if (t) *t = result; return result; diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 693920b22496..1b950d151e58 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -54,7 +54,6 @@ EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); EXPORT_SYMBOL(empty_zero_page); -EXPORT_SYMBOL(init_level4_pgt); #ifndef CONFIG_PARAVIRT EXPORT_SYMBOL(native_load_gs_index); #endif diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index 61a1e8c7e19f..cd6da6bf3eca 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -5,6 +5,7 @@ */ #include <linux/init.h> #include <linux/ioport.h> +#include <linux/module.h> #include <asm/bios_ebda.h> #include <asm/paravirt.h> @@ -85,6 +86,7 @@ struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { }; static void default_nmi_init(void) { }; +static int default_i8042_detect(void) { return 1; }; struct x86_platform_ops x86_platform = { .calibrate_tsc = native_calibrate_tsc, @@ -92,5 +94,8 @@ struct x86_platform_ops x86_platform = { .set_wallclock = mach_set_rtc_mmss, .iommu_shutdown = iommu_shutdown_noop, .is_untracked_pat_range = is_ISA_range, - .nmi_init = default_nmi_init + .nmi_init = default_nmi_init, + .i8042_detect = default_i8042_detect }; + +EXPORT_SYMBOL_GPL(x86_platform); diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index 782c3a362ec6..9c253bd65e24 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -16,11 +16,88 @@ */ u64 pcntxt_mask; +/* + * Represents init state for the supported extended state. + */ +static struct xsave_struct *init_xstate_buf; + struct _fpx_sw_bytes fx_sw_reserved; #ifdef CONFIG_IA32_EMULATION struct _fpx_sw_bytes fx_sw_reserved_ia32; #endif +static unsigned int *xstate_offsets, *xstate_sizes, xstate_features; + +/* + * If a processor implementation discern that a processor state component is + * in its initialized state it may modify the corresponding bit in the + * xsave_hdr.xstate_bv as '0', with out modifying the corresponding memory + * layout in the case of xsaveopt. While presenting the xstate information to + * the user, we always ensure that the memory layout of a feature will be in + * the init state if the corresponding header bit is zero. This is to ensure + * that the user doesn't see some stale state in the memory layout during + * signal handling, debugging etc. + */ +void __sanitize_i387_state(struct task_struct *tsk) +{ + u64 xstate_bv; + int feature_bit = 0x2; + struct i387_fxsave_struct *fx = &tsk->thread.fpu.state->fxsave; + + if (!fx) + return; + + BUG_ON(task_thread_info(tsk)->status & TS_USEDFPU); + + xstate_bv = tsk->thread.fpu.state->xsave.xsave_hdr.xstate_bv; + + /* + * None of the feature bits are in init state. So nothing else + * to do for us, as the memory layout is upto date. + */ + if ((xstate_bv & pcntxt_mask) == pcntxt_mask) + return; + + /* + * FP is in init state + */ + if (!(xstate_bv & XSTATE_FP)) { + fx->cwd = 0x37f; + fx->swd = 0; + fx->twd = 0; + fx->fop = 0; + fx->rip = 0; + fx->rdp = 0; + memset(&fx->st_space[0], 0, 128); + } + + /* + * SSE is in init state + */ + if (!(xstate_bv & XSTATE_SSE)) + memset(&fx->xmm_space[0], 0, 256); + + xstate_bv = (pcntxt_mask & ~xstate_bv) >> 2; + + /* + * Update all the other memory layouts for which the corresponding + * header bit is in the init state. + */ + while (xstate_bv) { + if (xstate_bv & 0x1) { + int offset = xstate_offsets[feature_bit]; + int size = xstate_sizes[feature_bit]; + + memcpy(((void *) fx) + offset, + ((void *) init_xstate_buf) + offset, + size); + } + + xstate_bv >>= 1; + feature_bit++; + } +} + /* * Check for the presence of extended state information in the * user fpstate pointer in the sigcontext. @@ -36,15 +113,14 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, err = __copy_from_user(fx_sw_user, &buf->sw_reserved[0], sizeof(struct _fpx_sw_bytes)); - if (err) - return err; + return -EFAULT; /* * First Magic check failed. */ if (fx_sw_user->magic1 != FP_XSTATE_MAGIC1) - return -1; + return -EINVAL; /* * Check for error scenarios. @@ -52,19 +128,21 @@ int check_for_xstate(struct i387_fxsave_struct __user *buf, if (fx_sw_user->xstate_size < min_xstate_size || fx_sw_user->xstate_size > xstate_size || fx_sw_user->xstate_size > fx_sw_user->extended_size) - return -1; + return -EINVAL; err = __get_user(magic2, (__u32 *) (((void *)fpstate) + fx_sw_user->extended_size - FP_XSTATE_MAGIC2_SIZE)); + if (err) + return err; /* * Check for the presence of second magic word at the end of memory * layout. This detects the case where the user just copied the legacy * fpstate layout with out copying the extended state information * in the memory layout. */ - if (err || magic2 != FP_XSTATE_MAGIC2) - return -1; + if (magic2 != FP_XSTATE_MAGIC2) + return -EFAULT; return 0; } @@ -91,15 +169,7 @@ int save_i387_xstate(void __user *buf) return 0; if (task_thread_info(tsk)->status & TS_USEDFPU) { - /* - * Start with clearing the user buffer. This will present a - * clean context for the bytes not touched by the fxsave/xsave. - */ - err = __clear_user(buf, sig_xstate_size); - if (err) - return err; - - if (task_thread_info(tsk)->status & TS_XSAVE) + if (use_xsave()) err = xsave_user(buf); else err = fxsave_user(buf); @@ -109,14 +179,15 @@ int save_i387_xstate(void __user *buf) task_thread_info(tsk)->status &= ~TS_USEDFPU; stts(); } else { - if (__copy_to_user(buf, &tsk->thread.xstate->fxsave, + sanitize_i387_state(tsk); + if (__copy_to_user(buf, &tsk->thread.fpu.state->fxsave, xstate_size)) return -1; } clear_used_math(); /* trigger finit */ - if (task_thread_info(tsk)->status & TS_XSAVE) { + if (use_xsave()) { struct _fpstate __user *fx = buf; struct _xstate __user *x = buf; u64 xstate_bv; @@ -184,8 +255,8 @@ static int restore_user_xstate(void __user *buf) * init the state skipped by the user. */ mask = pcntxt_mask & ~mask; - - xrstor_state(init_xstate_buf, mask); + if (unlikely(mask)) + xrstor_state(init_xstate_buf, mask); return 0; @@ -225,7 +296,7 @@ int restore_i387_xstate(void __user *buf) clts(); task_thread_info(current)->status |= TS_USEDFPU; } - if (task_thread_info(tsk)->status & TS_XSAVE) + if (use_xsave()) err = restore_user_xstate(buf); else err = fxrstor_checking((__force struct i387_fxsave_struct *) @@ -274,11 +345,6 @@ static void prepare_fx_sw_frame(void) #endif } -/* - * Represents init state for the supported extended state. - */ -struct xsave_struct *init_xstate_buf; - #ifdef CONFIG_X86_64 unsigned int sig_xstate_size = sizeof(struct _fpstate); #endif @@ -286,37 +352,77 @@ unsigned int sig_xstate_size = sizeof(struct _fpstate); /* * Enable the extended processor state save/restore feature */ -void __cpuinit xsave_init(void) +static inline void xstate_enable(void) { - if (!cpu_has_xsave) - return; - set_in_cr4(X86_CR4_OSXSAVE); - - /* - * Enable all the features that the HW is capable of - * and the Linux kernel is aware of. - */ xsetbv(XCR_XFEATURE_ENABLED_MASK, pcntxt_mask); } /* + * Record the offsets and sizes of different state managed by the xsave + * memory layout. + */ +static void __init setup_xstate_features(void) +{ + int eax, ebx, ecx, edx, leaf = 0x2; + + xstate_features = fls64(pcntxt_mask); + xstate_offsets = alloc_bootmem(xstate_features * sizeof(int)); + xstate_sizes = alloc_bootmem(xstate_features * sizeof(int)); + + do { + cpuid_count(XSTATE_CPUID, leaf, &eax, &ebx, &ecx, &edx); + + if (eax == 0) + break; + + xstate_offsets[leaf] = ebx; + xstate_sizes[leaf] = eax; + + leaf++; + } while (1); +} + +/* * setup the xstate image representing the init state */ static void __init setup_xstate_init(void) { + setup_xstate_features(); + + /* + * Setup init_xstate_buf to represent the init state of + * all the features managed by the xsave + */ init_xstate_buf = alloc_bootmem(xstate_size); init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; + + clts(); + /* + * Init all the features state with header_bv being 0x0 + */ + xrstor_state(init_xstate_buf, -1); + /* + * Dump the init state again. This is to identify the init state + * of any feature which is not represented by all zero's. + */ + xsave_state(init_xstate_buf, -1); + stts(); } /* * Enable and initialize the xsave feature. */ -void __ref xsave_cntxt_init(void) +static void __init xstate_enable_boot_cpu(void) { unsigned int eax, ebx, ecx, edx; - cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if (boot_cpu_data.cpuid_level < XSTATE_CPUID) { + WARN(1, KERN_ERR "XSTATE_CPUID missing\n"); + return; + } + + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); pcntxt_mask = eax + ((u64)edx << 32); if ((pcntxt_mask & XSTATE_FPSSE) != XSTATE_FPSSE) { @@ -329,12 +435,13 @@ void __ref xsave_cntxt_init(void) * Support only the state known to OS. */ pcntxt_mask = pcntxt_mask & XCNTXT_MASK; - xsave_init(); + + xstate_enable(); /* * Recompute the context size for enabled features */ - cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + cpuid_count(XSTATE_CPUID, 0, &eax, &ebx, &ecx, &edx); xstate_size = ebx; update_regset_xstate_info(xstate_size, pcntxt_mask); @@ -346,3 +453,23 @@ void __ref xsave_cntxt_init(void) "cntxt size 0x%x\n", pcntxt_mask, xstate_size); } + +/* + * For the very first instance, this calls xstate_enable_boot_cpu(); + * for all subsequent instances, this calls xstate_enable(). + * + * This is somewhat obfuscated due to the lack of powerful enough + * overrides for the section checks. + */ +void __cpuinit xsave_init(void) +{ + static __refdata void (*next_func)(void) = xstate_enable_boot_cpu; + void (*this_func)(void); + + if (!cpu_has_xsave) + return; + + this_func = next_func; + next_func = xstate_enable; + this_func(); +} |