diff options
Diffstat (limited to 'arch/x86_64')
43 files changed, 856 insertions, 563 deletions
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig index 010d2265f1cf..bfbb9bcae123 100644 --- a/arch/x86_64/Kconfig +++ b/arch/x86_64/Kconfig @@ -122,7 +122,7 @@ endchoice choice prompt "Processor family" - default MK8 + default GENERIC_CPU config MK8 bool "AMD-Opteron/Athlon64" @@ -130,16 +130,31 @@ config MK8 Optimize for AMD Opteron/Athlon64/Hammer/K8 CPUs. config MPSC - bool "Intel EM64T" + bool "Intel P4 / older Netburst based Xeon" help - Optimize for Intel Pentium 4 and Xeon CPUs with Intel - Extended Memory 64 Technology(EM64T). For details see + Optimize for Intel Pentium 4 and older Nocona/Dempsey Xeon CPUs + with Intel Extended Memory 64 Technology(EM64T). For details see <http://www.intel.com/technology/64bitextensions/>. + Note the the latest Xeons (Xeon 51xx and 53xx) are not based on the + Netburst core and shouldn't use this option. You can distingush them + using the cpu family field + in /proc/cpuinfo. Family 15 is a older Xeon, Family 6 a newer one + (this rule only applies to system that support EM64T) + +config MCORE2 + bool "Intel Core2 / newer Xeon" + help + Optimize for Intel Core2 and newer Xeons (51xx) + You can distingush the newer Xeons from the older ones using + the cpu family field in /proc/cpuinfo. 15 is a older Xeon + (use CONFIG_MPSC then), 6 is a newer one. This rule only + applies to CPUs that support EM64T. config GENERIC_CPU bool "Generic-x86-64" help Generic x86-64 CPU. + Run equally well on all x86-64 CPUs. endchoice @@ -149,12 +164,12 @@ endchoice config X86_L1_CACHE_BYTES int default "128" if GENERIC_CPU || MPSC - default "64" if MK8 + default "64" if MK8 || MCORE2 config X86_L1_CACHE_SHIFT int default "7" if GENERIC_CPU || MPSC - default "6" if MK8 + default "6" if MK8 || MCORE2 config X86_INTERNODE_CACHE_BYTES int @@ -344,11 +359,6 @@ config ARCH_DISCONTIGMEM_ENABLE depends on NUMA default y - -config ARCH_DISCONTIGMEM_ENABLE - def_bool y - depends on NUMA - config ARCH_DISCONTIGMEM_DEFAULT def_bool y depends on NUMA @@ -455,6 +465,17 @@ config CALGARY_IOMMU Normally the kernel will make the right choice by itself. If unsure, say Y. +config CALGARY_IOMMU_ENABLED_BY_DEFAULT + bool "Should Calgary be enabled by default?" + default y + depends on CALGARY_IOMMU + help + Should Calgary be enabled by default? if you choose 'y', Calgary + will be used (if it exists). If you choose 'n', Calgary will not be + used even if it exists. If you choose 'n' and would like to use + Calgary anyway, pass 'iommu=calgary' on the kernel command line. + If unsure, say Y. + # need this always selected by IOMMU for the VIA workaround config SWIOTLB bool diff --git a/arch/x86_64/Makefile b/arch/x86_64/Makefile index 13972148058d..b471b8550d03 100644 --- a/arch/x86_64/Makefile +++ b/arch/x86_64/Makefile @@ -30,6 +30,10 @@ cflags-y := cflags-kernel-y := cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) +# gcc doesn't support -march=core2 yet as of gcc 4.3, but I hope it +# will eventually. Use -mtune=generic as fallback +cflags-$(CONFIG_MCORE2) += \ + $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) cflags-y += -m64 @@ -66,8 +70,8 @@ AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_endproc,-DCONFIG_AS_CFI=1,) cflags-y += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) AFLAGS += $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1,) -cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector ) -cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh $(CC) -fstack-protector-all ) +cflags-$(CONFIG_CC_STACKPROTECTOR) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector ) +cflags-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh "$(CC)" -fstack-protector-all ) CFLAGS += $(cflags-y) CFLAGS_KERNEL += $(cflags-kernel-y) diff --git a/arch/x86_64/boot/setup.S b/arch/x86_64/boot/setup.S index c3bfd223ab49..770940cc0108 100644 --- a/arch/x86_64/boot/setup.S +++ b/arch/x86_64/boot/setup.S @@ -836,13 +836,12 @@ gdt: .word 0x9200 # data read/write .word 0x00CF # granularity = 4096, 386 # (+5th nibble of limit) +gdt_end: idt_48: .word 0 # idt limit = 0 .word 0, 0 # idt base = 0L gdt_48: - .word 0x8000 # gdt limit=2048, - # 256 GDT entries - + .word gdt_end-gdt-1 # gdt limit .word 0, 0 # gdt base (filled in later) # Include video setup & detection code diff --git a/arch/x86_64/defconfig b/arch/x86_64/defconfig index 0f5d44e86be5..96f226cfb339 100644 --- a/arch/x86_64/defconfig +++ b/arch/x86_64/defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.19-rc2-git4 -# Sat Oct 21 03:38:52 2006 +# Linux kernel version: 2.6.19-git7 +# Wed Dec 6 23:50:47 2006 # CONFIG_X86_64=y CONFIG_64BIT=y @@ -47,13 +47,14 @@ CONFIG_POSIX_MQUEUE=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y # CONFIG_CPUSETS is not set +CONFIG_SYSFS_DEPRECATED=y # CONFIG_RELAY is not set CONFIG_INITRAMFS_SOURCE="" CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y -# CONFIG_SYSCTL_SYSCALL is not set +CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set @@ -87,9 +88,7 @@ CONFIG_STOP_MACHINE=y # Block layer # CONFIG_BLOCK=y -CONFIG_LBD=y # CONFIG_BLK_DEV_IO_TRACE is not set -# CONFIG_LSF is not set # # IO Schedulers @@ -111,10 +110,11 @@ CONFIG_X86_PC=y # CONFIG_X86_VSMP is not set # CONFIG_MK8 is not set # CONFIG_MPSC is not set -CONFIG_GENERIC_CPU=y -CONFIG_X86_L1_CACHE_BYTES=128 -CONFIG_X86_L1_CACHE_SHIFT=7 -CONFIG_X86_INTERNODE_CACHE_BYTES=128 +CONFIG_MCORE2=y +# CONFIG_GENERIC_CPU is not set +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_TSC=y CONFIG_X86_GOOD_APIC=y # CONFIG_MICROCODE is not set @@ -322,6 +322,7 @@ CONFIG_INET_TCP_DIAG=y # CONFIG_TCP_CONG_ADVANCED is not set CONFIG_TCP_CONG_CUBIC=y CONFIG_DEFAULT_TCP_CONG="cubic" +# CONFIG_TCP_MD5SIG is not set CONFIG_IPV6=y # CONFIG_IPV6_PRIVACY is not set # CONFIG_IPV6_ROUTER_PREF is not set @@ -624,6 +625,7 @@ CONFIG_SATA_INTEL_COMBINED=y # CONFIG_PATA_IT821X is not set # CONFIG_PATA_JMICRON is not set # CONFIG_PATA_TRIFLEX is not set +# CONFIG_PATA_MARVELL is not set # CONFIG_PATA_MPIIX is not set # CONFIG_PATA_OLDPIIX is not set # CONFIG_PATA_NETCELL is not set @@ -795,6 +797,7 @@ CONFIG_BNX2=y CONFIG_S2IO=m # CONFIG_S2IO_NAPI is not set # CONFIG_MYRI10GE is not set +# CONFIG_NETXEN_NIC is not set # # Token Ring devices @@ -927,10 +930,6 @@ CONFIG_RTC=y # CONFIG_DTLK is not set # CONFIG_R3964 is not set # CONFIG_APPLICOM is not set - -# -# Ftape, the floppy tape device driver -# CONFIG_AGP=y CONFIG_AGP_AMD64=y CONFIG_AGP_INTEL=y @@ -1135,6 +1134,7 @@ CONFIG_USB_DEVICEFS=y # CONFIG_USB_BANDWIDTH is not set # CONFIG_USB_DYNAMIC_MINORS is not set # CONFIG_USB_SUSPEND is not set +# CONFIG_USB_MULTITHREAD_PROBE is not set # CONFIG_USB_OTG is not set # @@ -1212,6 +1212,7 @@ CONFIG_USB_HIDINPUT=y # CONFIG_USB_KAWETH is not set # CONFIG_USB_PEGASUS is not set # CONFIG_USB_RTL8150 is not set +# CONFIG_USB_USBNET_MII is not set # CONFIG_USB_USBNET is not set CONFIG_USB_MON=y diff --git a/arch/x86_64/ia32/ia32_binfmt.c b/arch/x86_64/ia32/ia32_binfmt.c index 82ef182de6ae..543ef4f405e9 100644 --- a/arch/x86_64/ia32/ia32_binfmt.c +++ b/arch/x86_64/ia32/ia32_binfmt.c @@ -305,8 +305,6 @@ MODULE_AUTHOR("Eric Youngdale, Andi Kleen"); #undef MODULE_DESCRIPTION #undef MODULE_AUTHOR -#define elf_addr_t __u32 - static void elf32_init(struct pt_regs *); #define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 @@ -351,7 +349,7 @@ int ia32_setup_arg_pages(struct linux_binprm *bprm, unsigned long stack_top, bprm->loader += stack_base; bprm->exec += stack_base; - mpnt = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + mpnt = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!mpnt) return -ENOMEM; diff --git a/arch/x86_64/ia32/ia32_signal.c b/arch/x86_64/ia32/ia32_signal.c index 0e0a266d976f..ff499ef2a1ba 100644 --- a/arch/x86_64/ia32/ia32_signal.c +++ b/arch/x86_64/ia32/ia32_signal.c @@ -584,6 +584,11 @@ int ia32_setup_rt_frame(int sig, struct k_sigaction *ka, siginfo_t *info, regs->rdx = (unsigned long) &frame->info; regs->rcx = (unsigned long) &frame->uc; + /* Make -mregparm=3 work */ + regs->rax = sig; + regs->rdx = (unsigned long) &frame->info; + regs->rcx = (unsigned long) &frame->uc; + asm volatile("movl %0,%%ds" :: "r" (__USER32_DS)); asm volatile("movl %0,%%es" :: "r" (__USER32_DS)); diff --git a/arch/x86_64/ia32/ptrace32.c b/arch/x86_64/ia32/ptrace32.c index 3a7561d4703e..04566fe5de49 100644 --- a/arch/x86_64/ia32/ptrace32.c +++ b/arch/x86_64/ia32/ptrace32.c @@ -244,6 +244,8 @@ asmlinkage long sys32_ptrace(long request, u32 pid, u32 addr, u32 data) case PTRACE_DETACH: case PTRACE_SYSCALL: case PTRACE_SETOPTIONS: + case PTRACE_SET_THREAD_AREA: + case PTRACE_GET_THREAD_AREA: return sys_ptrace(request, pid, addr, data); default: diff --git a/arch/x86_64/ia32/syscall32.c b/arch/x86_64/ia32/syscall32.c index 3a01329473ab..3e5ed20cba45 100644 --- a/arch/x86_64/ia32/syscall32.c +++ b/arch/x86_64/ia32/syscall32.c @@ -49,7 +49,7 @@ int syscall32_setup_pages(struct linux_binprm *bprm, int exstack) struct mm_struct *mm = current->mm; int ret; - vma = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (!vma) return -ENOMEM; diff --git a/arch/x86_64/kernel/apic.c b/arch/x86_64/kernel/apic.c index 4d9d5ed942b2..124b2d27b4ac 100644 --- a/arch/x86_64/kernel/apic.c +++ b/arch/x86_64/kernel/apic.c @@ -25,6 +25,7 @@ #include <linux/kernel_stat.h> #include <linux/sysdev.h> #include <linux/module.h> +#include <linux/ioport.h> #include <asm/atomic.h> #include <asm/smp.h> @@ -45,6 +46,12 @@ int apic_calibrate_pmtmr __initdata; int disable_apic_timer __initdata; +static struct resource *ioapic_resources; +static struct resource lapic_resource = { + .name = "Local APIC", + .flags = IORESOURCE_MEM | IORESOURCE_BUSY, +}; + /* * cpu_mask that denotes the CPUs that needs timer interrupt coming in as * IPIs in place of local APIC timers @@ -133,7 +140,6 @@ void clear_local_APIC(void) apic_write(APIC_LVTERR, APIC_LVT_MASKED); if (maxlvt >= 4) apic_write(APIC_LVTPC, APIC_LVT_MASKED); - v = GET_APIC_VERSION(apic_read(APIC_LVR)); apic_write(APIC_ESR, 0); apic_read(APIC_ESR); } @@ -452,23 +458,30 @@ static struct { static int lapic_suspend(struct sys_device *dev, pm_message_t state) { unsigned long flags; + int maxlvt; if (!apic_pm_state.active) return 0; + maxlvt = get_maxlvt(); + apic_pm_state.apic_id = apic_read(APIC_ID); apic_pm_state.apic_taskpri = apic_read(APIC_TASKPRI); apic_pm_state.apic_ldr = apic_read(APIC_LDR); apic_pm_state.apic_dfr = apic_read(APIC_DFR); apic_pm_state.apic_spiv = apic_read(APIC_SPIV); apic_pm_state.apic_lvtt = apic_read(APIC_LVTT); - apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); + if (maxlvt >= 4) + apic_pm_state.apic_lvtpc = apic_read(APIC_LVTPC); apic_pm_state.apic_lvt0 = apic_read(APIC_LVT0); apic_pm_state.apic_lvt1 = apic_read(APIC_LVT1); apic_pm_state.apic_lvterr = apic_read(APIC_LVTERR); apic_pm_state.apic_tmict = apic_read(APIC_TMICT); apic_pm_state.apic_tdcr = apic_read(APIC_TDCR); - apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 5) + apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR); +#endif local_irq_save(flags); disable_local_APIC(); local_irq_restore(flags); @@ -479,10 +492,13 @@ static int lapic_resume(struct sys_device *dev) { unsigned int l, h; unsigned long flags; + int maxlvt; if (!apic_pm_state.active) return 0; + maxlvt = get_maxlvt(); + local_irq_save(flags); rdmsr(MSR_IA32_APICBASE, l, h); l &= ~MSR_IA32_APICBASE_BASE; @@ -496,8 +512,12 @@ static int lapic_resume(struct sys_device *dev) apic_write(APIC_SPIV, apic_pm_state.apic_spiv); apic_write(APIC_LVT0, apic_pm_state.apic_lvt0); apic_write(APIC_LVT1, apic_pm_state.apic_lvt1); - apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); - apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); +#ifdef CONFIG_X86_MCE_INTEL + if (maxlvt >= 5) + apic_write(APIC_LVTTHMR, apic_pm_state.apic_thmr); +#endif + if (maxlvt >= 4) + apic_write(APIC_LVTPC, apic_pm_state.apic_lvtpc); apic_write(APIC_LVTT, apic_pm_state.apic_lvtt); apic_write(APIC_TDCR, apic_pm_state.apic_tdcr); apic_write(APIC_TMICT, apic_pm_state.apic_tmict); @@ -585,6 +605,64 @@ static int __init detect_init_APIC (void) return 0; } +#ifdef CONFIG_X86_IO_APIC +static struct resource * __init ioapic_setup_resources(void) +{ +#define IOAPIC_RESOURCE_NAME_SIZE 11 + unsigned long n; + struct resource *res; + char *mem; + int i; + + if (nr_ioapics <= 0) + return NULL; + + n = IOAPIC_RESOURCE_NAME_SIZE + sizeof(struct resource); + n *= nr_ioapics; + + mem = alloc_bootmem(n); + res = (void *)mem; + + if (mem != NULL) { + memset(mem, 0, n); + mem += sizeof(struct resource) * nr_ioapics; + + for (i = 0; i < nr_ioapics; i++) { + res[i].name = mem; + res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; + sprintf(mem, "IOAPIC %u", i); + mem += IOAPIC_RESOURCE_NAME_SIZE; + } + } + + ioapic_resources = res; + + return res; +} + +static int __init ioapic_insert_resources(void) +{ + int i; + struct resource *r = ioapic_resources; + + if (!r) { + printk("IO APIC resources could be not be allocated.\n"); + return -1; + } + + for (i = 0; i < nr_ioapics; i++) { + insert_resource(&iomem_resource, r); + r++; + } + + return 0; +} + +/* Insert the IO APIC resources after PCI initialization has occured to handle + * IO APICS that are mapped in on a BAR in PCI space. */ +late_initcall(ioapic_insert_resources); +#endif + void __init init_apic_mappings(void) { unsigned long apic_phys; @@ -604,6 +682,11 @@ void __init init_apic_mappings(void) apic_mapped = 1; apic_printk(APIC_VERBOSE,"mapped APIC to %16lx (%16lx)\n", APIC_BASE, apic_phys); + /* Put local APIC into the resource map. */ + lapic_resource.start = apic_phys; + lapic_resource.end = lapic_resource.start + PAGE_SIZE - 1; + insert_resource(&iomem_resource, &lapic_resource); + /* * Fetch the APIC ID of the BSP in case we have a * default configuration (or the MP table is broken). @@ -613,7 +696,9 @@ void __init init_apic_mappings(void) { unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; int i; + struct resource *ioapic_res; + ioapic_res = ioapic_setup_resources(); for (i = 0; i < nr_ioapics; i++) { if (smp_found_config) { ioapic_phys = mp_ioapics[i].mpc_apicaddr; @@ -625,6 +710,12 @@ void __init init_apic_mappings(void) apic_printk(APIC_VERBOSE,"mapped IOAPIC to %016lx (%016lx)\n", __fix_to_virt(idx), ioapic_phys); idx++; + + if (ioapic_res != NULL) { + ioapic_res->start = ioapic_phys; + ioapic_res->end = ioapic_phys + (4 * 1024) - 1; + ioapic_res++; + } } } } @@ -644,10 +735,9 @@ void __init init_apic_mappings(void) static void __setup_APIC_LVTT(unsigned int clocks) { - unsigned int lvtt_value, tmp_value, ver; + unsigned int lvtt_value, tmp_value; int cpu = smp_processor_id(); - ver = GET_APIC_VERSION(apic_read(APIC_LVR)); lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR; if (cpu_isset(cpu, timer_interrupt_broadcast_ipi_mask)) diff --git a/arch/x86_64/kernel/crash.c b/arch/x86_64/kernel/crash.c index 3525f884af82..95a7a2c13131 100644 --- a/arch/x86_64/kernel/crash.c +++ b/arch/x86_64/kernel/crash.c @@ -28,71 +28,6 @@ /* This keeps a track of which one is crashing cpu. */ static int crashing_cpu; -static u32 *append_elf_note(u32 *buf, char *name, unsigned type, - void *data, size_t data_len) -{ - struct elf_note note; - - note.n_namesz = strlen(name) + 1; - note.n_descsz = data_len; - note.n_type = type; - memcpy(buf, ¬e, sizeof(note)); - buf += (sizeof(note) +3)/4; - memcpy(buf, name, note.n_namesz); - buf += (note.n_namesz + 3)/4; - memcpy(buf, data, note.n_descsz); - buf += (note.n_descsz + 3)/4; - - return buf; -} - -static void final_note(u32 *buf) -{ - struct elf_note note; - - note.n_namesz = 0; - note.n_descsz = 0; - note.n_type = 0; - memcpy(buf, ¬e, sizeof(note)); -} - -static void crash_save_this_cpu(struct pt_regs *regs, int cpu) -{ - struct elf_prstatus prstatus; - u32 *buf; - - if ((cpu < 0) || (cpu >= NR_CPUS)) - return; - - /* Using ELF notes here is opportunistic. - * I need a well defined structure format - * for the data I pass, and I need tags - * on the data to indicate what information I have - * squirrelled away. ELF notes happen to provide - * all of that, no need to invent something new. - */ - - buf = (u32*)per_cpu_ptr(crash_notes, cpu); - - if (!buf) - return; - - memset(&prstatus, 0, sizeof(prstatus)); - prstatus.pr_pid = current->pid; - elf_core_copy_regs(&prstatus.pr_reg, regs); - buf = append_elf_note(buf, "CORE", NT_PRSTATUS, &prstatus, - sizeof(prstatus)); - final_note(buf); -} - -static void crash_save_self(struct pt_regs *regs) -{ - int cpu; - - cpu = smp_processor_id(); - crash_save_this_cpu(regs, cpu); -} - #ifdef CONFIG_SMP static atomic_t waiting_for_crash_ipi; @@ -117,7 +52,7 @@ static int crash_nmi_callback(struct notifier_block *self, return NOTIFY_STOP; local_irq_disable(); - crash_save_this_cpu(regs, cpu); + crash_save_cpu(regs, cpu); disable_local_APIC(); atomic_dec(&waiting_for_crash_ipi); /* Assume hlt works */ @@ -196,5 +131,5 @@ void machine_crash_shutdown(struct pt_regs *regs) disable_IO_APIC(); - crash_save_self(regs); + crash_save_cpu(regs, smp_processor_id()); } diff --git a/arch/x86_64/kernel/e820.c b/arch/x86_64/kernel/e820.c index a75c829c2b02..6fe191c58084 100644 --- a/arch/x86_64/kernel/e820.c +++ b/arch/x86_64/kernel/e820.c @@ -278,7 +278,7 @@ e820_register_active_regions(int nid, unsigned long start_pfn, >> PAGE_SHIFT; /* Skip map entries smaller than a page */ - if (ei_startpfn > ei_endpfn) + if (ei_startpfn >= ei_endpfn) continue; /* Check if end_pfn_map should be updated */ @@ -594,7 +594,9 @@ static int __init parse_memmap_opt(char *p) * size before original memory map is * reset. */ + e820_register_active_regions(0, 0, -1UL); saved_max_pfn = e820_end_of_ram(); + remove_all_active_ranges(); #endif end_pfn_map = 0; e820.nr_map = 0; diff --git a/arch/x86_64/kernel/early-quirks.c b/arch/x86_64/kernel/early-quirks.c index 2b1245d86258..829698f6d049 100644 --- a/arch/x86_64/kernel/early-quirks.c +++ b/arch/x86_64/kernel/early-quirks.c @@ -45,7 +45,13 @@ static void nvidia_bugs(void) /* * All timer overrides on Nvidia are * wrong unless HPET is enabled. + * Unfortunately that's not true on many Asus boards. + * We don't know yet how to detect this automatically, but + * at least allow a command line override. */ + if (acpi_use_timer_override) + return; + nvidia_hpet_detected = 0; acpi_table_parse(ACPI_HPET, nvidia_hpet_check); if (nvidia_hpet_detected == 0) { @@ -53,6 +59,8 @@ static void nvidia_bugs(void) printk(KERN_INFO "Nvidia board " "detected. Ignoring ACPI " "timer override.\n"); + printk(KERN_INFO "If you got timer trouble " + "try acpi_use_timer_override\n"); } #endif /* RED-PEN skip them on mptables too? */ @@ -61,11 +69,18 @@ static void nvidia_bugs(void) static void ati_bugs(void) { - if (timer_over_8254 == 1) { - timer_over_8254 = 0; - printk(KERN_INFO - "ATI board detected. Disabling timer routing over 8254.\n"); - } +} + +static void intel_bugs(void) +{ + u16 device = read_pci_config_16(0, 0, 0, PCI_DEVICE_ID); + +#ifdef CONFIG_SMP + if (device == PCI_DEVICE_ID_INTEL_E7320_MCH || + device == PCI_DEVICE_ID_INTEL_E7520_MCH || + device == PCI_DEVICE_ID_INTEL_E7525_MCH) + quirk_intel_irqbalance(); +#endif } struct chipset { @@ -77,6 +92,7 @@ static struct chipset early_qrk[] = { { PCI_VENDOR_ID_NVIDIA, nvidia_bugs }, { PCI_VENDOR_ID_VIA, via_bugs }, { PCI_VENDOR_ID_ATI, ati_bugs }, + { PCI_VENDOR_ID_INTEL, intel_bugs}, {} }; diff --git a/arch/x86_64/kernel/early_printk.c b/arch/x86_64/kernel/early_printk.c index e22ecd54870d..47b6d90349da 100644 --- a/arch/x86_64/kernel/early_printk.c +++ b/arch/x86_64/kernel/early_printk.c @@ -224,7 +224,7 @@ static int __init setup_early_printk(char *buf) return 0; early_console_initialized = 1; - if (!strcmp(buf,"keep")) + if (strstr(buf, "keep")) keep_early = 1; if (!strncmp(buf, "serial", 6)) { diff --git a/arch/x86_64/kernel/entry.S b/arch/x86_64/kernel/entry.S index 7d401b00d822..601d332c4b79 100644 --- a/arch/x86_64/kernel/entry.S +++ b/arch/x86_64/kernel/entry.S @@ -230,7 +230,6 @@ ENTRY(system_call) CFI_REL_OFFSET rip,RIP-ARGOFFSET GET_THREAD_INFO(%rcx) testl $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP),threadinfo_flags(%rcx) - CFI_REMEMBER_STATE jnz tracesys cmpq $__NR_syscall_max,%rax ja badsys @@ -241,7 +240,6 @@ ENTRY(system_call) * Syscall return path ending with SYSRET (fast path) * Has incomplete stack frame and undefined top of stack. */ - .globl ret_from_sys_call ret_from_sys_call: movl $_TIF_ALLWORK_MASK,%edi /* edi: flagmask */ @@ -251,8 +249,8 @@ sysret_check: TRACE_IRQS_OFF movl threadinfo_flags(%rcx),%edx andl %edi,%edx - CFI_REMEMBER_STATE jnz sysret_careful + CFI_REMEMBER_STATE /* * sysretq will re-enable interrupts: */ @@ -265,10 +263,10 @@ sysret_check: swapgs sysretq + CFI_RESTORE_STATE /* Handle reschedules */ /* edx: work, edi: workmask */ sysret_careful: - CFI_RESTORE_STATE bt $TIF_NEED_RESCHED,%edx jnc sysret_signal TRACE_IRQS_ON @@ -306,7 +304,6 @@ badsys: /* Do syscall tracing */ tracesys: - CFI_RESTORE_STATE SAVE_REST movq $-ENOSYS,RAX(%rsp) FIXUP_TOP_OF_STACK %rdi @@ -322,32 +319,13 @@ tracesys: call *sys_call_table(,%rax,8) 1: movq %rax,RAX-ARGOFFSET(%rsp) /* Use IRET because user could have changed frame */ - jmp int_ret_from_sys_call - CFI_ENDPROC -END(system_call) /* * Syscall return path ending with IRET. * Has correct top of stack, but partial stack frame. - */ -ENTRY(int_ret_from_sys_call) - CFI_STARTPROC simple - CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-ARGOFFSET - /*CFI_REL_OFFSET ss,SS-ARGOFFSET*/ - CFI_REL_OFFSET rsp,RSP-ARGOFFSET - /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ - /*CFI_REL_OFFSET cs,CS-ARGOFFSET*/ - CFI_REL_OFFSET rip,RIP-ARGOFFSET - CFI_REL_OFFSET rdx,RDX-ARGOFFSET - CFI_REL_OFFSET rcx,RCX-ARGOFFSET - CFI_REL_OFFSET rax,RAX-ARGOFFSET - CFI_REL_OFFSET rdi,RDI-ARGOFFSET - CFI_REL_OFFSET rsi,RSI-ARGOFFSET - CFI_REL_OFFSET r8,R8-ARGOFFSET - CFI_REL_OFFSET r9,R9-ARGOFFSET - CFI_REL_OFFSET r10,R10-ARGOFFSET - CFI_REL_OFFSET r11,R11-ARGOFFSET + */ + .globl int_ret_from_sys_call +int_ret_from_sys_call: cli TRACE_IRQS_OFF testl $3,CS-ARGOFFSET(%rsp) @@ -394,8 +372,6 @@ int_very_careful: popq %rdi CFI_ADJUST_CFA_OFFSET -8 andl $~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP),%edi - cli - TRACE_IRQS_OFF jmp int_restore_rest int_signal: @@ -411,7 +387,7 @@ int_restore_rest: TRACE_IRQS_OFF jmp int_with_check CFI_ENDPROC -END(int_ret_from_sys_call) +END(system_call) /* * Certain special system calls that need to save a complete full stack frame. diff --git a/arch/x86_64/kernel/genapic.c b/arch/x86_64/kernel/genapic.c index 8e78a75d1866..b007433f96bb 100644 --- a/arch/x86_64/kernel/genapic.c +++ b/arch/x86_64/kernel/genapic.c @@ -33,7 +33,7 @@ extern struct genapic apic_flat; extern struct genapic apic_physflat; struct genapic *genapic = &apic_flat; - +struct genapic *genapic_force; /* * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. @@ -46,6 +46,13 @@ void __init clustered_apic_check(void) u8 cluster_cnt[NUM_APIC_CLUSTERS]; int max_apic = 0; + /* genapic selection can be forced because of certain quirks. + */ + if (genapic_force) { + genapic = genapic_force; + goto print; + } + #if defined(CONFIG_ACPI) /* * Some x86_64 machines use physical APIC mode regardless of how many diff --git a/arch/x86_64/kernel/head64.c b/arch/x86_64/kernel/head64.c index 9561eb3c5b5c..cc230b93cd1c 100644 --- a/arch/x86_64/kernel/head64.c +++ b/arch/x86_64/kernel/head64.c @@ -57,10 +57,12 @@ void __init x86_64_start_kernel(char * real_mode_data) { int i; - for (i = 0; i < 256; i++) + /* clear bss before set_intr_gate with early_idt_handler */ + clear_bss(); + + for (i = 0; i < IDT_ENTRIES; i++) set_intr_gate(i, early_idt_handler); asm volatile("lidt %0" :: "m" (idt_descr)); - clear_bss(); early_printk("Kernel alive\n"); diff --git a/arch/x86_64/kernel/i387.c b/arch/x86_64/kernel/i387.c index 3aa1e9bb781d..1d58c13bc6bc 100644 --- a/arch/x86_64/kernel/i387.c +++ b/arch/x86_64/kernel/i387.c @@ -82,11 +82,8 @@ int save_i387(struct _fpstate __user *buf) struct task_struct *tsk = current; int err = 0; - { - extern void bad_user_i387_struct(void); - if (sizeof(struct user_i387_struct) != sizeof(tsk->thread.i387.fxsave)) - bad_user_i387_struct(); - } + BUILD_BUG_ON(sizeof(struct user_i387_struct) != + sizeof(tsk->thread.i387.fxsave)); if ((unsigned long)buf % 16) printk("save_i387: bad fpstate %p\n",buf); diff --git a/arch/x86_64/kernel/i8259.c b/arch/x86_64/kernel/i8259.c index c4ef801b765b..d73c79e821f1 100644 --- a/arch/x86_64/kernel/i8259.c +++ b/arch/x86_64/kernel/i8259.c @@ -76,7 +76,8 @@ BUILD_16_IRQS(0xc) BUILD_16_IRQS(0xd) BUILD_16_IRQS(0xe) BUILD_16_IRQS(0xf) IRQ(x,8), IRQ(x,9), IRQ(x,a), IRQ(x,b), \ IRQ(x,c), IRQ(x,d), IRQ(x,e), IRQ(x,f) -void (*interrupt[NR_IRQS])(void) = { +/* for the irq vectors */ +static void (*interrupt[NR_VECTORS - FIRST_EXTERNAL_VECTOR])(void) = { IRQLIST_16(0x2), IRQLIST_16(0x3), IRQLIST_16(0x4), IRQLIST_16(0x5), IRQLIST_16(0x6), IRQLIST_16(0x7), IRQLIST_16(0x8), IRQLIST_16(0x9), IRQLIST_16(0xa), IRQLIST_16(0xb), diff --git a/arch/x86_64/kernel/io_apic.c b/arch/x86_64/kernel/io_apic.c index fe429e5d6b29..2a1dcd5f69c2 100644 --- a/arch/x86_64/kernel/io_apic.c +++ b/arch/x86_64/kernel/io_apic.c @@ -55,10 +55,6 @@ int sis_apic_bug; /* not actually supported, dummy for compile */ static int no_timer_check; -static int disable_timer_pin_1 __initdata; - -int timer_over_8254 __initdata = 1; - /* Where if anywhere is the i8259 connect in external int mode */ static struct { int pin, apic; } ioapic_i8259 = { -1, -1 }; @@ -88,6 +84,52 @@ static struct irq_pin_list { short apic, pin, next; } irq_2_pin[PIN_MAP_SIZE]; +struct io_apic { + unsigned int index; + unsigned int unused[3]; + unsigned int data; +}; + +static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) +{ + return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) + + (mp_ioapics[idx].mpc_apicaddr & ~PAGE_MASK); +} + +static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + return readl(&io_apic->data); +} + +static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(reg, &io_apic->index); + writel(value, &io_apic->data); +} + +/* + * Re-write a value: to be used for read-modify-write + * cycles where the read already set up the index register. + */ +static inline void io_apic_modify(unsigned int apic, unsigned int value) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + writel(value, &io_apic->data); +} + +/* + * Synchronize the IO-APIC and the CPU by doing + * a dummy read from the IO-APIC + */ +static inline void io_apic_sync(unsigned int apic) +{ + struct io_apic __iomem *io_apic = io_apic_base(apic); + readl(&io_apic->data); +} + #define __DO_ACTION(R, ACTION, FINAL) \ \ { \ @@ -126,11 +168,39 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin) return eu.entry; } -static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +/* + * When we write a new IO APIC routing entry, we need to write the high + * word first! If the mask bit in the low word is clear, we will enable + * the interrupt, and we need to make sure the entry is fully populated + * before that happens. + */ +static void +__ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) { - unsigned long flags; union entry_union eu; eu.entry = e; + io_apic_write(apic, 0x11 + 2*pin, eu.w2); + io_apic_write(apic, 0x10 + 2*pin, eu.w1); +} + +static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) +{ + unsigned long flags; + spin_lock_irqsave(&ioapic_lock, flags); + __ioapic_write_entry(apic, pin, e); + spin_unlock_irqrestore(&ioapic_lock, flags); +} + +/* + * When we mask an IO APIC routing entry, we need to write the low + * word first, in order to set the mask bit before we change the + * high bits! + */ +static void ioapic_mask_entry(int apic, int pin) +{ + unsigned long flags; + union entry_union eu = { .entry.mask = 1 }; + spin_lock_irqsave(&ioapic_lock, flags); io_apic_write(apic, 0x10 + 2*pin, eu.w1); io_apic_write(apic, 0x11 + 2*pin, eu.w2); @@ -256,9 +326,7 @@ static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) /* * Disable it in the IO-APIC irq-routing table: */ - memset(&entry, 0, sizeof(entry)); - entry.mask = 1; - ioapic_write_entry(apic, pin, entry); + ioapic_mask_entry(apic, pin); } static void clear_IO_APIC (void) @@ -282,29 +350,6 @@ static int __init disable_ioapic_setup(char *str) } early_param("noapic", disable_ioapic_setup); -/* Actually the next is obsolete, but keep it for paranoid reasons -AK */ -static int __init disable_timer_pin_setup(char *arg) -{ - disable_timer_pin_1 = 1; - return 1; -} -__setup("disable_timer_pin_1", disable_timer_pin_setup); - -static int __init setup_disable_8254_timer(char *s) -{ - timer_over_8254 = -1; - return 1; -} -static int __init setup_enable_8254_timer(char *s) -{ - timer_over_8254 = 2; - return 1; -} - -__setup("disable_8254_timer", setup_disable_8254_timer); -__setup("enable_8254_timer", setup_enable_8254_timer); - - /* * Find the IRQ entry number of a certain pin. */ @@ -684,14 +729,28 @@ static int assign_irq_vector(int irq, cpumask_t mask, cpumask_t *result) return vector; } +static void __clear_irq_vector(int irq) +{ + cpumask_t mask; + int cpu, vector; + + BUG_ON(!irq_vector[irq]); + + vector = irq_vector[irq]; + cpus_and(mask, irq_domain[irq], cpu_online_map); + for_each_cpu_mask(cpu, mask) + per_cpu(vector_irq, cpu)[vector] = -1; + + irq_vector[irq] = 0; + irq_domain[irq] = CPU_MASK_NONE; +} + void __setup_vector_irq(int cpu) { /* Initialize vector_irq on a new cpu */ /* This function must be called with vector_lock held */ - unsigned long flags; int irq, vector; - /* Mark the inuse vectors */ for (irq = 0; irq < NR_IRQ_VECTORS; ++irq) { if (!cpu_isset(cpu, irq_domain[irq])) @@ -724,31 +783,71 @@ static void ioapic_register_intr(int irq, int vector, unsigned long trigger) trigger == IOAPIC_LEVEL) set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_fasteoi_irq, "fasteoi"); - else + else { + irq_desc[irq].status |= IRQ_DELAYED_DISABLE; set_irq_chip_and_handler_name(irq, &ioapic_chip, handle_edge_irq, "edge"); + } } - -static void __init setup_IO_APIC_irqs(void) +static void __init setup_IO_APIC_irq(int apic, int pin, int idx, int irq) { struct IO_APIC_route_entry entry; - int apic, pin, idx, irq, first_notcon = 1, vector; + int vector; unsigned long flags; - apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); - for (apic = 0; apic < nr_ioapics; apic++) { - for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { + /* + * add it to the IO-APIC irq-routing table: + */ + memset(&entry,0,sizeof(entry)); - /* - * add it to the IO-APIC irq-routing table: - */ - memset(&entry,0,sizeof(entry)); + entry.delivery_mode = INT_DELIVERY_MODE; + entry.dest_mode = INT_DEST_MODE; + entry.mask = 0; /* enable IRQ */ + entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + + entry.trigger = irq_trigger(idx); + entry.polarity = irq_polarity(idx); - entry.delivery_mode = INT_DELIVERY_MODE; - entry.dest_mode = INT_DEST_MODE; - entry.mask = 0; /* enable IRQ */ + if (irq_trigger(idx)) { + entry.trigger = 1; + entry.mask = 1; entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); + } + + if (!apic && !IO_APIC_IRQ(irq)) + return; + + if (IO_APIC_IRQ(irq)) { + cpumask_t mask; + vector = assign_irq_vector(irq, TARGET_CPUS, &mask); + if (vector < 0) + return; + + entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); + entry.vector = vector; + + ioapic_register_intr(irq, vector, IOAPIC_AUTO); + if (!apic && (irq < 16)) + disable_8259A_irq(irq); + } + + ioapic_write_entry(apic, pin, entry); + + spin_lock_irqsave(&ioapic_lock, flags); + set_native_irq_info(irq, TARGET_CPUS); + spin_unlock_irqrestore(&ioapic_lock, flags); + +} + +static void __init setup_IO_APIC_irqs(void) +{ + int apic, pin, idx, irq, first_notcon = 1; + + apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); + + for (apic = 0; apic < nr_ioapics; apic++) { + for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { idx = find_irq_entry(apic,pin,mp_INT); if (idx == -1) { @@ -760,39 +859,11 @@ static void __init setup_IO_APIC_irqs(void) continue; } - entry.trigger = irq_trigger(idx); - entry.polarity = irq_polarity(idx); - - if (irq_trigger(idx)) { - entry.trigger = 1; - entry.mask = 1; - entry.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); - } - irq = pin_2_irq(idx, apic, pin); add_pin_to_irq(irq, apic, pin); - if (!apic && !IO_APIC_IRQ(irq)) - continue; - - if (IO_APIC_IRQ(irq)) { - cpumask_t mask; - vector = assign_irq_vector(irq, TARGET_CPUS, &mask); - if (vector < 0) - continue; - - entry.dest.logical.logical_dest = cpu_mask_to_apicid(mask); - entry.vector = vector; - - ioapic_register_intr(irq, vector, IOAPIC_AUTO); - if (!apic && (irq < 16)) - disable_8259A_irq(irq); - } - ioapic_write_entry(apic, pin, entry); + setup_IO_APIC_irq(apic, pin, idx, irq); - spin_lock_irqsave(&ioapic_lock, flags); - set_native_irq_info(irq, TARGET_CPUS); - spin_unlock_irqrestore(&ioapic_lock, flags); } } @@ -1497,10 +1568,33 @@ static inline void unlock_ExtINT_logic(void) * a wide range of boards and BIOS bugs. Fortunately only the timer IRQ * is so screwy. Thanks to Brian Perkins for testing/hacking this beast * fanatically on his truly buggy board. - * - * FIXME: really need to revamp this for modern platforms only. */ -static inline void check_timer(void) + +static int try_apic_pin(int apic, int pin, char *msg) +{ + apic_printk(APIC_VERBOSE, KERN_INFO + "..TIMER: trying IO-APIC=%d PIN=%d %s", + apic, pin, msg); + + /* + * Ok, does IRQ0 through the IOAPIC work? + */ + if (!no_timer_check && timer_irq_works()) { + nmi_watchdog_default(); + if (nmi_watchdog == NMI_IO_APIC) { + disable_8259A_irq(0); + setup_nmi(); + enable_8259A_irq(0); + } + return 1; + } + clear_IO_APIC_pin(apic, pin); + apic_printk(APIC_QUIET, KERN_ERR " .. failed\n"); + return 0; +} + +/* The function from hell */ +static void check_timer(void) { int apic1, pin1, apic2, pin2; int vector; @@ -1521,61 +1615,43 @@ static inline void check_timer(void) */ apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); init_8259A(1); - if (timer_over_8254 > 0) - enable_8259A_irq(0); pin1 = find_isa_irq_pin(0, mp_INT); apic1 = find_isa_irq_apic(0, mp_INT); pin2 = ioapic_i8259.pin; apic2 = ioapic_i8259.apic; - apic_printk(APIC_VERBOSE,KERN_INFO "..TIMER: vector=0x%02X apic1=%d pin1=%d apic2=%d pin2=%d\n", - vector, apic1, pin1, apic2, pin2); + /* Do this first, otherwise we get double interrupts on ATI boards */ + if ((pin1 != -1) && try_apic_pin(apic1, pin1,"with 8259 IRQ0 disabled")) + return; - if (pin1 != -1) { - /* - * Ok, does IRQ0 through the IOAPIC work? - */ - unmask_IO_APIC_irq(0); - if (!no_timer_check && timer_irq_works()) { - nmi_watchdog_default(); - if (nmi_watchdog == NMI_IO_APIC) { - disable_8259A_irq(0); - setup_nmi(); - enable_8259A_irq(0); - } - if (disable_timer_pin_1 > 0) - clear_IO_APIC_pin(0, pin1); - return; - } - clear_IO_APIC_pin(apic1, pin1); - apic_printk(APIC_QUIET,KERN_ERR "..MP-BIOS bug: 8254 timer not " - "connected to IO-APIC\n"); - } + /* Now try again with IRQ0 8259A enabled. + Assumes timer is on IO-APIC 0 ?!? */ + enable_8259A_irq(0); + unmask_IO_APIC_irq(0); + if (try_apic_pin(apic1, pin1, "with 8259 IRQ0 enabled")) + return; + disable_8259A_irq(0); + + /* Always try pin0 and pin2 on APIC 0 to handle buggy timer overrides + on Nvidia boards */ + if (!(apic1 == 0 && pin1 == 0) && + try_apic_pin(0, 0, "fallback with 8259 IRQ0 disabled")) + return; + if (!(apic1 == 0 && pin1 == 2) && + try_apic_pin(0, 2, "fallback with 8259 IRQ0 disabled")) + return; - apic_printk(APIC_VERBOSE,KERN_INFO "...trying to set up timer (IRQ0) " - "through the 8259A ... "); + /* Then try pure 8259A routing on the 8259 as reported by BIOS*/ + enable_8259A_irq(0); if (pin2 != -1) { - apic_printk(APIC_VERBOSE,"\n..... (found apic %d pin %d) ...", - apic2, pin2); - /* - * legacy devices should be connected to IO APIC #0 - */ setup_ExtINT_IRQ0_pin(apic2, pin2, vector); - if (timer_irq_works()) { - apic_printk(APIC_VERBOSE," works.\n"); - nmi_watchdog_default(); - if (nmi_watchdog == NMI_IO_APIC) { - setup_nmi(); - } + if (try_apic_pin(apic2,pin2,"8259A broadcast ExtINT from BIOS")) return; - } - /* - * Cleanup, just in case ... - */ - clear_IO_APIC_pin(apic2, pin2); } - apic_printk(APIC_VERBOSE," failed.\n"); + + /* Tried all possibilities to go through the IO-APIC. Now come the + really cheesy fallbacks. */ if (nmi_watchdog == NMI_IO_APIC) { printk(KERN_WARNING "timer doesn't work through the IO-APIC - disabling NMI Watchdog!\n"); @@ -1771,7 +1847,7 @@ void destroy_irq(unsigned int irq) dynamic_irq_cleanup(irq); spin_lock_irqsave(&vector_lock, flags); - irq_vector[irq] = 0; + __clear_irq_vector(irq); spin_unlock_irqrestore(&vector_lock, flags); } @@ -1889,18 +1965,16 @@ void arch_teardown_msi_irq(unsigned int irq) static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector) { - u32 low, high; - low = read_ht_irq_low(irq); - high = read_ht_irq_high(irq); + struct ht_irq_msg msg; + fetch_ht_irq_msg(irq, &msg); - low &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); - high &= ~(HT_IRQ_HIGH_DEST_ID_MASK); + msg.address_lo &= ~(HT_IRQ_LOW_VECTOR_MASK | HT_IRQ_LOW_DEST_ID_MASK); + msg.address_hi &= ~(HT_IRQ_HIGH_DEST_ID_MASK); - low |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); - high |= HT_IRQ_HIGH_DEST_ID(dest); + msg.address_lo |= HT_IRQ_LOW_VECTOR(vector) | HT_IRQ_LOW_DEST_ID(dest); + msg.address_hi |= HT_IRQ_HIGH_DEST_ID(dest); - write_ht_irq_low(irq, low); - write_ht_irq_high(irq, high); + write_ht_irq_msg(irq, &msg); } static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) @@ -1921,7 +1995,7 @@ static void set_ht_irq_affinity(unsigned int irq, cpumask_t mask) dest = cpu_mask_to_apicid(tmp); - target_ht_irq(irq, dest, vector & 0xff); + target_ht_irq(irq, dest, vector); set_native_irq_info(irq, mask); } #endif @@ -1944,14 +2018,15 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) vector = assign_irq_vector(irq, TARGET_CPUS, &tmp); if (vector >= 0) { - u32 low, high; + struct ht_irq_msg msg; unsigned dest; dest = cpu_mask_to_apicid(tmp); - high = HT_IRQ_HIGH_DEST_ID(dest); + msg.address_hi = HT_IRQ_HIGH_DEST_ID(dest); - low = HT_IRQ_LOW_BASE | + msg.address_lo = + HT_IRQ_LOW_BASE | HT_IRQ_LOW_DEST_ID(dest) | HT_IRQ_LOW_VECTOR(vector) | ((INT_DEST_MODE == 0) ? @@ -1960,10 +2035,10 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) HT_IRQ_LOW_RQEOI_EDGE | ((INT_DELIVERY_MODE != dest_LowestPrio) ? HT_IRQ_LOW_MT_FIXED : - HT_IRQ_LOW_MT_ARBITRATED); + HT_IRQ_LOW_MT_ARBITRATED) | + HT_IRQ_LOW_IRQ_MASKED; - write_ht_irq_low(irq, low); - write_ht_irq_high(irq, high); + write_ht_irq_msg(irq, &msg); set_irq_chip_and_handler_name(irq, &ht_irq_chip, handle_edge_irq, "edge"); @@ -2074,7 +2149,15 @@ void __init setup_ioapic_dest(void) if (irq_entry == -1) continue; irq = pin_2_irq(irq_entry, ioapic, pin); - set_ioapic_affinity_irq(irq, TARGET_CPUS); + + /* setup_IO_APIC_irqs could fail to get vector for some device + * when you have too many devices, because at that time only boot + * cpu is online. + */ + if(!irq_vector[irq]) + setup_IO_APIC_irq(ioapic, pin, irq_entry, irq); + else + set_ioapic_affinity_irq(irq, TARGET_CPUS); } } diff --git a/arch/x86_64/kernel/irq.c b/arch/x86_64/kernel/irq.c index e46c55856d40..0c06af6c13bc 100644 --- a/arch/x86_64/kernel/irq.c +++ b/arch/x86_64/kernel/irq.c @@ -120,7 +120,7 @@ asmlinkage unsigned int do_IRQ(struct pt_regs *regs) if (likely(irq < NR_IRQS)) generic_handle_irq(irq); - else + else if (printk_ratelimit()) printk(KERN_EMERG "%s: %d.%d No irq handler for vector\n", __func__, smp_processor_id(), vector); diff --git a/arch/x86_64/kernel/kprobes.c b/arch/x86_64/kernel/kprobes.c index ac241567e682..209c8c0bec71 100644 --- a/arch/x86_64/kernel/kprobes.c +++ b/arch/x86_64/kernel/kprobes.c @@ -224,7 +224,7 @@ void __kprobes arch_disarm_kprobe(struct kprobe *p) void __kprobes arch_remove_kprobe(struct kprobe *p) { mutex_lock(&kprobe_mutex); - free_insn_slot(p->ainsn.insn); + free_insn_slot(p->ainsn.insn, 0); mutex_unlock(&kprobe_mutex); } diff --git a/arch/x86_64/kernel/mce.c b/arch/x86_64/kernel/mce.c index bbea88801d88..ac085038af29 100644 --- a/arch/x86_64/kernel/mce.c +++ b/arch/x86_64/kernel/mce.c @@ -306,8 +306,8 @@ void mce_log_therm_throt_event(unsigned int cpu, __u64 status) */ static int check_interval = 5 * 60; /* 5 minutes */ -static void mcheck_timer(void *data); -static DECLARE_WORK(mcheck_work, mcheck_timer, NULL); +static void mcheck_timer(struct work_struct *work); +static DECLARE_DELAYED_WORK(mcheck_work, mcheck_timer); static void mcheck_check_cpu(void *info) { @@ -315,7 +315,7 @@ static void mcheck_check_cpu(void *info) do_machine_check(NULL, 0); } -static void mcheck_timer(void *data) +static void mcheck_timer(struct work_struct *work) { on_each_cpu(mcheck_check_cpu, NULL, 1, 1); schedule_delayed_work(&mcheck_work, check_interval * HZ); @@ -641,7 +641,6 @@ static __cpuinit int mce_create_device(unsigned int cpu) return err; } -#ifdef CONFIG_HOTPLUG_CPU static void mce_remove_device(unsigned int cpu) { int i; @@ -652,6 +651,7 @@ static void mce_remove_device(unsigned int cpu) sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_tolerant); sysdev_remove_file(&per_cpu(device_mce,cpu), &attr_check_interval); sysdev_unregister(&per_cpu(device_mce,cpu)); + memset(&per_cpu(device_mce, cpu).kobj, 0, sizeof(struct kobject)); } /* Get notified when a cpu comes on/off. Be hotplug friendly. */ @@ -674,7 +674,6 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) static struct notifier_block mce_cpu_notifier = { .notifier_call = mce_cpu_callback, }; -#endif static __init int mce_init_device(void) { diff --git a/arch/x86_64/kernel/mce_amd.c b/arch/x86_64/kernel/mce_amd.c index 883fe747f64c..fa09debad4b7 100644 --- a/arch/x86_64/kernel/mce_amd.c +++ b/arch/x86_64/kernel/mce_amd.c @@ -551,7 +551,6 @@ out: return err; } -#ifdef CONFIG_HOTPLUG_CPU /* * let's be hotplug friendly. * in case of multiple core processors, the first core always takes ownership @@ -594,12 +593,14 @@ static void threshold_remove_bank(unsigned int cpu, int bank) sprintf(name, "threshold_bank%i", bank); +#ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { sysfs_remove_link(&per_cpu(device_mce, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; } +#endif /* remove all sibling symlinks before unregistering */ for_each_cpu_mask(i, b->cpus) { @@ -656,7 +657,6 @@ static int threshold_cpu_callback(struct notifier_block *nfb, static struct notifier_block threshold_cpu_notifier = { .notifier_call = threshold_cpu_callback, }; -#endif /* CONFIG_HOTPLUG_CPU */ static __init int threshold_init_device(void) { diff --git a/arch/x86_64/kernel/mpparse.c b/arch/x86_64/kernel/mpparse.c index b147ab19fbd4..08072568847d 100644 --- a/arch/x86_64/kernel/mpparse.c +++ b/arch/x86_64/kernel/mpparse.c @@ -35,8 +35,6 @@ int smp_found_config; unsigned int __initdata maxcpus = NR_CPUS; -int acpi_found_madt; - /* * Various Linux-internal data structures created from the * MP-table. diff --git a/arch/x86_64/kernel/nmi.c b/arch/x86_64/kernel/nmi.c index 7af9cb3e2d99..27e95e7922c1 100644 --- a/arch/x86_64/kernel/nmi.c +++ b/arch/x86_64/kernel/nmi.c @@ -12,14 +12,15 @@ * Mikael Pettersson : PM converted to driver model. Disable/enable API. */ +#include <linux/nmi.h> #include <linux/mm.h> #include <linux/delay.h> #include <linux/interrupt.h> #include <linux/module.h> #include <linux/sysdev.h> -#include <linux/nmi.h> #include <linux/sysctl.h> #include <linux/kprobes.h> +#include <linux/cpumask.h> #include <asm/smp.h> #include <asm/nmi.h> @@ -41,6 +42,8 @@ int panic_on_unrecovered_nmi; static DEFINE_PER_CPU(unsigned, perfctr_nmi_owner); static DEFINE_PER_CPU(unsigned, evntsel_nmi_owner[2]); +static cpumask_t backtrace_mask = CPU_MASK_NONE; + /* this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's * offset from MSR_P4_BSU_ESCR0. It will be the max for all platforms (for now) */ @@ -782,6 +785,7 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) { int sum; int touched = 0; + int cpu = smp_processor_id(); struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk); u64 dummy; int rc=0; @@ -799,6 +803,16 @@ int __kprobes nmi_watchdog_tick(struct pt_regs * regs, unsigned reason) touched = 1; } + if (cpu_isset(cpu, backtrace_mask)) { + static DEFINE_SPINLOCK(lock); /* Serialise the printks */ + + spin_lock(&lock); + printk("NMI backtrace for cpu %d\n", cpu); + dump_stack(); + spin_unlock(&lock); + cpu_clear(cpu, backtrace_mask); + } + #ifdef CONFIG_X86_MCE /* Could check oops_in_progress here too, but it's safer not too */ @@ -931,6 +945,19 @@ int proc_nmi_enabled(struct ctl_table *table, int write, struct file *file, #endif +void __trigger_all_cpu_backtrace(void) +{ + int i; + + backtrace_mask = cpu_online_map; + /* Wait for up to 10 seconds for all CPUs to do the backtrace */ + for (i = 0; i < 10 * 1000; i++) { + if (cpus_empty(backtrace_mask)) + break; + mdelay(1); + } +} + EXPORT_SYMBOL(nmi_active); EXPORT_SYMBOL(nmi_watchdog); EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi); diff --git a/arch/x86_64/kernel/pci-calgary.c b/arch/x86_64/kernel/pci-calgary.c index 37a770859e71..3215675ab128 100644 --- a/arch/x86_64/kernel/pci-calgary.c +++ b/arch/x86_64/kernel/pci-calgary.c @@ -41,6 +41,13 @@ #include <asm/pci-direct.h> #include <asm/system.h> #include <asm/dma.h> +#include <asm/rio.h> + +#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT +int use_calgary __read_mostly = 1; +#else +int use_calgary __read_mostly = 0; +#endif /* CONFIG_CALGARY_DEFAULT_ENABLED */ #define PCI_DEVICE_ID_IBM_CALGARY 0x02a1 #define PCI_VENDOR_DEVICE_ID_CALGARY \ @@ -115,14 +122,35 @@ static const unsigned long phb_offsets[] = { 0xB000 /* PHB3 */ }; +/* PHB debug registers */ + +static const unsigned long phb_debug_offsets[] = { + 0x4000 /* PHB 0 DEBUG */, + 0x5000 /* PHB 1 DEBUG */, + 0x6000 /* PHB 2 DEBUG */, + 0x7000 /* PHB 3 DEBUG */ +}; + +/* + * STUFF register for each debug PHB, + * byte 1 = start bus number, byte 2 = end bus number + */ + +#define PHB_DEBUG_STUFF_OFFSET 0x0020 + unsigned int specified_table_size = TCE_TABLE_SIZE_UNSPECIFIED; static int translate_empty_slots __read_mostly = 0; static int calgary_detected __read_mostly = 0; +static struct rio_table_hdr *rio_table_hdr __initdata; +static struct scal_detail *scal_devs[MAX_NUMNODES] __initdata; +static struct rio_detail *rio_devs[MAX_NUMNODES * 4] __initdata; + struct calgary_bus_info { void *tce_space; unsigned char translation_disabled; signed char phbid; + void __iomem *bbar; }; static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; @@ -475,6 +503,11 @@ static struct dma_mapping_ops calgary_dma_ops = { .unmap_sg = calgary_unmap_sg, }; +static inline void __iomem * busno_to_bbar(unsigned char num) +{ + return bus_info[num].bbar; +} + static inline int busno_to_phbid(unsigned char num) { return bus_info[num].phbid; @@ -620,14 +653,9 @@ static void __init calgary_reserve_peripheral_mem_2(struct pci_dev *dev) static void __init calgary_reserve_regions(struct pci_dev *dev) { unsigned int npages; - void __iomem *bbar; - unsigned char busnum; u64 start; struct iommu_table *tbl = dev->sysdata; - bbar = tbl->bbar; - busnum = dev->bus->number; - /* reserve bad_dma_address in case it's a legal address */ iommu_range_reserve(tbl, bad_dma_address, 1); @@ -740,7 +768,7 @@ static void __init calgary_increase_split_completion_timeout(void __iomem *bbar, { u64 val64; void __iomem *target; - unsigned long phb_shift = -1; + unsigned int phb_shift = ~0; /* silence gcc */ u64 mask; switch (busno_to_phbid(busnum)) { @@ -828,33 +856,6 @@ static void __init calgary_disable_translation(struct pci_dev *dev) del_timer_sync(&tbl->watchdog_timer); } -static inline unsigned int __init locate_register_space(struct pci_dev *dev) -{ - int rionodeid; - u32 address; - - /* - * Each Calgary has four busses. The first four busses (first Calgary) - * have RIO node ID 2, then the next four (second Calgary) have RIO - * node ID 3, the next four (third Calgary) have node ID 2 again, etc. - * We use a gross hack - relying on the dev->bus->number ordering, - * modulo 14 - to decide which Calgary a given bus is on. Busses 0, 1, - * 2 and 4 are on the first Calgary (id 2), 6, 8, a and c are on the - * second (id 3), and then it repeats modulo 14. - */ - rionodeid = (dev->bus->number % 14 > 4) ? 3 : 2; - /* - * register space address calculation as follows: - * FE0MB-8MB*OneBasedChassisNumber+1MB*(RioNodeId-ChassisBase) - * ChassisBase is always zero for x366/x260/x460 - * RioNodeId is 2 for first Calgary, 3 for second Calgary - */ - address = START_ADDRESS - - (0x800000 * (ONE_BASED_CHASSIS_NUM + dev->bus->number / 14)) + - (0x100000) * (rionodeid - CHASSIS_BASE); - return address; -} - static void __init calgary_init_one_nontraslated(struct pci_dev *dev) { pci_dev_get(dev); @@ -864,23 +865,15 @@ static void __init calgary_init_one_nontraslated(struct pci_dev *dev) static int __init calgary_init_one(struct pci_dev *dev) { - u32 address; void __iomem *bbar; int ret; BUG_ON(dev->bus->number >= MAX_PHB_BUS_NUM); - address = locate_register_space(dev); - /* map entire 1MB of Calgary config space */ - bbar = ioremap_nocache(address, 1024 * 1024); - if (!bbar) { - ret = -ENODATA; - goto done; - } - + bbar = busno_to_bbar(dev->bus->number); ret = calgary_setup_tar(dev, bbar); if (ret) - goto iounmap; + goto done; pci_dev_get(dev); dev->bus->self = dev; @@ -888,17 +881,66 @@ static int __init calgary_init_one(struct pci_dev *dev) return 0; -iounmap: - iounmap(bbar); done: return ret; } +static int __init calgary_locate_bbars(void) +{ + int ret; + int rioidx, phb, bus; + void __iomem *bbar; + void __iomem *target; + unsigned long offset; + u8 start_bus, end_bus; + u32 val; + + ret = -ENODATA; + for (rioidx = 0; rioidx < rio_table_hdr->num_rio_dev; rioidx++) { + struct rio_detail *rio = rio_devs[rioidx]; + + if ((rio->type != COMPAT_CALGARY) && (rio->type != ALT_CALGARY)) + continue; + + /* map entire 1MB of Calgary config space */ + bbar = ioremap_nocache(rio->BBAR, 1024 * 1024); + if (!bbar) + goto error; + + for (phb = 0; phb < PHBS_PER_CALGARY; phb++) { + offset = phb_debug_offsets[phb] | PHB_DEBUG_STUFF_OFFSET; + target = calgary_reg(bbar, offset); + + val = be32_to_cpu(readl(target)); + start_bus = (u8)((val & 0x00FF0000) >> 16); + end_bus = (u8)((val & 0x0000FF00) >> 8); + for (bus = start_bus; bus <= end_bus; bus++) { + bus_info[bus].bbar = bbar; + bus_info[bus].phbid = phb; + } + } + } + + return 0; + +error: + /* scan bus_info and iounmap any bbars we previously ioremap'd */ + for (bus = 0; bus < ARRAY_SIZE(bus_info); bus++) + if (bus_info[bus].bbar) + iounmap(bus_info[bus].bbar); + + return ret; +} + static int __init calgary_init(void) { - int ret = -ENODEV; + int ret; struct pci_dev *dev = NULL; + ret = calgary_locate_bbars(); + if (ret) + return ret; + do { dev = pci_get_device(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CALGARY, @@ -921,7 +963,7 @@ static int __init calgary_init(void) error: do { - dev = pci_find_device_reverse(PCI_VENDOR_ID_IBM, + dev = pci_get_device_reverse(PCI_VENDOR_ID_IBM, PCI_DEVICE_ID_IBM_CALGARY, dev); if (!dev) @@ -962,13 +1004,56 @@ static inline int __init determine_tce_table_size(u64 ram) return ret; } +static int __init build_detail_arrays(void) +{ + unsigned long ptr; + int i, scal_detail_size, rio_detail_size; + + if (rio_table_hdr->num_scal_dev > MAX_NUMNODES){ + printk(KERN_WARNING + "Calgary: MAX_NUMNODES too low! Defined as %d, " + "but system has %d nodes.\n", + MAX_NUMNODES, rio_table_hdr->num_scal_dev); + return -ENODEV; + } + + switch (rio_table_hdr->version){ + case 2: + scal_detail_size = 11; + rio_detail_size = 13; + break; + case 3: + scal_detail_size = 12; + rio_detail_size = 15; + break; + default: + printk(KERN_WARNING + "Calgary: Invalid Rio Grande Table Version: %d\n", + rio_table_hdr->version); + return -EPROTO; + } + + ptr = ((unsigned long)rio_table_hdr) + 3; + for (i = 0; i < rio_table_hdr->num_scal_dev; + i++, ptr += scal_detail_size) + scal_devs[i] = (struct scal_detail *)ptr; + + for (i = 0; i < rio_table_hdr->num_rio_dev; + i++, ptr += rio_detail_size) + rio_devs[i] = (struct rio_detail *)ptr; + + return 0; +} + void __init detect_calgary(void) { u32 val; int bus; void *tbl; int calgary_found = 0; - int phb = -1; + unsigned long ptr; + int offset; + int ret; /* * if the user specified iommu=off or iommu=soft or we found @@ -977,25 +1062,47 @@ void __init detect_calgary(void) if (swiotlb || no_iommu || iommu_detected) return; + if (!use_calgary) + return; + if (!early_pci_allowed()) return; + ptr = (unsigned long)phys_to_virt(get_bios_ebda()); + + rio_table_hdr = NULL; + offset = 0x180; + while (offset) { + /* The block id is stored in the 2nd word */ + if (*((unsigned short *)(ptr + offset + 2)) == 0x4752){ + /* set the pointer past the offset & block id */ + rio_table_hdr = (struct rio_table_hdr *)(ptr + offset + 4); + break; + } + /* The next offset is stored in the 1st word. 0 means no more */ + offset = *((unsigned short *)(ptr + offset)); + } + if (!rio_table_hdr) { + printk(KERN_ERR "Calgary: Unable to locate " + "Rio Grande Table in EBDA - bailing!\n"); + return; + } + + ret = build_detail_arrays(); + if (ret) { + printk(KERN_ERR "Calgary: build_detail_arrays ret %d\n", ret); + return; + } + specified_table_size = determine_tce_table_size(end_pfn * PAGE_SIZE); for (bus = 0; bus < MAX_PHB_BUS_NUM; bus++) { int dev; struct calgary_bus_info *info = &bus_info[bus]; - info->phbid = -1; if (read_pci_config(bus, 0, 0, 0) != PCI_VENDOR_DEVICE_ID_CALGARY) continue; - /* - * There are 4 PHBs per Calgary chip. Set phb to which phb (0-3) - * it is connected to releative to the clagary chip. - */ - phb = (phb + 1) % PHBS_PER_CALGARY; - if (info->translation_disabled) continue; @@ -1010,7 +1117,6 @@ void __init detect_calgary(void) if (!tbl) goto cleanup; info->tce_space = tbl; - info->phbid = phb; calgary_found = 1; break; } diff --git a/arch/x86_64/kernel/pci-dma.c b/arch/x86_64/kernel/pci-dma.c index f8d857453f8a..683b7a5c1ab3 100644 --- a/arch/x86_64/kernel/pci-dma.c +++ b/arch/x86_64/kernel/pci-dma.c @@ -296,6 +296,11 @@ __init int iommu_setup(char *p) gart_parse_options(p); #endif +#ifdef CONFIG_CALGARY_IOMMU + if (!strncmp(p, "calgary", 7)) + use_calgary = 1; +#endif /* CONFIG_CALGARY_IOMMU */ + p += strcspn(p, ","); if (*p == ',') ++p; diff --git a/arch/x86_64/kernel/pci-gart.c b/arch/x86_64/kernel/pci-gart.c index 16261a8a3303..fc1960f1f243 100644 --- a/arch/x86_64/kernel/pci-gart.c +++ b/arch/x86_64/kernel/pci-gart.c @@ -601,10 +601,9 @@ void __init gart_iommu_init(void) (!force_iommu && end_pfn <= MAX_DMA32_PFN) || !iommu_aperture || (no_agp && init_k8_gatt(&info) < 0)) { - printk(KERN_INFO "PCI-DMA: Disabling IOMMU.\n"); if (end_pfn > MAX_DMA32_PFN) { printk(KERN_ERR "WARNING more than 4GB of memory " - "but IOMMU not available.\n" + "but GART IOMMU not available.\n" KERN_ERR "WARNING 32bit PCI may malfunction.\n"); } return; diff --git a/arch/x86_64/kernel/process.c b/arch/x86_64/kernel/process.c index 49f7fac6229e..a418ee4c8c62 100644 --- a/arch/x86_64/kernel/process.c +++ b/arch/x86_64/kernel/process.c @@ -88,9 +88,8 @@ void enter_idle(void) static void __exit_idle(void) { - if (read_pda(isidle) == 0) + if (test_and_clear_bit_pda(0, isidle) == 0) return; - write_pda(isidle, 0); atomic_notifier_call_chain(&idle_notifier, IDLE_END, NULL); } @@ -109,17 +108,15 @@ void exit_idle(void) */ static void default_idle(void) { - local_irq_enable(); - current_thread_info()->status &= ~TS_POLLING; smp_mb__after_clear_bit(); - while (!need_resched()) { - local_irq_disable(); - if (!need_resched()) - safe_halt(); - else - local_irq_enable(); - } + local_irq_disable(); + if (!need_resched()) { + /* Enables interrupts one instruction before HLT. + x86 special cases this so there is no race. */ + safe_halt(); + } else + local_irq_enable(); current_thread_info()->status |= TS_POLLING; } @@ -131,21 +128,13 @@ static void default_idle(void) static void poll_idle (void) { local_irq_enable(); - - asm volatile( - "2:" - "testl %0,%1;" - "rep; nop;" - "je 2b;" - : : - "i" (_TIF_NEED_RESCHED), - "m" (current_thread_info()->flags)); + cpu_relax(); } void cpu_idle_wait(void) { unsigned int cpu, this_cpu = get_cpu(); - cpumask_t map; + cpumask_t map, tmp = current->cpus_allowed; set_cpus_allowed(current, cpumask_of_cpu(this_cpu)); put_cpu(); @@ -168,6 +157,8 @@ void cpu_idle_wait(void) } cpus_and(map, map, cpu_online_map); } while (!cpus_empty(map)); + + set_cpus_allowed(current, tmp); } EXPORT_SYMBOL_GPL(cpu_idle_wait); @@ -218,6 +209,12 @@ void cpu_idle (void) idle = default_idle; if (cpu_is_offline(smp_processor_id())) play_dead(); + /* + * Idle routines should keep interrupts disabled + * from here on, until they go to idle. + * Otherwise, idle callbacks can misfire. + */ + local_irq_disable(); enter_idle(); idle(); /* In many cases the interrupt that ended idle @@ -255,9 +252,16 @@ void mwait_idle_with_hints(unsigned long eax, unsigned long ecx) /* Default MONITOR/MWAIT with no hints, used for default C1 state */ static void mwait_idle(void) { - local_irq_enable(); - while (!need_resched()) - mwait_idle_with_hints(0,0); + if (!need_resched()) { + __monitor((void *)¤t_thread_info()->flags, 0, 0); + smp_mb(); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); + } else { + local_irq_enable(); + } } void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c) diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c index fc944b5e8f4a..af425a8049fb 100644 --- a/arch/x86_64/kernel/setup.c +++ b/arch/x86_64/kernel/setup.c @@ -471,8 +471,7 @@ void __init setup_arch(char **cmdline_p) if (LOADER_TYPE && INITRD_START) { if (INITRD_START + INITRD_SIZE <= (end_pfn << PAGE_SHIFT)) { reserve_bootmem_generic(INITRD_START, INITRD_SIZE); - initrd_start = - INITRD_START ? INITRD_START + PAGE_OFFSET : 0; + initrd_start = INITRD_START + PAGE_OFFSET; initrd_end = initrd_start+INITRD_SIZE; } else { @@ -732,11 +731,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) /* Fix cpuid4 emulation for more */ num_cache_leaves = 3; - /* When there is only one core no need to synchronize RDTSC */ - if (num_possible_cpus() == 1) - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); - else - clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + /* RDTSC can be speculated around */ + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); } static void __cpuinit detect_ht(struct cpuinfo_x86 *c) @@ -835,6 +831,15 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_bit(X86_FEATURE_ARCH_PERFMON, &c->x86_capability); } + if (cpu_has_ds) { + unsigned int l1, l2; + rdmsr(MSR_IA32_MISC_ENABLE, l1, l2); + if (!(l1 & (1<<11))) + set_bit(X86_FEATURE_BTS, c->x86_capability); + if (!(l1 & (1<<12))) + set_bit(X86_FEATURE_PEBS, c->x86_capability); + } + n = c->extended_cpuid_level; if (n >= 0x80000008) { unsigned eax = cpuid_eax(0x80000008); @@ -854,7 +859,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c) set_bit(X86_FEATURE_CONSTANT_TSC, &c->x86_capability); if (c->x86 == 6) set_bit(X86_FEATURE_REP_GOOD, &c->x86_capability); - set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + if (c->x86 == 15) + set_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); + else + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); c->x86_max_cores = intel_num_cpu_cores(c); srat_detect_node(); diff --git a/arch/x86_64/kernel/smp.c b/arch/x86_64/kernel/smp.c index 4f67697f5036..af1ec4d23cf8 100644 --- a/arch/x86_64/kernel/smp.c +++ b/arch/x86_64/kernel/smp.c @@ -376,16 +376,20 @@ int smp_call_function_single (int cpu, void (*func) (void *info), void *info, /* prevent preemption and reschedule on another processor */ int me = get_cpu(); if (cpu == me) { - WARN_ON(1); put_cpu(); - return -EBUSY; + return 0; } + + /* Can deadlock when called with interrupts disabled */ + WARN_ON(irqs_disabled()); + spin_lock_bh(&call_lock); __smp_call_function_single(cpu, func, info, nonatomic, wait); spin_unlock_bh(&call_lock); put_cpu(); return 0; } +EXPORT_SYMBOL(smp_call_function_single); /* * this function sends a 'generic call function' IPI to all other CPUs diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c index 62c2e747af58..daf19332f0dd 100644 --- a/arch/x86_64/kernel/smpboot.c +++ b/arch/x86_64/kernel/smpboot.c @@ -60,6 +60,7 @@ #include <asm/irq.h> #include <asm/hw_irq.h> #include <asm/numa.h> +#include <asm/genapic.h> /* Number of siblings per CPU package */ int smp_num_siblings = 1; @@ -753,14 +754,16 @@ static int __cpuinit wakeup_secondary_via_INIT(int phys_apicid, unsigned int sta } struct create_idle { + struct work_struct work; struct task_struct *idle; struct completion done; int cpu; }; -void do_fork_idle(void *_c_idle) +void do_fork_idle(struct work_struct *work) { - struct create_idle *c_idle = _c_idle; + struct create_idle *c_idle = + container_of(work, struct create_idle, work); c_idle->idle = fork_idle(c_idle->cpu); complete(&c_idle->done); @@ -775,10 +778,10 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) int timeout; unsigned long start_rip; struct create_idle c_idle = { + .work = __WORK_INITIALIZER(c_idle.work, do_fork_idle), .cpu = cpu, .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), }; - DECLARE_WORK(work, do_fork_idle, &c_idle); /* allocate memory for gdts of secondary cpus. Hotplug is considered */ if (!cpu_gdt_descr[cpu].address && @@ -825,9 +828,9 @@ static int __cpuinit do_boot_cpu(int cpu, int apicid) * thread. */ if (!keventd_up() || current_is_keventd()) - work.func(work.data); + c_idle.work.func(&c_idle.work); else { - schedule_work(&work); + schedule_work(&c_idle.work); wait_for_completion(&c_idle.done); } @@ -1167,6 +1170,13 @@ int __cpuinit __cpu_up(unsigned int cpu) while (!cpu_isset(cpu, cpu_online_map)) cpu_relax(); + + if (num_online_cpus() > 8 && genapic == &apic_flat) { + printk(KERN_WARNING + "flat APIC routing can't be used with > 8 cpus\n"); + BUG(); + } + err = 0; return err; diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 88722f11ca13..9f05bc9b2dad 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -563,7 +563,7 @@ static unsigned int cpufreq_delayed_issched = 0; static unsigned int cpufreq_init = 0; static struct work_struct cpufreq_delayed_get_work; -static void handle_cpufreq_delayed_get(void *v) +static void handle_cpufreq_delayed_get(struct work_struct *v) { unsigned int cpu; for_each_online_cpu(cpu) { @@ -639,7 +639,7 @@ static struct notifier_block time_cpufreq_notifier_block = { static int __init cpufreq_tsc(void) { - INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL); + INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get); if (!cpufreq_register_notifier(&time_cpufreq_notifier_block, CPUFREQ_TRANSITION_NOTIFIER)) cpufreq_init = 1; @@ -876,15 +876,6 @@ static struct irqaction irq0 = { timer_interrupt, IRQF_DISABLED, CPU_MASK_NONE, "timer", NULL, NULL }; -static int __cpuinit -time_cpu_notifier(struct notifier_block *nb, unsigned long action, void *hcpu) -{ - unsigned cpu = (unsigned long) hcpu; - if (action == CPU_ONLINE) - vsyscall_set_cpu(cpu); - return NOTIFY_DONE; -} - void __init time_init(void) { if (nohpet) @@ -925,8 +916,6 @@ void __init time_init(void) vxtime.last_tsc = get_cycles_sync(); set_cyc2ns_scale(cpu_khz); setup_irq(0, &irq0); - hotcpu_notifier(time_cpu_notifier, 0); - time_cpu_notifier(NULL, CPU_ONLINE, (void *)(long)smp_processor_id()); #ifndef CONFIG_SMP time_init_gtod(); diff --git a/arch/x86_64/kernel/traps.c b/arch/x86_64/kernel/traps.c index 7819022a8db5..a1641ffdffcf 100644 --- a/arch/x86_64/kernel/traps.c +++ b/arch/x86_64/kernel/traps.c @@ -30,9 +30,9 @@ #include <linux/kprobes.h> #include <linux/kexec.h> #include <linux/unwind.h> +#include <linux/uaccess.h> #include <asm/system.h> -#include <asm/uaccess.h> #include <asm/io.h> #include <asm/atomic.h> #include <asm/debugreg.h> @@ -108,7 +108,7 @@ static inline void preempt_conditional_cli(struct pt_regs *regs) preempt_enable_no_resched(); } -static int kstack_depth_to_print = 12; +int kstack_depth_to_print = 12; #ifdef CONFIG_STACK_UNWIND static int call_trace = 1; #else @@ -225,16 +225,25 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context) { struct ops_and_data *oad = (struct ops_and_data *)context; int n = 0; + unsigned long sp = UNW_SP(info); + if (arch_unw_user_mode(info)) + return -1; while (unwind(info) == 0 && UNW_PC(info)) { n++; oad->ops->address(oad->data, UNW_PC(info)); if (arch_unw_user_mode(info)) break; + if ((sp & ~(PAGE_SIZE - 1)) == (UNW_SP(info) & ~(PAGE_SIZE - 1)) + && sp > UNW_SP(info)) + break; + sp = UNW_SP(info); } return n; } +#define MSG(txt) ops->warning(data, txt) + /* * x86-64 can have upto three kernel stacks: * process stack @@ -242,12 +251,20 @@ static int dump_trace_unwind(struct unwind_frame_info *info, void *context) * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack */ -void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * stack, +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +{ + void *t = (void *)tinfo; + return p > t && p < t + THREAD_SIZE - 3; +} + +void dump_trace(struct task_struct *tsk, struct pt_regs *regs, + unsigned long *stack, struct stacktrace_ops *ops, void *data) { - const unsigned cpu = smp_processor_id(); - unsigned long *irqstack_end = (unsigned long *)cpu_pda(cpu)->irqstackptr; + const unsigned cpu = get_cpu(); + unsigned long *irqstack_end = (unsigned long*)cpu_pda(cpu)->irqstackptr; unsigned used = 0; + struct thread_info *tinfo; if (!tsk) tsk = current; @@ -261,28 +278,30 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s if (unwind_init_frame_info(&info, tsk, regs) == 0) unw_ret = dump_trace_unwind(&info, &oad); } else if (tsk == current) - unw_ret = unwind_init_running(&info, dump_trace_unwind, &oad); + unw_ret = unwind_init_running(&info, dump_trace_unwind, + &oad); else { if (unwind_init_blocked(&info, tsk) == 0) unw_ret = dump_trace_unwind(&info, &oad); } if (unw_ret > 0) { if (call_trace == 1 && !arch_unw_user_mode(&info)) { - ops->warning_symbol(data, "DWARF2 unwinder stuck at %s\n", + ops->warning_symbol(data, + "DWARF2 unwinder stuck at %s", UNW_PC(&info)); if ((long)UNW_SP(&info) < 0) { - ops->warning(data, "Leftover inexact backtrace:\n"); + MSG("Leftover inexact backtrace:"); stack = (unsigned long *)UNW_SP(&info); if (!stack) - return; + goto out; } else - ops->warning(data, "Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:"); } else if (call_trace >= 1) - return; + goto out; else - ops->warning(data, "Full inexact backtrace again:\n"); + MSG("Full inexact backtrace again:"); } else - ops->warning(data, "Inexact backtrace:\n"); + MSG("Inexact backtrace:"); } if (!stack) { unsigned long dummy; @@ -299,9 +318,9 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s #define HANDLE_STACK(cond) \ do while (cond) { \ unsigned long addr = *stack++; \ - if (oops_in_progress ? \ - __kernel_text_address(addr) : \ - kernel_text_address(addr)) { \ + /* Use unlocked access here because except for NMIs \ + we should be already protected against module unloads */ \ + if (__kernel_text_address(addr)) { \ /* \ * If the address is either in the text segment of the \ * kernel, or in the region which contains vmalloc'ed \ @@ -364,8 +383,11 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs, unsigned long * s /* * This handles the process stack: */ - HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0); + tinfo = current_thread_info(); + HANDLE_STACK (valid_stack_ptr(tinfo, stack)); #undef HANDLE_STACK +out: + put_cpu(); } EXPORT_SYMBOL(dump_trace); @@ -772,8 +794,7 @@ mem_parity_error(unsigned char reason, struct pt_regs * regs) { printk(KERN_EMERG "Uhhuh. NMI received for unknown reason %02x.\n", reason); - printk(KERN_EMERG "You probably have a hardware problem with your " - "RAM chips\n"); + printk(KERN_EMERG "You have some hardware problem, likely on the PCI bus.\n"); if (panic_on_unrecovered_nmi) panic("NMI: Not continuing"); diff --git a/arch/x86_64/kernel/vmlinux.lds.S b/arch/x86_64/kernel/vmlinux.lds.S index edb24aa714b4..6a1f8f491e5d 100644 --- a/arch/x86_64/kernel/vmlinux.lds.S +++ b/arch/x86_64/kernel/vmlinux.lds.S @@ -51,15 +51,7 @@ SECTIONS RODATA -#ifdef CONFIG_STACK_UNWIND - . = ALIGN(8); - .eh_frame : AT(ADDR(.eh_frame) - LOAD_OFFSET) { - __start_unwind = .; - *(.eh_frame) - __end_unwind = .; - } -#endif - + . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ /* Data */ .data : AT(ADDR(.data) - LOAD_OFFSET) { *(.data) diff --git a/arch/x86_64/kernel/vsyscall.c b/arch/x86_64/kernel/vsyscall.c index a98b460af6a1..4a673f5397a0 100644 --- a/arch/x86_64/kernel/vsyscall.c +++ b/arch/x86_64/kernel/vsyscall.c @@ -27,6 +27,9 @@ #include <linux/jiffies.h> #include <linux/sysctl.h> #include <linux/getcpu.h> +#include <linux/cpu.h> +#include <linux/smp.h> +#include <linux/notifier.h> #include <asm/vsyscall.h> #include <asm/pgtable.h> @@ -39,6 +42,7 @@ #include <asm/topology.h> #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) +#define __syscall_clobber "r11","rcx","memory" int __sysctl_vsyscall __section_sysctl_vsyscall = 1; seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED; @@ -243,32 +247,17 @@ static ctl_table kernel_root_table2[] = { #endif -static void __cpuinit write_rdtscp_cb(void *info) -{ - write_rdtscp_aux((unsigned long)info); -} - -void __cpuinit vsyscall_set_cpu(int cpu) +/* Assume __initcall executes before all user space. Hopefully kmod + doesn't violate that. We'll find out if it does. */ +static void __cpuinit vsyscall_set_cpu(int cpu) { unsigned long *d; unsigned long node = 0; #ifdef CONFIG_NUMA node = cpu_to_node[cpu]; #endif - if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) { - void *info = (void *)((node << 12) | cpu); - /* Can happen on preemptive kernel */ - if (get_cpu() == cpu) - write_rdtscp_cb(info); -#ifdef CONFIG_SMP - else { - /* the notifier is unfortunately not executed on the - target CPU */ - smp_call_function_single(cpu,write_rdtscp_cb,info,0,1); - } -#endif - put_cpu(); - } + if (cpu_has(&cpu_data[cpu], X86_FEATURE_RDTSCP)) + write_rdtscp_aux((node << 12) | cpu); /* Store cpu number in limit so that it can be loaded quickly in user space in vgetcpu. @@ -280,11 +269,27 @@ void __cpuinit vsyscall_set_cpu(int cpu) *d |= (node >> 4) << 48; } +static void __cpuinit cpu_vsyscall_init(void *arg) +{ + /* preemption should be already off */ + vsyscall_set_cpu(raw_smp_processor_id()); +} + +static int __cpuinit +cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) +{ + long cpu = (long)arg; + if (action == CPU_ONLINE) + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); + return NOTIFY_DONE; +} + static void __init map_vsyscall(void) { extern char __vsyscall_0; unsigned long physaddr_page0 = __pa_symbol(&__vsyscall_0); + /* Note that VSYSCALL_MAPPED_PAGES must agree with the code below. */ __set_fixmap(VSYSCALL_FIRST_PAGE, physaddr_page0, PAGE_KERNEL_VSYSCALL); } @@ -299,6 +304,8 @@ static int __init vsyscall_init(void) #ifdef CONFIG_SYSCTL register_sysctl_table(kernel_root_table2, 0); #endif + on_each_cpu(cpu_vsyscall_init, NULL, 0, 1); + hotcpu_notifier(cpu_vsyscall_notifier, 0); return 0; } diff --git a/arch/x86_64/lib/csum-partial.c b/arch/x86_64/lib/csum-partial.c index c493735218da..bc503f506903 100644 --- a/arch/x86_64/lib/csum-partial.c +++ b/arch/x86_64/lib/csum-partial.c @@ -9,8 +9,6 @@ #include <linux/module.h> #include <asm/checksum.h> -#define __force_inline inline __attribute__((always_inline)) - static inline unsigned short from32to16(unsigned a) { unsigned short b = a >> 16; @@ -33,7 +31,7 @@ static inline unsigned short from32to16(unsigned a) * Unrolling to an 128 bytes inner loop. * Using interleaving with more registers to break the carry chains. */ -static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len) +static unsigned do_csum(const unsigned char *buff, unsigned len) { unsigned odd, count; unsigned long result = 0; @@ -132,9 +130,10 @@ static __force_inline unsigned do_csum(const unsigned char *buff, unsigned len) * * it's best to have buff aligned on a 64-bit boundary */ -unsigned csum_partial(const unsigned char *buff, unsigned len, unsigned sum) +__wsum csum_partial(const void *buff, int len, __wsum sum) { - return add32_with_carry(do_csum(buff, len), sum); + return (__force __wsum)add32_with_carry(do_csum(buff, len), + (__force u32)sum); } EXPORT_SYMBOL(csum_partial); @@ -143,7 +142,7 @@ EXPORT_SYMBOL(csum_partial); * this routine is used for miscellaneous IP-like checksums, mainly * in icmp.c */ -unsigned short ip_compute_csum(unsigned char * buff, int len) +__sum16 ip_compute_csum(const void *buff, int len) { return csum_fold(csum_partial(buff,len,0)); } diff --git a/arch/x86_64/lib/csum-wrappers.c b/arch/x86_64/lib/csum-wrappers.c index b1320ec58428..fd42a4a095fc 100644 --- a/arch/x86_64/lib/csum-wrappers.c +++ b/arch/x86_64/lib/csum-wrappers.c @@ -18,9 +18,9 @@ * Returns an 32bit unfolded checksum of the buffer. * src and dst are best aligned to 64bits. */ -unsigned int -csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst, - int len, unsigned int isum, int *errp) +__wsum +csum_partial_copy_from_user(const void __user *src, void *dst, + int len, __wsum isum, int *errp) { might_sleep(); *errp = 0; @@ -34,17 +34,19 @@ csum_partial_copy_from_user(const unsigned char __user *src, unsigned char *dst, if (unlikely((unsigned long)src & 6)) { while (((unsigned long)src & 6) && len >= 2) { __u16 val16; - *errp = __get_user(val16, (__u16 __user *)src); + *errp = __get_user(val16, (const __u16 __user *)src); if (*errp) return isum; *(__u16 *)dst = val16; - isum = add32_with_carry(isum, val16); + isum = (__force __wsum)add32_with_carry( + (__force unsigned)isum, val16); src += 2; dst += 2; len -= 2; } } - isum = csum_partial_copy_generic((__force void *)src,dst,len,isum,errp,NULL); + isum = csum_partial_copy_generic((__force const void *)src, + dst, len, isum, errp, NULL); if (likely(*errp == 0)) return isum; } @@ -66,9 +68,9 @@ EXPORT_SYMBOL(csum_partial_copy_from_user); * Returns an 32bit unfolded checksum of the buffer. * src and dst are best aligned to 64bits. */ -unsigned int -csum_partial_copy_to_user(unsigned const char *src, unsigned char __user *dst, - int len, unsigned int isum, int *errp) +__wsum +csum_partial_copy_to_user(const void *src, void __user *dst, + int len, __wsum isum, int *errp) { might_sleep(); if (unlikely(!access_ok(VERIFY_WRITE, dst, len))) { @@ -79,7 +81,8 @@ csum_partial_copy_to_user(unsigned const char *src, unsigned char __user *dst, if (unlikely((unsigned long)dst & 6)) { while (((unsigned long)dst & 6) && len >= 2) { __u16 val16 = *(__u16 *)src; - isum = add32_with_carry(isum, val16); + isum = (__force __wsum)add32_with_carry( + (__force unsigned)isum, val16); *errp = __put_user(val16, (__u16 __user *)dst); if (*errp) return isum; @@ -104,19 +107,21 @@ EXPORT_SYMBOL(csum_partial_copy_to_user); * * Returns an 32bit unfolded checksum of the buffer. */ -unsigned int -csum_partial_copy_nocheck(const unsigned char *src, unsigned char *dst, int len, unsigned int sum) +__wsum +csum_partial_copy_nocheck(const void *src, void *dst, int len, __wsum sum) { return csum_partial_copy_generic(src,dst,len,sum,NULL,NULL); } EXPORT_SYMBOL(csum_partial_copy_nocheck); -unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr, - __u32 len, unsigned short proto, unsigned int sum) +__sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, unsigned short proto, __wsum sum) { __u64 rest, sum64; - rest = (__u64)htonl(len) + (__u64)htons(proto) + (__u64)sum; + rest = (__force __u64)htonl(len) + (__force __u64)htons(proto) + + (__force __u64)sum; asm(" addq (%[saddr]),%[sum]\n" " adcq 8(%[saddr]),%[sum]\n" " adcq (%[daddr]),%[sum]\n" @@ -124,7 +129,7 @@ unsigned short csum_ipv6_magic(struct in6_addr *saddr, struct in6_addr *daddr, " adcq $0,%[sum]\n" : [sum] "=r" (sum64) : "[sum]" (rest),[saddr] "r" (saddr), [daddr] "r" (daddr)); - return csum_fold(add32_with_carry(sum64 & 0xffffffff, sum64>>32)); + return csum_fold((__force __wsum)add32_with_carry(sum64 & 0xffffffff, sum64>>32)); } EXPORT_SYMBOL(csum_ipv6_magic); diff --git a/arch/x86_64/lib/delay.c b/arch/x86_64/lib/delay.c index 50be90975d04..2dbebd308347 100644 --- a/arch/x86_64/lib/delay.c +++ b/arch/x86_64/lib/delay.c @@ -40,13 +40,13 @@ EXPORT_SYMBOL(__delay); inline void __const_udelay(unsigned long xloops) { - __delay((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32); + __delay(((xloops * HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy) >> 32) + 1); } EXPORT_SYMBOL(__const_udelay); void __udelay(unsigned long usecs) { - __const_udelay(usecs * 0x000010c6); /* 2**32 / 1000000 */ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ } EXPORT_SYMBOL(__udelay); diff --git a/arch/x86_64/mm/fault.c b/arch/x86_64/mm/fault.c index 3751b4788e28..a65fc6f1dcaf 100644 --- a/arch/x86_64/mm/fault.c +++ b/arch/x86_64/mm/fault.c @@ -23,9 +23,9 @@ #include <linux/compiler.h> #include <linux/module.h> #include <linux/kprobes.h> +#include <linux/uaccess.h> #include <asm/system.h> -#include <asm/uaccess.h> #include <asm/pgalloc.h> #include <asm/smp.h> #include <asm/tlbflush.h> @@ -96,7 +96,7 @@ void bust_spinlocks(int yes) static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned long error_code) { - unsigned char __user *instr; + unsigned char *instr; int scan_more = 1; int prefetch = 0; unsigned char *max_instr; @@ -116,7 +116,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, unsigned char instr_hi; unsigned char instr_lo; - if (__get_user(opcode, (char __user *)instr)) + if (probe_kernel_address(instr, opcode)) break; instr_hi = opcode & 0xf0; @@ -154,7 +154,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, case 0x00: /* Prefetch instruction is 0x0F0D or 0x0F18 */ scan_more = 0; - if (__get_user(opcode, (char __user *)instr)) + if (probe_kernel_address(instr, opcode)) break; prefetch = (instr_lo == 0xF) && (opcode == 0x0D || opcode == 0x18); @@ -170,7 +170,7 @@ static noinline int is_prefetch(struct pt_regs *regs, unsigned long addr, static int bad_address(void *p) { unsigned long dummy; - return __get_user(dummy, (unsigned long __user *)p); + return probe_kernel_address((unsigned long *)p, dummy); } void dump_pagetable(unsigned long address) diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 971dc1181e69..2968b90ef8ad 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -496,7 +496,7 @@ int remove_memory(u64 start, u64 size) } EXPORT_SYMBOL_GPL(remove_memory); -#ifndef CONFIG_ACPI_NUMA +#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA) int memory_add_physaddr_to_nid(u64 start) { return 0; @@ -504,13 +504,6 @@ int memory_add_physaddr_to_nid(u64 start) EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid); #endif -#ifndef CONFIG_ACPI_NUMA -int memory_add_physaddr_to_nid(u64 start) -{ - return 0; -} -#endif - #endif /* CONFIG_MEMORY_HOTPLUG */ #ifdef CONFIG_MEMORY_HOTPLUG_RESERVE @@ -655,9 +648,22 @@ void free_initrd_mem(unsigned long start, unsigned long end) void __init reserve_bootmem_generic(unsigned long phys, unsigned len) { - /* Should check here against the e820 map to avoid double free */ #ifdef CONFIG_NUMA int nid = phys_to_nid(phys); +#endif + unsigned long pfn = phys >> PAGE_SHIFT; + if (pfn >= end_pfn) { + /* This can happen with kdump kernels when accessing firmware + tables. */ + if (pfn < end_pfn_map) + return; + printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n", + phys, len); + return; + } + + /* Should check here against the e820 map to avoid double free */ +#ifdef CONFIG_NUMA reserve_bootmem_node(NODE_DATA(nid), phys, len); #else reserve_bootmem(phys, len); @@ -724,14 +730,15 @@ static __init int x8664_sysctl_init(void) __initcall(x8664_sysctl_init); #endif -/* A pseudo VMAs to allow ptrace access for the vsyscall page. This only +/* A pseudo VMA to allow ptrace access for the vsyscall page. This only covers the 64bit vsyscall page now. 32bit has a real VMA now and does not need special handling anymore. */ static struct vm_area_struct gate_vma = { .vm_start = VSYSCALL_START, - .vm_end = VSYSCALL_END, - .vm_page_prot = PAGE_READONLY + .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT), + .vm_page_prot = PAGE_READONLY_EXEC, + .vm_flags = VM_READ | VM_EXEC }; struct vm_area_struct *get_gate_vma(struct task_struct *tsk) diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 3e231d762aaa..ccb91dd996a9 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -61,34 +61,40 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, return base; } - -static void flush_kernel_map(void *address) +static void cache_flush_page(void *adr) { - if (0 && address && cpu_has_clflush) { - /* is this worth it? */ - int i; - for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) - asm volatile("clflush (%0)" :: "r" (address + i)); - } else - asm volatile("wbinvd":::"memory"); - if (address) - __flush_tlb_one(address); - else - __flush_tlb_all(); + int i; + for (i = 0; i < PAGE_SIZE; i += boot_cpu_data.x86_clflush_size) + asm volatile("clflush (%0)" :: "r" (adr + i)); } +static void flush_kernel_map(void *arg) +{ + struct list_head *l = (struct list_head *)arg; + struct page *pg; + + /* When clflush is available always use it because it is + much cheaper than WBINVD */ + if (!cpu_has_clflush) + asm volatile("wbinvd" ::: "memory"); + list_for_each_entry(pg, l, lru) { + void *adr = page_address(pg); + if (cpu_has_clflush) + cache_flush_page(adr); + __flush_tlb_one(adr); + } +} -static inline void flush_map(unsigned long address) +static inline void flush_map(struct list_head *l) { - on_each_cpu(flush_kernel_map, (void *)address, 1, 1); + on_each_cpu(flush_kernel_map, l, 1, 1); } -static struct page *deferred_pages; /* protected by init_mm.mmap_sem */ +static LIST_HEAD(deferred_pages); /* protected by init_mm.mmap_sem */ static inline void save_page(struct page *fpage) { - fpage->lru.next = (struct list_head *)deferred_pages; - deferred_pages = fpage; + list_add(&fpage->lru, &deferred_pages); } /* @@ -207,18 +213,18 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot) void global_flush_tlb(void) { - struct page *dpage; + struct page *pg, *next; + struct list_head l; down_read(&init_mm.mmap_sem); - dpage = xchg(&deferred_pages, NULL); + list_replace_init(&deferred_pages, &l); up_read(&init_mm.mmap_sem); - flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0); - while (dpage) { - struct page *tmp = dpage; - dpage = (struct page *)dpage->lru.next; - ClearPagePrivate(tmp); - __free_page(tmp); + flush_map(&l); + + list_for_each_entry_safe(pg, next, &l, lru) { + ClearPagePrivate(pg); + __free_page(pg); } } diff --git a/arch/x86_64/pci/mmconfig.c b/arch/x86_64/pci/mmconfig.c index e61093b34c26..f8b6b2800a62 100644 --- a/arch/x86_64/pci/mmconfig.c +++ b/arch/x86_64/pci/mmconfig.c @@ -163,37 +163,6 @@ static __init void unreachable_devices(void) } } -static __init void pci_mmcfg_insert_resources(void) -{ -#define PCI_MMCFG_RESOURCE_NAME_LEN 19 - int i; - struct resource *res; - char *names; - unsigned num_buses; - - res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res), - pci_mmcfg_config_num, GFP_KERNEL); - - if (!res) { - printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n"); - return; - } - - names = (void *)&res[pci_mmcfg_config_num]; - for (i = 0; i < pci_mmcfg_config_num; i++, res++) { - num_buses = pci_mmcfg_config[i].end_bus_number - - pci_mmcfg_config[i].start_bus_number + 1; - res->name = names; - snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN, "PCI MMCONFIG %u", - pci_mmcfg_config[i].pci_segment_group_number); - res->start = pci_mmcfg_config[i].base_address; - res->end = res->start + (num_buses << 20) - 1; - res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; - insert_resource(&iomem_resource, res); - names += PCI_MMCFG_RESOURCE_NAME_LEN; - } -} - void __init pci_mmcfg_init(int type) { int i; @@ -237,7 +206,6 @@ void __init pci_mmcfg_init(int type) } unreachable_devices(); - pci_mmcfg_insert_resources(); raw_pci_ops = &pci_mmcfg; pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; |