diff options
Diffstat (limited to 'arch')
94 files changed, 1307 insertions, 653 deletions
diff --git a/arch/alpha/mm/init.c b/arch/alpha/mm/init.c index 486d7945583d..544ac5dc09eb 100644 --- a/arch/alpha/mm/init.c +++ b/arch/alpha/mm/init.c @@ -357,7 +357,7 @@ free_reserved_mem(void *start, void *end) void *__start = start; for (; __start < end; __start += PAGE_SIZE) { ClearPageReserved(virt_to_page(__start)); - set_page_count(virt_to_page(__start), 1); + init_page_count(virt_to_page(__start)); free_page((long)__start); totalram_pages++; } diff --git a/arch/arm/mach-pxa/mainstone.c b/arch/arm/mach-pxa/mainstone.c index d5bda60209ec..98356f810007 100644 --- a/arch/arm/mach-pxa/mainstone.c +++ b/arch/arm/mach-pxa/mainstone.c @@ -157,14 +157,14 @@ static struct platform_device smc91x_device = { .resource = smc91x_resources, }; -static int mst_audio_startup(snd_pcm_substream_t *substream, void *priv) +static int mst_audio_startup(struct snd_pcm_substream *substream, void *priv) { if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) MST_MSCWR2 &= ~MST_MSCWR2_AC97_SPKROFF; return 0; } -static void mst_audio_shutdown(snd_pcm_substream_t *substream, void *priv) +static void mst_audio_shutdown(struct snd_pcm_substream *substream, void *priv) { if (substream->stream == SNDRV_PCM_STREAM_PLAYBACK) MST_MSCWR2 |= MST_MSCWR2_AC97_SPKROFF; diff --git a/arch/arm/mm/consistent.c b/arch/arm/mm/consistent.c index c2ee18d2075e..8a1bfcd50087 100644 --- a/arch/arm/mm/consistent.c +++ b/arch/arm/mm/consistent.c @@ -223,6 +223,8 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, pte = consistent_pte[idx] + off; c->vm_pages = page; + split_page(page, order); + /* * Set the "dma handle" */ @@ -231,7 +233,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, do { BUG_ON(!pte_none(*pte)); - set_page_count(page, 1); /* * x86 does not mark the pages reserved... */ @@ -250,7 +251,6 @@ __dma_alloc(struct device *dev, size_t size, dma_addr_t *handle, gfp_t gfp, * Free the otherwise unused pages. */ while (page < end) { - set_page_count(page, 1); __free_page(page); page++; } diff --git a/arch/arm/mm/init.c b/arch/arm/mm/init.c index efda9710ee68..88279124317a 100644 --- a/arch/arm/mm/init.c +++ b/arch/arm/mm/init.c @@ -530,7 +530,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s) for (; addr < end; addr += PAGE_SIZE) { struct page *page = virt_to_page(addr); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(addr); totalram_pages++; } diff --git a/arch/arm26/mm/init.c b/arch/arm26/mm/init.c index 1f09a9d0fb83..e3ecaa453747 100644 --- a/arch/arm26/mm/init.c +++ b/arch/arm26/mm/init.c @@ -324,7 +324,7 @@ static inline void free_area(unsigned long addr, unsigned long end, char *s) for (; addr < end; addr += PAGE_SIZE) { struct page *page = virt_to_page(addr); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(addr); totalram_pages++; } diff --git a/arch/cris/mm/init.c b/arch/cris/mm/init.c index 31a0018b525a..b7842ff213a6 100644 --- a/arch/cris/mm/init.c +++ b/arch/cris/mm/init.c @@ -216,7 +216,7 @@ free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/frv/kernel/frv_ksyms.c b/arch/frv/kernel/frv_ksyms.c index 0f1c6cbc4f50..aa6b7d0a2109 100644 --- a/arch/frv/kernel/frv_ksyms.c +++ b/arch/frv/kernel/frv_ksyms.c @@ -27,6 +27,7 @@ EXPORT_SYMBOL(__ioremap); EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(strstr); EXPORT_SYMBOL(strchr); diff --git a/arch/frv/mm/dma-alloc.c b/arch/frv/mm/dma-alloc.c index 342823aad758..636b2f8b5d98 100644 --- a/arch/frv/mm/dma-alloc.c +++ b/arch/frv/mm/dma-alloc.c @@ -115,9 +115,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *dma_handle) */ if (order > 0) { struct page *rpage = virt_to_page(page); - - for (i = 1; i < (1 << order); i++) - set_page_count(rpage + i, 1); + split_page(rpage, order); } err = 0; diff --git a/arch/frv/mm/init.c b/arch/frv/mm/init.c index 765088ea8a50..8899aa1a4f06 100644 --- a/arch/frv/mm/init.c +++ b/arch/frv/mm/init.c @@ -169,7 +169,7 @@ void __init mem_init(void) struct page *page = &mem_map[pfn]; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; } @@ -210,7 +210,7 @@ void __init free_initmem(void) /* next to check that the page we free is not a partial page */ for (addr = start; addr < end; addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -230,7 +230,7 @@ void __init free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; diff --git a/arch/h8300/kernel/h8300_ksyms.c b/arch/h8300/kernel/h8300_ksyms.c index 5cc76efaf7aa..69d6ad32d56c 100644 --- a/arch/h8300/kernel/h8300_ksyms.c +++ b/arch/h8300/kernel/h8300_ksyms.c @@ -25,6 +25,7 @@ extern char h8300_debug_device[]; /* platform dependent support */ EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(strstr); EXPORT_SYMBOL(strchr); diff --git a/arch/h8300/mm/init.c b/arch/h8300/mm/init.c index 1e0929ddc8c4..09efc4b1f038 100644 --- a/arch/h8300/mm/init.c +++ b/arch/h8300/mm/init.c @@ -196,7 +196,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; @@ -219,7 +219,7 @@ free_initmem() /* next to check that the page we free is not a partial page */ for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/i386/kernel/efi.c b/arch/i386/kernel/efi.c index c9cad7ba0d2d..aeabb4196861 100644 --- a/arch/i386/kernel/efi.c +++ b/arch/i386/kernel/efi.c @@ -115,7 +115,7 @@ static void efi_call_phys_epilog(void) unsigned long cr4; struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, 0); - cpu_gdt_descr->address = __va(cpu_gdt_descr->address); + cpu_gdt_descr->address = (unsigned long)__va(cpu_gdt_descr->address); load_gdt(cpu_gdt_descr); cr4 = read_cr4(); diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 218d725a5a1e..d134e9643a58 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -504,27 +504,23 @@ void unlock_ipi_call_lock(void) spin_unlock_irq(&call_lock); } -static struct call_data_struct * call_data; - -/* - * this function sends a 'generic call function' IPI to all other CPUs - * in the system. - */ - -int smp_call_function (void (*func) (void *info), void *info, int nonatomic, - int wait) -/* - * [SUMMARY] Run a function on all other CPUs. - * <func> The function to run. This must be fast and non-blocking. - * <info> An arbitrary pointer to pass to the function. - * <nonatomic> currently unused. - * <wait> If true, wait (atomically) until function has completed on other CPUs. - * [RETURNS] 0 on success, else a negative status code. Does not return until +static struct call_data_struct *call_data; + +/** + * smp_call_function(): Run a function on all other CPUs. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. + * @nonatomic: currently unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * + * Returns 0 on success, else a negative status code. Does not return until * remote CPUs are nearly ready to execute <<func>> or are or have executed. * * You must not call this function with disabled interrupts or from a * hardware interrupt handler or from a bottom half handler. */ +int smp_call_function (void (*func) (void *info), void *info, int nonatomic, + int wait) { struct call_data_struct data; int cpus; diff --git a/arch/i386/kernel/sys_i386.c b/arch/i386/kernel/sys_i386.c index a4a61976ecb9..8fdb1fb17a5f 100644 --- a/arch/i386/kernel/sys_i386.c +++ b/arch/i386/kernel/sys_i386.c @@ -40,14 +40,13 @@ asmlinkage int sys_pipe(unsigned long __user * fildes) return error; } -/* common code for old and new mmaps */ -static inline long do_mmap2( - unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) +asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff) { int error = -EBADF; - struct file * file = NULL; + struct file *file = NULL; + struct mm_struct *mm = current->mm; flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); if (!(flags & MAP_ANONYMOUS)) { @@ -56,9 +55,9 @@ static inline long do_mmap2( goto out; } - down_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff); - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); if (file) fput(file); @@ -66,13 +65,6 @@ out: return error; } -asmlinkage long sys_mmap2(unsigned long addr, unsigned long len, - unsigned long prot, unsigned long flags, - unsigned long fd, unsigned long pgoff) -{ - return do_mmap2(addr, len, prot, flags, fd, pgoff); -} - /* * Perform the select(nd, in, out, ex, tv) and mmap() system * calls. Linux/i386 didn't use to be able to handle more than @@ -101,7 +93,8 @@ asmlinkage int old_mmap(struct mmap_arg_struct __user *arg) if (a.offset & ~PAGE_MASK) goto out; - err = do_mmap2(a.addr, a.len, a.prot, a.flags, a.fd, a.offset >> PAGE_SHIFT); + err = sys_mmap2(a.addr, a.len, a.prot, a.flags, + a.fd, a.offset >> PAGE_SHIFT); out: return err; } diff --git a/arch/i386/kernel/timers/timer_hpet.c b/arch/i386/kernel/timers/timer_hpet.c index be242723c339..17a6fe7166e7 100644 --- a/arch/i386/kernel/timers/timer_hpet.c +++ b/arch/i386/kernel/timers/timer_hpet.c @@ -46,7 +46,7 @@ static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED; * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static unsigned long cyc2ns_scale; +static unsigned long cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ static inline void set_cyc2ns_scale(unsigned long cpu_khz) diff --git a/arch/i386/kernel/timers/timer_tsc.c b/arch/i386/kernel/timers/timer_tsc.c index a7f5a2aceba2..5e41ee29c8cf 100644 --- a/arch/i386/kernel/timers/timer_tsc.c +++ b/arch/i386/kernel/timers/timer_tsc.c @@ -74,7 +74,7 @@ late_initcall(start_lost_tick_compensation); * * -johnstul@us.ibm.com "math is hard, lets go shopping!" */ -static unsigned long cyc2ns_scale; +static unsigned long cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ static inline void set_cyc2ns_scale(unsigned long cpu_khz) diff --git a/arch/i386/mm/hugetlbpage.c b/arch/i386/mm/hugetlbpage.c index d524127c9afc..a7d891585411 100644 --- a/arch/i386/mm/hugetlbpage.c +++ b/arch/i386/mm/hugetlbpage.c @@ -48,18 +48,6 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return (pte_t *) pmd; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - #if 0 /* This is just for testing */ struct page * follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) diff --git a/arch/i386/mm/init.c b/arch/i386/mm/init.c index 2700f01994ba..7ba55a6e2dbc 100644 --- a/arch/i386/mm/init.c +++ b/arch/i386/mm/init.c @@ -270,7 +270,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base) static void __meminit free_new_highpage(struct page *page) { - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -727,7 +727,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); memset((void *)addr, 0xcc, PAGE_SIZE); free_page(addr); totalram_pages++; @@ -766,7 +766,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk (KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/i386/mm/pageattr.c b/arch/i386/mm/pageattr.c index d0cadb33b54c..92c3d9f0e731 100644 --- a/arch/i386/mm/pageattr.c +++ b/arch/i386/mm/pageattr.c @@ -51,6 +51,13 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, if (!base) return NULL; + /* + * page_private is used to track the number of entries in + * the page table page that have non standard attributes. + */ + SetPagePrivate(base); + page_private(base) = 0; + address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); @@ -143,11 +150,12 @@ __change_page_attr(struct page *page, pgprot_t prot) return -ENOMEM; set_pmd_pte(kpte,address,mk_pte(split, ref_prot)); kpte_page = split; - } - get_page(kpte_page); + } + page_private(kpte_page)++; } else if ((pte_val(*kpte) & _PAGE_PSE) == 0) { set_pte_atomic(kpte, mk_pte(page, PAGE_KERNEL)); - __put_page(kpte_page); + BUG_ON(page_private(kpte_page) == 0); + page_private(kpte_page)--; } else BUG(); @@ -157,10 +165,8 @@ __change_page_attr(struct page *page, pgprot_t prot) * replace it with a largepage. */ if (!PageReserved(kpte_page)) { - /* memleak and potential failed 2M page regeneration */ - BUG_ON(!page_count(kpte_page)); - - if (cpu_has_pse && (page_count(kpte_page) == 1)) { + if (cpu_has_pse && (page_private(kpte_page) == 0)) { + ClearPagePrivate(kpte_page); list_add(&kpte_page->lru, &df_list); revert_page(kpte_page, address); } diff --git a/arch/ia64/Kconfig b/arch/ia64/Kconfig index a85ea9d37f05..ff7ae6b664e8 100644 --- a/arch/ia64/Kconfig +++ b/arch/ia64/Kconfig @@ -271,6 +271,25 @@ config SCHED_SMT Intel IA64 chips with MultiThreading at a cost of slightly increased overhead in some places. If unsure say N here. +config PERMIT_BSP_REMOVE + bool "Support removal of Bootstrap Processor" + depends on HOTPLUG_CPU + default n + ---help--- + Say Y here if your platform SAL will support removal of BSP with HOTPLUG_CPU + support. + +config FORCE_CPEI_RETARGET + bool "Force assumption that CPEI can be re-targetted" + depends on PERMIT_BSP_REMOVE + default n + ---help--- + Say Y if you need to force the assumption that CPEI can be re-targetted to + any cpu in the system. This hint is available via ACPI 3.0 specifications. + Tiger4 systems are capable of re-directing CPEI to any CPU other than BSP. + This option it useful to enable this feature on older BIOS's as well. + You can also enable this by using boot command line option force_cpei=1. + config PREEMPT bool "Preemptible Kernel" help diff --git a/arch/ia64/configs/tiger_defconfig b/arch/ia64/configs/tiger_defconfig index 125568118b84..766bf4955432 100644 --- a/arch/ia64/configs/tiger_defconfig +++ b/arch/ia64/configs/tiger_defconfig @@ -116,6 +116,8 @@ CONFIG_FORCE_MAX_ZONEORDER=17 CONFIG_SMP=y CONFIG_NR_CPUS=4 CONFIG_HOTPLUG_CPU=y +CONFIG_PERMIT_BSP_REMOVE=y +CONFIG_FORCE_CPEI_RETARGET=y # CONFIG_SCHED_SMT is not set # CONFIG_PREEMPT is not set CONFIG_SELECT_MEMORY_MODEL=y diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index ecd44bdc8394..4722ec51c70c 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -284,19 +284,24 @@ acpi_parse_plat_int_src(acpi_table_entry_header * header, return 0; } +#ifdef CONFIG_HOTPLUG_CPU unsigned int can_cpei_retarget(void) { extern int cpe_vector; + extern unsigned int force_cpei_retarget; /* * Only if CPEI is supported and the override flag * is present, otherwise return that its re-targettable * if we are in polling mode. */ - if (cpe_vector > 0 && !acpi_cpei_override) - return 0; - else - return 1; + if (cpe_vector > 0) { + if (acpi_cpei_override || force_cpei_retarget) + return 1; + else + return 0; + } + return 1; } unsigned int is_cpu_cpei_target(unsigned int cpu) @@ -315,6 +320,7 @@ void set_cpei_target_cpu(unsigned int cpu) { acpi_cpei_phys_cpuid = cpu_physical_id(cpu); } +#endif unsigned int get_cpei_target_cpu(void) { diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 930fdfca6ddb..0e3eda99e549 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1102,9 +1102,6 @@ skip_rbs_switch: st8 [r2]=r8 st8 [r3]=r10 .work_pending: - tbit.nz p6,p0=r31,TIF_SIGDELAYED // signal delayed from MCA/INIT/NMI/PMI context? -(p6) br.cond.sptk.few .sigdelayed - ;; tbit.z p6,p0=r31,TIF_NEED_RESCHED // current_thread_info()->need_resched==0? (p6) br.cond.sptk.few .notify #ifdef CONFIG_PREEMPT @@ -1131,17 +1128,6 @@ skip_rbs_switch: (pLvSys)br.cond.sptk.few .work_pending_syscall_end br.cond.sptk.many .work_processed_kernel // don't re-check -// There is a delayed signal that was detected in MCA/INIT/NMI/PMI context where -// it could not be delivered. Deliver it now. The signal might be for us and -// may set TIF_SIGPENDING, so redrive ia64_leave_* after processing the delayed -// signal. - -.sigdelayed: - br.call.sptk.many rp=do_sigdelayed - cmp.eq p6,p0=r0,r0 // p6 <- 1, always re-check -(pLvSys)br.cond.sptk.few .work_pending_syscall_end - br.cond.sptk.many .work_processed_kernel // re-check - .work_pending_syscall_end: adds r2=PT(R8)+16,r12 adds r3=PT(R10)+16,r12 diff --git a/arch/ia64/kernel/iosapic.c b/arch/ia64/kernel/iosapic.c index 574084f343fa..8832c553230a 100644 --- a/arch/ia64/kernel/iosapic.c +++ b/arch/ia64/kernel/iosapic.c @@ -631,6 +631,7 @@ get_target_cpu (unsigned int gsi, int vector) { #ifdef CONFIG_SMP static int cpu = -1; + extern int cpe_vector; /* * In case of vector shared by multiple RTEs, all RTEs that @@ -653,6 +654,11 @@ get_target_cpu (unsigned int gsi, int vector) if (!cpu_online(smp_processor_id())) return cpu_physical_id(smp_processor_id()); +#ifdef CONFIG_ACPI + if (cpe_vector > 0 && vector == IA64_CPEP_VECTOR) + return get_cpei_target_cpu(); +#endif + #ifdef CONFIG_NUMA { int num_cpus, cpu_index, iosapic_index, numa_cpu, i = 0; diff --git a/arch/ia64/kernel/irq.c b/arch/ia64/kernel/irq.c index d33244c32759..5ce908ef9c95 100644 --- a/arch/ia64/kernel/irq.c +++ b/arch/ia64/kernel/irq.c @@ -163,8 +163,19 @@ void fixup_irqs(void) { unsigned int irq; extern void ia64_process_pending_intr(void); + extern void ia64_disable_timer(void); + extern volatile int time_keeper_id; + + ia64_disable_timer(); + + /* + * Find a new timesync master + */ + if (smp_processor_id() == time_keeper_id) { + time_keeper_id = first_cpu(cpu_online_map); + printk ("CPU %d is now promoted to time-keeper master\n", time_keeper_id); + } - ia64_set_itv(1<<16); /* * Phase 1: Locate irq's bound to this cpu and * relocate them for cpu removal. diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c index ee7eec9ee576..b57e723f194c 100644 --- a/arch/ia64/kernel/mca.c +++ b/arch/ia64/kernel/mca.c @@ -281,14 +281,10 @@ ia64_mca_log_sal_error_record(int sal_info_type) ia64_sal_clear_state_info(sal_info_type); } -/* - * platform dependent error handling - */ -#ifndef PLATFORM_MCA_HANDLERS - #ifdef CONFIG_ACPI int cpe_vector = -1; +int ia64_cpe_irq = -1; static irqreturn_t ia64_mca_cpe_int_handler (int cpe_irq, void *arg, struct pt_regs *ptregs) @@ -377,8 +373,6 @@ ia64_mca_register_cpev (int cpev) } #endif /* CONFIG_ACPI */ -#endif /* PLATFORM_MCA_HANDLERS */ - /* * ia64_mca_cmc_vector_setup * @@ -630,6 +624,32 @@ copy_reg(const u64 *fr, u64 fnat, u64 *tr, u64 *tnat) *tnat |= (nat << tslot); } +/* Change the comm field on the MCA/INT task to include the pid that + * was interrupted, it makes for easier debugging. If that pid was 0 + * (swapper or nested MCA/INIT) then use the start of the previous comm + * field suffixed with its cpu. + */ + +static void +ia64_mca_modify_comm(const task_t *previous_current) +{ + char *p, comm[sizeof(current->comm)]; + if (previous_current->pid) + snprintf(comm, sizeof(comm), "%s %d", + current->comm, previous_current->pid); + else { + int l; + if ((p = strchr(previous_current->comm, ' '))) + l = p - previous_current->comm; + else + l = strlen(previous_current->comm); + snprintf(comm, sizeof(comm), "%s %*s %d", + current->comm, l, previous_current->comm, + task_thread_info(previous_current)->cpu); + } + memcpy(current->comm, comm, sizeof(current->comm)); +} + /* On entry to this routine, we are running on the per cpu stack, see * mca_asm.h. The original stack has not been touched by this event. Some of * the original stack's registers will be in the RBS on this stack. This stack @@ -648,7 +668,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, struct ia64_sal_os_state *sos, const char *type) { - char *p, comm[sizeof(current->comm)]; + char *p; ia64_va va; extern char ia64_leave_kernel[]; /* Need asm address, not function descriptor */ const pal_min_state_area_t *ms = sos->pal_min_state; @@ -721,6 +741,10 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, /* Verify the previous stack state before we change it */ if (user_mode(regs)) { msg = "occurred in user space"; + /* previous_current is guaranteed to be valid when the task was + * in user space, so ... + */ + ia64_mca_modify_comm(previous_current); goto no_mod; } if (r13 != sos->prev_IA64_KR_CURRENT) { @@ -750,25 +774,7 @@ ia64_mca_modify_original_stack(struct pt_regs *regs, goto no_mod; } - /* Change the comm field on the MCA/INT task to include the pid that - * was interrupted, it makes for easier debugging. If that pid was 0 - * (swapper or nested MCA/INIT) then use the start of the previous comm - * field suffixed with its cpu. - */ - if (previous_current->pid) - snprintf(comm, sizeof(comm), "%s %d", - current->comm, previous_current->pid); - else { - int l; - if ((p = strchr(previous_current->comm, ' '))) - l = p - previous_current->comm; - else - l = strlen(previous_current->comm); - snprintf(comm, sizeof(comm), "%s %*s %d", - current->comm, l, previous_current->comm, - task_thread_info(previous_current)->cpu); - } - memcpy(current->comm, comm, sizeof(current->comm)); + ia64_mca_modify_comm(previous_current); /* Make the original task look blocked. First stack a struct pt_regs, * describing the state at the time of interrupt. mca_asm.S built a @@ -908,7 +914,7 @@ no_mod: static void ia64_wait_for_slaves(int monarch) { - int c, wait = 0; + int c, wait = 0, missing = 0; for_each_online_cpu(c) { if (c == monarch) continue; @@ -919,15 +925,32 @@ ia64_wait_for_slaves(int monarch) } } if (!wait) - return; + goto all_in; for_each_online_cpu(c) { if (c == monarch) continue; if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) { udelay(5*1000000); /* wait 5 seconds for slaves (arbitrary) */ + if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) + missing = 1; break; } } + if (!missing) + goto all_in; + printk(KERN_INFO "OS MCA slave did not rendezvous on cpu"); + for_each_online_cpu(c) { + if (c == monarch) + continue; + if (ia64_mc_info.imi_rendez_checkin[c] == IA64_MCA_RENDEZ_CHECKIN_NOTDONE) + printk(" %d", c); + } + printk("\n"); + return; + +all_in: + printk(KERN_INFO "All OS MCA slaves have reached rendezvous\n"); + return; } /* @@ -953,6 +976,10 @@ ia64_mca_handler(struct pt_regs *regs, struct switch_stack *sw, task_t *previous_current; oops_in_progress = 1; /* FIXME: make printk NMI/MCA/INIT safe */ + console_loglevel = 15; /* make sure printks make it to console */ + printk(KERN_INFO "Entered OS MCA handler. PSP=%lx cpu=%d monarch=%ld\n", + sos->proc_state_param, cpu, sos->monarch); + previous_current = ia64_mca_modify_original_stack(regs, sw, sos, "MCA"); monarch_cpu = cpu; if (notify_die(DIE_MCA_MONARCH_ENTER, "MCA", regs, 0, 0, 0) @@ -1444,11 +1471,13 @@ void __devinit ia64_mca_cpu_init(void *cpu_data) { void *pal_vaddr; + static int first_time = 1; - if (smp_processor_id() == 0) { + if (first_time) { void *mca_data; int cpu; + first_time = 0; mca_data = alloc_bootmem(sizeof(struct ia64_mca_cpu) * NR_CPUS + KERNEL_STACK_SIZE); mca_data = (void *)(((unsigned long)mca_data + @@ -1704,6 +1733,7 @@ ia64_mca_late_init(void) desc = irq_descp(irq); desc->status |= IRQ_PER_CPU; setup_irq(irq, &mca_cpe_irqaction); + ia64_cpe_irq = irq; } ia64_mca_register_cpev(cpe_vector); IA64_MCA_DEBUG("%s: CPEI/P setup and enabled.\n", __FUNCTION__); diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c index 9c5194b385da..077f21216b65 100644 --- a/arch/ia64/kernel/perfmon.c +++ b/arch/ia64/kernel/perfmon.c @@ -6722,6 +6722,7 @@ __initcall(pfm_init); void pfm_init_percpu (void) { + static int first_time=1; /* * make sure no measurement is active * (may inherit programmed PMCs from EFI). @@ -6734,8 +6735,10 @@ pfm_init_percpu (void) */ pfm_unfreeze_pmu(); - if (smp_processor_id() == 0) + if (first_time) { register_percpu_irq(IA64_PERFMON_VECTOR, &perfmon_irqaction); + first_time=0; + } ia64_setreg(_IA64_REG_CR_PMV, IA64_PERFMON_VECTOR); ia64_srlz_d(); diff --git a/arch/ia64/kernel/signal.c b/arch/ia64/kernel/signal.c index 463f6bb44d07..1d7903ee2126 100644 --- a/arch/ia64/kernel/signal.c +++ b/arch/ia64/kernel/signal.c @@ -588,104 +588,3 @@ ia64_do_signal (sigset_t *oldset, struct sigscratch *scr, long in_syscall) } return 0; } - -/* Set a delayed signal that was detected in MCA/INIT/NMI/PMI context where it - * could not be delivered. It is important that the target process is not - * allowed to do any more work in user space. Possible cases for the target - * process: - * - * - It is sleeping and will wake up soon. Store the data in the current task, - * the signal will be sent when the current task returns from the next - * interrupt. - * - * - It is running in user context. Store the data in the current task, the - * signal will be sent when the current task returns from the next interrupt. - * - * - It is running in kernel context on this or another cpu and will return to - * user context. Store the data in the target task, the signal will be sent - * to itself when the target task returns to user space. - * - * - It is running in kernel context on this cpu and will sleep before - * returning to user context. Because this is also the current task, the - * signal will not get delivered and the task could sleep indefinitely. - * Store the data in the idle task for this cpu, the signal will be sent - * after the idle task processes its next interrupt. - * - * To cover all cases, store the data in the target task, the current task and - * the idle task on this cpu. Whatever happens, the signal will be delivered - * to the target task before it can do any useful user space work. Multiple - * deliveries have no unwanted side effects. - * - * Note: This code is executed in MCA/INIT/NMI/PMI context, with interrupts - * disabled. It must not take any locks nor use kernel structures or services - * that require locks. - */ - -/* To ensure that we get the right pid, check its start time. To avoid extra - * include files in thread_info.h, convert the task start_time to unsigned long, - * giving us a cycle time of > 580 years. - */ -static inline unsigned long -start_time_ul(const struct task_struct *t) -{ - return t->start_time.tv_sec * NSEC_PER_SEC + t->start_time.tv_nsec; -} - -void -set_sigdelayed(pid_t pid, int signo, int code, void __user *addr) -{ - struct task_struct *t; - unsigned long start_time = 0; - int i; - - for (i = 1; i <= 3; ++i) { - switch (i) { - case 1: - t = find_task_by_pid(pid); - if (t) - start_time = start_time_ul(t); - break; - case 2: - t = current; - break; - default: - t = idle_task(smp_processor_id()); - break; - } - - if (!t) - return; - task_thread_info(t)->sigdelayed.signo = signo; - task_thread_info(t)->sigdelayed.code = code; - task_thread_info(t)->sigdelayed.addr = addr; - task_thread_info(t)->sigdelayed.start_time = start_time; - task_thread_info(t)->sigdelayed.pid = pid; - wmb(); - set_tsk_thread_flag(t, TIF_SIGDELAYED); - } -} - -/* Called from entry.S when it detects TIF_SIGDELAYED, a delayed signal that - * was detected in MCA/INIT/NMI/PMI context where it could not be delivered. - */ - -void -do_sigdelayed(void) -{ - struct siginfo siginfo; - pid_t pid; - struct task_struct *t; - - clear_thread_flag(TIF_SIGDELAYED); - memset(&siginfo, 0, sizeof(siginfo)); - siginfo.si_signo = current_thread_info()->sigdelayed.signo; - siginfo.si_code = current_thread_info()->sigdelayed.code; - siginfo.si_addr = current_thread_info()->sigdelayed.addr; - pid = current_thread_info()->sigdelayed.pid; - t = find_task_by_pid(pid); - if (!t) - return; - if (current_thread_info()->sigdelayed.start_time != start_time_ul(t)) - return; - force_sig_info(siginfo.si_signo, &siginfo, t); -} diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c index b681ef34a86e..c4b633b36dab 100644 --- a/arch/ia64/kernel/smpboot.c +++ b/arch/ia64/kernel/smpboot.c @@ -70,6 +70,12 @@ #endif #ifdef CONFIG_HOTPLUG_CPU +#ifdef CONFIG_PERMIT_BSP_REMOVE +#define bsp_remove_ok 1 +#else +#define bsp_remove_ok 0 +#endif + /* * Store all idle threads, this can be reused instead of creating * a new thread. Also avoids complicated thread destroy functionality @@ -104,7 +110,7 @@ struct sal_to_os_boot *sal_state_for_booting_cpu = &sal_boot_rendez_state[0]; /* * ITC synchronization related stuff: */ -#define MASTER 0 +#define MASTER (0) #define SLAVE (SMP_CACHE_BYTES/8) #define NUM_ROUNDS 64 /* magic value */ @@ -151,6 +157,27 @@ char __initdata no_int_routing; unsigned char smp_int_redirect; /* are INT and IPI redirectable by the chipset? */ +#ifdef CONFIG_FORCE_CPEI_RETARGET +#define CPEI_OVERRIDE_DEFAULT (1) +#else +#define CPEI_OVERRIDE_DEFAULT (0) +#endif + +unsigned int force_cpei_retarget = CPEI_OVERRIDE_DEFAULT; + +static int __init +cmdl_force_cpei(char *str) +{ + int value=0; + + get_option (&str, &value); + force_cpei_retarget = value; + + return 1; +} + +__setup("force_cpei=", cmdl_force_cpei); + static int __init nointroute (char *str) { @@ -161,6 +188,27 @@ nointroute (char *str) __setup("nointroute", nointroute); +static void fix_b0_for_bsp(void) +{ +#ifdef CONFIG_HOTPLUG_CPU + int cpuid; + static int fix_bsp_b0 = 1; + + cpuid = smp_processor_id(); + + /* + * Cache the b0 value on the first AP that comes up + */ + if (!(fix_bsp_b0 && cpuid)) + return; + + sal_boot_rendez_state[0].br[0] = sal_boot_rendez_state[cpuid].br[0]; + printk ("Fixed BSP b0 value from CPU %d\n", cpuid); + + fix_bsp_b0 = 0; +#endif +} + void sync_master (void *arg) { @@ -327,8 +375,9 @@ smp_setup_percpu_timer (void) static void __devinit smp_callin (void) { - int cpuid, phys_id; + int cpuid, phys_id, itc_master; extern void ia64_init_itm(void); + extern volatile int time_keeper_id; #ifdef CONFIG_PERFMON extern void pfm_init_percpu(void); @@ -336,6 +385,7 @@ smp_callin (void) cpuid = smp_processor_id(); phys_id = hard_smp_processor_id(); + itc_master = time_keeper_id; if (cpu_online(cpuid)) { printk(KERN_ERR "huh, phys CPU#0x%x, CPU#0x%x already present??\n", @@ -343,6 +393,8 @@ smp_callin (void) BUG(); } + fix_b0_for_bsp(); + lock_ipi_calllock(); cpu_set(cpuid, cpu_online_map); unlock_ipi_calllock(); @@ -365,8 +417,8 @@ smp_callin (void) * calls spin_unlock_bh(), which calls spin_unlock_bh(), which calls * local_bh_enable(), which bugs out if irqs are not enabled... */ - Dprintk("Going to syncup ITC with BP.\n"); - ia64_sync_itc(0); + Dprintk("Going to syncup ITC with ITC Master.\n"); + ia64_sync_itc(itc_master); } /* @@ -635,6 +687,47 @@ remove_siblinginfo(int cpu) } extern void fixup_irqs(void); + +int migrate_platform_irqs(unsigned int cpu) +{ + int new_cpei_cpu; + irq_desc_t *desc = NULL; + cpumask_t mask; + int retval = 0; + + /* + * dont permit CPEI target to removed. + */ + if (cpe_vector > 0 && is_cpu_cpei_target(cpu)) { + printk ("CPU (%d) is CPEI Target\n", cpu); + if (can_cpei_retarget()) { + /* + * Now re-target the CPEI to a different processor + */ + new_cpei_cpu = any_online_cpu(cpu_online_map); + mask = cpumask_of_cpu(new_cpei_cpu); + set_cpei_target_cpu(new_cpei_cpu); + desc = irq_descp(ia64_cpe_irq); + /* + * Switch for now, immediatly, we need to do fake intr + * as other interrupts, but need to study CPEI behaviour with + * polling before making changes. + */ + if (desc) { + desc->handler->disable(ia64_cpe_irq); + desc->handler->set_affinity(ia64_cpe_irq, mask); + desc->handler->enable(ia64_cpe_irq); + printk ("Re-targetting CPEI to cpu %d\n", new_cpei_cpu); + } + } + if (!desc) { + printk ("Unable to retarget CPEI, offline cpu [%d] failed\n", cpu); + retval = -EBUSY; + } + } + return retval; +} + /* must be called with cpucontrol mutex held */ int __cpu_disable(void) { @@ -643,8 +736,17 @@ int __cpu_disable(void) /* * dont permit boot processor for now */ - if (cpu == 0) - return -EBUSY; + if (cpu == 0 && !bsp_remove_ok) { + printk ("Your platform does not support removal of BSP\n"); + return (-EBUSY); + } + + cpu_clear(cpu, cpu_online_map); + + if (migrate_platform_irqs(cpu)) { + cpu_set(cpu, cpu_online_map); + return (-EBUSY); + } remove_siblinginfo(cpu); cpu_clear(cpu, cpu_online_map); diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 307d01e15b2e..ac167436e936 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -32,7 +32,7 @@ extern unsigned long wall_jiffies; -#define TIME_KEEPER_ID 0 /* smp_processor_id() of time-keeper */ +volatile int time_keeper_id = 0; /* smp_processor_id() of time-keeper */ #ifdef CONFIG_IA64_DEBUG_IRQ @@ -71,7 +71,7 @@ timer_interrupt (int irq, void *dev_id, struct pt_regs *regs) new_itm += local_cpu_data->itm_delta; - if (smp_processor_id() == TIME_KEEPER_ID) { + if (smp_processor_id() == time_keeper_id) { /* * Here we are in the timer irq handler. We have irqs locally * disabled, but we don't know if the timer_bh is running on @@ -236,6 +236,11 @@ static struct irqaction timer_irqaction = { .name = "timer" }; +void __devinit ia64_disable_timer(void) +{ + ia64_set_itv(1 << 16); +} + void __init time_init (void) { diff --git a/arch/ia64/kernel/topology.c b/arch/ia64/kernel/topology.c index 6e5eea19fa67..3b6fd798c4d6 100644 --- a/arch/ia64/kernel/topology.c +++ b/arch/ia64/kernel/topology.c @@ -36,7 +36,7 @@ int arch_register_cpu(int num) parent = &sysfs_nodes[cpu_to_node(num)]; #endif /* CONFIG_NUMA */ -#ifdef CONFIG_ACPI +#if defined (CONFIG_ACPI) && defined (CONFIG_HOTPLUG_CPU) /* * If CPEI cannot be re-targetted, and this is * CPEI target, then dont create the control file diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c index acaaec4e4681..9855ba318094 100644 --- a/arch/ia64/mm/contig.c +++ b/arch/ia64/mm/contig.c @@ -181,13 +181,15 @@ per_cpu_init (void) { void *cpu_data; int cpu; + static int first_time=1; /* * get_free_pages() cannot be used before cpu_init() done. BSP * allocates "NR_CPUS" pages for all CPUs to avoid that AP calls * get_zeroed_page(). */ - if (smp_processor_id() == 0) { + if (first_time) { + first_time=0; cpu_data = __alloc_bootmem(PERCPU_PAGE_SIZE * NR_CPUS, PERCPU_PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); for (cpu = 0; cpu < NR_CPUS; cpu++) { diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c index c87d6d1d5813..573d5cc63e2b 100644 --- a/arch/ia64/mm/discontig.c +++ b/arch/ia64/mm/discontig.c @@ -528,12 +528,17 @@ void __init find_memory(void) void *per_cpu_init(void) { int cpu; + static int first_time = 1; + if (smp_processor_id() != 0) return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; - for (cpu = 0; cpu < NR_CPUS; cpu++) - per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; + if (first_time) { + first_time = 0; + for (cpu = 0; cpu < NR_CPUS; cpu++) + per_cpu(local_per_cpu_offset, cpu) = __per_cpu_offset[cpu]; + } return __per_cpu_start + __per_cpu_offset[smp_processor_id()]; } diff --git a/arch/ia64/mm/hugetlbpage.c b/arch/ia64/mm/hugetlbpage.c index 2d13889d0a99..9dbc7dadd165 100644 --- a/arch/ia64/mm/hugetlbpage.c +++ b/arch/ia64/mm/hugetlbpage.c @@ -68,9 +68,10 @@ huge_pte_offset (struct mm_struct *mm, unsigned long addr) #define mk_pte_huge(entry) { pte_val(entry) |= _PAGE_P; } /* - * This function checks for proper alignment of input addr and len parameters. + * Don't actually need to do any preparation, but need to make sure + * the address is in the right region. */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) +int prepare_hugepage_range(unsigned long addr, unsigned long len) { if (len & ~HPAGE_MASK) return -EINVAL; diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c index b38b6d213c15..08d94e6bfa18 100644 --- a/arch/ia64/mm/init.c +++ b/arch/ia64/mm/init.c @@ -197,7 +197,7 @@ free_initmem (void) eaddr = (unsigned long) ia64_imva(__init_end); while (addr < eaddr) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); ++totalram_pages; addr += PAGE_SIZE; @@ -252,7 +252,7 @@ free_initrd_mem (unsigned long start, unsigned long end) continue; page = virt_to_page(start); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); free_page(start); ++totalram_pages; } @@ -640,7 +640,7 @@ mem_init (void) void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; num_physpages++; diff --git a/arch/ia64/sn/kernel/Makefile b/arch/ia64/sn/kernel/Makefile index 3e9b4eea7418..ab9c48c88012 100644 --- a/arch/ia64/sn/kernel/Makefile +++ b/arch/ia64/sn/kernel/Makefile @@ -10,7 +10,8 @@ CPPFLAGS += -I$(srctree)/arch/ia64/sn/include obj-y += setup.o bte.o bte_error.o irq.o mca.o idle.o \ - huberror.o io_init.o iomv.o klconflib.o sn2/ + huberror.o io_init.o iomv.o klconflib.o pio_phys.o \ + sn2/ obj-$(CONFIG_IA64_GENERIC) += machvec.o obj-$(CONFIG_SGI_TIOCX) += tiocx.o obj-$(CONFIG_IA64_SGI_SN_XP) += xp.o diff --git a/arch/ia64/sn/kernel/pio_phys.S b/arch/ia64/sn/kernel/pio_phys.S new file mode 100644 index 000000000000..3c7d48d6ecb8 --- /dev/null +++ b/arch/ia64/sn/kernel/pio_phys.S @@ -0,0 +1,71 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Copyright (C) 2000-2005 Silicon Graphics, Inc. All rights reserved. + * + * This file contains macros used to access MMR registers via + * uncached physical addresses. + * pio_phys_read_mmr - read an MMR + * pio_phys_write_mmr - write an MMR + * pio_atomic_phys_write_mmrs - atomically write 1 or 2 MMRs with psr.ic=0 + * Second MMR will be skipped if address is NULL + * + * Addresses passed to these routines should be uncached physical addresses + * ie., 0x80000.... + */ + + + +#include <asm/asmmacro.h> +#include <asm/page.h> + +GLOBAL_ENTRY(pio_phys_read_mmr) + .prologue + .regstk 1,0,0,0 + .body + mov r2=psr + rsm psr.i | psr.dt + ;; + srlz.d + ld8.acq r8=[r32] + ;; + mov psr.l=r2;; + srlz.d + br.ret.sptk.many rp +END(pio_phys_read_mmr) + +GLOBAL_ENTRY(pio_phys_write_mmr) + .prologue + .regstk 2,0,0,0 + .body + mov r2=psr + rsm psr.i | psr.dt + ;; + srlz.d + st8.rel [r32]=r33 + ;; + mov psr.l=r2;; + srlz.d + br.ret.sptk.many rp +END(pio_phys_write_mmr) + +GLOBAL_ENTRY(pio_atomic_phys_write_mmrs) + .prologue + .regstk 4,0,0,0 + .body + mov r2=psr + cmp.ne p9,p0=r34,r0; + rsm psr.i | psr.dt | psr.ic + ;; + srlz.d + st8.rel [r32]=r33 +(p9) st8.rel [r34]=r35 + ;; + mov psr.l=r2;; + srlz.d + br.ret.sptk.many rp +END(pio_atomic_phys_write_mmrs) + + diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c index 5b84836c2171..8b6d5c844708 100644 --- a/arch/ia64/sn/kernel/setup.c +++ b/arch/ia64/sn/kernel/setup.c @@ -3,7 +3,7 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. * - * Copyright (C) 1999,2001-2005 Silicon Graphics, Inc. All rights reserved. + * Copyright (C) 1999,2001-2006 Silicon Graphics, Inc. All rights reserved. */ #include <linux/config.h> @@ -498,6 +498,7 @@ void __init sn_setup(char **cmdline_p) * for sn. */ pm_power_off = ia64_sn_power_down; + current->thread.flags |= IA64_THREAD_MIGRATION; } /** @@ -660,7 +661,8 @@ void __init sn_cpu_init(void) SH2_PIO_WRITE_STATUS_1, SH2_PIO_WRITE_STATUS_3}; u64 *pio; pio = is_shub1() ? pio1 : pio2; - pda->pio_write_status_addr = (volatile unsigned long *) LOCAL_MMR_ADDR(pio[slice]); + pda->pio_write_status_addr = + (volatile unsigned long *)GLOBAL_MMR_ADDR(nasid, pio[slice]); pda->pio_write_status_val = is_shub1() ? SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK : 0; } diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c index b2e1e746b47f..d9d306c79f2d 100644 --- a/arch/ia64/sn/kernel/sn2/sn2_smp.c +++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c @@ -93,6 +93,27 @@ static inline unsigned long wait_piowc(void) return (ws & SH_PIO_WRITE_STATUS_WRITE_DEADLOCK_MASK) != 0; } +/** + * sn_migrate - SN-specific task migration actions + * @task: Task being migrated to new CPU + * + * SN2 PIO writes from separate CPUs are not guaranteed to arrive in order. + * Context switching user threads which have memory-mapped MMIO may cause + * PIOs to issue from seperate CPUs, thus the PIO writes must be drained + * from the previous CPU's Shub before execution resumes on the new CPU. + */ +void sn_migrate(struct task_struct *task) +{ + pda_t *last_pda = pdacpu(task_thread_info(task)->last_cpu); + volatile unsigned long *adr = last_pda->pio_write_status_addr; + unsigned long val = last_pda->pio_write_status_val; + + /* Drain PIO writes from old CPU's Shub */ + while (unlikely((*adr & SH_PIO_WRITE_STATUS_PENDING_WRITE_COUNT_MASK) + != val)) + cpu_relax(); +} + void sn_tlb_migrate_finish(struct mm_struct *mm) { /* flush_tlb_mm is inefficient if more than 1 users of mm */ diff --git a/arch/ia64/sn/kernel/xpc_channel.c b/arch/ia64/sn/kernel/xpc_channel.c index cdf6856ce089..d0abddd9ffe6 100644 --- a/arch/ia64/sn/kernel/xpc_channel.c +++ b/arch/ia64/sn/kernel/xpc_channel.c @@ -21,7 +21,6 @@ #include <linux/sched.h> #include <linux/cache.h> #include <linux/interrupt.h> -#include <linux/slab.h> #include <linux/mutex.h> #include <linux/completion.h> #include <asm/sn/bte.h> @@ -30,6 +29,31 @@ /* + * Guarantee that the kzalloc'd memory is cacheline aligned. + */ +static void * +xpc_kzalloc_cacheline_aligned(size_t size, gfp_t flags, void **base) +{ + /* see if kzalloc will give us cachline aligned memory by default */ + *base = kzalloc(size, flags); + if (*base == NULL) { + return NULL; + } + if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) { + return *base; + } + kfree(*base); + + /* nope, we'll have to do it ourselves */ + *base = kzalloc(size + L1_CACHE_BYTES, flags); + if (*base == NULL) { + return NULL; + } + return (void *) L1_CACHE_ALIGN((u64) *base); +} + + +/* * Set up the initial values for the XPartition Communication channels. */ static void @@ -93,20 +117,19 @@ xpc_setup_infrastructure(struct xpc_partition *part) * Allocate all of the channel structures as a contiguous chunk of * memory. */ - part->channels = kmalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS, + part->channels = kzalloc(sizeof(struct xpc_channel) * XPC_NCHANNELS, GFP_KERNEL); if (part->channels == NULL) { dev_err(xpc_chan, "can't get memory for channels\n"); return xpcNoMemory; } - memset(part->channels, 0, sizeof(struct xpc_channel) * XPC_NCHANNELS); part->nchannels = XPC_NCHANNELS; /* allocate all the required GET/PUT values */ - part->local_GPs = xpc_kmalloc_cacheline_aligned(XPC_GP_SIZE, + part->local_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE, GFP_KERNEL, &part->local_GPs_base); if (part->local_GPs == NULL) { kfree(part->channels); @@ -115,55 +138,51 @@ xpc_setup_infrastructure(struct xpc_partition *part) "values\n"); return xpcNoMemory; } - memset(part->local_GPs, 0, XPC_GP_SIZE); - part->remote_GPs = xpc_kmalloc_cacheline_aligned(XPC_GP_SIZE, + part->remote_GPs = xpc_kzalloc_cacheline_aligned(XPC_GP_SIZE, GFP_KERNEL, &part->remote_GPs_base); if (part->remote_GPs == NULL) { - kfree(part->channels); - part->channels = NULL; - kfree(part->local_GPs_base); - part->local_GPs = NULL; dev_err(xpc_chan, "can't get memory for remote get/put " "values\n"); + kfree(part->local_GPs_base); + part->local_GPs = NULL; + kfree(part->channels); + part->channels = NULL; return xpcNoMemory; } - memset(part->remote_GPs, 0, XPC_GP_SIZE); /* allocate all the required open and close args */ - part->local_openclose_args = xpc_kmalloc_cacheline_aligned( + part->local_openclose_args = xpc_kzalloc_cacheline_aligned( XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL, &part->local_openclose_args_base); if (part->local_openclose_args == NULL) { - kfree(part->channels); - part->channels = NULL; - kfree(part->local_GPs_base); - part->local_GPs = NULL; + dev_err(xpc_chan, "can't get memory for local connect args\n"); kfree(part->remote_GPs_base); part->remote_GPs = NULL; - dev_err(xpc_chan, "can't get memory for local connect args\n"); + kfree(part->local_GPs_base); + part->local_GPs = NULL; + kfree(part->channels); + part->channels = NULL; return xpcNoMemory; } - memset(part->local_openclose_args, 0, XPC_OPENCLOSE_ARGS_SIZE); - part->remote_openclose_args = xpc_kmalloc_cacheline_aligned( + part->remote_openclose_args = xpc_kzalloc_cacheline_aligned( XPC_OPENCLOSE_ARGS_SIZE, GFP_KERNEL, &part->remote_openclose_args_base); if (part->remote_openclose_args == NULL) { - kfree(part->channels); - part->channels = NULL; - kfree(part->local_GPs_base); - part->local_GPs = NULL; - kfree(part->remote_GPs_base); - part->remote_GPs = NULL; + dev_err(xpc_chan, "can't get memory for remote connect args\n"); kfree(part->local_openclose_args_base); part->local_openclose_args = NULL; - dev_err(xpc_chan, "can't get memory for remote connect args\n"); + kfree(part->remote_GPs_base); + part->remote_GPs = NULL; + kfree(part->local_GPs_base); + part->local_GPs = NULL; + kfree(part->channels); + part->channels = NULL; return xpcNoMemory; } - memset(part->remote_openclose_args, 0, XPC_OPENCLOSE_ARGS_SIZE); xpc_initialize_channels(part, partid); @@ -186,18 +205,18 @@ xpc_setup_infrastructure(struct xpc_partition *part) ret = request_irq(SGI_XPC_NOTIFY, xpc_notify_IRQ_handler, SA_SHIRQ, part->IPI_owner, (void *) (u64) partid); if (ret != 0) { - kfree(part->channels); - part->channels = NULL; - kfree(part->local_GPs_base); - part->local_GPs = NULL; - kfree(part->remote_GPs_base); - part->remote_GPs = NULL; - kfree(part->local_openclose_args_base); - part->local_openclose_args = NULL; - kfree(part->remote_openclose_args_base); - part->remote_openclose_args = NULL; dev_err(xpc_chan, "can't register NOTIFY IRQ handler, " "errno=%d\n", -ret); + kfree(part->remote_openclose_args_base); + part->remote_openclose_args = NULL; + kfree(part->local_openclose_args_base); + part->local_openclose_args = NULL; + kfree(part->remote_GPs_base); + part->remote_GPs = NULL; + kfree(part->local_GPs_base); + part->local_GPs = NULL; + kfree(part->channels); + part->channels = NULL; return xpcLackOfResources; } @@ -446,22 +465,20 @@ xpc_allocate_local_msgqueue(struct xpc_channel *ch) for (nentries = ch->local_nentries; nentries > 0; nentries--) { nbytes = nentries * ch->msg_size; - ch->local_msgqueue = xpc_kmalloc_cacheline_aligned(nbytes, + ch->local_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes, GFP_KERNEL, &ch->local_msgqueue_base); if (ch->local_msgqueue == NULL) { continue; } - memset(ch->local_msgqueue, 0, nbytes); nbytes = nentries * sizeof(struct xpc_notify); - ch->notify_queue = kmalloc(nbytes, GFP_KERNEL); + ch->notify_queue = kzalloc(nbytes, GFP_KERNEL); if (ch->notify_queue == NULL) { kfree(ch->local_msgqueue_base); ch->local_msgqueue = NULL; continue; } - memset(ch->notify_queue, 0, nbytes); spin_lock_irqsave(&ch->lock, irq_flags); if (nentries < ch->local_nentries) { @@ -501,13 +518,12 @@ xpc_allocate_remote_msgqueue(struct xpc_channel *ch) for (nentries = ch->remote_nentries; nentries > 0; nentries--) { nbytes = nentries * ch->msg_size; - ch->remote_msgqueue = xpc_kmalloc_cacheline_aligned(nbytes, + ch->remote_msgqueue = xpc_kzalloc_cacheline_aligned(nbytes, GFP_KERNEL, &ch->remote_msgqueue_base); if (ch->remote_msgqueue == NULL) { continue; } - memset(ch->remote_msgqueue, 0, nbytes); spin_lock_irqsave(&ch->lock, irq_flags); if (nentries < ch->remote_nentries) { diff --git a/arch/ia64/sn/kernel/xpc_main.c b/arch/ia64/sn/kernel/xpc_main.c index 8cbf16432570..99b123a6421a 100644 --- a/arch/ia64/sn/kernel/xpc_main.c +++ b/arch/ia64/sn/kernel/xpc_main.c @@ -52,7 +52,6 @@ #include <linux/syscalls.h> #include <linux/cache.h> #include <linux/interrupt.h> -#include <linux/slab.h> #include <linux/delay.h> #include <linux/reboot.h> #include <linux/completion.h> diff --git a/arch/ia64/sn/kernel/xpc_partition.c b/arch/ia64/sn/kernel/xpc_partition.c index 88a730e6cfdb..94211429fd0c 100644 --- a/arch/ia64/sn/kernel/xpc_partition.c +++ b/arch/ia64/sn/kernel/xpc_partition.c @@ -81,6 +81,31 @@ char ____cacheline_aligned xpc_remote_copy_buffer[XPC_RP_HEADER_SIZE + /* + * Guarantee that the kmalloc'd memory is cacheline aligned. + */ +static void * +xpc_kmalloc_cacheline_aligned(size_t size, gfp_t flags, void **base) +{ + /* see if kmalloc will give us cachline aligned memory by default */ + *base = kmalloc(size, flags); + if (*base == NULL) { + return NULL; + } + if ((u64) *base == L1_CACHE_ALIGN((u64) *base)) { + return *base; + } + kfree(*base); + + /* nope, we'll have to do it ourselves */ + *base = kmalloc(size + L1_CACHE_BYTES, flags); + if (*base == NULL) { + return NULL; + } + return (void *) L1_CACHE_ALIGN((u64) *base); +} + + +/* * Given a nasid, get the physical address of the partition's reserved page * for that nasid. This function returns 0 on any error. */ @@ -1038,13 +1063,12 @@ xpc_discovery(void) remote_vars = (struct xpc_vars *) remote_rp; - discovered_nasids = kmalloc(sizeof(u64) * xp_nasid_mask_words, + discovered_nasids = kzalloc(sizeof(u64) * xp_nasid_mask_words, GFP_KERNEL); if (discovered_nasids == NULL) { kfree(remote_rp_base); return; } - memset(discovered_nasids, 0, sizeof(u64) * xp_nasid_mask_words); rp = (struct xpc_rsvd_page *) xpc_rsvd_page; diff --git a/arch/ia64/sn/pci/tioce_provider.c b/arch/ia64/sn/pci/tioce_provider.c index e52831ed93eb..fa073cc4b565 100644 --- a/arch/ia64/sn/pci/tioce_provider.c +++ b/arch/ia64/sn/pci/tioce_provider.c @@ -15,6 +15,124 @@ #include <asm/sn/pcidev.h> #include <asm/sn/pcibus_provider_defs.h> #include <asm/sn/tioce_provider.h> +#include <asm/sn/sn2/sn_hwperf.h> + +/* + * 1/26/2006 + * + * WAR for SGI PV 944642. For revA TIOCE, need to use the following recipe + * (taken from the above PV) before and after accessing tioce internal MMR's + * to avoid tioce lockups. + * + * The recipe as taken from the PV: + * + * if(mmr address < 0x45000) { + * if(mmr address == 0 or 0x80) + * mmr wrt or read address 0xc0 + * else if(mmr address == 0x148 or 0x200) + * mmr wrt or read address 0x28 + * else + * mmr wrt or read address 0x158 + * + * do desired mmr access (rd or wrt) + * + * if(mmr address == 0x100) + * mmr wrt or read address 0x38 + * mmr wrt or read address 0xb050 + * } else + * do desired mmr access + * + * According to hw, we can use reads instead of writes to the above addres + * + * Note this WAR can only to be used for accessing internal MMR's in the + * TIOCE Coretalk Address Range 0x0 - 0x07ff_ffff. This includes the + * "Local CE Registers and Memories" and "PCI Compatible Config Space" address + * spaces from table 2-1 of the "CE Programmer's Reference Overview" document. + * + * All registers defined in struct tioce will meet that criteria. + */ + +static void inline +tioce_mmr_war_pre(struct tioce_kernel *kern, void *mmr_addr) +{ + u64 mmr_base; + u64 mmr_offset; + + if (kern->ce_common->ce_rev != TIOCE_REV_A) + return; + + mmr_base = kern->ce_common->ce_pcibus.bs_base; + mmr_offset = (u64)mmr_addr - mmr_base; + + if (mmr_offset < 0x45000) { + u64 mmr_war_offset; + + if (mmr_offset == 0 || mmr_offset == 0x80) + mmr_war_offset = 0xc0; + else if (mmr_offset == 0x148 || mmr_offset == 0x200) + mmr_war_offset = 0x28; + else + mmr_war_offset = 0x158; + + readq_relaxed((void *)(mmr_base + mmr_war_offset)); + } +} + +static void inline +tioce_mmr_war_post(struct tioce_kernel *kern, void *mmr_addr) +{ + u64 mmr_base; + u64 mmr_offset; + + if (kern->ce_common->ce_rev != TIOCE_REV_A) + return; + + mmr_base = kern->ce_common->ce_pcibus.bs_base; + mmr_offset = (u64)mmr_addr - mmr_base; + + if (mmr_offset < 0x45000) { + if (mmr_offset == 0x100) + readq_relaxed((void *)(mmr_base + 0x38)); + readq_relaxed((void *)(mmr_base + 0xb050)); + } +} + +/* load mmr contents into a variable */ +#define tioce_mmr_load(kern, mmrp, varp) do {\ + tioce_mmr_war_pre(kern, mmrp); \ + *(varp) = readq_relaxed(mmrp); \ + tioce_mmr_war_post(kern, mmrp); \ +} while (0) + +/* store variable contents into mmr */ +#define tioce_mmr_store(kern, mmrp, varp) do {\ + tioce_mmr_war_pre(kern, mmrp); \ + writeq(*varp, mmrp); \ + tioce_mmr_war_post(kern, mmrp); \ +} while (0) + +/* store immediate value into mmr */ +#define tioce_mmr_storei(kern, mmrp, val) do {\ + tioce_mmr_war_pre(kern, mmrp); \ + writeq(val, mmrp); \ + tioce_mmr_war_post(kern, mmrp); \ +} while (0) + +/* set bits (immediate value) into mmr */ +#define tioce_mmr_seti(kern, mmrp, bits) do {\ + u64 tmp; \ + tioce_mmr_load(kern, mmrp, &tmp); \ + tmp |= (bits); \ + tioce_mmr_store(kern, mmrp, &tmp); \ +} while (0) + +/* clear bits (immediate value) into mmr */ +#define tioce_mmr_clri(kern, mmrp, bits) do { \ + u64 tmp; \ + tioce_mmr_load(kern, mmrp, &tmp); \ + tmp &= ~(bits); \ + tioce_mmr_store(kern, mmrp, &tmp); \ +} while (0) /** * Bus address ranges for the 5 flavors of TIOCE DMA @@ -62,9 +180,9 @@ #define TIOCE_ATE_M40 2 #define TIOCE_ATE_M40S 3 -#define KB(x) ((x) << 10) -#define MB(x) ((x) << 20) -#define GB(x) ((x) << 30) +#define KB(x) ((u64)(x) << 10) +#define MB(x) ((u64)(x) << 20) +#define GB(x) ((u64)(x) << 30) /** * tioce_dma_d64 - create a DMA mapping using 64-bit direct mode @@ -151,7 +269,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port, int last; int entries; int nates; - int pagesize; + u64 pagesize; u64 *ate_shadow; u64 *ate_reg; u64 addr; @@ -228,7 +346,7 @@ tioce_alloc_map(struct tioce_kernel *ce_kern, int type, int port, ate = ATE_MAKE(addr, pagesize); ate_shadow[i + j] = ate; - writeq(ate, &ate_reg[i + j]); + tioce_mmr_storei(ce_kern, &ate_reg[i + j], ate); addr += pagesize; } @@ -272,7 +390,8 @@ tioce_dma_d32(struct pci_dev *pdev, u64 ct_addr) u64 tmp; ce_kern->ce_port[port].dirmap_shadow = ct_upper; - writeq(ct_upper, &ce_mmr->ce_ure_dir_map[port]); + tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_dir_map[port], + ct_upper); tmp = ce_mmr->ce_ure_dir_map[port]; dma_ok = 1; } else @@ -344,7 +463,8 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir) if (TIOCE_D32_ADDR(bus_addr)) { if (--ce_kern->ce_port[port].dirmap_refcnt == 0) { ce_kern->ce_port[port].dirmap_shadow = 0; - writeq(0, &ce_mmr->ce_ure_dir_map[port]); + tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_dir_map[port], + 0); } } else { struct tioce_dmamap *map; @@ -365,7 +485,7 @@ tioce_dma_unmap(struct pci_dev *pdev, dma_addr_t bus_addr, int dir) } else if (--map->refcnt == 0) { for (i = 0; i < map->ate_count; i++) { map->ate_shadow[i] = 0; - map->ate_hw[i] = 0; + tioce_mmr_storei(ce_kern, &map->ate_hw[i], 0); } list_del(&map->ce_dmamap_list); @@ -486,7 +606,7 @@ tioce_do_dma_map(struct pci_dev *pdev, u64 paddr, size_t byte_count, spin_unlock_irqrestore(&ce_kern->ce_lock, flags); dma_map_done: - if (mapaddr & barrier) + if (mapaddr && barrier) mapaddr = tioce_dma_barrier(mapaddr, 1); return mapaddr; @@ -541,17 +661,61 @@ tioce_error_intr_handler(int irq, void *arg, struct pt_regs *pt) soft->ce_pcibus.bs_persist_segment, soft->ce_pcibus.bs_persist_busnum, 0, 0, 0, 0, 0); + if (ret_stuff.v0) + panic("tioce_error_intr_handler: Fatal TIOCE error"); + return IRQ_HANDLED; } /** + * tioce_reserve_m32 - reserve M32 ate's for the indicated address range + * @tioce_kernel: TIOCE context to reserve ate's for + * @base: starting bus address to reserve + * @limit: last bus address to reserve + * + * If base/limit falls within the range of bus space mapped through the + * M32 space, reserve the resources corresponding to the range. + */ +static void +tioce_reserve_m32(struct tioce_kernel *ce_kern, u64 base, u64 limit) +{ + int ate_index, last_ate, ps; + struct tioce *ce_mmr; + + if (!TIOCE_M32_ADDR(base)) + return; + + ce_mmr = (struct tioce *)ce_kern->ce_common->ce_pcibus.bs_base; + ps = ce_kern->ce_ate3240_pagesize; + ate_index = ATE_PAGE(base, ps); + last_ate = ate_index + ATE_NPAGES(base, limit-base+1, ps) - 1; + + if (ate_index < 64) + ate_index = 64; + + while (ate_index <= last_ate) { + u64 ate; + + ate = ATE_MAKE(0xdeadbeef, ps); + ce_kern->ce_ate3240_shadow[ate_index] = ate; + tioce_mmr_storei(ce_kern, &ce_mmr->ce_ure_ate3240[ate_index], + ate); + ate_index++; + } +} + +/** * tioce_kern_init - init kernel structures related to a given TIOCE * @tioce_common: ptr to a cached tioce_common struct that originated in prom - */ static struct tioce_kernel * + */ +static struct tioce_kernel * tioce_kern_init(struct tioce_common *tioce_common) { int i; + int ps; + int dev; u32 tmp; + unsigned int seg, bus; struct tioce *tioce_mmr; struct tioce_kernel *tioce_kern; @@ -572,9 +736,10 @@ tioce_kern_init(struct tioce_common *tioce_common) * here to use pci_read_config_xxx() so use the raw_pci_ops vector. */ - raw_pci_ops->read(tioce_common->ce_pcibus.bs_persist_segment, - tioce_common->ce_pcibus.bs_persist_busnum, - PCI_DEVFN(2, 0), PCI_SECONDARY_BUS, 1, &tmp); + seg = tioce_common->ce_pcibus.bs_persist_segment; + bus = tioce_common->ce_pcibus.bs_persist_busnum; + + raw_pci_ops->read(seg, bus, PCI_DEVFN(2, 0), PCI_SECONDARY_BUS, 1,&tmp); tioce_kern->ce_port1_secondary = (u8) tmp; /* @@ -583,18 +748,76 @@ tioce_kern_init(struct tioce_common *tioce_common) */ tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base; - __sn_clrq_relaxed(&tioce_mmr->ce_ure_page_map, CE_URE_PAGESIZE_MASK); - __sn_setq_relaxed(&tioce_mmr->ce_ure_page_map, CE_URE_256K_PAGESIZE); - tioce_kern->ce_ate3240_pagesize = KB(256); + tioce_mmr_clri(tioce_kern, &tioce_mmr->ce_ure_page_map, + CE_URE_PAGESIZE_MASK); + tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_ure_page_map, + CE_URE_256K_PAGESIZE); + ps = tioce_kern->ce_ate3240_pagesize = KB(256); for (i = 0; i < TIOCE_NUM_M40_ATES; i++) { tioce_kern->ce_ate40_shadow[i] = 0; - writeq(0, &tioce_mmr->ce_ure_ate40[i]); + tioce_mmr_storei(tioce_kern, &tioce_mmr->ce_ure_ate40[i], 0); } for (i = 0; i < TIOCE_NUM_M3240_ATES; i++) { tioce_kern->ce_ate3240_shadow[i] = 0; - writeq(0, &tioce_mmr->ce_ure_ate3240[i]); + tioce_mmr_storei(tioce_kern, &tioce_mmr->ce_ure_ate3240[i], 0); + } + + /* + * Reserve ATE's corresponding to reserved address ranges. These + * include: + * + * Memory space covered by each PPB mem base/limit register + * Memory space covered by each PPB prefetch base/limit register + * + * These bus ranges are for pio (downstream) traffic only, and so + * cannot be used for DMA. + */ + + for (dev = 1; dev <= 2; dev++) { + u64 base, limit; + + /* mem base/limit */ + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_MEMORY_BASE, 2, &tmp); + base = (u64)tmp << 16; + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_MEMORY_LIMIT, 2, &tmp); + limit = (u64)tmp << 16; + limit |= 0xfffffUL; + + if (base < limit) + tioce_reserve_m32(tioce_kern, base, limit); + + /* + * prefetch mem base/limit. The tioce ppb's have 64-bit + * decoders, so read the upper portions w/o checking the + * attributes. + */ + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_PREF_MEMORY_BASE, 2, &tmp); + base = ((u64)tmp & PCI_PREF_RANGE_MASK) << 16; + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_PREF_BASE_UPPER32, 4, &tmp); + base |= (u64)tmp << 32; + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_PREF_MEMORY_LIMIT, 2, &tmp); + + limit = ((u64)tmp & PCI_PREF_RANGE_MASK) << 16; + limit |= 0xfffffUL; + + raw_pci_ops->read(seg, bus, PCI_DEVFN(dev, 0), + PCI_PREF_LIMIT_UPPER32, 4, &tmp); + limit |= (u64)tmp << 32; + + if ((base < limit) && TIOCE_M32_ADDR(base)) + tioce_reserve_m32(tioce_kern, base, limit); } return tioce_kern; @@ -614,6 +837,7 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info) { struct pcidev_info *pcidev_info; struct tioce_common *ce_common; + struct tioce_kernel *ce_kern; struct tioce *ce_mmr; u64 force_int_val; @@ -629,6 +853,29 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info) ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info; ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base; + ce_kern = (struct tioce_kernel *)ce_common->ce_kernel_private; + + /* + * TIOCE Rev A workaround (PV 945826), force an interrupt by writing + * the TIO_INTx register directly (1/26/2006) + */ + if (ce_common->ce_rev == TIOCE_REV_A) { + u64 int_bit_mask = (1ULL << sn_irq_info->irq_int_bit); + u64 status; + + tioce_mmr_load(ce_kern, &ce_mmr->ce_adm_int_status, &status); + if (status & int_bit_mask) { + u64 force_irq = (1 << 8) | sn_irq_info->irq_irq; + u64 ctalk = sn_irq_info->irq_xtalkaddr; + u64 nasid, offset; + + nasid = (ctalk & CTALK_NASID_MASK) >> CTALK_NASID_SHFT; + offset = (ctalk & CTALK_NODE_OFFSET); + HUB_S(TIO_IOSPACE_ADDR(nasid, offset), force_irq); + } + + return; + } /* * irq_int_bit is originally set up by prom, and holds the interrupt @@ -666,7 +913,7 @@ tioce_force_interrupt(struct sn_irq_info *sn_irq_info) default: return; } - writeq(force_int_val, &ce_mmr->ce_adm_force_int); + tioce_mmr_storei(ce_kern, &ce_mmr->ce_adm_force_int, force_int_val); } /** @@ -685,6 +932,7 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info) { struct pcidev_info *pcidev_info; struct tioce_common *ce_common; + struct tioce_kernel *ce_kern; struct tioce *ce_mmr; int bit; u64 vector; @@ -695,14 +943,15 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info) ce_common = (struct tioce_common *)pcidev_info->pdi_pcibus_info; ce_mmr = (struct tioce *)ce_common->ce_pcibus.bs_base; + ce_kern = (struct tioce_kernel *)ce_common->ce_kernel_private; bit = sn_irq_info->irq_int_bit; - __sn_setq_relaxed(&ce_mmr->ce_adm_int_mask, (1UL << bit)); + tioce_mmr_seti(ce_kern, &ce_mmr->ce_adm_int_mask, (1UL << bit)); vector = (u64)sn_irq_info->irq_irq << INTR_VECTOR_SHFT; vector |= sn_irq_info->irq_xtalkaddr; - writeq(vector, &ce_mmr->ce_adm_int_dest[bit]); - __sn_clrq_relaxed(&ce_mmr->ce_adm_int_mask, (1UL << bit)); + tioce_mmr_storei(ce_kern, &ce_mmr->ce_adm_int_dest[bit], vector); + tioce_mmr_clri(ce_kern, &ce_mmr->ce_adm_int_mask, (1UL << bit)); tioce_force_interrupt(sn_irq_info); } @@ -721,7 +970,11 @@ tioce_target_interrupt(struct sn_irq_info *sn_irq_info) static void * tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *controller) { + int my_nasid; + cnodeid_t my_cnode, mem_cnode; struct tioce_common *tioce_common; + struct tioce_kernel *tioce_kern; + struct tioce *tioce_mmr; /* * Allocate kernel bus soft and copy from prom. @@ -734,11 +987,23 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont memcpy(tioce_common, prom_bussoft, sizeof(struct tioce_common)); tioce_common->ce_pcibus.bs_base |= __IA64_UNCACHED_OFFSET; - if (tioce_kern_init(tioce_common) == NULL) { + tioce_kern = tioce_kern_init(tioce_common); + if (tioce_kern == NULL) { kfree(tioce_common); return NULL; } + /* + * Clear out any transient errors before registering the error + * interrupt handler. + */ + + tioce_mmr = (struct tioce *)tioce_common->ce_pcibus.bs_base; + tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_int_status_alias, ~0ULL); + tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_adm_error_summary_alias, + ~0ULL); + tioce_mmr_seti(tioce_kern, &tioce_mmr->ce_dre_comp_err_addr, ~0ULL); + if (request_irq(SGI_PCIASIC_ERROR, tioce_error_intr_handler, SA_SHIRQ, "TIOCE error", (void *)tioce_common)) @@ -750,6 +1015,21 @@ tioce_bus_fixup(struct pcibus_bussoft *prom_bussoft, struct pci_controller *cont tioce_common->ce_pcibus.bs_persist_segment, tioce_common->ce_pcibus.bs_persist_busnum); + /* + * identify closest nasid for memory allocations + */ + + my_nasid = NASID_GET(tioce_common->ce_pcibus.bs_base); + my_cnode = nasid_to_cnodeid(my_nasid); + + if (sn_hwperf_get_nearest_node(my_cnode, &mem_cnode, NULL) < 0) { + printk(KERN_WARNING "tioce_bus_fixup: failed to find " + "closest node with MEM to TIO node %d\n", my_cnode); + mem_cnode = (cnodeid_t)-1; /* use any node */ + } + + controller->node = mem_cnode; + return tioce_common; } diff --git a/arch/m32r/mm/init.c b/arch/m32r/mm/init.c index 6facf15b04f3..c9e7dad860b7 100644 --- a/arch/m32r/mm/init.c +++ b/arch/m32r/mm/init.c @@ -226,7 +226,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -244,7 +244,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/m68k/mm/init.c b/arch/m68k/mm/init.c index c45beb955943..a190e39c907a 100644 --- a/arch/m68k/mm/init.c +++ b/arch/m68k/mm/init.c @@ -137,7 +137,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; diff --git a/arch/m68k/mm/memory.c b/arch/m68k/mm/memory.c index 559942ce0e1e..d6d582a5abb0 100644 --- a/arch/m68k/mm/memory.c +++ b/arch/m68k/mm/memory.c @@ -54,7 +54,7 @@ void __init init_pointer_table(unsigned long ptable) /* unreserve the page so it's possible to free that page */ PD_PAGE(dp)->flags &= ~(1 << PG_reserved); - set_page_count(PD_PAGE(dp), 1); + init_page_count(PD_PAGE(dp)); return; } diff --git a/arch/m68k/mm/motorola.c b/arch/m68k/mm/motorola.c index d855fec26317..afb57eeafdcb 100644 --- a/arch/m68k/mm/motorola.c +++ b/arch/m68k/mm/motorola.c @@ -276,7 +276,7 @@ void free_initmem(void) addr = (unsigned long)&__init_begin; for (; addr < (unsigned long)&__init_end; addr += PAGE_SIZE) { virt_to_page(addr)->flags &= ~(1 << PG_reserved); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/m68knommu/kernel/m68k_ksyms.c b/arch/m68knommu/kernel/m68k_ksyms.c index eddb8d3e130a..d844c755945a 100644 --- a/arch/m68knommu/kernel/m68k_ksyms.c +++ b/arch/m68knommu/kernel/m68k_ksyms.c @@ -26,6 +26,7 @@ EXPORT_SYMBOL(__ioremap); EXPORT_SYMBOL(iounmap); EXPORT_SYMBOL(dump_fpu); EXPORT_SYMBOL(strnlen); +EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(strrchr); EXPORT_SYMBOL(strstr); EXPORT_SYMBOL(strchr); diff --git a/arch/m68knommu/mm/init.c b/arch/m68knommu/mm/init.c index 89f0b554ffb7..d79503fe6e42 100644 --- a/arch/m68knommu/mm/init.c +++ b/arch/m68knommu/mm/init.c @@ -195,7 +195,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) int pages = 0; for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; pages++; @@ -218,7 +218,7 @@ free_initmem() /* next to check that the page we free is not a partial page */ for (; addr + PAGE_SIZE < (unsigned long)(&__init_end); addr +=PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } diff --git a/arch/mips/arc/memory.c b/arch/mips/arc/memory.c index 958d2eb78862..8a9ef58cc399 100644 --- a/arch/mips/arc/memory.c +++ b/arch/mips/arc/memory.c @@ -158,7 +158,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/dec/prom/memory.c b/arch/mips/dec/prom/memory.c index 81cb5a76cfb7..1edaf3074ee9 100644 --- a/arch/mips/dec/prom/memory.c +++ b/arch/mips/dec/prom/memory.c @@ -118,7 +118,7 @@ unsigned long __init prom_free_prom_memory(void) addr = PAGE_SIZE; while (addr < end) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; } diff --git a/arch/mips/mips-boards/generic/memory.c b/arch/mips/mips-boards/generic/memory.c index 2c8afd77a20b..ee5e70c95cf3 100644 --- a/arch/mips/mips-boards/generic/memory.c +++ b/arch/mips/mips-boards/generic/memory.c @@ -174,7 +174,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/mips-boards/sim/sim_mem.c b/arch/mips/mips-boards/sim/sim_mem.c index 0dbd7435bb2a..1ec4e75656bd 100644 --- a/arch/mips/mips-boards/sim/sim_mem.c +++ b/arch/mips/mips-boards/sim/sim_mem.c @@ -117,7 +117,7 @@ unsigned long __init prom_free_prom_memory(void) while (addr < boot_mem_map.map[i].addr + boot_mem_map.map[i].size) { ClearPageReserved(virt_to_page(__va(addr))); - set_page_count(virt_to_page(__va(addr)), 1); + init_page_count(virt_to_page(__va(addr))); free_page((unsigned long)__va(addr)); addr += PAGE_SIZE; freed += PAGE_SIZE; diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 0ff9a348b843..52f7d59fe612 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -54,7 +54,8 @@ unsigned long empty_zero_page, zero_page_mask; */ unsigned long setup_zero_pages(void) { - unsigned long order, size; + unsigned int order; + unsigned long size; struct page *page; if (cpu_has_vce) @@ -67,9 +68,9 @@ unsigned long setup_zero_pages(void) panic("Oh boy, that early out of memory?"); page = virt_to_page(empty_zero_page); + split_page(page, order); while (page < virt_to_page(empty_zero_page + (PAGE_SIZE << order))) { SetPageReserved(page); - set_page_count(page, 1); page++; } @@ -244,7 +245,7 @@ void __init mem_init(void) #ifdef CONFIG_LIMITED_DMA set_page_address(page, lowmem_page_address(page)); #endif - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -291,7 +292,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } @@ -314,7 +315,7 @@ void free_initmem(void) page = addr; #endif ClearPageReserved(virt_to_page(page)); - set_page_count(virt_to_page(page), 1); + init_page_count(virt_to_page(page)); free_page(page); totalram_pages++; freed += PAGE_SIZE; diff --git a/arch/mips/sgi-ip27/ip27-memory.c b/arch/mips/sgi-ip27/ip27-memory.c index ed93a9792959..e0d095daa5ed 100644 --- a/arch/mips/sgi-ip27/ip27-memory.c +++ b/arch/mips/sgi-ip27/ip27-memory.c @@ -559,7 +559,7 @@ void __init mem_init(void) /* if (!page_is_ram(pgnr)) continue; */ /* commented out until page_is_ram works */ ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); totalram_pages++; } diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c index 7847ca13d6c2..852eda3953dc 100644 --- a/arch/parisc/mm/init.c +++ b/arch/parisc/mm/init.c @@ -398,7 +398,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); num_physpages++; totalram_pages++; @@ -1018,7 +1018,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk(KERN_INFO "Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); num_physpages++; totalram_pages++; diff --git a/arch/powerpc/mm/hugetlbpage.c b/arch/powerpc/mm/hugetlbpage.c index b51bb28c054b..7370f9f33e29 100644 --- a/arch/powerpc/mm/hugetlbpage.c +++ b/arch/powerpc/mm/hugetlbpage.c @@ -133,21 +133,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return __pte(old); } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - if (! (within_hugepage_low_range(addr, len) - || within_hugepage_high_range(addr, len)) ) - return -EINVAL; - return 0; -} - struct slb_flush_info { struct mm_struct *mm; u16 newareas; diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index 7d0d75c11848..b57fb3a2b7bb 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -216,7 +216,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name) while (start < end) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); cnt++; start += PAGE_SIZE; @@ -248,7 +248,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c index 81cfb0c2ec58..bacb71c89811 100644 --- a/arch/powerpc/mm/init_64.c +++ b/arch/powerpc/mm/init_64.c @@ -140,7 +140,7 @@ void free_initmem(void) for (; addr < (unsigned long)__init_end; addr += PAGE_SIZE) { memset((void *)addr, 0xcc, PAGE_SIZE); ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -155,7 +155,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 550517c2dd42..454cac01d8cc 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -108,8 +108,8 @@ EXPORT_SYMBOL(phys_mem_access_prot); void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 0); - free_cold_page(page); + init_page_count(page); + __free_page(page); totalram_pages++; num_physpages++; } @@ -376,7 +376,7 @@ void __init mem_init(void) struct page *page = pfn_to_page(pfn); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } diff --git a/arch/powerpc/platforms/cell/setup.c b/arch/powerpc/platforms/cell/setup.c index b33a4443f5a9..fec8e65b36ea 100644 --- a/arch/powerpc/platforms/cell/setup.c +++ b/arch/powerpc/platforms/cell/setup.c @@ -115,7 +115,7 @@ static void __init cell_spuprop_present(struct device_node *spe, for (pfn = start_pfn; pfn < end_pfn; pfn++) { struct page *page = pfn_to_page(pfn); set_page_links(page, ZONE_DMA, node_id, pfn); - set_page_count(page, 1); + init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); diff --git a/arch/ppc/kernel/dma-mapping.c b/arch/ppc/kernel/dma-mapping.c index 685fd0defe23..61465ec88bc7 100644 --- a/arch/ppc/kernel/dma-mapping.c +++ b/arch/ppc/kernel/dma-mapping.c @@ -223,6 +223,8 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) pte_t *pte = consistent_pte + CONSISTENT_OFFSET(vaddr); struct page *end = page + (1 << order); + split_page(page, order); + /* * Set the "dma handle" */ @@ -231,7 +233,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) do { BUG_ON(!pte_none(*pte)); - set_page_count(page, 1); SetPageReserved(page); set_pte_at(&init_mm, vaddr, pte, mk_pte(page, pgprot_noncached(PAGE_KERNEL))); @@ -244,7 +245,6 @@ __dma_alloc_coherent(size_t size, dma_addr_t *handle, gfp_t gfp) * Free the otherwise unused pages. */ while (page < end) { - set_page_count(page, 1); __free_page(page); page++; } diff --git a/arch/ppc/mm/init.c b/arch/ppc/mm/init.c index 134db5c04203..cb1c294fb932 100644 --- a/arch/ppc/mm/init.c +++ b/arch/ppc/mm/init.c @@ -140,7 +140,7 @@ static void free_sec(unsigned long start, unsigned long end, const char *name) while (start < end) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); cnt++; start += PAGE_SIZE; @@ -172,7 +172,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } @@ -441,7 +441,7 @@ void __init mem_init(void) struct page *page = mem_map + pfn; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } diff --git a/arch/s390/mm/init.c b/arch/s390/mm/init.c index df953383724d..a055894f3bd8 100644 --- a/arch/s390/mm/init.c +++ b/arch/s390/mm/init.c @@ -292,7 +292,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -307,7 +307,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/sh/mm/consistent.c b/arch/sh/mm/consistent.c index df3a9e452cc5..ee73e30263af 100644 --- a/arch/sh/mm/consistent.c +++ b/arch/sh/mm/consistent.c @@ -23,6 +23,7 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle) page = alloc_pages(gfp, order); if (!page) return NULL; + split_page(page, order); ret = page_address(page); *handle = virt_to_phys(ret); @@ -37,8 +38,6 @@ void *consistent_alloc(gfp_t gfp, size_t size, dma_addr_t *handle) end = page + (1 << order); while (++page < end) { - set_page_count(page, 1); - /* Free any unused pages */ if (page >= free) { __free_page(page); diff --git a/arch/sh/mm/hugetlbpage.c b/arch/sh/mm/hugetlbpage.c index 6b7a7688c98e..a3568fd51508 100644 --- a/arch/sh/mm/hugetlbpage.c +++ b/arch/sh/mm/hugetlbpage.c @@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index e342565f75fb..77b4a838fe10 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -273,7 +273,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -286,7 +286,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/sh64/mm/hugetlbpage.c b/arch/sh64/mm/hugetlbpage.c index ed6a505b3ee2..3d89f2a6c785 100644 --- a/arch/sh64/mm/hugetlbpage.c +++ b/arch/sh64/mm/hugetlbpage.c @@ -84,18 +84,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { diff --git a/arch/sh64/mm/init.c b/arch/sh64/mm/init.c index a65e8bb2c3cc..1169757fb38b 100644 --- a/arch/sh64/mm/init.c +++ b/arch/sh64/mm/init.c @@ -173,7 +173,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); free_page(addr); totalram_pages++; } @@ -186,7 +186,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) unsigned long p; for (p = start; p < end; p += PAGE_SIZE) { ClearPageReserved(virt_to_page(p)); - set_page_count(virt_to_page(p), 1); + init_page_count(virt_to_page(p)); free_page(p); totalram_pages++; } diff --git a/arch/sparc/kernel/sun4d_smp.c b/arch/sparc/kernel/sun4d_smp.c index 40d426cce824..4219dd2ce3a2 100644 --- a/arch/sparc/kernel/sun4d_smp.c +++ b/arch/sparc/kernel/sun4d_smp.c @@ -266,19 +266,19 @@ void __init smp4d_boot_cpus(void) /* Free unneeded trap tables */ ClearPageReserved(virt_to_page(trapbase_cpu1)); - set_page_count(virt_to_page(trapbase_cpu1), 1); + init_page_count(virt_to_page(trapbase_cpu1)); free_page((unsigned long)trapbase_cpu1); totalram_pages++; num_physpages++; ClearPageReserved(virt_to_page(trapbase_cpu2)); - set_page_count(virt_to_page(trapbase_cpu2), 1); + init_page_count(virt_to_page(trapbase_cpu2)); free_page((unsigned long)trapbase_cpu2); totalram_pages++; num_physpages++; ClearPageReserved(virt_to_page(trapbase_cpu3)); - set_page_count(virt_to_page(trapbase_cpu3), 1); + init_page_count(virt_to_page(trapbase_cpu3)); free_page((unsigned long)trapbase_cpu3); totalram_pages++; num_physpages++; diff --git a/arch/sparc/kernel/sun4m_smp.c b/arch/sparc/kernel/sun4m_smp.c index a21f27d10e55..fbbd8a474c4c 100644 --- a/arch/sparc/kernel/sun4m_smp.c +++ b/arch/sparc/kernel/sun4m_smp.c @@ -233,21 +233,21 @@ void __init smp4m_boot_cpus(void) /* Free unneeded trap tables */ if (!cpu_isset(i, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu1)); - set_page_count(virt_to_page(trapbase_cpu1), 1); + init_page_count(virt_to_page(trapbase_cpu1)); free_page((unsigned long)trapbase_cpu1); totalram_pages++; num_physpages++; } if (!cpu_isset(2, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu2)); - set_page_count(virt_to_page(trapbase_cpu2), 1); + init_page_count(virt_to_page(trapbase_cpu2)); free_page((unsigned long)trapbase_cpu2); totalram_pages++; num_physpages++; } if (!cpu_isset(3, cpu_present_map)) { ClearPageReserved(virt_to_page(trapbase_cpu3)); - set_page_count(virt_to_page(trapbase_cpu3), 1); + init_page_count(virt_to_page(trapbase_cpu3)); free_page((unsigned long)trapbase_cpu3); totalram_pages++; num_physpages++; diff --git a/arch/sparc/mm/generic.c b/arch/sparc/mm/generic.c index 2cb0728cee05..1ef7fa03fefe 100644 --- a/arch/sparc/mm/generic.c +++ b/arch/sparc/mm/generic.c @@ -76,7 +76,6 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, vma->vm_pgoff = (offset >> PAGE_SHIFT) | ((unsigned long)space << 28UL); - prot = __pgprot(pg_iobits); offset -= from; dir = pgd_offset(mm, from); flush_cache_range(vma, beg, end); diff --git a/arch/sparc/mm/init.c b/arch/sparc/mm/init.c index c03babaa0498..898669732466 100644 --- a/arch/sparc/mm/init.c +++ b/arch/sparc/mm/init.c @@ -383,7 +383,7 @@ void map_high_region(unsigned long start_pfn, unsigned long end_pfn) struct page *page = pfn_to_page(tmp); ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalhigh_pages++; } @@ -480,7 +480,7 @@ void free_initmem (void) p = virt_to_page(addr); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); totalram_pages++; num_physpages++; @@ -497,7 +497,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) struct page *p = virt_to_page(start); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; } diff --git a/arch/sparc/mm/loadmmu.c b/arch/sparc/mm/loadmmu.c index e9f9571601ba..36b4d24988f8 100644 --- a/arch/sparc/mm/loadmmu.c +++ b/arch/sparc/mm/loadmmu.c @@ -22,8 +22,6 @@ struct ctx_list *ctx_list_pool; struct ctx_list ctx_free; struct ctx_list ctx_used; -unsigned int pg_iobits; - extern void ld_mmu_sun4c(void); extern void ld_mmu_srmmu(void); diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index c664b962987c..27b0e0ba8581 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -2130,6 +2130,13 @@ static unsigned long srmmu_pte_to_pgoff(pte_t pte) return pte_val(pte) >> SRMMU_PTE_FILE_SHIFT; } +static pgprot_t srmmu_pgprot_noncached(pgprot_t prot) +{ + prot &= ~__pgprot(SRMMU_CACHE); + + return prot; +} + /* Load up routines and constants for sun4m and sun4d mmu */ void __init ld_mmu_srmmu(void) { @@ -2150,9 +2157,9 @@ void __init ld_mmu_srmmu(void) BTFIXUPSET_INT(page_readonly, pgprot_val(SRMMU_PAGE_RDONLY)); BTFIXUPSET_INT(page_kernel, pgprot_val(SRMMU_PAGE_KERNEL)); page_kernel = pgprot_val(SRMMU_PAGE_KERNEL); - pg_iobits = SRMMU_VALID | SRMMU_WRITE | SRMMU_REF; /* Functions */ + BTFIXUPSET_CALL(pgprot_noncached, srmmu_pgprot_noncached, BTFIXUPCALL_NORM); #ifndef CONFIG_SMP BTFIXUPSET_CALL(___xchg32, ___xchg32_sun4md, BTFIXUPCALL_SWAPG1G2); #endif diff --git a/arch/sparc/mm/sun4c.c b/arch/sparc/mm/sun4c.c index 731f19603cad..49f28c1bdc6d 100644 --- a/arch/sparc/mm/sun4c.c +++ b/arch/sparc/mm/sun4c.c @@ -1589,7 +1589,10 @@ static void sun4c_flush_tlb_page(struct vm_area_struct *vma, unsigned long page) static inline void sun4c_mapioaddr(unsigned long physaddr, unsigned long virt_addr) { - unsigned long page_entry; + unsigned long page_entry, pg_iobits; + + pg_iobits = _SUN4C_PAGE_PRESENT | _SUN4C_READABLE | _SUN4C_WRITEABLE | + _SUN4C_PAGE_IO | _SUN4C_PAGE_NOCACHE; page_entry = ((physaddr >> PAGE_SHIFT) & SUN4C_PFN_MASK); page_entry |= ((pg_iobits | _SUN4C_PAGE_PRIV) & ~(_SUN4C_PAGE_PRESENT)); @@ -2134,6 +2137,13 @@ void __init sun4c_paging_init(void) printk("SUN4C: %d mmu entries for the kernel\n", cnt); } +static pgprot_t sun4c_pgprot_noncached(pgprot_t prot) +{ + prot |= __pgprot(_SUN4C_PAGE_IO | _SUN4C_PAGE_NOCACHE); + + return prot; +} + /* Load up routines and constants for sun4c mmu */ void __init ld_mmu_sun4c(void) { @@ -2156,10 +2166,9 @@ void __init ld_mmu_sun4c(void) BTFIXUPSET_INT(page_readonly, pgprot_val(SUN4C_PAGE_READONLY)); BTFIXUPSET_INT(page_kernel, pgprot_val(SUN4C_PAGE_KERNEL)); page_kernel = pgprot_val(SUN4C_PAGE_KERNEL); - pg_iobits = _SUN4C_PAGE_PRESENT | _SUN4C_READABLE | _SUN4C_WRITEABLE | - _SUN4C_PAGE_IO | _SUN4C_PAGE_NOCACHE; /* Functions */ + BTFIXUPSET_CALL(pgprot_noncached, sun4c_pgprot_noncached, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(___xchg32, ___xchg32_sun4c, BTFIXUPCALL_NORM); BTFIXUPSET_CALL(do_check_pgt_cache, sun4c_check_pgt_cache, BTFIXUPCALL_NORM); diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index c3685b314d71..267afddf63cf 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -175,11 +175,11 @@ config HUGETLB_PAGE_SIZE_4MB bool "4MB" config HUGETLB_PAGE_SIZE_512K - depends on !SPARC64_PAGE_SIZE_4MB + depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB bool "512K" config HUGETLB_PAGE_SIZE_64K - depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB + depends on !SPARC64_PAGE_SIZE_4MB && !SPARC64_PAGE_SIZE_512KB && !SPARC64_PAGE_SIZE_64K bool "64K" endchoice diff --git a/arch/sparc64/kernel/pci.c b/arch/sparc64/kernel/pci.c index 95ffa9418620..dfccff29e182 100644 --- a/arch/sparc64/kernel/pci.c +++ b/arch/sparc64/kernel/pci.c @@ -656,6 +656,7 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, __pci_mmap_set_flags(dev, vma, mmap_state); __pci_mmap_set_pgprot(dev, vma, mmap_state); + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ret = io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, vma->vm_end - vma->vm_start, @@ -663,7 +664,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, if (ret) return ret; - vma->vm_flags |= VM_IO; return 0; } diff --git a/arch/sparc64/kernel/sun4v_tlb_miss.S b/arch/sparc64/kernel/sun4v_tlb_miss.S index ab23ddb7116e..b731881224e8 100644 --- a/arch/sparc64/kernel/sun4v_tlb_miss.S +++ b/arch/sparc64/kernel/sun4v_tlb_miss.S @@ -29,15 +29,15 @@ * * index_mask = (512 << (tsb_reg & 0x7UL)) - 1UL; * tsb_base = tsb_reg & ~0x7UL; - * tsb_index = ((vaddr >> PAGE_SHIFT) & tsb_mask); + * tsb_index = ((vaddr >> HASH_SHIFT) & tsb_mask); * tsb_ptr = tsb_base + (tsb_index * 16); */ -#define COMPUTE_TSB_PTR(TSB_PTR, VADDR, TMP1, TMP2) \ +#define COMPUTE_TSB_PTR(TSB_PTR, VADDR, HASH_SHIFT, TMP1, TMP2) \ and TSB_PTR, 0x7, TMP1; \ mov 512, TMP2; \ andn TSB_PTR, 0x7, TSB_PTR; \ sllx TMP2, TMP1, TMP2; \ - srlx VADDR, PAGE_SHIFT, TMP1; \ + srlx VADDR, HASH_SHIFT, TMP1; \ sub TMP2, 1, TMP2; \ and TMP1, TMP2, TMP1; \ sllx TMP1, 4, TMP1; \ @@ -53,7 +53,7 @@ sun4v_itlb_miss: LOAD_ITLB_INFO(%g2, %g4, %g5) COMPUTE_TAG_TARGET(%g6, %g4, %g5, kvmap_itlb_4v) - COMPUTE_TSB_PTR(%g1, %g4, %g3, %g7) + COMPUTE_TSB_PTR(%g1, %g4, PAGE_SHIFT, %g3, %g7) /* Load TSB tag/pte into %g2/%g3 and compare the tag. */ ldda [%g1] ASI_QUAD_LDD_PHYS_4V, %g2 @@ -99,7 +99,7 @@ sun4v_dtlb_miss: LOAD_DTLB_INFO(%g2, %g4, %g5) COMPUTE_TAG_TARGET(%g6, %g4, %g5, kvmap_dtlb_4v) - COMPUTE_TSB_PTR(%g1, %g4, %g3, %g7) + COMPUTE_TSB_PTR(%g1, %g4, PAGE_SHIFT, %g3, %g7) /* Load TSB tag/pte into %g2/%g3 and compare the tag. */ ldda [%g1] ASI_QUAD_LDD_PHYS_4V, %g2 @@ -171,21 +171,26 @@ sun4v_dtsb_miss: /* fallthrough */ - /* Create TSB pointer into %g1. This is something like: - * - * index_mask = (512 << (tsb_reg & 0x7UL)) - 1UL; - * tsb_base = tsb_reg & ~0x7UL; - * tsb_index = ((vaddr >> PAGE_SHIFT) & tsb_mask); - * tsb_ptr = tsb_base + (tsb_index * 16); - */ sun4v_tsb_miss_common: - COMPUTE_TSB_PTR(%g1, %g4, %g5, %g7) + COMPUTE_TSB_PTR(%g1, %g4, PAGE_SHIFT, %g5, %g7) - /* Branch directly to page table lookup. We have SCRATCHPAD_MMU_MISS - * still in %g2, so it's quite trivial to get at the PGD PHYS value - * so we can preload it into %g7. - */ sub %g2, TRAP_PER_CPU_FAULT_INFO, %g2 + +#ifdef CONFIG_HUGETLB_PAGE + mov SCRATCHPAD_UTSBREG2, %g5 + ldxa [%g5] ASI_SCRATCHPAD, %g5 + cmp %g5, -1 + be,pt %xcc, 80f + nop + COMPUTE_TSB_PTR(%g5, %g4, HPAGE_SHIFT, %g2, %g7) + + /* That clobbered %g2, reload it. */ + ldxa [%g0] ASI_SCRATCHPAD, %g2 + sub %g2, TRAP_PER_CPU_FAULT_INFO, %g2 + +80: stx %g5, [%g2 + TRAP_PER_CPU_TSB_HUGE_TEMP] +#endif + ba,pt %xcc, tsb_miss_page_table_walk_sun4v_fastpath ldx [%g2 + TRAP_PER_CPU_PGD_PADDR], %g7 diff --git a/arch/sparc64/kernel/traps.c b/arch/sparc64/kernel/traps.c index 7f7dba0ca96a..df612e4f75f9 100644 --- a/arch/sparc64/kernel/traps.c +++ b/arch/sparc64/kernel/traps.c @@ -2482,6 +2482,7 @@ void init_cur_cpu_trap(struct thread_info *t) extern void thread_info_offsets_are_bolixed_dave(void); extern void trap_per_cpu_offsets_are_bolixed_dave(void); +extern void tsb_config_offsets_are_bolixed_dave(void); /* Only invoked on boot processor. */ void __init trap_init(void) @@ -2535,9 +2536,27 @@ void __init trap_init(void) (TRAP_PER_CPU_CPU_MONDO_BLOCK_PA != offsetof(struct trap_per_cpu, cpu_mondo_block_pa)) || (TRAP_PER_CPU_CPU_LIST_PA != - offsetof(struct trap_per_cpu, cpu_list_pa))) + offsetof(struct trap_per_cpu, cpu_list_pa)) || + (TRAP_PER_CPU_TSB_HUGE != + offsetof(struct trap_per_cpu, tsb_huge)) || + (TRAP_PER_CPU_TSB_HUGE_TEMP != + offsetof(struct trap_per_cpu, tsb_huge_temp))) trap_per_cpu_offsets_are_bolixed_dave(); + if ((TSB_CONFIG_TSB != + offsetof(struct tsb_config, tsb)) || + (TSB_CONFIG_RSS_LIMIT != + offsetof(struct tsb_config, tsb_rss_limit)) || + (TSB_CONFIG_NENTRIES != + offsetof(struct tsb_config, tsb_nentries)) || + (TSB_CONFIG_REG_VAL != + offsetof(struct tsb_config, tsb_reg_val)) || + (TSB_CONFIG_MAP_VADDR != + offsetof(struct tsb_config, tsb_map_vaddr)) || + (TSB_CONFIG_MAP_PTE != + offsetof(struct tsb_config, tsb_map_pte))) + tsb_config_offsets_are_bolixed_dave(); + /* Attach to the address space of init_task. On SMP we * do this in smp.c:smp_callin for other cpus. */ diff --git a/arch/sparc64/kernel/tsb.S b/arch/sparc64/kernel/tsb.S index 118baea44f69..a0c8ba58920b 100644 --- a/arch/sparc64/kernel/tsb.S +++ b/arch/sparc64/kernel/tsb.S @@ -3,8 +3,13 @@ * Copyright (C) 2006 David S. Miller <davem@davemloft.net> */ +#include <linux/config.h> + #include <asm/tsb.h> #include <asm/hypervisor.h> +#include <asm/page.h> +#include <asm/cpudata.h> +#include <asm/mmu.h> .text .align 32 @@ -34,34 +39,124 @@ tsb_miss_itlb: ldxa [%g4] ASI_IMMU, %g4 /* At this point we have: - * %g1 -- TSB entry address + * %g1 -- PAGE_SIZE TSB entry address * %g3 -- FAULT_CODE_{D,I}TLB * %g4 -- missing virtual address * %g6 -- TAG TARGET (vaddr >> 22) */ tsb_miss_page_table_walk: - TRAP_LOAD_PGD_PHYS(%g7, %g5) + TRAP_LOAD_TRAP_BLOCK(%g7, %g5) - /* And now we have the PGD base physical address in %g7. */ -tsb_miss_page_table_walk_sun4v_fastpath: - USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault) + /* Before committing to a full page table walk, + * check the huge page TSB. + */ +#ifdef CONFIG_HUGETLB_PAGE + +661: ldx [%g7 + TRAP_PER_CPU_TSB_HUGE], %g5 + nop + .section .sun4v_2insn_patch, "ax" + .word 661b + mov SCRATCHPAD_UTSBREG2, %g5 + ldxa [%g5] ASI_SCRATCHPAD, %g5 + .previous + + cmp %g5, -1 + be,pt %xcc, 80f + nop + + /* We need an aligned pair of registers containing 2 values + * which can be easily rematerialized. %g6 and %g7 foot the + * bill just nicely. We'll save %g6 away into %g2 for the + * huge page TSB TAG comparison. + * + * Perform a huge page TSB lookup. + */ + mov %g6, %g2 + and %g5, 0x7, %g6 + mov 512, %g7 + andn %g5, 0x7, %g5 + sllx %g7, %g6, %g7 + srlx %g4, HPAGE_SHIFT, %g6 + sub %g7, 1, %g7 + and %g6, %g7, %g6 + sllx %g6, 4, %g6 + add %g5, %g6, %g5 + + TSB_LOAD_QUAD(%g5, %g6) + cmp %g6, %g2 + be,a,pt %xcc, tsb_tlb_reload + mov %g7, %g5 + + /* No match, remember the huge page TSB entry address, + * and restore %g6 and %g7. + */ + TRAP_LOAD_TRAP_BLOCK(%g7, %g6) + srlx %g4, 22, %g6 +80: stx %g5, [%g7 + TRAP_PER_CPU_TSB_HUGE_TEMP] + +#endif + + ldx [%g7 + TRAP_PER_CPU_PGD_PADDR], %g7 /* At this point we have: * %g1 -- TSB entry address * %g3 -- FAULT_CODE_{D,I}TLB - * %g5 -- physical address of PTE in Linux page tables + * %g4 -- missing virtual address * %g6 -- TAG TARGET (vaddr >> 22) + * %g7 -- page table physical address + * + * We know that both the base PAGE_SIZE TSB and the HPAGE_SIZE + * TSB both lack a matching entry. */ -tsb_reload: - TSB_LOCK_TAG(%g1, %g2, %g7) +tsb_miss_page_table_walk_sun4v_fastpath: + USER_PGTABLE_WALK_TL1(%g4, %g7, %g5, %g2, tsb_do_fault) /* Load and check PTE. */ ldxa [%g5] ASI_PHYS_USE_EC, %g5 - mov 1, %g7 - sllx %g7, TSB_TAG_INVALID_BIT, %g7 - brgez,a,pn %g5, tsb_do_fault - TSB_STORE(%g1, %g7) + brgez,pn %g5, tsb_do_fault + nop + +#ifdef CONFIG_HUGETLB_PAGE +661: sethi %uhi(_PAGE_SZALL_4U), %g7 + sllx %g7, 32, %g7 + .section .sun4v_2insn_patch, "ax" + .word 661b + mov _PAGE_SZALL_4V, %g7 + nop + .previous + + and %g5, %g7, %g2 + +661: sethi %uhi(_PAGE_SZHUGE_4U), %g7 + sllx %g7, 32, %g7 + .section .sun4v_2insn_patch, "ax" + .word 661b + mov _PAGE_SZHUGE_4V, %g7 + nop + .previous + + cmp %g2, %g7 + bne,pt %xcc, 60f + nop + + /* It is a huge page, use huge page TSB entry address we + * calculated above. + */ + TRAP_LOAD_TRAP_BLOCK(%g7, %g2) + ldx [%g7 + TRAP_PER_CPU_TSB_HUGE_TEMP], %g2 + cmp %g2, -1 + movne %xcc, %g2, %g1 +60: +#endif + /* At this point we have: + * %g1 -- TSB entry address + * %g3 -- FAULT_CODE_{D,I}TLB + * %g5 -- valid PTE + * %g6 -- TAG TARGET (vaddr >> 22) + */ +tsb_reload: + TSB_LOCK_TAG(%g1, %g2, %g7) TSB_WRITE(%g1, %g5, %g6) /* Finally, load TLB and return from trap. */ @@ -240,10 +335,9 @@ tsb_flush: * schedule() time. * * %o0: page table physical address - * %o1: TSB register value - * %o2: TSB virtual address - * %o3: TSB mapping locked PTE - * %o4: Hypervisor TSB descriptor physical address + * %o1: TSB base config pointer + * %o2: TSB huge config pointer, or NULL if none + * %o3: Hypervisor TSB descriptor physical address * * We have to run this whole thing with interrupts * disabled so that the current cpu doesn't change @@ -253,63 +347,79 @@ tsb_flush: .globl __tsb_context_switch .type __tsb_context_switch,#function __tsb_context_switch: - rdpr %pstate, %o5 - wrpr %o5, PSTATE_IE, %pstate + rdpr %pstate, %g1 + wrpr %g1, PSTATE_IE, %pstate + + TRAP_LOAD_TRAP_BLOCK(%g2, %g3) - ldub [%g6 + TI_CPU], %g1 - sethi %hi(trap_block), %g2 - sllx %g1, TRAP_BLOCK_SZ_SHIFT, %g1 - or %g2, %lo(trap_block), %g2 - add %g2, %g1, %g2 stx %o0, [%g2 + TRAP_PER_CPU_PGD_PADDR] - sethi %hi(tlb_type), %g1 - lduw [%g1 + %lo(tlb_type)], %g1 - cmp %g1, 3 - bne,pt %icc, 1f + ldx [%o1 + TSB_CONFIG_REG_VAL], %o0 + brz,pt %o2, 1f + mov -1, %g3 + + ldx [%o2 + TSB_CONFIG_REG_VAL], %g3 + +1: stx %g3, [%g2 + TRAP_PER_CPU_TSB_HUGE] + + sethi %hi(tlb_type), %g2 + lduw [%g2 + %lo(tlb_type)], %g2 + cmp %g2, 3 + bne,pt %icc, 50f nop /* Hypervisor TSB switch. */ - mov SCRATCHPAD_UTSBREG1, %g1 - stxa %o1, [%g1] ASI_SCRATCHPAD - mov -1, %g2 - mov SCRATCHPAD_UTSBREG2, %g1 - stxa %g2, [%g1] ASI_SCRATCHPAD - - /* Save away %o5's %pstate, we have to use %o5 for - * the hypervisor call. - */ - mov %o5, %g1 + mov SCRATCHPAD_UTSBREG1, %o5 + stxa %o0, [%o5] ASI_SCRATCHPAD + mov SCRATCHPAD_UTSBREG2, %o5 + stxa %g3, [%o5] ASI_SCRATCHPAD + + mov 2, %o0 + cmp %g3, -1 + move %xcc, 1, %o0 mov HV_FAST_MMU_TSB_CTXNON0, %o5 - mov 1, %o0 - mov %o4, %o1 + mov %o3, %o1 ta HV_FAST_TRAP - /* Finish up and restore %o5. */ + /* Finish up. */ ba,pt %xcc, 9f - mov %g1, %o5 + nop /* SUN4U TSB switch. */ -1: mov TSB_REG, %g1 - stxa %o1, [%g1] ASI_DMMU +50: mov TSB_REG, %o5 + stxa %o0, [%o5] ASI_DMMU membar #Sync - stxa %o1, [%g1] ASI_IMMU + stxa %o0, [%o5] ASI_IMMU membar #Sync -2: brz %o2, 9f - nop +2: ldx [%o1 + TSB_CONFIG_MAP_VADDR], %o4 + brz %o4, 9f + ldx [%o1 + TSB_CONFIG_MAP_PTE], %o5 sethi %hi(sparc64_highest_unlocked_tlb_ent), %g2 - mov TLB_TAG_ACCESS, %g1 + mov TLB_TAG_ACCESS, %g3 lduw [%g2 + %lo(sparc64_highest_unlocked_tlb_ent)], %g2 - stxa %o2, [%g1] ASI_DMMU + stxa %o4, [%g3] ASI_DMMU membar #Sync sllx %g2, 3, %g2 - stxa %o3, [%g2] ASI_DTLB_DATA_ACCESS + stxa %o5, [%g2] ASI_DTLB_DATA_ACCESS + membar #Sync + + brz,pt %o2, 9f + nop + + ldx [%o2 + TSB_CONFIG_MAP_VADDR], %o4 + ldx [%o2 + TSB_CONFIG_MAP_PTE], %o5 + mov TLB_TAG_ACCESS, %g3 + stxa %o4, [%g3] ASI_DMMU + membar #Sync + sub %g2, (1 << 3), %g2 + stxa %o5, [%g2] ASI_DTLB_DATA_ACCESS membar #Sync + 9: - wrpr %o5, %pstate + wrpr %g1, %pstate retl nop diff --git a/arch/sparc64/mm/fault.c b/arch/sparc64/mm/fault.c index 63b6cc0cd5d5..d21ff3230c02 100644 --- a/arch/sparc64/mm/fault.c +++ b/arch/sparc64/mm/fault.c @@ -410,9 +410,18 @@ good_area: up_read(&mm->mmap_sem); mm_rss = get_mm_rss(mm); - if (unlikely(mm_rss >= mm->context.tsb_rss_limit)) - tsb_grow(mm, mm_rss); - +#ifdef CONFIG_HUGETLB_PAGE + mm_rss -= (mm->context.huge_pte_count * (HPAGE_SIZE / PAGE_SIZE)); +#endif + if (unlikely(mm_rss >= + mm->context.tsb_block[MM_TSB_BASE].tsb_rss_limit)) + tsb_grow(mm, MM_TSB_BASE, mm_rss); +#ifdef CONFIG_HUGETLB_PAGE + mm_rss = mm->context.huge_pte_count; + if (unlikely(mm_rss >= + mm->context.tsb_block[MM_TSB_HUGE].tsb_rss_limit)) + tsb_grow(mm, MM_TSB_HUGE, mm_rss); +#endif return; /* diff --git a/arch/sparc64/mm/generic.c b/arch/sparc64/mm/generic.c index 5fc5c579e35e..8cb06205d265 100644 --- a/arch/sparc64/mm/generic.c +++ b/arch/sparc64/mm/generic.c @@ -140,7 +140,6 @@ int io_remap_pfn_range(struct vm_area_struct *vma, unsigned long from, vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; vma->vm_pgoff = phys_base >> PAGE_SHIFT; - prot = __pgprot(pg_iobits); offset -= from; dir = pgd_offset(mm, from); flush_cache_range(vma, beg, end); diff --git a/arch/sparc64/mm/hugetlbpage.c b/arch/sparc64/mm/hugetlbpage.c index a7a24869d045..074620d413d4 100644 --- a/arch/sparc64/mm/hugetlbpage.c +++ b/arch/sparc64/mm/hugetlbpage.c @@ -199,13 +199,11 @@ pte_t *huge_pte_alloc(struct mm_struct *mm, unsigned long addr) pte_t *pte = NULL; pgd = pgd_offset(mm, addr); - if (pgd) { - pud = pud_offset(pgd, addr); - if (pud) { - pmd = pmd_alloc(mm, pud, addr); - if (pmd) - pte = pte_alloc_map(mm, pmd, addr); - } + pud = pud_alloc(mm, pgd, addr); + if (pud) { + pmd = pmd_alloc(mm, pud, addr); + if (pmd) + pte = pte_alloc_map(mm, pmd, addr); } return pte; } @@ -231,13 +229,14 @@ pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr) return pte; } -#define mk_pte_huge(entry) do { pte_val(entry) |= _PAGE_SZHUGE; } while (0) - void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t entry) { int i; + if (!pte_present(*ptep) && pte_present(entry)) + mm->context.huge_pte_count++; + for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { set_pte_at(mm, addr, ptep, entry); ptep++; @@ -253,6 +252,8 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, int i; entry = *ptep; + if (pte_present(entry)) + mm->context.huge_pte_count--; for (i = 0; i < (1 << HUGETLB_PAGE_ORDER); i++) { pte_clear(mm, addr, ptep); @@ -263,18 +264,6 @@ pte_t huge_ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, return entry; } -/* - * This function checks for proper alignment of input addr and len parameters. - */ -int is_aligned_hugepage_range(unsigned long addr, unsigned long len) -{ - if (len & ~HPAGE_MASK) - return -EINVAL; - if (addr & ~HPAGE_MASK) - return -EINVAL; - return 0; -} - struct page *follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) { @@ -302,6 +291,15 @@ static void context_reload(void *__data) void hugetlb_prefault_arch_hook(struct mm_struct *mm) { + struct tsb_config *tp = &mm->context.tsb_block[MM_TSB_HUGE]; + + if (likely(tp->tsb != NULL)) + return; + + tsb_grow(mm, MM_TSB_HUGE, 0); + tsb_context_switch(mm); + smp_tsb_sync(mm); + /* On UltraSPARC-III+ and later, configure the second half of * the Data-TLB for huge pages. */ diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c index c2b556106fc1..ded63ee9c4fd 100644 --- a/arch/sparc64/mm/init.c +++ b/arch/sparc64/mm/init.c @@ -283,6 +283,7 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p struct mm_struct *mm; struct tsb *tsb; unsigned long tag, flags; + unsigned long tsb_index, tsb_hash_shift; if (tlb_type != hypervisor) { unsigned long pfn = pte_pfn(pte); @@ -312,10 +313,26 @@ void update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t p mm = vma->vm_mm; + tsb_index = MM_TSB_BASE; + tsb_hash_shift = PAGE_SHIFT; + spin_lock_irqsave(&mm->context.lock, flags); - tsb = &mm->context.tsb[(address >> PAGE_SHIFT) & - (mm->context.tsb_nentries - 1UL)]; +#ifdef CONFIG_HUGETLB_PAGE + if (mm->context.tsb_block[MM_TSB_HUGE].tsb != NULL) { + if ((tlb_type == hypervisor && + (pte_val(pte) & _PAGE_SZALL_4V) == _PAGE_SZHUGE_4V) || + (tlb_type != hypervisor && + (pte_val(pte) & _PAGE_SZALL_4U) == _PAGE_SZHUGE_4U)) { + tsb_index = MM_TSB_HUGE; + tsb_hash_shift = HPAGE_SHIFT; + } + } +#endif + + tsb = mm->context.tsb_block[tsb_index].tsb; + tsb += ((address >> tsb_hash_shift) & + (mm->context.tsb_block[tsb_index].tsb_nentries - 1UL)); tag = (address >> 22UL); tsb_insert(tsb, tag, pte_val(pte)); @@ -1461,7 +1478,7 @@ void free_initmem(void) p = virt_to_page(page); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; totalram_pages++; @@ -1477,7 +1494,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) struct page *p = virt_to_page(start); ClearPageReserved(p); - set_page_count(p, 1); + init_page_count(p); __free_page(p); num_physpages++; totalram_pages++; diff --git a/arch/sparc64/mm/tsb.c b/arch/sparc64/mm/tsb.c index b2064e2a44d6..beaa02810f0e 100644 --- a/arch/sparc64/mm/tsb.c +++ b/arch/sparc64/mm/tsb.c @@ -15,9 +15,9 @@ extern struct tsb swapper_tsb[KERNEL_TSB_NENTRIES]; -static inline unsigned long tsb_hash(unsigned long vaddr, unsigned long nentries) +static inline unsigned long tsb_hash(unsigned long vaddr, unsigned long hash_shift, unsigned long nentries) { - vaddr >>= PAGE_SHIFT; + vaddr >>= hash_shift; return vaddr & (nentries - 1); } @@ -36,7 +36,8 @@ void flush_tsb_kernel_range(unsigned long start, unsigned long end) unsigned long v; for (v = start; v < end; v += PAGE_SIZE) { - unsigned long hash = tsb_hash(v, KERNEL_TSB_NENTRIES); + unsigned long hash = tsb_hash(v, PAGE_SHIFT, + KERNEL_TSB_NENTRIES); struct tsb *ent = &swapper_tsb[hash]; if (tag_compare(ent->tag, v)) { @@ -46,49 +47,91 @@ void flush_tsb_kernel_range(unsigned long start, unsigned long end) } } -void flush_tsb_user(struct mmu_gather *mp) +static void __flush_tsb_one(struct mmu_gather *mp, unsigned long hash_shift, unsigned long tsb, unsigned long nentries) { - struct mm_struct *mm = mp->mm; - unsigned long nentries, base, flags; - struct tsb *tsb; - int i; - - spin_lock_irqsave(&mm->context.lock, flags); - - tsb = mm->context.tsb; - nentries = mm->context.tsb_nentries; + unsigned long i; - if (tlb_type == cheetah_plus || tlb_type == hypervisor) - base = __pa(tsb); - else - base = (unsigned long) tsb; - for (i = 0; i < mp->tlb_nr; i++) { unsigned long v = mp->vaddrs[i]; unsigned long tag, ent, hash; v &= ~0x1UL; - hash = tsb_hash(v, nentries); - ent = base + (hash * sizeof(struct tsb)); + hash = tsb_hash(v, hash_shift, nentries); + ent = tsb + (hash * sizeof(struct tsb)); tag = (v >> 22UL); tsb_flush(ent, tag); } +} + +void flush_tsb_user(struct mmu_gather *mp) +{ + struct mm_struct *mm = mp->mm; + unsigned long nentries, base, flags; + + spin_lock_irqsave(&mm->context.lock, flags); + base = (unsigned long) mm->context.tsb_block[MM_TSB_BASE].tsb; + nentries = mm->context.tsb_block[MM_TSB_BASE].tsb_nentries; + if (tlb_type == cheetah_plus || tlb_type == hypervisor) + base = __pa(base); + __flush_tsb_one(mp, PAGE_SHIFT, base, nentries); + +#ifdef CONFIG_HUGETLB_PAGE + if (mm->context.tsb_block[MM_TSB_HUGE].tsb) { + base = (unsigned long) mm->context.tsb_block[MM_TSB_HUGE].tsb; + nentries = mm->context.tsb_block[MM_TSB_HUGE].tsb_nentries; + if (tlb_type == cheetah_plus || tlb_type == hypervisor) + base = __pa(base); + __flush_tsb_one(mp, HPAGE_SHIFT, base, nentries); + } +#endif spin_unlock_irqrestore(&mm->context.lock, flags); } -static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes) +#if defined(CONFIG_SPARC64_PAGE_SIZE_8KB) +#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_8K +#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_8K +#elif defined(CONFIG_SPARC64_PAGE_SIZE_64KB) +#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_64K +#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_64K +#elif defined(CONFIG_SPARC64_PAGE_SIZE_512KB) +#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_512K +#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_512K +#elif defined(CONFIG_SPARC64_PAGE_SIZE_4MB) +#define HV_PGSZ_IDX_BASE HV_PGSZ_IDX_4MB +#define HV_PGSZ_MASK_BASE HV_PGSZ_MASK_4MB +#else +#error Broken base page size setting... +#endif + +#ifdef CONFIG_HUGETLB_PAGE +#if defined(CONFIG_HUGETLB_PAGE_SIZE_64K) +#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_64K +#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_64K +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_512K) +#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_512K +#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_512K +#elif defined(CONFIG_HUGETLB_PAGE_SIZE_4MB) +#define HV_PGSZ_IDX_HUGE HV_PGSZ_IDX_4MB +#define HV_PGSZ_MASK_HUGE HV_PGSZ_MASK_4MB +#else +#error Broken huge page size setting... +#endif +#endif + +static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_idx, unsigned long tsb_bytes) { unsigned long tsb_reg, base, tsb_paddr; unsigned long page_sz, tte; - mm->context.tsb_nentries = tsb_bytes / sizeof(struct tsb); + mm->context.tsb_block[tsb_idx].tsb_nentries = + tsb_bytes / sizeof(struct tsb); base = TSBMAP_BASE; tte = pgprot_val(PAGE_KERNEL_LOCKED); - tsb_paddr = __pa(mm->context.tsb); + tsb_paddr = __pa(mm->context.tsb_block[tsb_idx].tsb); BUG_ON(tsb_paddr & (tsb_bytes - 1UL)); /* Use the smallest page size that can map the whole TSB @@ -147,61 +190,49 @@ static void setup_tsb_params(struct mm_struct *mm, unsigned long tsb_bytes) /* Physical mapping, no locked TLB entry for TSB. */ tsb_reg |= tsb_paddr; - mm->context.tsb_reg_val = tsb_reg; - mm->context.tsb_map_vaddr = 0; - mm->context.tsb_map_pte = 0; + mm->context.tsb_block[tsb_idx].tsb_reg_val = tsb_reg; + mm->context.tsb_block[tsb_idx].tsb_map_vaddr = 0; + mm->context.tsb_block[tsb_idx].tsb_map_pte = 0; } else { tsb_reg |= base; tsb_reg |= (tsb_paddr & (page_sz - 1UL)); tte |= (tsb_paddr & ~(page_sz - 1UL)); - mm->context.tsb_reg_val = tsb_reg; - mm->context.tsb_map_vaddr = base; - mm->context.tsb_map_pte = tte; + mm->context.tsb_block[tsb_idx].tsb_reg_val = tsb_reg; + mm->context.tsb_block[tsb_idx].tsb_map_vaddr = base; + mm->context.tsb_block[tsb_idx].tsb_map_pte = tte; } /* Setup the Hypervisor TSB descriptor. */ if (tlb_type == hypervisor) { - struct hv_tsb_descr *hp = &mm->context.tsb_descr; + struct hv_tsb_descr *hp = &mm->context.tsb_descr[tsb_idx]; - switch (PAGE_SIZE) { - case 8192: - default: - hp->pgsz_idx = HV_PGSZ_IDX_8K; + switch (tsb_idx) { + case MM_TSB_BASE: + hp->pgsz_idx = HV_PGSZ_IDX_BASE; break; - - case 64 * 1024: - hp->pgsz_idx = HV_PGSZ_IDX_64K; - break; - - case 512 * 1024: - hp->pgsz_idx = HV_PGSZ_IDX_512K; - break; - - case 4 * 1024 * 1024: - hp->pgsz_idx = HV_PGSZ_IDX_4MB; +#ifdef CONFIG_HUGETLB_PAGE + case MM_TSB_HUGE: + hp->pgsz_idx = HV_PGSZ_IDX_HUGE; break; +#endif + default: + BUG(); }; hp->assoc = 1; hp->num_ttes = tsb_bytes / 16; hp->ctx_idx = 0; - switch (PAGE_SIZE) { - case 8192: - default: - hp->pgsz_mask = HV_PGSZ_MASK_8K; - break; - - case 64 * 1024: - hp->pgsz_mask = HV_PGSZ_MASK_64K; - break; - - case 512 * 1024: - hp->pgsz_mask = HV_PGSZ_MASK_512K; + switch (tsb_idx) { + case MM_TSB_BASE: + hp->pgsz_mask = HV_PGSZ_MASK_BASE; break; - - case 4 * 1024 * 1024: - hp->pgsz_mask = HV_PGSZ_MASK_4MB; +#ifdef CONFIG_HUGETLB_PAGE + case MM_TSB_HUGE: + hp->pgsz_mask = HV_PGSZ_MASK_HUGE; break; +#endif + default: + BUG(); }; hp->tsb_base = tsb_paddr; hp->resv = 0; @@ -241,11 +272,11 @@ void __init tsb_cache_init(void) } } -/* When the RSS of an address space exceeds mm->context.tsb_rss_limit, - * do_sparc64_fault() invokes this routine to try and grow the TSB. +/* When the RSS of an address space exceeds tsb_rss_limit for a TSB, + * do_sparc64_fault() invokes this routine to try and grow it. * * When we reach the maximum TSB size supported, we stick ~0UL into - * mm->context.tsb_rss_limit so the grow checks in update_mmu_cache() + * tsb_rss_limit for that TSB so the grow checks in do_sparc64_fault() * will not trigger any longer. * * The TSB can be anywhere from 8K to 1MB in size, in increasing powers @@ -257,7 +288,7 @@ void __init tsb_cache_init(void) * the number of entries that the current TSB can hold at once. Currently, * we trigger when the RSS hits 3/4 of the TSB capacity. */ -void tsb_grow(struct mm_struct *mm, unsigned long rss) +void tsb_grow(struct mm_struct *mm, unsigned long tsb_index, unsigned long rss) { unsigned long max_tsb_size = 1 * 1024 * 1024; unsigned long new_size, old_size, flags; @@ -297,7 +328,8 @@ retry_tsb_alloc: * down to a 0-order allocation and force no TSB * growing for this address space. */ - if (mm->context.tsb == NULL && new_cache_index > 0) { + if (mm->context.tsb_block[tsb_index].tsb == NULL && + new_cache_index > 0) { new_cache_index = 0; new_size = 8192; new_rss_limit = ~0UL; @@ -307,8 +339,8 @@ retry_tsb_alloc: /* If we failed on a TSB grow, we are under serious * memory pressure so don't try to grow any more. */ - if (mm->context.tsb != NULL) - mm->context.tsb_rss_limit = ~0UL; + if (mm->context.tsb_block[tsb_index].tsb != NULL) + mm->context.tsb_block[tsb_index].tsb_rss_limit = ~0UL; return; } @@ -339,23 +371,26 @@ retry_tsb_alloc: */ spin_lock_irqsave(&mm->context.lock, flags); - old_tsb = mm->context.tsb; - old_cache_index = (mm->context.tsb_reg_val & 0x7UL); - old_size = mm->context.tsb_nentries * sizeof(struct tsb); + old_tsb = mm->context.tsb_block[tsb_index].tsb; + old_cache_index = + (mm->context.tsb_block[tsb_index].tsb_reg_val & 0x7UL); + old_size = (mm->context.tsb_block[tsb_index].tsb_nentries * + sizeof(struct tsb)); /* Handle multiple threads trying to grow the TSB at the same time. * One will get in here first, and bump the size and the RSS limit. * The others will get in here next and hit this check. */ - if (unlikely(old_tsb && (rss < mm->context.tsb_rss_limit))) { + if (unlikely(old_tsb && + (rss < mm->context.tsb_block[tsb_index].tsb_rss_limit))) { spin_unlock_irqrestore(&mm->context.lock, flags); kmem_cache_free(tsb_caches[new_cache_index], new_tsb); return; } - mm->context.tsb_rss_limit = new_rss_limit; + mm->context.tsb_block[tsb_index].tsb_rss_limit = new_rss_limit; if (old_tsb) { extern void copy_tsb(unsigned long old_tsb_base, @@ -372,8 +407,8 @@ retry_tsb_alloc: copy_tsb(old_tsb_base, old_size, new_tsb_base, new_size); } - mm->context.tsb = new_tsb; - setup_tsb_params(mm, new_size); + mm->context.tsb_block[tsb_index].tsb = new_tsb; + setup_tsb_params(mm, tsb_index, new_size); spin_unlock_irqrestore(&mm->context.lock, flags); @@ -394,40 +429,65 @@ retry_tsb_alloc: int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { +#ifdef CONFIG_HUGETLB_PAGE + unsigned long huge_pte_count; +#endif + unsigned int i; + spin_lock_init(&mm->context.lock); mm->context.sparc64_ctx_val = 0UL; +#ifdef CONFIG_HUGETLB_PAGE + /* We reset it to zero because the fork() page copying + * will re-increment the counters as the parent PTEs are + * copied into the child address space. + */ + huge_pte_count = mm->context.huge_pte_count; + mm->context.huge_pte_count = 0; +#endif + /* copy_mm() copies over the parent's mm_struct before calling * us, so we need to zero out the TSB pointer or else tsb_grow() * will be confused and think there is an older TSB to free up. */ - mm->context.tsb = NULL; + for (i = 0; i < MM_NUM_TSBS; i++) + mm->context.tsb_block[i].tsb = NULL; /* If this is fork, inherit the parent's TSB size. We would * grow it to that size on the first page fault anyways. */ - tsb_grow(mm, get_mm_rss(mm)); + tsb_grow(mm, MM_TSB_BASE, get_mm_rss(mm)); - if (unlikely(!mm->context.tsb)) +#ifdef CONFIG_HUGETLB_PAGE + if (unlikely(huge_pte_count)) + tsb_grow(mm, MM_TSB_HUGE, huge_pte_count); +#endif + + if (unlikely(!mm->context.tsb_block[MM_TSB_BASE].tsb)) return -ENOMEM; return 0; } -void destroy_context(struct mm_struct *mm) +static void tsb_destroy_one(struct tsb_config *tp) { - unsigned long flags, cache_index; + unsigned long cache_index; - cache_index = (mm->context.tsb_reg_val & 0x7UL); - kmem_cache_free(tsb_caches[cache_index], mm->context.tsb); + if (!tp->tsb) + return; + cache_index = tp->tsb_reg_val & 0x7UL; + kmem_cache_free(tsb_caches[cache_index], tp->tsb); + tp->tsb = NULL; + tp->tsb_reg_val = 0UL; +} - /* We can remove these later, but for now it's useful - * to catch any bogus post-destroy_context() references - * to the TSB. - */ - mm->context.tsb = NULL; - mm->context.tsb_reg_val = 0UL; +void destroy_context(struct mm_struct *mm) +{ + unsigned long flags, i; + + for (i = 0; i < MM_NUM_TSBS; i++) + tsb_destroy_one(&mm->context.tsb_block[i]); spin_lock_irqsave(&ctx_alloc_lock, flags); diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index fa4f915be5c5..92cce96b5e24 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -57,7 +57,7 @@ static void setup_highmem(unsigned long highmem_start, for(i = 0; i < highmem_len >> PAGE_SHIFT; i++){ page = &mem_map[highmem_pfn + i]; ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); } } @@ -296,7 +296,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/um/kernel/physmem.c b/arch/um/kernel/physmem.c index 544665e04513..0e65340eee33 100644 --- a/arch/um/kernel/physmem.c +++ b/arch/um/kernel/physmem.c @@ -279,7 +279,7 @@ int init_maps(unsigned long physmem, unsigned long iomem, unsigned long highmem) for(i = 0; i < total_pages; i++){ p = &map[i]; - set_page_count(p, 0); + memset(p, 0, sizeof(struct page)); SetPageReserved(p); INIT_LIST_HEAD(&p->lru); } diff --git a/arch/x86_64/kernel/time.c b/arch/x86_64/kernel/time.c index 3080f84bf7b7..ee5ce3d3cbc3 100644 --- a/arch/x86_64/kernel/time.c +++ b/arch/x86_64/kernel/time.c @@ -477,7 +477,7 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs) return IRQ_HANDLED; } -static unsigned int cyc2ns_scale; +static unsigned int cyc2ns_scale __read_mostly; #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */ static inline void set_cyc2ns_scale(unsigned long cpu_khz) diff --git a/arch/x86_64/kernel/x8664_ksyms.c b/arch/x86_64/kernel/x8664_ksyms.c index 3496abc8d372..c9dc7e46731e 100644 --- a/arch/x86_64/kernel/x8664_ksyms.c +++ b/arch/x86_64/kernel/x8664_ksyms.c @@ -124,6 +124,7 @@ extern void * __memcpy(void *,const void *,__kernel_size_t); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(strlen); +EXPORT_SYMBOL(strpbrk); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(__memcpy); diff --git a/arch/x86_64/mm/init.c b/arch/x86_64/mm/init.c index 7af1742aa958..40ed13d263cd 100644 --- a/arch/x86_64/mm/init.c +++ b/arch/x86_64/mm/init.c @@ -486,7 +486,7 @@ void __init clear_kernel_mapping(unsigned long address, unsigned long size) void online_page(struct page *page) { ClearPageReserved(page); - set_page_count(page, 1); + init_page_count(page); __free_page(page); totalram_pages++; num_physpages++; @@ -592,7 +592,7 @@ void free_initmem(void) addr = (unsigned long)(&__init_begin); for (; addr < (unsigned long)(&__init_end); addr += PAGE_SIZE) { ClearPageReserved(virt_to_page(addr)); - set_page_count(virt_to_page(addr), 1); + init_page_count(virt_to_page(addr)); memset((void *)(addr & ~(PAGE_SIZE-1)), 0xcc, PAGE_SIZE); free_page(addr); totalram_pages++; @@ -632,7 +632,7 @@ void free_initrd_mem(unsigned long start, unsigned long end) printk ("Freeing initrd memory: %ldk freed\n", (end - start) >> 10); for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page(start); totalram_pages++; } diff --git a/arch/x86_64/mm/pageattr.c b/arch/x86_64/mm/pageattr.c index 35f1f1aab063..531ad21447b1 100644 --- a/arch/x86_64/mm/pageattr.c +++ b/arch/x86_64/mm/pageattr.c @@ -45,6 +45,13 @@ static struct page *split_large_page(unsigned long address, pgprot_t prot, pte_t *pbase; if (!base) return NULL; + /* + * page_private is used to track the number of entries in + * the page table page have non standard attributes. + */ + SetPagePrivate(base); + page_private(base) = 0; + address = __pa(address); addr = address & LARGE_PAGE_MASK; pbase = (pte_t *)page_address(base); @@ -77,26 +84,12 @@ static inline void flush_map(unsigned long address) on_each_cpu(flush_kernel_map, (void *)address, 1, 1); } -struct deferred_page { - struct deferred_page *next; - struct page *fpage; - unsigned long address; -}; -static struct deferred_page *df_list; /* protected by init_mm.mmap_sem */ +static struct page *deferred_pages; /* protected by init_mm.mmap_sem */ -static inline void save_page(unsigned long address, struct page *fpage) +static inline void save_page(struct page *fpage) { - struct deferred_page *df; - df = kmalloc(sizeof(struct deferred_page), GFP_KERNEL); - if (!df) { - flush_map(address); - __free_page(fpage); - } else { - df->next = df_list; - df->fpage = fpage; - df->address = address; - df_list = df; - } + fpage->lru.next = (struct list_head *)deferred_pages; + deferred_pages = fpage; } /* @@ -138,8 +131,8 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, set_pte(kpte, pfn_pte(pfn, prot)); } else { /* - * split_large_page will take the reference for this change_page_attr - * on the split page. + * split_large_page will take the reference for this + * change_page_attr on the split page. */ struct page *split; @@ -151,23 +144,20 @@ __change_page_attr(unsigned long address, unsigned long pfn, pgprot_t prot, set_pte(kpte,mk_pte(split, ref_prot2)); kpte_page = split; } - get_page(kpte_page); + page_private(kpte_page)++; } else if ((kpte_flags & _PAGE_PSE) == 0) { set_pte(kpte, pfn_pte(pfn, ref_prot)); - __put_page(kpte_page); + BUG_ON(page_private(kpte_page) == 0); + page_private(kpte_page)--; } else BUG(); /* on x86-64 the direct mapping set at boot is not using 4k pages */ BUG_ON(PageReserved(kpte_page)); - switch (page_count(kpte_page)) { - case 1: - save_page(address, kpte_page); + if (page_private(kpte_page) == 0) { + save_page(kpte_page); revert_page(address, ref_prot); - break; - case 0: - BUG(); /* memleak and failed 2M page regeneration */ } return 0; } @@ -220,17 +210,18 @@ int change_page_attr(struct page *page, int numpages, pgprot_t prot) void global_flush_tlb(void) { - struct deferred_page *df, *next_df; + struct page *dpage; down_read(&init_mm.mmap_sem); - df = xchg(&df_list, NULL); + dpage = xchg(&deferred_pages, NULL); up_read(&init_mm.mmap_sem); - flush_map((df && !df->next) ? df->address : 0); - for (; df; df = next_df) { - next_df = df->next; - if (df->fpage) - __free_page(df->fpage); - kfree(df); + + flush_map((dpage && !dpage->lru.next) ? (unsigned long)page_address(dpage) : 0); + while (dpage) { + struct page *tmp = dpage; + dpage = (struct page *)dpage->lru.next; + ClearPagePrivate(tmp); + __free_page(tmp); } } diff --git a/arch/xtensa/mm/init.c b/arch/xtensa/mm/init.c index 5a91d6c9e66d..e1be4235f367 100644 --- a/arch/xtensa/mm/init.c +++ b/arch/xtensa/mm/init.c @@ -272,7 +272,7 @@ free_reserved_mem(void *start, void *end) { for (; start < end; start += PAGE_SIZE) { ClearPageReserved(virt_to_page(start)); - set_page_count(virt_to_page(start), 1); + init_page_count(virt_to_page(start)); free_page((unsigned long)start); totalram_pages++; } diff --git a/arch/xtensa/mm/pgtable.c b/arch/xtensa/mm/pgtable.c index e5e119c820e4..7d28914d11cb 100644 --- a/arch/xtensa/mm/pgtable.c +++ b/arch/xtensa/mm/pgtable.c @@ -14,25 +14,21 @@ pte_t* pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) { - pte_t *pte, p; + pte_t *pte = NULL, *p; int color = ADDR_COLOR(address); int i; p = (pte_t*) __get_free_pages(GFP_KERNEL|__GFP_REPEAT, COLOR_ORDER); if (likely(p)) { - struct page *page; - - for (i = 0; i < COLOR_SIZE; i++, p++) { - page = virt_to_page(pte); - - set_page_count(page, 1); - ClearPageCompound(page); + split_page(virt_to_page(p), COLOR_ORDER); + for (i = 0; i < COLOR_SIZE; i++) { if (ADDR_COLOR(p) == color) pte = p; else free_page(p); + p += PTRS_PER_PTE; } clear_page(pte); } @@ -49,20 +45,20 @@ int flush; struct page* pte_alloc_one(struct mm_struct *mm, unsigned long address) { - struct page *page, p; + struct page *page = NULL, *p; int color = ADDR_COLOR(address); p = alloc_pages(GFP_KERNEL | __GFP_REPEAT, PTE_ORDER); if (likely(p)) { - for (i = 0; i < PAGE_ORDER; i++) { - set_page_count(p, 1); - ClearPageCompound(p); + split_page(p, COLOR_ORDER); - if (PADDR_COLOR(page_address(pg)) == color) + for (i = 0; i < PAGE_ORDER; i++) { + if (PADDR_COLOR(page_address(p)) == color) page = p; else - free_page(p); + __free_page(p); + p++; } clear_highpage(page); } |