diff options
author | Chris Metcalf <cmetcalf@tilera.com> | 2013-08-16 00:23:24 +0400 |
---|---|---|
committer | Chris Metcalf <cmetcalf@tilera.com> | 2013-09-03 22:53:29 +0400 |
commit | d7c9661115fd23b4dabb710b3080dd9919dfa891 (patch) | |
tree | 5eaeb8c4aab296f39d6aa896ec9408419ec17441 /arch/tile/lib | |
parent | d6a0aa314c06743b702931cb468f400b7615c5c9 (diff) | |
download | linux-d7c9661115fd23b4dabb710b3080dd9919dfa891.tar.xz |
tile: remove support for TILE64
This chip is no longer being actively developed for (it was superceded
by the TILEPro64 in 2008), and in any case the existing compiler and
toolchain in the community do not support it. It's unlikely that the
kernel works with TILE64 at this point as the configuration has not been
tested in years. The support is also awkward as it requires maintaining
a significant number of ifdefs. So, just remove it altogether.
Signed-off-by: Chris Metcalf <cmetcalf@tilera.com>
Diffstat (limited to 'arch/tile/lib')
-rw-r--r-- | arch/tile/lib/Makefile | 2 | ||||
-rw-r--r-- | arch/tile/lib/atomic_32.c | 90 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_32.S | 61 | ||||
-rw-r--r-- | arch/tile/lib/memcpy_tile64.c | 280 | ||||
-rw-r--r-- | arch/tile/lib/memset_32.c | 105 |
5 files changed, 1 insertions, 537 deletions
diff --git a/arch/tile/lib/Makefile b/arch/tile/lib/Makefile index 9adfd76fbdd8..c4211cbb2021 100644 --- a/arch/tile/lib/Makefile +++ b/arch/tile/lib/Makefile @@ -7,7 +7,7 @@ lib-y = cacheflush.o checksum.o cpumask.o delay.o uaccess.o \ strchr_$(BITS).o strlen_$(BITS).o strnlen_$(BITS).o lib-$(CONFIG_TILEGX) += memcpy_user_64.o -lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o memcpy_tile64.o +lib-$(CONFIG_TILEPRO) += atomic_32.o atomic_asm_32.o lib-$(CONFIG_SMP) += spinlock_$(BITS).o usercopy_$(BITS).o obj-$(CONFIG_MODULES) += exports.o diff --git a/arch/tile/lib/atomic_32.c b/arch/tile/lib/atomic_32.c index 42eacb1f737a..5d91d1860640 100644 --- a/arch/tile/lib/atomic_32.c +++ b/arch/tile/lib/atomic_32.c @@ -20,50 +20,12 @@ #include <linux/atomic.h> #include <arch/chip.h> -/* See <asm/atomic_32.h> */ -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - -/* - * A block of memory containing locks for atomic ops. Each instance of this - * struct will be homed on a different CPU. - */ -struct atomic_locks_on_cpu { - int lock[ATOMIC_HASH_L2_SIZE]; -} __attribute__((aligned(ATOMIC_HASH_L2_SIZE * 4))); - -static DEFINE_PER_CPU(struct atomic_locks_on_cpu, atomic_lock_pool); - -/* The locks we'll use until __init_atomic_per_cpu is called. */ -static struct atomic_locks_on_cpu __initdata initial_atomic_locks; - -/* Hash into this vector to get a pointer to lock for the given atomic. */ -struct atomic_locks_on_cpu *atomic_lock_ptr[ATOMIC_HASH_L1_SIZE] - __write_once = { - [0 ... ATOMIC_HASH_L1_SIZE-1] (&initial_atomic_locks) -}; - -#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ - /* This page is remapped on startup to be hash-for-home. */ int atomic_locks[PAGE_SIZE / sizeof(int)] __page_aligned_bss; -#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ - int *__atomic_hashed_lock(volatile void *v) { /* NOTE: this code must match "sys_cmpxchg" in kernel/intvec_32.S */ -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - unsigned long i = - (unsigned long) v & ((PAGE_SIZE-1) & -sizeof(long long)); - unsigned long n = __insn_crc32_32(0, i); - - /* Grab high bits for L1 index. */ - unsigned long l1_index = n >> ((sizeof(n) * 8) - ATOMIC_HASH_L1_SHIFT); - /* Grab low bits for L2 index. */ - unsigned long l2_index = n & (ATOMIC_HASH_L2_SIZE - 1); - - return &atomic_lock_ptr[l1_index]->lock[l2_index]; -#else /* * Use bits [3, 3 + ATOMIC_HASH_SHIFT) as the lock index. * Using mm works here because atomic_locks is page aligned. @@ -72,26 +34,13 @@ int *__atomic_hashed_lock(volatile void *v) (unsigned long)atomic_locks, 2, (ATOMIC_HASH_SHIFT + 2) - 1); return (int *)ptr; -#endif } #ifdef CONFIG_SMP /* Return whether the passed pointer is a valid atomic lock pointer. */ static int is_atomic_lock(int *p) { -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - int i; - for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) { - - if (p >= &atomic_lock_ptr[i]->lock[0] && - p < &atomic_lock_ptr[i]->lock[ATOMIC_HASH_L2_SIZE]) { - return 1; - } - } - return 0; -#else return p >= &atomic_locks[0] && p < &atomic_locks[ATOMIC_HASH_SIZE]; -#endif } void __atomic_fault_unlock(int *irqlock_word) @@ -210,43 +159,6 @@ struct __get_user __atomic_bad_address(int __user *addr) void __init __init_atomic_per_cpu(void) { -#if ATOMIC_LOCKS_FOUND_VIA_TABLE() - - unsigned int i; - int actual_cpu; - - /* - * Before this is called from setup, we just have one lock for - * all atomic objects/operations. Here we replace the - * elements of atomic_lock_ptr so that they point at per_cpu - * integers. This seemingly over-complex approach stems from - * the fact that DEFINE_PER_CPU defines an entry for each cpu - * in the grid, not each cpu from 0..ATOMIC_HASH_SIZE-1. But - * for efficient hashing of atomics to their locks we want a - * compile time constant power of 2 for the size of this - * table, so we use ATOMIC_HASH_SIZE. - * - * Here we populate atomic_lock_ptr from the per cpu - * atomic_lock_pool, interspersing by actual cpu so that - * subsequent elements are homed on consecutive cpus. - */ - - actual_cpu = cpumask_first(cpu_possible_mask); - - for (i = 0; i < ATOMIC_HASH_L1_SIZE; ++i) { - /* - * Preincrement to slightly bias against using cpu 0, - * which has plenty of stuff homed on it already. - */ - actual_cpu = cpumask_next(actual_cpu, cpu_possible_mask); - if (actual_cpu >= nr_cpu_ids) - actual_cpu = cpumask_first(cpu_possible_mask); - - atomic_lock_ptr[i] = &per_cpu(atomic_lock_pool, actual_cpu); - } - -#else /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ - /* Validate power-of-two and "bigger than cpus" assumption */ BUILD_BUG_ON(ATOMIC_HASH_SIZE & (ATOMIC_HASH_SIZE-1)); BUG_ON(ATOMIC_HASH_SIZE < nr_cpu_ids); @@ -270,6 +182,4 @@ void __init __init_atomic_per_cpu(void) * That should not produce more indices than ATOMIC_HASH_SIZE. */ BUILD_BUG_ON((PAGE_SIZE >> 3) > ATOMIC_HASH_SIZE); - -#endif /* ATOMIC_LOCKS_FOUND_VIA_TABLE() */ } diff --git a/arch/tile/lib/memcpy_32.S b/arch/tile/lib/memcpy_32.S index 8ba7626cfeb1..a2771ae5da53 100644 --- a/arch/tile/lib/memcpy_32.S +++ b/arch/tile/lib/memcpy_32.S @@ -22,14 +22,6 @@ #include <linux/linkage.h> -/* On TILE64, we wrap these functions via arch/tile/lib/memcpy_tile64.c */ -#if !CHIP_HAS_COHERENT_LOCAL_CACHE() -#define memcpy __memcpy_asm -#define __copy_to_user_inatomic __copy_to_user_inatomic_asm -#define __copy_from_user_inatomic __copy_from_user_inatomic_asm -#define __copy_from_user_zeroing __copy_from_user_zeroing_asm -#endif - #define IS_MEMCPY 0 #define IS_COPY_FROM_USER 1 #define IS_COPY_FROM_USER_ZEROING 2 @@ -159,12 +151,9 @@ EX: { sw r0, r3; addi r0, r0, 4; addi r2, r2, -4 } { addi r3, r1, 60; andi r9, r9, -64 } -#if CHIP_HAS_WH64() /* No need to prefetch dst, we'll just do the wh64 * right before we copy a line. */ -#endif - EX: { lw r5, r3; addi r3, r3, 64; movei r4, 1 } /* Intentionally stall for a few cycles to leave L2 cache alone. */ { bnzt zero, .; move r27, lr } @@ -172,21 +161,6 @@ EX: { lw r6, r3; addi r3, r3, 64 } /* Intentionally stall for a few cycles to leave L2 cache alone. */ { bnzt zero, . } EX: { lw r7, r3; addi r3, r3, 64 } -#if !CHIP_HAS_WH64() - /* Prefetch the dest */ - /* Intentionally stall for a few cycles to leave L2 cache alone. */ - { bnzt zero, . } - /* Use a real load to cause a TLB miss if necessary. We aren't using - * r28, so this should be fine. - */ -EX: { lw r28, r9; addi r9, r9, 64 } - /* Intentionally stall for a few cycles to leave L2 cache alone. */ - { bnzt zero, . } - { prefetch r9; addi r9, r9, 64 } - /* Intentionally stall for a few cycles to leave L2 cache alone. */ - { bnzt zero, . } - { prefetch r9; addi r9, r9, 64 } -#endif /* Intentionally stall for a few cycles to leave L2 cache alone. */ { bz zero, .Lbig_loop2 } @@ -287,13 +261,8 @@ EX: { lw r7, r3; addi r3, r3, 64 } /* Fill second L1D line. */ EX: { lw r17, r17; addi r1, r1, 48; mvz r3, r13, r1 } /* r17 = WORD_4 */ -#if CHIP_HAS_WH64() /* Prepare destination line for writing. */ EX: { wh64 r9; addi r9, r9, 64 } -#else - /* Prefetch dest line */ - { prefetch r9; addi r9, r9, 64 } -#endif /* Load seven words that are L1D hits to cover wh64 L2 usage. */ /* Load the three remaining words from the last L1D line, which @@ -331,16 +300,7 @@ EX: { lw r18, r1; addi r1, r1, 4 } /* r18 = WORD_8 */ EX: { sw r0, r16; addi r0, r0, 4; add r16, r0, r2 } /* store(WORD_0) */ EX: { sw r0, r13; addi r0, r0, 4; andi r16, r16, -64 } /* store(WORD_1) */ EX: { sw r0, r14; addi r0, r0, 4; slt_u r16, r9, r16 } /* store(WORD_2) */ -#if CHIP_HAS_WH64() EX: { sw r0, r15; addi r0, r0, 4; addi r13, sp, -64 } /* store(WORD_3) */ -#else - /* Back up the r9 to a cache line we are already storing to - * if it gets past the end of the dest vector. Strictly speaking, - * we don't need to back up to the start of a cache line, but it's free - * and tidy, so why not? - */ -EX: { sw r0, r15; addi r0, r0, 4; andi r13, r0, -64 } /* store(WORD_3) */ -#endif /* Store second L1D line. */ EX: { sw r0, r17; addi r0, r0, 4; mvz r9, r16, r13 }/* store(WORD_4) */ EX: { sw r0, r19; addi r0, r0, 4 } /* store(WORD_5) */ @@ -404,7 +364,6 @@ EX: { sb r0, r3; addi r0, r0, 1; addi r2, r2, -1 } .Ldest_is_word_aligned: -#if CHIP_HAS_DWORD_ALIGN() EX: { andi r8, r0, 63; lwadd_na r6, r1, 4} { slti_u r9, r2, 64; bz r8, .Ldest_is_L2_line_aligned } @@ -512,26 +471,6 @@ EX: { swadd r0, r13, 4; addi r2, r2, -32 } /* Move r1 back to the point where it corresponds to r0. */ { addi r1, r1, -4 } -#else /* !CHIP_HAS_DWORD_ALIGN() */ - - /* Compute right/left shift counts and load initial source words. */ - { andi r5, r1, -4; andi r3, r1, 3 } -EX: { lw r6, r5; addi r5, r5, 4; shli r3, r3, 3 } -EX: { lw r7, r5; addi r5, r5, 4; sub r4, zero, r3 } - - /* Load and store one word at a time, using shifts and ORs - * to correct for the misaligned src. - */ -.Lcopy_unaligned_src_loop: - { shr r6, r6, r3; shl r8, r7, r4 } -EX: { lw r7, r5; or r8, r8, r6; move r6, r7 } -EX: { sw r0, r8; addi r0, r0, 4; addi r2, r2, -4 } - { addi r5, r5, 4; slti_u r8, r2, 8 } - { bzt r8, .Lcopy_unaligned_src_loop; addi r1, r1, 4 } - - { bz r2, .Lcopy_unaligned_done } -#endif /* !CHIP_HAS_DWORD_ALIGN() */ - /* Fall through */ /* diff --git a/arch/tile/lib/memcpy_tile64.c b/arch/tile/lib/memcpy_tile64.c deleted file mode 100644 index 0290c222847b..000000000000 --- a/arch/tile/lib/memcpy_tile64.c +++ /dev/null @@ -1,280 +0,0 @@ -/* - * Copyright 2010 Tilera Corporation. All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation, version 2. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or - * NON INFRINGEMENT. See the GNU General Public License for - * more details. - */ - -#include <linux/string.h> -#include <linux/smp.h> -#include <linux/module.h> -#include <linux/uaccess.h> -#include <asm/fixmap.h> -#include <asm/kmap_types.h> -#include <asm/tlbflush.h> -#include <hv/hypervisor.h> -#include <arch/chip.h> - - -#if !CHIP_HAS_COHERENT_LOCAL_CACHE() - -/* Defined in memcpy.S */ -extern unsigned long __memcpy_asm(void *to, const void *from, unsigned long n); -extern unsigned long __copy_to_user_inatomic_asm( - void __user *to, const void *from, unsigned long n); -extern unsigned long __copy_from_user_inatomic_asm( - void *to, const void __user *from, unsigned long n); -extern unsigned long __copy_from_user_zeroing_asm( - void *to, const void __user *from, unsigned long n); - -typedef unsigned long (*memcpy_t)(void *, const void *, unsigned long); - -/* Size above which to consider TLB games for performance */ -#define LARGE_COPY_CUTOFF 2048 - -/* Communicate to the simulator what we are trying to do. */ -#define sim_allow_multiple_caching(b) \ - __insn_mtspr(SPR_SIM_CONTROL, \ - SIM_CONTROL_ALLOW_MULTIPLE_CACHING | ((b) << _SIM_CONTROL_OPERATOR_BITS)) - -/* - * Copy memory by briefly enabling incoherent cacheline-at-a-time mode. - * - * We set up our own source and destination PTEs that we fully control. - * This is the only way to guarantee that we don't race with another - * thread that is modifying the PTE; we can't afford to try the - * copy_{to,from}_user() technique of catching the interrupt, since - * we must run with interrupts disabled to avoid the risk of some - * other code seeing the incoherent data in our cache. (Recall that - * our cache is indexed by PA, so even if the other code doesn't use - * our kmap_atomic virtual addresses, they'll still hit in cache using - * the normal VAs that aren't supposed to hit in cache.) - */ -static void memcpy_multicache(void *dest, const void *source, - pte_t dst_pte, pte_t src_pte, int len) -{ - int idx; - unsigned long flags, newsrc, newdst; - pmd_t *pmdp; - pte_t *ptep; - int type0, type1; - int cpu = smp_processor_id(); - - /* - * Disable interrupts so that we don't recurse into memcpy() - * in an interrupt handler, nor accidentally reference - * the PA of the source from an interrupt routine. Also - * notify the simulator that we're playing games so we don't - * generate spurious coherency warnings. - */ - local_irq_save(flags); - sim_allow_multiple_caching(1); - - /* Set up the new dest mapping */ - type0 = kmap_atomic_idx_push(); - idx = FIX_KMAP_BEGIN + (KM_TYPE_NR * cpu) + type0; - newdst = __fix_to_virt(idx) + ((unsigned long)dest & (PAGE_SIZE-1)); - pmdp = pmd_offset(pud_offset(pgd_offset_k(newdst), newdst), newdst); - ptep = pte_offset_kernel(pmdp, newdst); - if (pte_val(*ptep) != pte_val(dst_pte)) { - set_pte(ptep, dst_pte); - local_flush_tlb_page(NULL, newdst, PAGE_SIZE); - } - - /* Set up the new source mapping */ - type1 = kmap_atomic_idx_push(); - idx += (type0 - type1); - src_pte = hv_pte_set_nc(src_pte); - src_pte = hv_pte_clear_writable(src_pte); /* be paranoid */ - newsrc = __fix_to_virt(idx) + ((unsigned long)source & (PAGE_SIZE-1)); - pmdp = pmd_offset(pud_offset(pgd_offset_k(newsrc), newsrc), newsrc); - ptep = pte_offset_kernel(pmdp, newsrc); - __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ - local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); - - /* Actually move the data. */ - __memcpy_asm((void *)newdst, (const void *)newsrc, len); - - /* - * Remap the source as locally-cached and not OLOC'ed so that - * we can inval without also invaling the remote cpu's cache. - * This also avoids known errata with inv'ing cacheable oloc data. - */ - src_pte = hv_pte_set_mode(src_pte, HV_PTE_MODE_CACHE_NO_L3); - src_pte = hv_pte_set_writable(src_pte); /* need write access for inv */ - __set_pte(ptep, src_pte); /* set_pte() would be confused by this */ - local_flush_tlb_page(NULL, newsrc, PAGE_SIZE); - - /* - * Do the actual invalidation, covering the full L2 cache line - * at the end since __memcpy_asm() is somewhat aggressive. - */ - __inv_buffer((void *)newsrc, len); - - /* - * We're done: notify the simulator that all is back to normal, - * and re-enable interrupts and pre-emption. - */ - kmap_atomic_idx_pop(); - kmap_atomic_idx_pop(); - sim_allow_multiple_caching(0); - local_irq_restore(flags); -} - -/* - * Identify large copies from remotely-cached memory, and copy them - * via memcpy_multicache() if they look good, otherwise fall back - * to the particular kind of copying passed as the memcpy_t function. - */ -static unsigned long fast_copy(void *dest, const void *source, int len, - memcpy_t func) -{ - int cpu = get_cpu(); - unsigned long retval; - - /* - * Check if it's big enough to bother with. We may end up doing a - * small copy via TLB manipulation if we're near a page boundary, - * but presumably we'll make it up when we hit the second page. - */ - while (len >= LARGE_COPY_CUTOFF) { - int copy_size, bytes_left_on_page; - pte_t *src_ptep, *dst_ptep; - pte_t src_pte, dst_pte; - struct page *src_page, *dst_page; - - /* Is the source page oloc'ed to a remote cpu? */ -retry_source: - src_ptep = virt_to_pte(current->mm, (unsigned long)source); - if (src_ptep == NULL) - break; - src_pte = *src_ptep; - if (!hv_pte_get_present(src_pte) || - !hv_pte_get_readable(src_pte) || - hv_pte_get_mode(src_pte) != HV_PTE_MODE_CACHE_TILE_L3) - break; - if (get_remote_cache_cpu(src_pte) == cpu) - break; - src_page = pfn_to_page(pte_pfn(src_pte)); - get_page(src_page); - if (pte_val(src_pte) != pte_val(*src_ptep)) { - put_page(src_page); - goto retry_source; - } - if (pte_huge(src_pte)) { - /* Adjust the PTE to correspond to a small page */ - int pfn = pte_pfn(src_pte); - pfn += (((unsigned long)source & (HPAGE_SIZE-1)) - >> PAGE_SHIFT); - src_pte = pfn_pte(pfn, src_pte); - src_pte = pte_mksmall(src_pte); - } - - /* Is the destination page writable? */ -retry_dest: - dst_ptep = virt_to_pte(current->mm, (unsigned long)dest); - if (dst_ptep == NULL) { - put_page(src_page); - break; - } - dst_pte = *dst_ptep; - if (!hv_pte_get_present(dst_pte) || - !hv_pte_get_writable(dst_pte)) { - put_page(src_page); - break; - } - dst_page = pfn_to_page(pte_pfn(dst_pte)); - if (dst_page == src_page) { - /* - * Source and dest are on the same page; this - * potentially exposes us to incoherence if any - * part of src and dest overlap on a cache line. - * Just give up rather than trying to be precise. - */ - put_page(src_page); - break; - } - get_page(dst_page); - if (pte_val(dst_pte) != pte_val(*dst_ptep)) { - put_page(dst_page); - goto retry_dest; - } - if (pte_huge(dst_pte)) { - /* Adjust the PTE to correspond to a small page */ - int pfn = pte_pfn(dst_pte); - pfn += (((unsigned long)dest & (HPAGE_SIZE-1)) - >> PAGE_SHIFT); - dst_pte = pfn_pte(pfn, dst_pte); - dst_pte = pte_mksmall(dst_pte); - } - - /* All looks good: create a cachable PTE and copy from it */ - copy_size = len; - bytes_left_on_page = - PAGE_SIZE - (((int)source) & (PAGE_SIZE-1)); - if (copy_size > bytes_left_on_page) - copy_size = bytes_left_on_page; - bytes_left_on_page = - PAGE_SIZE - (((int)dest) & (PAGE_SIZE-1)); - if (copy_size > bytes_left_on_page) - copy_size = bytes_left_on_page; - memcpy_multicache(dest, source, dst_pte, src_pte, copy_size); - - /* Release the pages */ - put_page(dst_page); - put_page(src_page); - - /* Continue on the next page */ - dest += copy_size; - source += copy_size; - len -= copy_size; - } - - retval = func(dest, source, len); - put_cpu(); - return retval; -} - -void *memcpy(void *to, const void *from, __kernel_size_t n) -{ - if (n < LARGE_COPY_CUTOFF) - return (void *)__memcpy_asm(to, from, n); - else - return (void *)fast_copy(to, from, n, __memcpy_asm); -} - -unsigned long __copy_to_user_inatomic(void __user *to, const void *from, - unsigned long n) -{ - if (n < LARGE_COPY_CUTOFF) - return __copy_to_user_inatomic_asm(to, from, n); - else - return fast_copy(to, from, n, __copy_to_user_inatomic_asm); -} - -unsigned long __copy_from_user_inatomic(void *to, const void __user *from, - unsigned long n) -{ - if (n < LARGE_COPY_CUTOFF) - return __copy_from_user_inatomic_asm(to, from, n); - else - return fast_copy(to, from, n, __copy_from_user_inatomic_asm); -} - -unsigned long __copy_from_user_zeroing(void *to, const void __user *from, - unsigned long n) -{ - if (n < LARGE_COPY_CUTOFF) - return __copy_from_user_zeroing_asm(to, from, n); - else - return fast_copy(to, from, n, __copy_from_user_zeroing_asm); -} - -#endif /* !CHIP_HAS_COHERENT_LOCAL_CACHE() */ diff --git a/arch/tile/lib/memset_32.c b/arch/tile/lib/memset_32.c index 9a7837d11f7d..2042bfe6595f 100644 --- a/arch/tile/lib/memset_32.c +++ b/arch/tile/lib/memset_32.c @@ -23,11 +23,7 @@ void *memset(void *s, int c, size_t n) int n32; uint32_t v16, v32; uint8_t *out8 = s; -#if !CHIP_HAS_WH64() - int ahead32; -#else int to_align32; -#endif /* Experimentation shows that a trivial tight loop is a win up until * around a size of 20, where writing a word at a time starts to win. @@ -58,21 +54,6 @@ void *memset(void *s, int c, size_t n) return s; } -#if !CHIP_HAS_WH64() - /* Use a spare issue slot to start prefetching the first cache - * line early. This instruction is free as the store can be buried - * in otherwise idle issue slots doing ALU ops. - */ - __insn_prefetch(out8); - - /* We prefetch the end so that a short memset that spans two cache - * lines gets some prefetching benefit. Again we believe this is free - * to issue. - */ - __insn_prefetch(&out8[n - 1]); -#endif /* !CHIP_HAS_WH64() */ - - /* Align 'out8'. We know n >= 3 so this won't write past the end. */ while (((uintptr_t) out8 & 3) != 0) { *out8++ = c; @@ -93,90 +74,6 @@ void *memset(void *s, int c, size_t n) /* This must be at least 8 or the following loop doesn't work. */ #define CACHE_LINE_SIZE_IN_WORDS (CHIP_L2_LINE_SIZE() / 4) -#if !CHIP_HAS_WH64() - - ahead32 = CACHE_LINE_SIZE_IN_WORDS; - - /* We already prefetched the first and last cache lines, so - * we only need to do more prefetching if we are storing - * to more than two cache lines. - */ - if (n32 > CACHE_LINE_SIZE_IN_WORDS * 2) { - int i; - - /* Prefetch the next several cache lines. - * This is the setup code for the software-pipelined - * loop below. - */ -#define MAX_PREFETCH 5 - ahead32 = n32 & -CACHE_LINE_SIZE_IN_WORDS; - if (ahead32 > MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS) - ahead32 = MAX_PREFETCH * CACHE_LINE_SIZE_IN_WORDS; - - for (i = CACHE_LINE_SIZE_IN_WORDS; - i < ahead32; i += CACHE_LINE_SIZE_IN_WORDS) - __insn_prefetch(&out32[i]); - } - - if (n32 > ahead32) { - while (1) { - int j; - - /* Prefetch by reading one word several cache lines - * ahead. Since loads are non-blocking this will - * cause the full cache line to be read while we are - * finishing earlier cache lines. Using a store - * here causes microarchitectural performance - * problems where a victimizing store miss goes to - * the head of the retry FIFO and locks the pipe for - * a few cycles. So a few subsequent stores in this - * loop go into the retry FIFO, and then later - * stores see other stores to the same cache line - * are already in the retry FIFO and themselves go - * into the retry FIFO, filling it up and grinding - * to a halt waiting for the original miss to be - * satisfied. - */ - __insn_prefetch(&out32[ahead32]); - -#if CACHE_LINE_SIZE_IN_WORDS % 4 != 0 -#error "Unhandled CACHE_LINE_SIZE_IN_WORDS" -#endif - - n32 -= CACHE_LINE_SIZE_IN_WORDS; - - /* Save icache space by only partially unrolling - * this loop. - */ - for (j = CACHE_LINE_SIZE_IN_WORDS / 4; j > 0; j--) { - *out32++ = v32; - *out32++ = v32; - *out32++ = v32; - *out32++ = v32; - } - - /* To save compiled code size, reuse this loop even - * when we run out of prefetching to do by dropping - * ahead32 down. - */ - if (n32 <= ahead32) { - /* Not even a full cache line left, - * so stop now. - */ - if (n32 < CACHE_LINE_SIZE_IN_WORDS) - break; - - /* Choose a small enough value that we don't - * prefetch past the end. There's no sense - * in touching cache lines we don't have to. - */ - ahead32 = CACHE_LINE_SIZE_IN_WORDS - 1; - } - } - } - -#else /* CHIP_HAS_WH64() */ - /* Determine how many words we need to emit before the 'out32' * pointer becomes aligned modulo the cache line size. */ @@ -233,8 +130,6 @@ void *memset(void *s, int c, size_t n) n32 &= CACHE_LINE_SIZE_IN_WORDS - 1; } -#endif /* CHIP_HAS_WH64() */ - /* Now handle any leftover values. */ if (n32 != 0) { do { |