diff options
Diffstat (limited to 'arch/parisc/lib/memcpy.c')
-rw-r--r-- | arch/parisc/lib/memcpy.c | 461 |
1 files changed, 3 insertions, 458 deletions
diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index f82ff10ed974..b3d47ec1d80a 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -2,7 +2,7 @@ * Optimized memory copy routines. * * Copyright (C) 2004 Randolph Chung <tausq@debian.org> - * Copyright (C) 2013 Helge Deller <deller@gmx.de> + * Copyright (C) 2013-2017 Helge Deller <deller@gmx.de> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -21,474 +21,21 @@ * Portions derived from the GNU C Library * Copyright (C) 1991, 1997, 2003 Free Software Foundation, Inc. * - * Several strategies are tried to try to get the best performance for various - * conditions. In the optimal case, we copy 64-bytes in an unrolled loop using - * fp regs. This is followed by loops that copy 32- or 16-bytes at a time using - * general registers. Unaligned copies are handled either by aligning the - * destination and then using shift-and-write method, or in a few cases by - * falling back to a byte-at-a-time copy. - * - * I chose to implement this in C because it is easier to maintain and debug, - * and in my experiments it appears that the C code generated by gcc (3.3/3.4 - * at the time of writing) is fairly optimal. Unfortunately some of the - * semantics of the copy routine (exception handling) is difficult to express - * in C, so we have to play some tricks to get it to work. - * - * All the loads and stores are done via explicit asm() code in order to use - * the right space registers. - * - * Testing with various alignments and buffer sizes shows that this code is - * often >10x faster than a simple byte-at-a-time copy, even for strangely - * aligned operands. It is interesting to note that the glibc version - * of memcpy (written in C) is actually quite fast already. This routine is - * able to beat it by 30-40% for aligned copies because of the loop unrolling, - * but in some cases the glibc version is still slightly faster. This lends - * more credibility that gcc can generate very good code as long as we are - * careful. - * - * TODO: - * - cache prefetching needs more experimentation to get optimal settings - * - try not to use the post-increment address modifiers; they create additional - * interlocks - * - replace byte-copy loops with stybs sequences */ -#ifdef __KERNEL__ #include <linux/module.h> #include <linux/compiler.h> #include <linux/uaccess.h> -#define s_space "%%sr1" -#define d_space "%%sr2" -#else -#include "memcpy.h" -#define s_space "%%sr0" -#define d_space "%%sr0" -#define pa_memcpy new2_copy -#endif DECLARE_PER_CPU(struct exception_data, exception_data); -#define preserve_branch(label) do { \ - volatile int dummy = 0; \ - /* The following branch is never taken, it's just here to */ \ - /* prevent gcc from optimizing away our exception code. */ \ - if (unlikely(dummy != dummy)) \ - goto label; \ -} while (0) - #define get_user_space() (segment_eq(get_fs(), KERNEL_DS) ? 0 : mfsp(3)) #define get_kernel_space() (0) -#define MERGE(w0, sh_1, w1, sh_2) ({ \ - unsigned int _r; \ - asm volatile ( \ - "mtsar %3\n" \ - "shrpw %1, %2, %%sar, %0\n" \ - : "=r"(_r) \ - : "r"(w0), "r"(w1), "r"(sh_2) \ - ); \ - _r; \ -}) -#define THRESHOLD 16 - -#ifdef DEBUG_MEMCPY -#define DPRINTF(fmt, args...) do { printk(KERN_DEBUG "%s:%d:%s ", __FILE__, __LINE__, __func__ ); printk(KERN_DEBUG fmt, ##args ); } while (0) -#else -#define DPRINTF(fmt, args...) -#endif - -#define def_load_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ - __asm__ __volatile__ ( \ - "1:\t" #_insn ",ma " #_sz "(" _s ",%1), %0\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ - : _tt(_t), "+r"(_a) \ - : \ - : "r8") - -#define def_store_ai_insn(_insn,_sz,_tt,_s,_a,_t,_e) \ - __asm__ __volatile__ ( \ - "1:\t" #_insn ",ma %1, " #_sz "(" _s ",%0)\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ - : "+r"(_a) \ - : _tt(_t) \ - : "r8") - -#define ldbma(_s, _a, _t, _e) def_load_ai_insn(ldbs,1,"=r",_s,_a,_t,_e) -#define stbma(_s, _t, _a, _e) def_store_ai_insn(stbs,1,"r",_s,_a,_t,_e) -#define ldwma(_s, _a, _t, _e) def_load_ai_insn(ldw,4,"=r",_s,_a,_t,_e) -#define stwma(_s, _t, _a, _e) def_store_ai_insn(stw,4,"r",_s,_a,_t,_e) -#define flddma(_s, _a, _t, _e) def_load_ai_insn(fldd,8,"=f",_s,_a,_t,_e) -#define fstdma(_s, _t, _a, _e) def_store_ai_insn(fstd,8,"f",_s,_a,_t,_e) - -#define def_load_insn(_insn,_tt,_s,_o,_a,_t,_e) \ - __asm__ __volatile__ ( \ - "1:\t" #_insn " " #_o "(" _s ",%1), %0\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ - : _tt(_t) \ - : "r"(_a) \ - : "r8") - -#define def_store_insn(_insn,_tt,_s,_t,_o,_a,_e) \ - __asm__ __volatile__ ( \ - "1:\t" #_insn " %0, " #_o "(" _s ",%1)\n\t" \ - ASM_EXCEPTIONTABLE_ENTRY(1b,_e) \ - : \ - : _tt(_t), "r"(_a) \ - : "r8") - -#define ldw(_s,_o,_a,_t,_e) def_load_insn(ldw,"=r",_s,_o,_a,_t,_e) -#define stw(_s,_t,_o,_a,_e) def_store_insn(stw,"r",_s,_t,_o,_a,_e) - -#ifdef CONFIG_PREFETCH -static inline void prefetch_src(const void *addr) -{ - __asm__("ldw 0(" s_space ",%0), %%r0" : : "r" (addr)); -} - -static inline void prefetch_dst(const void *addr) -{ - __asm__("ldd 0(" d_space ",%0), %%r0" : : "r" (addr)); -} -#else -#define prefetch_src(addr) do { } while(0) -#define prefetch_dst(addr) do { } while(0) -#endif - -#define PA_MEMCPY_OK 0 -#define PA_MEMCPY_LOAD_ERROR 1 -#define PA_MEMCPY_STORE_ERROR 2 - -/* Copy from a not-aligned src to an aligned dst, using shifts. Handles 4 words - * per loop. This code is derived from glibc. - */ -static noinline unsigned long copy_dstaligned(unsigned long dst, - unsigned long src, unsigned long len) -{ - /* gcc complains that a2 and a3 may be uninitialized, but actually - * they cannot be. Initialize a2/a3 to shut gcc up. - */ - register unsigned int a0, a1, a2 = 0, a3 = 0; - int sh_1, sh_2; - - /* prefetch_src((const void *)src); */ - - /* Calculate how to shift a word read at the memory operation - aligned srcp to make it aligned for copy. */ - sh_1 = 8 * (src % sizeof(unsigned int)); - sh_2 = 8 * sizeof(unsigned int) - sh_1; - - /* Make src aligned by rounding it down. */ - src &= -sizeof(unsigned int); - - switch (len % 4) - { - case 2: - /* a1 = ((unsigned int *) src)[0]; - a2 = ((unsigned int *) src)[1]; */ - ldw(s_space, 0, src, a1, cda_ldw_exc); - ldw(s_space, 4, src, a2, cda_ldw_exc); - src -= 1 * sizeof(unsigned int); - dst -= 3 * sizeof(unsigned int); - len += 2; - goto do1; - case 3: - /* a0 = ((unsigned int *) src)[0]; - a1 = ((unsigned int *) src)[1]; */ - ldw(s_space, 0, src, a0, cda_ldw_exc); - ldw(s_space, 4, src, a1, cda_ldw_exc); - src -= 0 * sizeof(unsigned int); - dst -= 2 * sizeof(unsigned int); - len += 1; - goto do2; - case 0: - if (len == 0) - return PA_MEMCPY_OK; - /* a3 = ((unsigned int *) src)[0]; - a0 = ((unsigned int *) src)[1]; */ - ldw(s_space, 0, src, a3, cda_ldw_exc); - ldw(s_space, 4, src, a0, cda_ldw_exc); - src -=-1 * sizeof(unsigned int); - dst -= 1 * sizeof(unsigned int); - len += 0; - goto do3; - case 1: - /* a2 = ((unsigned int *) src)[0]; - a3 = ((unsigned int *) src)[1]; */ - ldw(s_space, 0, src, a2, cda_ldw_exc); - ldw(s_space, 4, src, a3, cda_ldw_exc); - src -=-2 * sizeof(unsigned int); - dst -= 0 * sizeof(unsigned int); - len -= 1; - if (len == 0) - goto do0; - goto do4; /* No-op. */ - } - - do - { - /* prefetch_src((const void *)(src + 4 * sizeof(unsigned int))); */ -do4: - /* a0 = ((unsigned int *) src)[0]; */ - ldw(s_space, 0, src, a0, cda_ldw_exc); - /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ - stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); -do3: - /* a1 = ((unsigned int *) src)[1]; */ - ldw(s_space, 4, src, a1, cda_ldw_exc); - /* ((unsigned int *) dst)[1] = MERGE (a3, sh_1, a0, sh_2); */ - stw(d_space, MERGE (a3, sh_1, a0, sh_2), 4, dst, cda_stw_exc); -do2: - /* a2 = ((unsigned int *) src)[2]; */ - ldw(s_space, 8, src, a2, cda_ldw_exc); - /* ((unsigned int *) dst)[2] = MERGE (a0, sh_1, a1, sh_2); */ - stw(d_space, MERGE (a0, sh_1, a1, sh_2), 8, dst, cda_stw_exc); -do1: - /* a3 = ((unsigned int *) src)[3]; */ - ldw(s_space, 12, src, a3, cda_ldw_exc); - /* ((unsigned int *) dst)[3] = MERGE (a1, sh_1, a2, sh_2); */ - stw(d_space, MERGE (a1, sh_1, a2, sh_2), 12, dst, cda_stw_exc); - - src += 4 * sizeof(unsigned int); - dst += 4 * sizeof(unsigned int); - len -= 4; - } - while (len != 0); - -do0: - /* ((unsigned int *) dst)[0] = MERGE (a2, sh_1, a3, sh_2); */ - stw(d_space, MERGE (a2, sh_1, a3, sh_2), 0, dst, cda_stw_exc); - - preserve_branch(handle_load_error); - preserve_branch(handle_store_error); - - return PA_MEMCPY_OK; - -handle_load_error: - __asm__ __volatile__ ("cda_ldw_exc:\n"); - return PA_MEMCPY_LOAD_ERROR; - -handle_store_error: - __asm__ __volatile__ ("cda_stw_exc:\n"); - return PA_MEMCPY_STORE_ERROR; -} - - -/* Returns PA_MEMCPY_OK, PA_MEMCPY_LOAD_ERROR or PA_MEMCPY_STORE_ERROR. - * In case of an access fault the faulty address can be read from the per_cpu - * exception data struct. */ -static noinline unsigned long pa_memcpy_internal(void *dstp, const void *srcp, - unsigned long len) -{ - register unsigned long src, dst, t1, t2, t3; - register unsigned char *pcs, *pcd; - register unsigned int *pws, *pwd; - register double *pds, *pdd; - unsigned long ret; - - src = (unsigned long)srcp; - dst = (unsigned long)dstp; - pcs = (unsigned char *)srcp; - pcd = (unsigned char *)dstp; - - /* prefetch_src((const void *)srcp); */ - - if (len < THRESHOLD) - goto byte_copy; - - /* Check alignment */ - t1 = (src ^ dst); - if (unlikely(t1 & (sizeof(double)-1))) - goto unaligned_copy; - - /* src and dst have same alignment. */ - - /* Copy bytes till we are double-aligned. */ - t2 = src & (sizeof(double) - 1); - if (unlikely(t2 != 0)) { - t2 = sizeof(double) - t2; - while (t2 && len) { - /* *pcd++ = *pcs++; */ - ldbma(s_space, pcs, t3, pmc_load_exc); - len--; - stbma(d_space, t3, pcd, pmc_store_exc); - t2--; - } - } - - pds = (double *)pcs; - pdd = (double *)pcd; - -#if 0 - /* Copy 8 doubles at a time */ - while (len >= 8*sizeof(double)) { - register double r1, r2, r3, r4, r5, r6, r7, r8; - /* prefetch_src((char *)pds + L1_CACHE_BYTES); */ - flddma(s_space, pds, r1, pmc_load_exc); - flddma(s_space, pds, r2, pmc_load_exc); - flddma(s_space, pds, r3, pmc_load_exc); - flddma(s_space, pds, r4, pmc_load_exc); - fstdma(d_space, r1, pdd, pmc_store_exc); - fstdma(d_space, r2, pdd, pmc_store_exc); - fstdma(d_space, r3, pdd, pmc_store_exc); - fstdma(d_space, r4, pdd, pmc_store_exc); - -#if 0 - if (L1_CACHE_BYTES <= 32) - prefetch_src((char *)pds + L1_CACHE_BYTES); -#endif - flddma(s_space, pds, r5, pmc_load_exc); - flddma(s_space, pds, r6, pmc_load_exc); - flddma(s_space, pds, r7, pmc_load_exc); - flddma(s_space, pds, r8, pmc_load_exc); - fstdma(d_space, r5, pdd, pmc_store_exc); - fstdma(d_space, r6, pdd, pmc_store_exc); - fstdma(d_space, r7, pdd, pmc_store_exc); - fstdma(d_space, r8, pdd, pmc_store_exc); - len -= 8*sizeof(double); - } -#endif - - pws = (unsigned int *)pds; - pwd = (unsigned int *)pdd; - -word_copy: - while (len >= 8*sizeof(unsigned int)) { - register unsigned int r1,r2,r3,r4,r5,r6,r7,r8; - /* prefetch_src((char *)pws + L1_CACHE_BYTES); */ - ldwma(s_space, pws, r1, pmc_load_exc); - ldwma(s_space, pws, r2, pmc_load_exc); - ldwma(s_space, pws, r3, pmc_load_exc); - ldwma(s_space, pws, r4, pmc_load_exc); - stwma(d_space, r1, pwd, pmc_store_exc); - stwma(d_space, r2, pwd, pmc_store_exc); - stwma(d_space, r3, pwd, pmc_store_exc); - stwma(d_space, r4, pwd, pmc_store_exc); - - ldwma(s_space, pws, r5, pmc_load_exc); - ldwma(s_space, pws, r6, pmc_load_exc); - ldwma(s_space, pws, r7, pmc_load_exc); - ldwma(s_space, pws, r8, pmc_load_exc); - stwma(d_space, r5, pwd, pmc_store_exc); - stwma(d_space, r6, pwd, pmc_store_exc); - stwma(d_space, r7, pwd, pmc_store_exc); - stwma(d_space, r8, pwd, pmc_store_exc); - len -= 8*sizeof(unsigned int); - } - - while (len >= 4*sizeof(unsigned int)) { - register unsigned int r1,r2,r3,r4; - ldwma(s_space, pws, r1, pmc_load_exc); - ldwma(s_space, pws, r2, pmc_load_exc); - ldwma(s_space, pws, r3, pmc_load_exc); - ldwma(s_space, pws, r4, pmc_load_exc); - stwma(d_space, r1, pwd, pmc_store_exc); - stwma(d_space, r2, pwd, pmc_store_exc); - stwma(d_space, r3, pwd, pmc_store_exc); - stwma(d_space, r4, pwd, pmc_store_exc); - len -= 4*sizeof(unsigned int); - } - - pcs = (unsigned char *)pws; - pcd = (unsigned char *)pwd; - -byte_copy: - while (len) { - /* *pcd++ = *pcs++; */ - ldbma(s_space, pcs, t3, pmc_load_exc); - stbma(d_space, t3, pcd, pmc_store_exc); - len--; - } - - return PA_MEMCPY_OK; - -unaligned_copy: - /* possibly we are aligned on a word, but not on a double... */ - if (likely((t1 & (sizeof(unsigned int)-1)) == 0)) { - t2 = src & (sizeof(unsigned int) - 1); - - if (unlikely(t2 != 0)) { - t2 = sizeof(unsigned int) - t2; - while (t2) { - /* *pcd++ = *pcs++; */ - ldbma(s_space, pcs, t3, pmc_load_exc); - stbma(d_space, t3, pcd, pmc_store_exc); - len--; - t2--; - } - } - - pws = (unsigned int *)pcs; - pwd = (unsigned int *)pcd; - goto word_copy; - } - - /* Align the destination. */ - if (unlikely((dst & (sizeof(unsigned int) - 1)) != 0)) { - t2 = sizeof(unsigned int) - (dst & (sizeof(unsigned int) - 1)); - while (t2) { - /* *pcd++ = *pcs++; */ - ldbma(s_space, pcs, t3, pmc_load_exc); - stbma(d_space, t3, pcd, pmc_store_exc); - len--; - t2--; - } - dst = (unsigned long)pcd; - src = (unsigned long)pcs; - } - - ret = copy_dstaligned(dst, src, len / sizeof(unsigned int)); - if (ret) - return ret; - - pcs += (len & -sizeof(unsigned int)); - pcd += (len & -sizeof(unsigned int)); - len %= sizeof(unsigned int); - - preserve_branch(handle_load_error); - preserve_branch(handle_store_error); - - goto byte_copy; - -handle_load_error: - __asm__ __volatile__ ("pmc_load_exc:\n"); - return PA_MEMCPY_LOAD_ERROR; - -handle_store_error: - __asm__ __volatile__ ("pmc_store_exc:\n"); - return PA_MEMCPY_STORE_ERROR; -} - - /* Returns 0 for success, otherwise, returns number of bytes not transferred. */ -static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) -{ - unsigned long ret, fault_addr, reference; - struct exception_data *d; - - ret = pa_memcpy_internal(dstp, srcp, len); - if (likely(ret == PA_MEMCPY_OK)) - return 0; - - /* if a load or store fault occured we can get the faulty addr */ - d = this_cpu_ptr(&exception_data); - fault_addr = d->fault_addr; - - /* error in load or store? */ - if (ret == PA_MEMCPY_LOAD_ERROR) - reference = (unsigned long) srcp; - else - reference = (unsigned long) dstp; +extern unsigned long pa_memcpy(void *dst, const void *src, + unsigned long len); - DPRINTF("pa_memcpy: fault type = %lu, len=%lu fault_addr=%lu ref=%lu\n", - ret, len, fault_addr, reference); - - if (fault_addr >= reference) - return len - (fault_addr - reference); - else - return len; -} - -#ifdef __KERNEL__ unsigned long __copy_to_user(void __user *dst, const void *src, unsigned long len) { @@ -537,5 +84,3 @@ long probe_kernel_read(void *dst, const void *src, size_t size) return __probe_kernel_read(dst, src, size); } - -#endif |