diff options
author | Will Newton <will.newton@linaro.org> | 2013-03-26 10:19:35 +0000 |
---|---|---|
committer | Will Newton <will.newton@linaro.org> | 2013-03-26 10:19:35 +0000 |
commit | 99a65574c6435888205e4d3f12019c3865a9f9db (patch) | |
tree | 81cd6d32ce722502c6f0801d76bb51cfe9eb83f7 | |
parent | 3f92e22221aff7771cf4f6e257b8aea30fa68424 (diff) |
Integrate NEON/VFP/ARM optimised memcpy implementation.
Add --with-vfp configure option to allow testing VFP code.
-rw-r--r-- | Makefile.am | 20 | ||||
-rw-r--r-- | configure.ac | 8 | ||||
-rw-r--r-- | src/linaro-a9/memcpy-hybrid.S | 152 | ||||
-rw-r--r-- | src/linaro-a9/memcpy.S | 807 |
4 files changed, 596 insertions, 391 deletions
diff --git a/Makefile.am b/Makefile.am index bb4ec36..4a87f07 100644 --- a/Makefile.am +++ b/Makefile.am @@ -171,15 +171,15 @@ if HOST_AARCH32 if WITH_NEON # Pull in the NEON specific files -neon_sources = \ - src/linaro-a9/memcpy-hybrid.S neon_bionic_sources = \ reference/bionic/memcpy.S -neon_cppflags = -mfpu=neon -neon_dirs = neon +fpu_flags = -mfpu=neon else -alternate_sources = \ - src/linaro-a9/memcpy.S +if WITH_VFP +fpu_flags = -mfpu=vfp +else +fpu_flags = -msoft-float +endif endif # Benchmarks and example programs @@ -200,13 +200,12 @@ noinst_LIBRARIES += \ # Main library libcortex_strings_la_SOURCES = \ - $(neon_sources) \ - $(alternate_sources) \ src/thumb-2/strcpy.c \ src/linaro-a9/memchr.S \ src/linaro-a9/strchr.S \ src/linaro-a9/strlen.S \ - src/linaro-a9/memset.S + src/linaro-a9/memset.S \ + src/linaro-a9/memcpy.S # Libraries containing the difference reference versions libbionic_a_SOURCES = \ @@ -259,7 +258,8 @@ try_newlib_LDADD = libmulti.a libnewlib.a -lrt try_newlib_xscale_SOURCES = try_newlib_xscale_LDADD = libmulti.a libnewlib-xscale.a -lrt -AM_CPPFLAGS = $(neon_cppflags) +AM_CPPFLAGS = $(fpu_flags) +AM_LDFLAGS = $(fpu_flags) endif diff --git a/configure.ac b/configure.ac index 498d98c..56f1ced 100644 --- a/configure.ac +++ b/configure.ac @@ -77,4 +77,12 @@ AC_ARG_WITH([neon], AC_SUBST(with_neon) AM_CONDITIONAL(WITH_NEON, test x$with_neon = xyes) +AC_ARG_WITH([vfp], + AC_HELP_STRING([--with-vfp], + [include VFP specific routines @<:@default=yes@:>@]), + [with_vfp=$withval], + [with_vfp=yes]) +AC_SUBST(with_vfp) +AM_CONDITIONAL(WITH_VFP, test x$with_vfp = xyes) + AC_OUTPUT diff --git a/src/linaro-a9/memcpy-hybrid.S b/src/linaro-a9/memcpy-hybrid.S deleted file mode 100644 index 3be24ca..0000000 --- a/src/linaro-a9/memcpy-hybrid.S +++ /dev/null @@ -1,152 +0,0 @@ -/* Copyright (c) 2010-2011, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Linaro Limited nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - Written by Dave Gilbert <david.gilbert@linaro.org> - - This memcpy routine is optimised on a Cortex-A9 and should work on - all ARMv7 processors with NEON. */ - -@ 2011-09-01 david.gilbert@linaro.org -@ Extracted from local git 2f11b436 - - .syntax unified - .arch armv7-a - -@ this lets us check a flag in a 00/ff byte easily in either endianness -#ifdef __ARMEB__ -#define CHARTSTMASK(c) 1<<(31-(c*8)) -#else -#define CHARTSTMASK(c) 1<<(c*8) -#endif - .text - .thumb - -@ --------------------------------------------------------------------------- - .thumb_func - .align 2 - .p2align 4,,15 - .global memcpy - .type memcpy,%function -memcpy: - @ r0 = dest - @ r1 = source - @ r2 = count - @ returns dest in r0 - @ Overlaps of source/dest not allowed according to spec - @ Note this routine relies on v7 misaligned loads/stores - pld [r1] - mov r12, r0 @ stash original r0 - cmp r2,#32 - blt 10f @ take the small copy case separately - - @ test for either source or destination being misaligned - @ (We only rely on word align) - tst r0,#3 - it eq - tsteq r1,#3 - bne 30f @ misaligned case - -4: - @ at this point we are word (or better) aligned and have at least - @ 32 bytes to play with - - @ If it's a huge copy, try Neon - cmp r2, #128*1024 - bge 35f @ Sharing general non-aligned case here, aligned could be faster - - push {r3,r4,r5,r6,r7,r8,r10,r11} -5: - ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11} - sub r2,r2,#32 - pld [r1,#96] - cmp r2,#32 - stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11} - bge 5b - - pop {r3,r4,r5,r6,r7,r8,r10,r11} - @ We are now down to less than 32 bytes - cbz r2,15f @ quick exit for the case where we copied a multiple of 32 - -10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes) - cmp r2,#4 - blt 12f -11: - sub r2,r2,#4 - cmp r2,#4 - ldr r3, [r1],#4 - str r3, [r0],#4 - bge 11b -12: - tst r2,#2 - itt ne - ldrhne r3, [r1],#2 - strhne r3, [r0],#2 - - tst r2,#1 - itt ne - ldrbne r3, [r1],#1 - strbne r3, [r0],#1 - -15: @ exit - mov r0,r12 @ restore r0 - bx lr - - .align 2 - .p2align 4,,15 -30: @ non-aligned - at least 32 bytes to play with - @ Test for co-misalignment - eor r3, r0, r1 - tst r3,#3 - beq 50f - - @ Use Neon for misaligned -35: - vld1.8 {d0,d1,d2,d3}, [r1]! - sub r2,r2,#32 - cmp r2,#32 - pld [r1,#96] - vst1.8 {d0,d1,d2,d3}, [r0]! - bge 35b - b 10b @ TODO: Probably a bad idea to switch to ARM at this point - - .align 2 - .p2align 4,,15 -50: @ Co-misaligned - @ At this point we've got at least 32 bytes -51: - ldrb r3,[r1],#1 - sub r2,r2,#1 - strb r3,[r0],#1 - tst r0,#7 - bne 51b - - cmp r2,#32 - blt 10b - b 4b diff --git a/src/linaro-a9/memcpy.S b/src/linaro-a9/memcpy.S index a98a627..4faec18 100644 --- a/src/linaro-a9/memcpy.S +++ b/src/linaro-a9/memcpy.S @@ -1,4 +1,4 @@ -/* Copyright (c) 2010-2011, Linaro Limited +/* Copyright (c) 2013, Linaro Limited All rights reserved. Redistribution and use in source and binary forms, with or without @@ -28,241 +28,590 @@ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - Written by Dave Gilbert <david.gilbert@linaro.org> + This memcpy routine is optimised for Cortex-A cores and takes advantage + of VFP or NEON when built with the appropriate flags. - This memcpy routine is optimised on a Cortex-A9 and should work on - all ARMv7 processors. */ + Assumptions: -@ 2011-09-01 david.gilbert@linaro.org -@ Extracted from local git 2f11b436 + ARMv6 (ARMv7-a if using Neon) + ARM state + Unaligned accesses + LDRD/STRD support unaligned word accesses + Not tested on big-endian + + */ .syntax unified - .arch armv7-a + /* This implementation requires ARM state. */ + .arm + +#ifdef __ARM_NEON__ + + .fpu neon + .arch armv7-a +# define FRAME_SIZE 4 +# define USE_VFP +# define USE_NEON + +#elif !defined (__SOFTFP__) + + .arch armv6 + .fpu vfpv2 +# define FRAME_SIZE 32 +# define USE_VFP + +#else + .arch armv6 +# define FRAME_SIZE 32 + +#endif -@ this lets us check a flag in a 00/ff byte easily in either endianness -#ifdef __ARMEB__ -#define CHARTSTMASK(c) 1<<(31-(c*8)) +/* Old versions of GAS incorrectly implement the NEON align semantics. */ +#ifdef BROKEN_ASM_NEON_ALIGN +#define ALIGN(addr, align) addr,:align #else -#define CHARTSTMASK(c) 1<<(c*8) +#define ALIGN(addr, align) addr:align #endif + +#define PC_OFFSET 8 /* PC pipeline compensation. */ +#define INSN_SIZE 4 + +/* Call parameters. */ +#define dstin r0 +#define src r1 +#define count r2 + +/* Locals. */ +#define tmp1 r3 +#define dst ip +#define tmp2 r10 + +#ifndef USE_NEON +/* For bulk copies using GP registers. */ +#define A_l r2 /* Call-clobbered. */ +#define A_h r3 /* Call-clobbered. */ +#define B_l r4 +#define B_h r5 +#define C_l r6 +#define C_h r7 +#define D_l r8 +#define D_h r9 +#endif + +/* Number of lines ahead to pre-fetch data. If you change this the code + below will need adjustment to compensate. */ + +#define prefetch_lines 5 + +#ifdef USE_VFP + .macro cpy_line_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm + + .macro cpy_tail_vfp vreg, base + vstr \vreg, [dst, #\base] + vldr \vreg, [src, #\base] + vstr d0, [dst, #\base + 8] + vldr d0, [src, #\base + 8] + vstr d1, [dst, #\base + 16] + vldr d1, [src, #\base + 16] + vstr d2, [dst, #\base + 24] + vldr d2, [src, #\base + 24] + vstr \vreg, [dst, #\base + 32] + vstr d0, [dst, #\base + 40] + vldr d0, [src, #\base + 40] + vstr d1, [dst, #\base + 48] + vldr d1, [src, #\base + 48] + vstr d2, [dst, #\base + 56] + vldr d2, [src, #\base + 56] + .endm +#endif + + .macro def_fn f p2align=0 .text - .thumb - -@ --------------------------------------------------------------------------- - .thumb_func - .align 2 - .p2align 4,,15 - .global memcpy - .type memcpy,%function -memcpy: - @ r0 = dest - @ r1 = source - @ r2 = count - @ returns dest in r0 - @ Overlaps of source/dest not allowed according to spec - @ Note this routine relies on v7 misaligned loads/stores - pld [r1] - mov r12, r0 @ stash original r0 - cmp r2,#32 - blt 10f @ take the small copy case separately - - @ test for either source or destination being misaligned - @ (We only rely on word align) - @ TODO: Test for co-misalignment - tst r0,#3 - it eq - tsteq r1,#3 - bne 30f @ misaligned case - -4: - @ at this point we are word (or better) aligned and have at least - @ 32 bytes to play with - push {r3,r4,r5,r6,r7,r8,r10,r11} -5: - ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11} - pld [r1,#96] - sub r2,r2,#32 - cmp r2,#32 - stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11} - bge 5b - - pop {r3,r4,r5,r6,r7,r8,r10,r11} - @ We are now down to less than 32 bytes - cbz r2,15f @ quick exit for the case where we copied a multiple of 32 - -10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes) - cmp r2,#4 - blt 12f -11: - sub r2,r2,#4 - cmp r2,#4 - ldr r3, [r1],#4 - str r3, [r0],#4 - bge 11b -12: - tst r2,#2 - itt ne - ldrhne r3, [r1],#2 - strhne r3, [r0],#2 - - tst r2,#1 - itt ne - ldrbne r3, [r1],#1 - strbne r3, [r0],#1 - -15: @ exit - mov r0,r12 @ restore r0 + .p2align \p2align + .global \f + .type \f, %function +\f: + .endm + +def_fn memcpy p2align=6 + + mov dst, dstin /* Preserve dstin, we need to return it. */ + cmp count, #64 + bge .Lcpy_not_short + /* Deal with small copies quickly by dropping straight into the + exit block. */ + +.Ltail63unaligned: +#ifdef USE_NEON + and tmp1, count, #0x38 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + vld1.8 {d0}, [src]! /* 14 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 12 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 10 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 8 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 6 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 4 words to go. */ + vst1.8 {d0}, [dst]! + vld1.8 {d0}, [src]! /* 2 words to go. */ + vst1.8 {d0}, [dst]! + + tst count, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 +#else + /* Copy up to 15 full words of data. May not be aligned. */ + /* Cannot use VFP for unaligned data. */ + and tmp1, count, #0x3c + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) + /* Jump directly into the sequence below at the correct offset. */ + add pc, pc, tmp1, lsl #1 + + ldr tmp1, [src, #-60] /* 15 words to go. */ + str tmp1, [dst, #-60] + + ldr tmp1, [src, #-56] /* 14 words to go. */ + str tmp1, [dst, #-56] + ldr tmp1, [src, #-52] + str tmp1, [dst, #-52] + + ldr tmp1, [src, #-48] /* 12 words to go. */ + str tmp1, [dst, #-48] + ldr tmp1, [src, #-44] + str tmp1, [dst, #-44] + + ldr tmp1, [src, #-40] /* 10 words to go. */ + str tmp1, [dst, #-40] + ldr tmp1, [src, #-36] + str tmp1, [dst, #-36] + + ldr tmp1, [src, #-32] /* 8 words to go. */ + str tmp1, [dst, #-32] + ldr tmp1, [src, #-28] + str tmp1, [dst, #-28] + + ldr tmp1, [src, #-24] /* 6 words to go. */ + str tmp1, [dst, #-24] + ldr tmp1, [src, #-20] + str tmp1, [dst, #-20] + + ldr tmp1, [src, #-16] /* 4 words to go. */ + str tmp1, [dst, #-16] + ldr tmp1, [src, #-12] + str tmp1, [dst, #-12] + + ldr tmp1, [src, #-8] /* 2 words to go. */ + str tmp1, [dst, #-8] + ldr tmp1, [src, #-4] + str tmp1, [dst, #-4] +#endif + + lsls count, count, #31 + ldrhcs tmp1, [src], #2 + ldrbne src, [src] /* Src is dead, use as a scratch. */ + strhcs tmp1, [dst], #2 + strbne src, [dst] + bx lr + +.Lcpy_not_short: + /* At least 64 bytes to copy, but don't know the alignment yet. */ + str tmp2, [sp, #-FRAME_SIZE]! + and tmp2, src, #3 + and tmp1, dst, #3 + cmp tmp1, tmp2 + bne .Lcpy_notaligned + +#ifdef USE_VFP + /* Magic dust alert! Force VFP on Cortex-A9. Experiments show + that the FP pipeline is much better at streaming loads and + stores. This is outside the critical loop. */ + vmov.f32 s0, s0 +#endif + + /* SRC and DST have the same mutual 32-bit alignment, but we may + still need to pre-copy some bytes to get to natural alignment. + We bring DST into full 64-bit alignment. */ + lsls tmp2, dst, #29 + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src], #1 + strhcs tmp1, [dst], #2 + strbne tmp2, [dst], #1 + +1: + subs tmp2, count, #64 /* Use tmp2 for count. */ + blt .Ltail63aligned + + cmp tmp2, #512 + bge .Lcpy_body_long + +.Lcpy_body_medium: /* Count in tmp2. */ +#ifdef USE_VFP +1: + vldr d0, [src, #0] + subs tmp2, tmp2, #64 + vldr d1, [src, #8] + vstr d0, [dst, #0] + vldr d0, [src, #16] + vstr d1, [dst, #8] + vldr d1, [src, #24] + vstr d0, [dst, #16] + vldr d0, [src, #32] + vstr d1, [dst, #24] + vldr d1, [src, #40] + vstr d0, [dst, #32] + vldr d0, [src, #48] + vstr d1, [dst, #40] + vldr d1, [src, #56] + vstr d0, [dst, #48] + add src, src, #64 + vstr d1, [dst, #56] + add dst, dst, #64 + bge 1b + tst tmp2, #0x3f + beq .Ldone + +.Ltail63aligned: /* Count in tmp2. */ + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + + vldr d0, [src, #-56] /* 14 words to go. */ + vstr d0, [dst, #-56] + vldr d0, [src, #-48] /* 12 words to go. */ + vstr d0, [dst, #-48] + vldr d0, [src, #-40] /* 10 words to go. */ + vstr d0, [dst, #-40] + vldr d0, [src, #-32] /* 8 words to go. */ + vstr d0, [dst, #-32] + vldr d0, [src, #-24] /* 6 words to go. */ + vstr d0, [dst, #-24] + vldr d0, [src, #-16] /* 4 words to go. */ + vstr d0, [dst, #-16] + vldr d0, [src, #-8] /* 2 words to go. */ + vstr d0, [dst, #-8] +#else + sub src, src, #8 + sub dst, dst, #8 +1: + ldrd A_l, A_h, [src, #8] + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #16] + strd A_l, A_h, [dst, #16] + ldrd A_l, A_h, [src, #24] + strd A_l, A_h, [dst, #24] + ldrd A_l, A_h, [src, #32] + strd A_l, A_h, [dst, #32] + ldrd A_l, A_h, [src, #40] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #48] + strd A_l, A_h, [dst, #48] + ldrd A_l, A_h, [src, #56] + strd A_l, A_h, [dst, #56] + ldrd A_l, A_h, [src, #64]! + strd A_l, A_h, [dst, #64]! + subs tmp2, tmp2, #64 + bge 1b + tst tmp2, #0x3f + bne 1f + ldr tmp2,[sp], #FRAME_SIZE + bx lr +1: + add src, src, #8 + add dst, dst, #8 + +.Ltail63aligned: /* Count in tmp2. */ + /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but + we know that the src and dest are 32-bit aligned so we can use + LDRD/STRD to improve efficiency. */ + /* TMP2 is now negative, but we don't care about that. The bottom + six bits still tell us how many bytes are left to copy. */ + + and tmp1, tmp2, #0x38 + add dst, dst, tmp1 + add src, src, tmp1 + rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) + add pc, pc, tmp1 + ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ + strd A_l, A_h, [dst, #-56] + ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ + strd A_l, A_h, [dst, #-48] + ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ + strd A_l, A_h, [dst, #-40] + ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ + strd A_l, A_h, [dst, #-32] + ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ + strd A_l, A_h, [dst, #-24] + ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ + strd A_l, A_h, [dst, #-16] + ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ + strd A_l, A_h, [dst, #-8] + +#endif + tst tmp2, #4 + ldrne tmp1, [src], #4 + strne tmp1, [dst], #4 + lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ + ldrhcs tmp1, [src], #2 + ldrbne tmp2, [src] + strhcs tmp1, [dst], #2 + strbne tmp2, [dst] + +.Ldone: + ldr tmp2, [sp], #FRAME_SIZE + bx lr + +.Lcpy_body_long: /* Count in tmp2. */ + + /* Long copy. We know that there's at least (prefetch_lines * 64) + bytes to go. */ +#ifdef USE_VFP + /* Don't use PLD. Instead, read some data in advance of the current + copy position into a register. This should act like a PLD + operation but we won't have to repeat the transfer. */ + + vldr d3, [src, #0] + vldr d4, [src, #64] + vldr d5, [src, #128] + vldr d6, [src, #192] + vldr d7, [src, #256] + + vldr d0, [src, #8] + vldr d1, [src, #16] + vldr d2, [src, #24] + add src, src, #32 + + subs tmp2, tmp2, #prefetch_lines * 64 * 2 + blt 2f +1: + cpy_line_vfp d3, 0 + cpy_line_vfp d4, 64 + cpy_line_vfp d5, 128 + add dst, dst, #3 * 64 + add src, src, #3 * 64 + cpy_line_vfp d6, 0 + cpy_line_vfp d7, 64 + add dst, dst, #2 * 64 + add src, src, #2 * 64 + subs tmp2, tmp2, #prefetch_lines * 64 + bge 1b + +2: + cpy_tail_vfp d3, 0 + cpy_tail_vfp d4, 64 + cpy_tail_vfp d5, 128 + add src, src, #3 * 64 + add dst, dst, #3 * 64 + cpy_tail_vfp d6, 0 + vstr d7, [dst, #64] + vldr d7, [src, #64] + vstr d0, [dst, #64 + 8] + vldr d0, [src, #64 + 8] + vstr d1, [dst, #64 + 16] + vldr d1, [src, #64 + 16] + vstr d2, [dst, #64 + 24] + vldr d2, [src, #64 + 24] + vstr d7, [dst, #64 + 32] + add src, src, #96 + vstr d0, [dst, #64 + 40] + vstr d1, [dst, #64 + 48] + vstr d2, [dst, #64 + 56] + add dst, dst, #128 + add tmp2, tmp2, #prefetch_lines * 64 + b .Lcpy_body_medium +#else + /* Long copy. Use an SMS style loop to maximize the I/O + bandwidth of the core. We don't have enough spare registers + to synthesise prefetching, so use PLD operations. */ + /* Pre-bias src and dst. */ + sub src, src, #8 + sub dst, dst, #8 + pld [src, #8] + pld [src, #72] + subs tmp2, tmp2, #64 + pld [src, #136] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [sp, #8] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [sp, #16] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [sp, #24] + pld [src, #200] + ldrd D_l, D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #232] + strd A_l, A_h, [dst, #40] + ldrd A_l, A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldrd D_l, D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldrd A_l, A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldrd B_l, B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldrd C_l, C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldrd D_l, D_h, [src, #32] + bcs 2b + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #40 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + add dst, dst, #72 + tst tmp2, #0x3f + bne .Ltail63aligned + ldr tmp2, [sp], #FRAME_SIZE + bx lr +#endif + +.Lcpy_notaligned: + pld [src] + pld [src, #64] + /* There's at least 64 bytes to copy, but there is no mutual + alignment. */ + /* Bring DST to 64-bit alignment. */ + lsls tmp2, dst, #29 + pld [src, #(2 * 64)] + beq 1f + rsbs tmp2, tmp2, #0 + sub count, count, tmp2, lsr #29 + ldrmi tmp1, [src], #4 + strmi tmp1, [dst], #4 + lsls tmp2, tmp2, #2 + ldrbne tmp1, [src], #1 + ldrhcs tmp2, [src], #2 + strbne tmp1, [dst], #1 + strhcs tmp2, [dst], #2 +1: + pld [src, #(3 * 64)] + subs count, count, #64 + ldrmi tmp2, [sp], #FRAME_SIZE + bmi .Ltail63unaligned + pld [src, #(4 * 64)] + +#ifdef USE_NEON + vld1.8 {d0-d3}, [src]! + vld1.8 {d4-d7}, [src]! + subs count, count, #64 + bmi 2f +1: + pld [src, #(4 * 64)] + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! + vld1.8 {d0-d3}, [src]! + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! + vld1.8 {d4-d7}, [src]! + subs count, count, #64 + bpl 1b +2: + vst1.8 {d0-d3}, [ALIGN (dst, 64)]! + vst1.8 {d4-d7}, [ALIGN (dst, 64)]! + ands count, count, #0x3f +#else + /* Use an SMS style loop to maximize the I/O bandwidth. */ + sub src, src, #4 + sub dst, dst, #8 + subs tmp2, count, #64 /* Use tmp2 for count. */ + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [sp, #8] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [sp, #16] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [sp, #24] + ldr D_l, [src, #28] + ldr D_h, [src, #32]! + b 1f + .p2align 6 +2: + pld [src, #(5 * 64) - (32 - 4)] + strd A_l, A_h, [dst, #40] + ldr A_l, [src, #36] + ldr A_h, [src, #40] + strd B_l, B_h, [dst, #48] + ldr B_l, [src, #44] + ldr B_h, [src, #48] + strd C_l, C_h, [dst, #56] + ldr C_l, [src, #52] + ldr C_h, [src, #56] + strd D_l, D_h, [dst, #64]! + ldr D_l, [src, #60] + ldr D_h, [src, #64]! + subs tmp2, tmp2, #64 +1: + strd A_l, A_h, [dst, #8] + ldr A_l, [src, #4] + ldr A_h, [src, #8] + strd B_l, B_h, [dst, #16] + ldr B_l, [src, #12] + ldr B_h, [src, #16] + strd C_l, C_h, [dst, #24] + ldr C_l, [src, #20] + ldr C_h, [src, #24] + strd D_l, D_h, [dst, #32] + ldr D_l, [src, #28] + ldr D_h, [src, #32] + bcs 2b + + /* Save the remaining bytes and restore the callee-saved regs. */ + strd A_l, A_h, [dst, #40] + add src, src, #36 + strd B_l, B_h, [dst, #48] + ldrd B_l, B_h, [sp, #8] + strd C_l, C_h, [dst, #56] + ldrd C_l, C_h, [sp, #16] + strd D_l, D_h, [dst, #64] + ldrd D_l, D_h, [sp, #24] + add dst, dst, #72 + ands count, tmp2, #0x3f +#endif + ldr tmp2, [sp], #FRAME_SIZE + bne .Ltail63unaligned bx lr -30: @ non-aligned - at least 32 bytes to play with - @ On v7 we're allowed to do ldr's and str's from arbitrary alignments - @ but not ldrd/strd or ldm/stm - @ Note Neon is often a better choice misaligned using vld1 - - @ copy a byte at a time until the point where we have an aligned destination - @ we know we have enough bytes to go to know we won't run out in this phase - tst r0,#7 - beq 35f - -31: - ldrb r3,[r1],#1 - sub r2,r2,#1 - strb r3,[r0],#1 - tst r0,#7 - bne 31b - - cmp r2,#32 @ Lets get back to knowing we have 32 bytes to play with - blt 11b - - @ Now the store address is aligned -35: - push {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14} - and r6,r1,#3 @ how misaligned we are - cmp r6,#2 - cbz r6, 100f @ Go there if we're actually aligned - bge 120f @ And here if it's aligned on 2 or 3 byte - @ Note might be worth splitting to bgt and a separate beq - @ if the branches are well separated - - @ At this point dest is aligned, source is 1 byte forward -110: - ldr r3,[r1] @ Misaligned load - but it gives the first 4 bytes to store - sub r2,r2,#3 @ Number of bytes left in whole words we can load - add r1,r1,#3 @ To aligned load address - bic r3,r3,#0xff000000 - -112: - ldmia r1!,{r5,r6,r7,r8} - sub r2,r2,#32 - cmp r2,#32 - pld [r1,#96] - - orr r3,r3,r5,lsl#24 - mov r4,r5,lsr#8 - mov r5,r6,lsr#8 - orr r4,r4,r6,lsl#24 - mov r6,r7,lsr#8 - ldmia r1!,{r10,r11,r12,r14} - orr r5,r5,r7,lsl#24 - mov r7,r8,lsr#8 - orr r6,r6,r8,lsl#24 - mov r8,r10,lsr#8 - orr r7,r7,r10,lsl#24 - mov r10,r11,lsr#8 - orr r8,r8,r11,lsl#24 - orr r10,r10,r12,lsl#24 - mov r11,r12,lsr#8 - orr r11,r11,r14,lsl#24 - stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11} - mov r3,r14,lsr#8 - - bge 112b - - @ Deal with the stragglers - add r2,r2,#3 - sub r1,r1,#3 - pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14} - b 10b - -100: @ Dest and source aligned - must have been originally co-misaligned - @ Fallback to main aligned case if still big enough - pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14} - b 4b @ Big copies (32 bytes or more) - -120: @ Dest is aligned, source is align+2 or 3 - bgt 130f @ Now split off for 3 byte offset - - ldrh r3,[r1] - sub r2,r2,#2 @ Number of bytes left in whole words we can load - add r1,r1,#2 @ To aligned load address - -122: - ldmia r1!,{r5,r6,r7,r8} - sub r2,r2,#32 - cmp r2,#32 - pld [r1,#96] - - orr r3,r3,r5,lsl#16 - mov r4,r5,lsr#16 - mov r5,r6,lsr#16 - orr r4,r4,r6,lsl#16 - mov r6,r7,lsr#16 - ldmia r1!,{r10,r11,r12,r14} - orr r5,r5,r7,lsl#16 - orr r6,r6,r8,lsl#16 - mov r7,r8,lsr#16 - orr r7,r7,r10,lsl#16 - mov r8,r10,lsr#16 - orr r8,r8,r11,lsl#16 - mov r10,r11,lsr#16 - orr r10,r10,r12,lsl#16 - mov r11,r12,lsr#16 - orr r11,r11,r14,lsl#16 - stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11} - mov r3,r14,lsr#16 - - bge 122b - - @ Deal with the stragglers - add r2,r2,#2 - sub r1,r1,#2 - pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14} - b 10b - -130: @ Dest is aligned, source is align+3 - ldrb r3,[r1] - sub r2,r2,#1 @ Number of bytes left in whole words we can load - add r1,r1,#1 @ To aligned load address - -132: - ldmia r1!,{r5,r6,r7,r8} - sub r2,r2,#32 - cmp r2,#32 - pld [r1,#96] - - orr r3,r3,r5,lsl#8 - mov r4,r5,lsr#24 - mov r5,r6,lsr#24 - orr r4,r4,r6,lsl#8 - mov r6,r7,lsr#24 - ldmia r1!,{r10,r11,r12,r14} - orr r5,r5,r7,lsl#8 - mov r7,r8,lsr#24 - orr r6,r6,r8,lsl#8 - mov r8,r10,lsr#24 - orr r7,r7,r10,lsl#8 - orr r8,r8,r11,lsl#8 - mov r10,r11,lsr#24 - orr r10,r10,r12,lsl#8 - mov r11,r12,lsr#24 - orr r11,r11,r14,lsl#8 - stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11} - mov r3,r14,lsr#24 - - bge 132b - - @ Deal with the stragglers - add r2,r2,#1 - sub r1,r1,#1 - pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14} - b 10b + .size memcpy, . - memcpy |