aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWill Newton <will.newton@linaro.org>2013-03-26 10:19:35 +0000
committerWill Newton <will.newton@linaro.org>2013-03-26 10:19:35 +0000
commit99a65574c6435888205e4d3f12019c3865a9f9db (patch)
tree81cd6d32ce722502c6f0801d76bb51cfe9eb83f7
parent3f92e22221aff7771cf4f6e257b8aea30fa68424 (diff)
Integrate NEON/VFP/ARM optimised memcpy implementation.
Add --with-vfp configure option to allow testing VFP code.
-rw-r--r--Makefile.am20
-rw-r--r--configure.ac8
-rw-r--r--src/linaro-a9/memcpy-hybrid.S152
-rw-r--r--src/linaro-a9/memcpy.S807
4 files changed, 596 insertions, 391 deletions
diff --git a/Makefile.am b/Makefile.am
index bb4ec36..4a87f07 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -171,15 +171,15 @@ if HOST_AARCH32
if WITH_NEON
# Pull in the NEON specific files
-neon_sources = \
- src/linaro-a9/memcpy-hybrid.S
neon_bionic_sources = \
reference/bionic/memcpy.S
-neon_cppflags = -mfpu=neon
-neon_dirs = neon
+fpu_flags = -mfpu=neon
else
-alternate_sources = \
- src/linaro-a9/memcpy.S
+if WITH_VFP
+fpu_flags = -mfpu=vfp
+else
+fpu_flags = -msoft-float
+endif
endif
# Benchmarks and example programs
@@ -200,13 +200,12 @@ noinst_LIBRARIES += \
# Main library
libcortex_strings_la_SOURCES = \
- $(neon_sources) \
- $(alternate_sources) \
src/thumb-2/strcpy.c \
src/linaro-a9/memchr.S \
src/linaro-a9/strchr.S \
src/linaro-a9/strlen.S \
- src/linaro-a9/memset.S
+ src/linaro-a9/memset.S \
+ src/linaro-a9/memcpy.S
# Libraries containing the difference reference versions
libbionic_a_SOURCES = \
@@ -259,7 +258,8 @@ try_newlib_LDADD = libmulti.a libnewlib.a -lrt
try_newlib_xscale_SOURCES =
try_newlib_xscale_LDADD = libmulti.a libnewlib-xscale.a -lrt
-AM_CPPFLAGS = $(neon_cppflags)
+AM_CPPFLAGS = $(fpu_flags)
+AM_LDFLAGS = $(fpu_flags)
endif
diff --git a/configure.ac b/configure.ac
index 498d98c..56f1ced 100644
--- a/configure.ac
+++ b/configure.ac
@@ -77,4 +77,12 @@ AC_ARG_WITH([neon],
AC_SUBST(with_neon)
AM_CONDITIONAL(WITH_NEON, test x$with_neon = xyes)
+AC_ARG_WITH([vfp],
+ AC_HELP_STRING([--with-vfp],
+ [include VFP specific routines @<:@default=yes@:>@]),
+ [with_vfp=$withval],
+ [with_vfp=yes])
+AC_SUBST(with_vfp)
+AM_CONDITIONAL(WITH_VFP, test x$with_vfp = xyes)
+
AC_OUTPUT
diff --git a/src/linaro-a9/memcpy-hybrid.S b/src/linaro-a9/memcpy-hybrid.S
deleted file mode 100644
index 3be24ca..0000000
--- a/src/linaro-a9/memcpy-hybrid.S
+++ /dev/null
@@ -1,152 +0,0 @@
-/* Copyright (c) 2010-2011, Linaro Limited
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions
- are met:
-
- * Redistributions of source code must retain the above copyright
- notice, this list of conditions and the following disclaimer.
-
- * Redistributions in binary form must reproduce the above copyright
- notice, this list of conditions and the following disclaimer in the
- documentation and/or other materials provided with the distribution.
-
- * Neither the name of Linaro Limited nor the names of its
- contributors may be used to endorse or promote products derived
- from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
- Written by Dave Gilbert <david.gilbert@linaro.org>
-
- This memcpy routine is optimised on a Cortex-A9 and should work on
- all ARMv7 processors with NEON. */
-
-@ 2011-09-01 david.gilbert@linaro.org
-@ Extracted from local git 2f11b436
-
- .syntax unified
- .arch armv7-a
-
-@ this lets us check a flag in a 00/ff byte easily in either endianness
-#ifdef __ARMEB__
-#define CHARTSTMASK(c) 1<<(31-(c*8))
-#else
-#define CHARTSTMASK(c) 1<<(c*8)
-#endif
- .text
- .thumb
-
-@ ---------------------------------------------------------------------------
- .thumb_func
- .align 2
- .p2align 4,,15
- .global memcpy
- .type memcpy,%function
-memcpy:
- @ r0 = dest
- @ r1 = source
- @ r2 = count
- @ returns dest in r0
- @ Overlaps of source/dest not allowed according to spec
- @ Note this routine relies on v7 misaligned loads/stores
- pld [r1]
- mov r12, r0 @ stash original r0
- cmp r2,#32
- blt 10f @ take the small copy case separately
-
- @ test for either source or destination being misaligned
- @ (We only rely on word align)
- tst r0,#3
- it eq
- tsteq r1,#3
- bne 30f @ misaligned case
-
-4:
- @ at this point we are word (or better) aligned and have at least
- @ 32 bytes to play with
-
- @ If it's a huge copy, try Neon
- cmp r2, #128*1024
- bge 35f @ Sharing general non-aligned case here, aligned could be faster
-
- push {r3,r4,r5,r6,r7,r8,r10,r11}
-5:
- ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11}
- sub r2,r2,#32
- pld [r1,#96]
- cmp r2,#32
- stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
- bge 5b
-
- pop {r3,r4,r5,r6,r7,r8,r10,r11}
- @ We are now down to less than 32 bytes
- cbz r2,15f @ quick exit for the case where we copied a multiple of 32
-
-10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes)
- cmp r2,#4
- blt 12f
-11:
- sub r2,r2,#4
- cmp r2,#4
- ldr r3, [r1],#4
- str r3, [r0],#4
- bge 11b
-12:
- tst r2,#2
- itt ne
- ldrhne r3, [r1],#2
- strhne r3, [r0],#2
-
- tst r2,#1
- itt ne
- ldrbne r3, [r1],#1
- strbne r3, [r0],#1
-
-15: @ exit
- mov r0,r12 @ restore r0
- bx lr
-
- .align 2
- .p2align 4,,15
-30: @ non-aligned - at least 32 bytes to play with
- @ Test for co-misalignment
- eor r3, r0, r1
- tst r3,#3
- beq 50f
-
- @ Use Neon for misaligned
-35:
- vld1.8 {d0,d1,d2,d3}, [r1]!
- sub r2,r2,#32
- cmp r2,#32
- pld [r1,#96]
- vst1.8 {d0,d1,d2,d3}, [r0]!
- bge 35b
- b 10b @ TODO: Probably a bad idea to switch to ARM at this point
-
- .align 2
- .p2align 4,,15
-50: @ Co-misaligned
- @ At this point we've got at least 32 bytes
-51:
- ldrb r3,[r1],#1
- sub r2,r2,#1
- strb r3,[r0],#1
- tst r0,#7
- bne 51b
-
- cmp r2,#32
- blt 10b
- b 4b
diff --git a/src/linaro-a9/memcpy.S b/src/linaro-a9/memcpy.S
index a98a627..4faec18 100644
--- a/src/linaro-a9/memcpy.S
+++ b/src/linaro-a9/memcpy.S
@@ -1,4 +1,4 @@
-/* Copyright (c) 2010-2011, Linaro Limited
+/* Copyright (c) 2013, Linaro Limited
All rights reserved.
Redistribution and use in source and binary forms, with or without
@@ -28,241 +28,590 @@
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- Written by Dave Gilbert <david.gilbert@linaro.org>
+ This memcpy routine is optimised for Cortex-A cores and takes advantage
+ of VFP or NEON when built with the appropriate flags.
- This memcpy routine is optimised on a Cortex-A9 and should work on
- all ARMv7 processors. */
+ Assumptions:
-@ 2011-09-01 david.gilbert@linaro.org
-@ Extracted from local git 2f11b436
+ ARMv6 (ARMv7-a if using Neon)
+ ARM state
+ Unaligned accesses
+ LDRD/STRD support unaligned word accesses
+ Not tested on big-endian
+
+ */
.syntax unified
- .arch armv7-a
+ /* This implementation requires ARM state. */
+ .arm
+
+#ifdef __ARM_NEON__
+
+ .fpu neon
+ .arch armv7-a
+# define FRAME_SIZE 4
+# define USE_VFP
+# define USE_NEON
+
+#elif !defined (__SOFTFP__)
+
+ .arch armv6
+ .fpu vfpv2
+# define FRAME_SIZE 32
+# define USE_VFP
+
+#else
+ .arch armv6
+# define FRAME_SIZE 32
+
+#endif
-@ this lets us check a flag in a 00/ff byte easily in either endianness
-#ifdef __ARMEB__
-#define CHARTSTMASK(c) 1<<(31-(c*8))
+/* Old versions of GAS incorrectly implement the NEON align semantics. */
+#ifdef BROKEN_ASM_NEON_ALIGN
+#define ALIGN(addr, align) addr,:align
#else
-#define CHARTSTMASK(c) 1<<(c*8)
+#define ALIGN(addr, align) addr:align
#endif
+
+#define PC_OFFSET 8 /* PC pipeline compensation. */
+#define INSN_SIZE 4
+
+/* Call parameters. */
+#define dstin r0
+#define src r1
+#define count r2
+
+/* Locals. */
+#define tmp1 r3
+#define dst ip
+#define tmp2 r10
+
+#ifndef USE_NEON
+/* For bulk copies using GP registers. */
+#define A_l r2 /* Call-clobbered. */
+#define A_h r3 /* Call-clobbered. */
+#define B_l r4
+#define B_h r5
+#define C_l r6
+#define C_h r7
+#define D_l r8
+#define D_h r9
+#endif
+
+/* Number of lines ahead to pre-fetch data. If you change this the code
+ below will need adjustment to compensate. */
+
+#define prefetch_lines 5
+
+#ifdef USE_VFP
+ .macro cpy_line_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vldr \vreg, [src, #\base + prefetch_lines * 64 - 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+
+ .macro cpy_tail_vfp vreg, base
+ vstr \vreg, [dst, #\base]
+ vldr \vreg, [src, #\base]
+ vstr d0, [dst, #\base + 8]
+ vldr d0, [src, #\base + 8]
+ vstr d1, [dst, #\base + 16]
+ vldr d1, [src, #\base + 16]
+ vstr d2, [dst, #\base + 24]
+ vldr d2, [src, #\base + 24]
+ vstr \vreg, [dst, #\base + 32]
+ vstr d0, [dst, #\base + 40]
+ vldr d0, [src, #\base + 40]
+ vstr d1, [dst, #\base + 48]
+ vldr d1, [src, #\base + 48]
+ vstr d2, [dst, #\base + 56]
+ vldr d2, [src, #\base + 56]
+ .endm
+#endif
+
+ .macro def_fn f p2align=0
.text
- .thumb
-
-@ ---------------------------------------------------------------------------
- .thumb_func
- .align 2
- .p2align 4,,15
- .global memcpy
- .type memcpy,%function
-memcpy:
- @ r0 = dest
- @ r1 = source
- @ r2 = count
- @ returns dest in r0
- @ Overlaps of source/dest not allowed according to spec
- @ Note this routine relies on v7 misaligned loads/stores
- pld [r1]
- mov r12, r0 @ stash original r0
- cmp r2,#32
- blt 10f @ take the small copy case separately
-
- @ test for either source or destination being misaligned
- @ (We only rely on word align)
- @ TODO: Test for co-misalignment
- tst r0,#3
- it eq
- tsteq r1,#3
- bne 30f @ misaligned case
-
-4:
- @ at this point we are word (or better) aligned and have at least
- @ 32 bytes to play with
- push {r3,r4,r5,r6,r7,r8,r10,r11}
-5:
- ldmia r1!,{r3,r4,r5,r6,r7,r8,r10,r11}
- pld [r1,#96]
- sub r2,r2,#32
- cmp r2,#32
- stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
- bge 5b
-
- pop {r3,r4,r5,r6,r7,r8,r10,r11}
- @ We are now down to less than 32 bytes
- cbz r2,15f @ quick exit for the case where we copied a multiple of 32
-
-10: @ small copies (not necessarily aligned - note might be slightly more than 32bytes)
- cmp r2,#4
- blt 12f
-11:
- sub r2,r2,#4
- cmp r2,#4
- ldr r3, [r1],#4
- str r3, [r0],#4
- bge 11b
-12:
- tst r2,#2
- itt ne
- ldrhne r3, [r1],#2
- strhne r3, [r0],#2
-
- tst r2,#1
- itt ne
- ldrbne r3, [r1],#1
- strbne r3, [r0],#1
-
-15: @ exit
- mov r0,r12 @ restore r0
+ .p2align \p2align
+ .global \f
+ .type \f, %function
+\f:
+ .endm
+
+def_fn memcpy p2align=6
+
+ mov dst, dstin /* Preserve dstin, we need to return it. */
+ cmp count, #64
+ bge .Lcpy_not_short
+ /* Deal with small copies quickly by dropping straight into the
+ exit block. */
+
+.Ltail63unaligned:
+#ifdef USE_NEON
+ and tmp1, count, #0x38
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ vld1.8 {d0}, [src]! /* 14 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 12 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 10 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 8 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 6 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 4 words to go. */
+ vst1.8 {d0}, [dst]!
+ vld1.8 {d0}, [src]! /* 2 words to go. */
+ vst1.8 {d0}, [dst]!
+
+ tst count, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+#else
+ /* Copy up to 15 full words of data. May not be aligned. */
+ /* Cannot use VFP for unaligned data. */
+ and tmp1, count, #0x3c
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2)
+ /* Jump directly into the sequence below at the correct offset. */
+ add pc, pc, tmp1, lsl #1
+
+ ldr tmp1, [src, #-60] /* 15 words to go. */
+ str tmp1, [dst, #-60]
+
+ ldr tmp1, [src, #-56] /* 14 words to go. */
+ str tmp1, [dst, #-56]
+ ldr tmp1, [src, #-52]
+ str tmp1, [dst, #-52]
+
+ ldr tmp1, [src, #-48] /* 12 words to go. */
+ str tmp1, [dst, #-48]
+ ldr tmp1, [src, #-44]
+ str tmp1, [dst, #-44]
+
+ ldr tmp1, [src, #-40] /* 10 words to go. */
+ str tmp1, [dst, #-40]
+ ldr tmp1, [src, #-36]
+ str tmp1, [dst, #-36]
+
+ ldr tmp1, [src, #-32] /* 8 words to go. */
+ str tmp1, [dst, #-32]
+ ldr tmp1, [src, #-28]
+ str tmp1, [dst, #-28]
+
+ ldr tmp1, [src, #-24] /* 6 words to go. */
+ str tmp1, [dst, #-24]
+ ldr tmp1, [src, #-20]
+ str tmp1, [dst, #-20]
+
+ ldr tmp1, [src, #-16] /* 4 words to go. */
+ str tmp1, [dst, #-16]
+ ldr tmp1, [src, #-12]
+ str tmp1, [dst, #-12]
+
+ ldr tmp1, [src, #-8] /* 2 words to go. */
+ str tmp1, [dst, #-8]
+ ldr tmp1, [src, #-4]
+ str tmp1, [dst, #-4]
+#endif
+
+ lsls count, count, #31
+ ldrhcs tmp1, [src], #2
+ ldrbne src, [src] /* Src is dead, use as a scratch. */
+ strhcs tmp1, [dst], #2
+ strbne src, [dst]
+ bx lr
+
+.Lcpy_not_short:
+ /* At least 64 bytes to copy, but don't know the alignment yet. */
+ str tmp2, [sp, #-FRAME_SIZE]!
+ and tmp2, src, #3
+ and tmp1, dst, #3
+ cmp tmp1, tmp2
+ bne .Lcpy_notaligned
+
+#ifdef USE_VFP
+ /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
+ that the FP pipeline is much better at streaming loads and
+ stores. This is outside the critical loop. */
+ vmov.f32 s0, s0
+#endif
+
+ /* SRC and DST have the same mutual 32-bit alignment, but we may
+ still need to pre-copy some bytes to get to natural alignment.
+ We bring DST into full 64-bit alignment. */
+ lsls tmp2, dst, #29
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src], #1
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst], #1
+
+1:
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ blt .Ltail63aligned
+
+ cmp tmp2, #512
+ bge .Lcpy_body_long
+
+.Lcpy_body_medium: /* Count in tmp2. */
+#ifdef USE_VFP
+1:
+ vldr d0, [src, #0]
+ subs tmp2, tmp2, #64
+ vldr d1, [src, #8]
+ vstr d0, [dst, #0]
+ vldr d0, [src, #16]
+ vstr d1, [dst, #8]
+ vldr d1, [src, #24]
+ vstr d0, [dst, #16]
+ vldr d0, [src, #32]
+ vstr d1, [dst, #24]
+ vldr d1, [src, #40]
+ vstr d0, [dst, #32]
+ vldr d0, [src, #48]
+ vstr d1, [dst, #40]
+ vldr d1, [src, #56]
+ vstr d0, [dst, #48]
+ add src, src, #64
+ vstr d1, [dst, #56]
+ add dst, dst, #64
+ bge 1b
+ tst tmp2, #0x3f
+ beq .Ldone
+
+.Ltail63aligned: /* Count in tmp2. */
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+
+ vldr d0, [src, #-56] /* 14 words to go. */
+ vstr d0, [dst, #-56]
+ vldr d0, [src, #-48] /* 12 words to go. */
+ vstr d0, [dst, #-48]
+ vldr d0, [src, #-40] /* 10 words to go. */
+ vstr d0, [dst, #-40]
+ vldr d0, [src, #-32] /* 8 words to go. */
+ vstr d0, [dst, #-32]
+ vldr d0, [src, #-24] /* 6 words to go. */
+ vstr d0, [dst, #-24]
+ vldr d0, [src, #-16] /* 4 words to go. */
+ vstr d0, [dst, #-16]
+ vldr d0, [src, #-8] /* 2 words to go. */
+ vstr d0, [dst, #-8]
+#else
+ sub src, src, #8
+ sub dst, dst, #8
+1:
+ ldrd A_l, A_h, [src, #8]
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #16]
+ strd A_l, A_h, [dst, #16]
+ ldrd A_l, A_h, [src, #24]
+ strd A_l, A_h, [dst, #24]
+ ldrd A_l, A_h, [src, #32]
+ strd A_l, A_h, [dst, #32]
+ ldrd A_l, A_h, [src, #40]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #48]
+ strd A_l, A_h, [dst, #48]
+ ldrd A_l, A_h, [src, #56]
+ strd A_l, A_h, [dst, #56]
+ ldrd A_l, A_h, [src, #64]!
+ strd A_l, A_h, [dst, #64]!
+ subs tmp2, tmp2, #64
+ bge 1b
+ tst tmp2, #0x3f
+ bne 1f
+ ldr tmp2,[sp], #FRAME_SIZE
+ bx lr
+1:
+ add src, src, #8
+ add dst, dst, #8
+
+.Ltail63aligned: /* Count in tmp2. */
+ /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but
+ we know that the src and dest are 32-bit aligned so we can use
+ LDRD/STRD to improve efficiency. */
+ /* TMP2 is now negative, but we don't care about that. The bottom
+ six bits still tell us how many bytes are left to copy. */
+
+ and tmp1, tmp2, #0x38
+ add dst, dst, tmp1
+ add src, src, tmp1
+ rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
+ add pc, pc, tmp1
+ ldrd A_l, A_h, [src, #-56] /* 14 words to go. */
+ strd A_l, A_h, [dst, #-56]
+ ldrd A_l, A_h, [src, #-48] /* 12 words to go. */
+ strd A_l, A_h, [dst, #-48]
+ ldrd A_l, A_h, [src, #-40] /* 10 words to go. */
+ strd A_l, A_h, [dst, #-40]
+ ldrd A_l, A_h, [src, #-32] /* 8 words to go. */
+ strd A_l, A_h, [dst, #-32]
+ ldrd A_l, A_h, [src, #-24] /* 6 words to go. */
+ strd A_l, A_h, [dst, #-24]
+ ldrd A_l, A_h, [src, #-16] /* 4 words to go. */
+ strd A_l, A_h, [dst, #-16]
+ ldrd A_l, A_h, [src, #-8] /* 2 words to go. */
+ strd A_l, A_h, [dst, #-8]
+
+#endif
+ tst tmp2, #4
+ ldrne tmp1, [src], #4
+ strne tmp1, [dst], #4
+ lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */
+ ldrhcs tmp1, [src], #2
+ ldrbne tmp2, [src]
+ strhcs tmp1, [dst], #2
+ strbne tmp2, [dst]
+
+.Ldone:
+ ldr tmp2, [sp], #FRAME_SIZE
+ bx lr
+
+.Lcpy_body_long: /* Count in tmp2. */
+
+ /* Long copy. We know that there's at least (prefetch_lines * 64)
+ bytes to go. */
+#ifdef USE_VFP
+ /* Don't use PLD. Instead, read some data in advance of the current
+ copy position into a register. This should act like a PLD
+ operation but we won't have to repeat the transfer. */
+
+ vldr d3, [src, #0]
+ vldr d4, [src, #64]
+ vldr d5, [src, #128]
+ vldr d6, [src, #192]
+ vldr d7, [src, #256]
+
+ vldr d0, [src, #8]
+ vldr d1, [src, #16]
+ vldr d2, [src, #24]
+ add src, src, #32
+
+ subs tmp2, tmp2, #prefetch_lines * 64 * 2
+ blt 2f
+1:
+ cpy_line_vfp d3, 0
+ cpy_line_vfp d4, 64
+ cpy_line_vfp d5, 128
+ add dst, dst, #3 * 64
+ add src, src, #3 * 64
+ cpy_line_vfp d6, 0
+ cpy_line_vfp d7, 64
+ add dst, dst, #2 * 64
+ add src, src, #2 * 64
+ subs tmp2, tmp2, #prefetch_lines * 64
+ bge 1b
+
+2:
+ cpy_tail_vfp d3, 0
+ cpy_tail_vfp d4, 64
+ cpy_tail_vfp d5, 128
+ add src, src, #3 * 64
+ add dst, dst, #3 * 64
+ cpy_tail_vfp d6, 0
+ vstr d7, [dst, #64]
+ vldr d7, [src, #64]
+ vstr d0, [dst, #64 + 8]
+ vldr d0, [src, #64 + 8]
+ vstr d1, [dst, #64 + 16]
+ vldr d1, [src, #64 + 16]
+ vstr d2, [dst, #64 + 24]
+ vldr d2, [src, #64 + 24]
+ vstr d7, [dst, #64 + 32]
+ add src, src, #96
+ vstr d0, [dst, #64 + 40]
+ vstr d1, [dst, #64 + 48]
+ vstr d2, [dst, #64 + 56]
+ add dst, dst, #128
+ add tmp2, tmp2, #prefetch_lines * 64
+ b .Lcpy_body_medium
+#else
+ /* Long copy. Use an SMS style loop to maximize the I/O
+ bandwidth of the core. We don't have enough spare registers
+ to synthesise prefetching, so use PLD operations. */
+ /* Pre-bias src and dst. */
+ sub src, src, #8
+ sub dst, dst, #8
+ pld [src, #8]
+ pld [src, #72]
+ subs tmp2, tmp2, #64
+ pld [src, #136]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ pld [src, #200]
+ ldrd D_l, D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #232]
+ strd A_l, A_h, [dst, #40]
+ ldrd A_l, A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldrd D_l, D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldrd A_l, A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldrd B_l, B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldrd C_l, C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldrd D_l, D_h, [src, #32]
+ bcs 2b
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #40
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ add dst, dst, #72
+ tst tmp2, #0x3f
+ bne .Ltail63aligned
+ ldr tmp2, [sp], #FRAME_SIZE
+ bx lr
+#endif
+
+.Lcpy_notaligned:
+ pld [src]
+ pld [src, #64]
+ /* There's at least 64 bytes to copy, but there is no mutual
+ alignment. */
+ /* Bring DST to 64-bit alignment. */
+ lsls tmp2, dst, #29
+ pld [src, #(2 * 64)]
+ beq 1f
+ rsbs tmp2, tmp2, #0
+ sub count, count, tmp2, lsr #29
+ ldrmi tmp1, [src], #4
+ strmi tmp1, [dst], #4
+ lsls tmp2, tmp2, #2
+ ldrbne tmp1, [src], #1
+ ldrhcs tmp2, [src], #2
+ strbne tmp1, [dst], #1
+ strhcs tmp2, [dst], #2
+1:
+ pld [src, #(3 * 64)]
+ subs count, count, #64
+ ldrmi tmp2, [sp], #FRAME_SIZE
+ bmi .Ltail63unaligned
+ pld [src, #(4 * 64)]
+
+#ifdef USE_NEON
+ vld1.8 {d0-d3}, [src]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ bmi 2f
+1:
+ pld [src, #(4 * 64)]
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vld1.8 {d0-d3}, [src]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ vld1.8 {d4-d7}, [src]!
+ subs count, count, #64
+ bpl 1b
+2:
+ vst1.8 {d0-d3}, [ALIGN (dst, 64)]!
+ vst1.8 {d4-d7}, [ALIGN (dst, 64)]!
+ ands count, count, #0x3f
+#else
+ /* Use an SMS style loop to maximize the I/O bandwidth. */
+ sub src, src, #4
+ sub dst, dst, #8
+ subs tmp2, count, #64 /* Use tmp2 for count. */
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [sp, #8]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [sp, #16]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [sp, #24]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]!
+ b 1f
+ .p2align 6
+2:
+ pld [src, #(5 * 64) - (32 - 4)]
+ strd A_l, A_h, [dst, #40]
+ ldr A_l, [src, #36]
+ ldr A_h, [src, #40]
+ strd B_l, B_h, [dst, #48]
+ ldr B_l, [src, #44]
+ ldr B_h, [src, #48]
+ strd C_l, C_h, [dst, #56]
+ ldr C_l, [src, #52]
+ ldr C_h, [src, #56]
+ strd D_l, D_h, [dst, #64]!
+ ldr D_l, [src, #60]
+ ldr D_h, [src, #64]!
+ subs tmp2, tmp2, #64
+1:
+ strd A_l, A_h, [dst, #8]
+ ldr A_l, [src, #4]
+ ldr A_h, [src, #8]
+ strd B_l, B_h, [dst, #16]
+ ldr B_l, [src, #12]
+ ldr B_h, [src, #16]
+ strd C_l, C_h, [dst, #24]
+ ldr C_l, [src, #20]
+ ldr C_h, [src, #24]
+ strd D_l, D_h, [dst, #32]
+ ldr D_l, [src, #28]
+ ldr D_h, [src, #32]
+ bcs 2b
+
+ /* Save the remaining bytes and restore the callee-saved regs. */
+ strd A_l, A_h, [dst, #40]
+ add src, src, #36
+ strd B_l, B_h, [dst, #48]
+ ldrd B_l, B_h, [sp, #8]
+ strd C_l, C_h, [dst, #56]
+ ldrd C_l, C_h, [sp, #16]
+ strd D_l, D_h, [dst, #64]
+ ldrd D_l, D_h, [sp, #24]
+ add dst, dst, #72
+ ands count, tmp2, #0x3f
+#endif
+ ldr tmp2, [sp], #FRAME_SIZE
+ bne .Ltail63unaligned
bx lr
-30: @ non-aligned - at least 32 bytes to play with
- @ On v7 we're allowed to do ldr's and str's from arbitrary alignments
- @ but not ldrd/strd or ldm/stm
- @ Note Neon is often a better choice misaligned using vld1
-
- @ copy a byte at a time until the point where we have an aligned destination
- @ we know we have enough bytes to go to know we won't run out in this phase
- tst r0,#7
- beq 35f
-
-31:
- ldrb r3,[r1],#1
- sub r2,r2,#1
- strb r3,[r0],#1
- tst r0,#7
- bne 31b
-
- cmp r2,#32 @ Lets get back to knowing we have 32 bytes to play with
- blt 11b
-
- @ Now the store address is aligned
-35:
- push {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
- and r6,r1,#3 @ how misaligned we are
- cmp r6,#2
- cbz r6, 100f @ Go there if we're actually aligned
- bge 120f @ And here if it's aligned on 2 or 3 byte
- @ Note might be worth splitting to bgt and a separate beq
- @ if the branches are well separated
-
- @ At this point dest is aligned, source is 1 byte forward
-110:
- ldr r3,[r1] @ Misaligned load - but it gives the first 4 bytes to store
- sub r2,r2,#3 @ Number of bytes left in whole words we can load
- add r1,r1,#3 @ To aligned load address
- bic r3,r3,#0xff000000
-
-112:
- ldmia r1!,{r5,r6,r7,r8}
- sub r2,r2,#32
- cmp r2,#32
- pld [r1,#96]
-
- orr r3,r3,r5,lsl#24
- mov r4,r5,lsr#8
- mov r5,r6,lsr#8
- orr r4,r4,r6,lsl#24
- mov r6,r7,lsr#8
- ldmia r1!,{r10,r11,r12,r14}
- orr r5,r5,r7,lsl#24
- mov r7,r8,lsr#8
- orr r6,r6,r8,lsl#24
- mov r8,r10,lsr#8
- orr r7,r7,r10,lsl#24
- mov r10,r11,lsr#8
- orr r8,r8,r11,lsl#24
- orr r10,r10,r12,lsl#24
- mov r11,r12,lsr#8
- orr r11,r11,r14,lsl#24
- stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
- mov r3,r14,lsr#8
-
- bge 112b
-
- @ Deal with the stragglers
- add r2,r2,#3
- sub r1,r1,#3
- pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
- b 10b
-
-100: @ Dest and source aligned - must have been originally co-misaligned
- @ Fallback to main aligned case if still big enough
- pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
- b 4b @ Big copies (32 bytes or more)
-
-120: @ Dest is aligned, source is align+2 or 3
- bgt 130f @ Now split off for 3 byte offset
-
- ldrh r3,[r1]
- sub r2,r2,#2 @ Number of bytes left in whole words we can load
- add r1,r1,#2 @ To aligned load address
-
-122:
- ldmia r1!,{r5,r6,r7,r8}
- sub r2,r2,#32
- cmp r2,#32
- pld [r1,#96]
-
- orr r3,r3,r5,lsl#16
- mov r4,r5,lsr#16
- mov r5,r6,lsr#16
- orr r4,r4,r6,lsl#16
- mov r6,r7,lsr#16
- ldmia r1!,{r10,r11,r12,r14}
- orr r5,r5,r7,lsl#16
- orr r6,r6,r8,lsl#16
- mov r7,r8,lsr#16
- orr r7,r7,r10,lsl#16
- mov r8,r10,lsr#16
- orr r8,r8,r11,lsl#16
- mov r10,r11,lsr#16
- orr r10,r10,r12,lsl#16
- mov r11,r12,lsr#16
- orr r11,r11,r14,lsl#16
- stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
- mov r3,r14,lsr#16
-
- bge 122b
-
- @ Deal with the stragglers
- add r2,r2,#2
- sub r1,r1,#2
- pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
- b 10b
-
-130: @ Dest is aligned, source is align+3
- ldrb r3,[r1]
- sub r2,r2,#1 @ Number of bytes left in whole words we can load
- add r1,r1,#1 @ To aligned load address
-
-132:
- ldmia r1!,{r5,r6,r7,r8}
- sub r2,r2,#32
- cmp r2,#32
- pld [r1,#96]
-
- orr r3,r3,r5,lsl#8
- mov r4,r5,lsr#24
- mov r5,r6,lsr#24
- orr r4,r4,r6,lsl#8
- mov r6,r7,lsr#24
- ldmia r1!,{r10,r11,r12,r14}
- orr r5,r5,r7,lsl#8
- mov r7,r8,lsr#24
- orr r6,r6,r8,lsl#8
- mov r8,r10,lsr#24
- orr r7,r7,r10,lsl#8
- orr r8,r8,r11,lsl#8
- mov r10,r11,lsr#24
- orr r10,r10,r12,lsl#8
- mov r11,r12,lsr#24
- orr r11,r11,r14,lsl#8
- stmia r0!,{r3,r4,r5,r6,r7,r8,r10,r11}
- mov r3,r14,lsr#24
-
- bge 132b
-
- @ Deal with the stragglers
- add r2,r2,#1
- sub r1,r1,#1
- pop {r3,r4,r5,r6,r7,r8,r10,r11,r12,r14}
- b 10b
+ .size memcpy, . - memcpy