diff options
Diffstat (limited to 'android-tools/static-binary/src/memcpy_impl.S')
-rw-r--r-- | android-tools/static-binary/src/memcpy_impl.S | 627 |
1 files changed, 0 insertions, 627 deletions
diff --git a/android-tools/static-binary/src/memcpy_impl.S b/android-tools/static-binary/src/memcpy_impl.S deleted file mode 100644 index f6e06f4..0000000 --- a/android-tools/static-binary/src/memcpy_impl.S +++ /dev/null @@ -1,627 +0,0 @@ - /* Copyright (c) 2013, Linaro Limited - All rights reserved. - - Redistribution and use in source and binary forms, with or without - modification, are permitted provided that the following conditions - are met: - - * Redistributions of source code must retain the above copyright - notice, this list of conditions and the following disclaimer. - - * Redistributions in binary form must reproduce the above copyright - notice, this list of conditions and the following disclaimer in the - documentation and/or other materials provided with the distribution. - - * Neither the name of Linaro Limited nor the names of its - contributors may be used to endorse or promote products derived - from this software without specific prior written permission. - - THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR - A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT - LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, - DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY - THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT - (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE - OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - */ - - /* - This memcpy routine is optimised for Cortex-A15 cores and takes advantage - of VFP or NEON when built with the appropriate flags. - - Assumptions: - - ARMv6 (ARMv7-a if using Neon) - ARM state - Unaligned accesses - - */ - -/* Define an entry point visible from C. */ -#define C_LABEL(name) name: - -#define ENTRY(name) \ - .global name; \ - .type name, %function; \ - .align 4; \ - C_LABEL(name) \ - .cfi_sections .debug_frame; \ - .cfi_startproc; - -#undef END -#define END(name) \ - .cfi_endproc; - - - .syntax unified - /* This implementation requires ARM state. */ - .arm - -#ifdef MEMCPY_NEON - - .fpu neon - .arch armv7-a - # define FRAME_SIZE 4 - # define USE_VFP - # define USE_NEON - -#elif defined (MEMCPY_VFP) - - .arch armv6 - .fpu vfpv2 - # define FRAME_SIZE 32 - # define USE_VFP - -#else - .arch armv6 - # define FRAME_SIZE 32 - -#endif - - /* Old versions of GAS incorrectly implement the NEON align semantics. */ - #ifdef BROKEN_ASM_NEON_ALIGN - #define ALIGN(addr, align) addr,:align - #else - #define ALIGN(addr, align) addr:align - #endif - - #define PC_OFFSET 8 /* PC pipeline compensation. */ - #define INSN_SIZE 4 - - /* Call parameters. */ - #define dstin r0 - #define src r1 - #define count r2 - - /* Locals. */ - #define tmp1 r3 - #define dst ip - #define tmp2 r10 - - #ifndef USE_NEON - /* For bulk copies using GP registers. */ - #define A_l r2 /* Call-clobbered. */ - #define A_h r3 /* Call-clobbered. */ - #define B_l r4 - #define B_h r5 - #define C_l r6 - #define C_h r7 - #define D_l r8 - #define D_h r9 - #endif - - /* Number of lines ahead to pre-fetch data. If you change this the code - below will need adjustment to compensate. */ - - #define prefetch_lines 5 - - #ifdef USE_VFP - .macro cpy_line_vfp vreg, base - vstr \vreg, [dst, #\base] - vldr \vreg, [src, #\base] - vstr d0, [dst, #\base + 8] - vldr d0, [src, #\base + 8] - vstr d1, [dst, #\base + 16] - vldr d1, [src, #\base + 16] - vstr d2, [dst, #\base + 24] - vldr d2, [src, #\base + 24] - vstr \vreg, [dst, #\base + 32] - vldr \vreg, [src, #\base + prefetch_lines * 64 - 32] - vstr d0, [dst, #\base + 40] - vldr d0, [src, #\base + 40] - vstr d1, [dst, #\base + 48] - vldr d1, [src, #\base + 48] - vstr d2, [dst, #\base + 56] - vldr d2, [src, #\base + 56] - .endm - - .macro cpy_tail_vfp vreg, base - vstr \vreg, [dst, #\base] - vldr \vreg, [src, #\base] - vstr d0, [dst, #\base + 8] - vldr d0, [src, #\base + 8] - vstr d1, [dst, #\base + 16] - vldr d1, [src, #\base + 16] - vstr d2, [dst, #\base + 24] - vldr d2, [src, #\base + 24] - vstr \vreg, [dst, #\base + 32] - vstr d0, [dst, #\base + 40] - vldr d0, [src, #\base + 40] - vstr d1, [dst, #\base + 48] - vldr d1, [src, #\base + 48] - vstr d2, [dst, #\base + 56] - vldr d2, [src, #\base + 56] - .endm - #endif - - .p2align 6 -ENTRY(memcpy) - - mov dst, dstin /* Preserve dstin, we need to return it. */ - cmp count, #64 - bge .Lcpy_not_short - /* Deal with small copies quickly by dropping straight into the - exit block. */ - - .Ltail63unaligned: - #ifdef USE_NEON - and tmp1, count, #0x38 - rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) - add pc, pc, tmp1 - vld1.8 {d0}, [src]! /* 14 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 12 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 10 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 8 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 6 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 4 words to go. */ - vst1.8 {d0}, [dst]! - vld1.8 {d0}, [src]! /* 2 words to go. */ - vst1.8 {d0}, [dst]! - - tst count, #4 - ldrne tmp1, [src], #4 - strne tmp1, [dst], #4 - #else - /* Copy up to 15 full words of data. May not be aligned. */ - /* Cannot use VFP for unaligned data. */ - and tmp1, count, #0x3c - add dst, dst, tmp1 - add src, src, tmp1 - rsb tmp1, tmp1, #(60 - PC_OFFSET/2 + INSN_SIZE/2) - /* Jump directly into the sequence below at the correct offset. */ - add pc, pc, tmp1, lsl #1 - - ldr tmp1, [src, #-60] /* 15 words to go. */ - str tmp1, [dst, #-60] - - ldr tmp1, [src, #-56] /* 14 words to go. */ - str tmp1, [dst, #-56] - ldr tmp1, [src, #-52] - str tmp1, [dst, #-52] - - ldr tmp1, [src, #-48] /* 12 words to go. */ - str tmp1, [dst, #-48] - ldr tmp1, [src, #-44] - str tmp1, [dst, #-44] - - ldr tmp1, [src, #-40] /* 10 words to go. */ - str tmp1, [dst, #-40] - ldr tmp1, [src, #-36] - str tmp1, [dst, #-36] - - ldr tmp1, [src, #-32] /* 8 words to go. */ - str tmp1, [dst, #-32] - ldr tmp1, [src, #-28] - str tmp1, [dst, #-28] - - ldr tmp1, [src, #-24] /* 6 words to go. */ - str tmp1, [dst, #-24] - ldr tmp1, [src, #-20] - str tmp1, [dst, #-20] - - ldr tmp1, [src, #-16] /* 4 words to go. */ - str tmp1, [dst, #-16] - ldr tmp1, [src, #-12] - str tmp1, [dst, #-12] - - ldr tmp1, [src, #-8] /* 2 words to go. */ - str tmp1, [dst, #-8] - ldr tmp1, [src, #-4] - str tmp1, [dst, #-4] - #endif - - lsls count, count, #31 - ldrhcs tmp1, [src], #2 - ldrbne src, [src] /* Src is dead, use as a scratch. */ - strhcs tmp1, [dst], #2 - strbne src, [dst] - bx lr - - .Lcpy_not_short: - /* At least 64 bytes to copy, but don't know the alignment yet. */ - str tmp2, [sp, #-FRAME_SIZE]! - and tmp2, src, #7 - and tmp1, dst, #7 - cmp tmp1, tmp2 - bne .Lcpy_notaligned - - #ifdef USE_VFP - /* Magic dust alert! Force VFP on Cortex-A9. Experiments show - that the FP pipeline is much better at streaming loads and - stores. This is outside the critical loop. */ - vmov.f32 s0, s0 - #endif - - /* SRC and DST have the same mutual 64-bit alignment, but we may - still need to pre-copy some bytes to get to natural alignment. - We bring SRC and DST into full 64-bit alignment. */ - lsls tmp2, dst, #29 - beq 1f - rsbs tmp2, tmp2, #0 - sub count, count, tmp2, lsr #29 - ldrmi tmp1, [src], #4 - strmi tmp1, [dst], #4 - lsls tmp2, tmp2, #2 - ldrhcs tmp1, [src], #2 - ldrbne tmp2, [src], #1 - strhcs tmp1, [dst], #2 - strbne tmp2, [dst], #1 - - 1: - subs tmp2, count, #64 /* Use tmp2 for count. */ - blt .Ltail63aligned - - cmp tmp2, #512 - bge .Lcpy_body_long - - .Lcpy_body_medium: /* Count in tmp2. */ - #ifdef USE_VFP - 1: - vldr d0, [src, #0] - subs tmp2, tmp2, #64 - vldr d1, [src, #8] - vstr d0, [dst, #0] - vldr d0, [src, #16] - vstr d1, [dst, #8] - vldr d1, [src, #24] - vstr d0, [dst, #16] - vldr d0, [src, #32] - vstr d1, [dst, #24] - vldr d1, [src, #40] - vstr d0, [dst, #32] - vldr d0, [src, #48] - vstr d1, [dst, #40] - vldr d1, [src, #56] - vstr d0, [dst, #48] - add src, src, #64 - vstr d1, [dst, #56] - add dst, dst, #64 - bge 1b - tst tmp2, #0x3f - beq .Ldone - - .Ltail63aligned: /* Count in tmp2. */ - and tmp1, tmp2, #0x38 - add dst, dst, tmp1 - add src, src, tmp1 - rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) - add pc, pc, tmp1 - - vldr d0, [src, #-56] /* 14 words to go. */ - vstr d0, [dst, #-56] - vldr d0, [src, #-48] /* 12 words to go. */ - vstr d0, [dst, #-48] - vldr d0, [src, #-40] /* 10 words to go. */ - vstr d0, [dst, #-40] - vldr d0, [src, #-32] /* 8 words to go. */ - vstr d0, [dst, #-32] - vldr d0, [src, #-24] /* 6 words to go. */ - vstr d0, [dst, #-24] - vldr d0, [src, #-16] /* 4 words to go. */ - vstr d0, [dst, #-16] - vldr d0, [src, #-8] /* 2 words to go. */ - vstr d0, [dst, #-8] - #else - sub src, src, #8 - sub dst, dst, #8 - 1: - ldrd A_l, A_h, [src, #8] - strd A_l, A_h, [dst, #8] - ldrd A_l, A_h, [src, #16] - strd A_l, A_h, [dst, #16] - ldrd A_l, A_h, [src, #24] - strd A_l, A_h, [dst, #24] - ldrd A_l, A_h, [src, #32] - strd A_l, A_h, [dst, #32] - ldrd A_l, A_h, [src, #40] - strd A_l, A_h, [dst, #40] - ldrd A_l, A_h, [src, #48] - strd A_l, A_h, [dst, #48] - ldrd A_l, A_h, [src, #56] - strd A_l, A_h, [dst, #56] - ldrd A_l, A_h, [src, #64]! - strd A_l, A_h, [dst, #64]! - subs tmp2, tmp2, #64 - bge 1b - tst tmp2, #0x3f - bne 1f - ldr tmp2,[sp], #FRAME_SIZE - bx lr - 1: - add src, src, #8 - add dst, dst, #8 - - .Ltail63aligned: /* Count in tmp2. */ - /* Copy up to 7 d-words of data. Similar to Ltail63unaligned, but - we know that the src and dest are 64-bit aligned so we can use - LDRD/STRD to improve efficiency. */ - /* TMP2 is now negative, but we don't care about that. The bottom - six bits still tell us how many bytes are left to copy. */ - - and tmp1, tmp2, #0x38 - add dst, dst, tmp1 - add src, src, tmp1 - rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE) - add pc, pc, tmp1 - ldrd A_l, A_h, [src, #-56] /* 14 words to go. */ - strd A_l, A_h, [dst, #-56] - ldrd A_l, A_h, [src, #-48] /* 12 words to go. */ - strd A_l, A_h, [dst, #-48] - ldrd A_l, A_h, [src, #-40] /* 10 words to go. */ - strd A_l, A_h, [dst, #-40] - ldrd A_l, A_h, [src, #-32] /* 8 words to go. */ - strd A_l, A_h, [dst, #-32] - ldrd A_l, A_h, [src, #-24] /* 6 words to go. */ - strd A_l, A_h, [dst, #-24] - ldrd A_l, A_h, [src, #-16] /* 4 words to go. */ - strd A_l, A_h, [dst, #-16] - ldrd A_l, A_h, [src, #-8] /* 2 words to go. */ - strd A_l, A_h, [dst, #-8] - - #endif - tst tmp2, #4 - ldrne tmp1, [src], #4 - strne tmp1, [dst], #4 - lsls tmp2, tmp2, #31 /* Count (tmp2) now dead. */ - ldrhcs tmp1, [src], #2 - ldrbne tmp2, [src] - strhcs tmp1, [dst], #2 - strbne tmp2, [dst] - - .Ldone: - ldr tmp2, [sp], #FRAME_SIZE - bx lr - - .Lcpy_body_long: /* Count in tmp2. */ - - /* Long copy. We know that there's at least (prefetch_lines * 64) - bytes to go. */ - #ifdef USE_VFP - /* Don't use PLD. Instead, read some data in advance of the current - copy position into a register. This should act like a PLD - operation but we won't have to repeat the transfer. */ - - vldr d3, [src, #0] - vldr d4, [src, #64] - vldr d5, [src, #128] - vldr d6, [src, #192] - vldr d7, [src, #256] - - vldr d0, [src, #8] - vldr d1, [src, #16] - vldr d2, [src, #24] - add src, src, #32 - - subs tmp2, tmp2, #prefetch_lines * 64 * 2 - blt 2f - 1: - cpy_line_vfp d3, 0 - cpy_line_vfp d4, 64 - cpy_line_vfp d5, 128 - add dst, dst, #3 * 64 - add src, src, #3 * 64 - cpy_line_vfp d6, 0 - cpy_line_vfp d7, 64 - add dst, dst, #2 * 64 - add src, src, #2 * 64 - subs tmp2, tmp2, #prefetch_lines * 64 - bge 1b - - 2: - cpy_tail_vfp d3, 0 - cpy_tail_vfp d4, 64 - cpy_tail_vfp d5, 128 - add src, src, #3 * 64 - add dst, dst, #3 * 64 - cpy_tail_vfp d6, 0 - vstr d7, [dst, #64] - vldr d7, [src, #64] - vstr d0, [dst, #64 + 8] - vldr d0, [src, #64 + 8] - vstr d1, [dst, #64 + 16] - vldr d1, [src, #64 + 16] - vstr d2, [dst, #64 + 24] - vldr d2, [src, #64 + 24] - vstr d7, [dst, #64 + 32] - add src, src, #96 - vstr d0, [dst, #64 + 40] - vstr d1, [dst, #64 + 48] - vstr d2, [dst, #64 + 56] - add dst, dst, #128 - add tmp2, tmp2, #prefetch_lines * 64 - b .Lcpy_body_medium - #else - /* Long copy. Use an SMS style loop to maximize the I/O - bandwidth of the core. We don't have enough spare registers - to synthesise prefetching, so use PLD operations. */ - /* Pre-bias src and dst. */ - sub src, src, #8 - sub dst, dst, #8 - pld [src, #8] - pld [src, #72] - subs tmp2, tmp2, #64 - pld [src, #136] - ldrd A_l, A_h, [src, #8] - strd B_l, B_h, [sp, #8] - ldrd B_l, B_h, [src, #16] - strd C_l, C_h, [sp, #16] - ldrd C_l, C_h, [src, #24] - strd D_l, D_h, [sp, #24] - pld [src, #200] - ldrd D_l, D_h, [src, #32]! - b 1f - .p2align 6 - 2: - pld [src, #232] - strd A_l, A_h, [dst, #40] - ldrd A_l, A_h, [src, #40] - strd B_l, B_h, [dst, #48] - ldrd B_l, B_h, [src, #48] - strd C_l, C_h, [dst, #56] - ldrd C_l, C_h, [src, #56] - strd D_l, D_h, [dst, #64]! - ldrd D_l, D_h, [src, #64]! - subs tmp2, tmp2, #64 - 1: - strd A_l, A_h, [dst, #8] - ldrd A_l, A_h, [src, #8] - strd B_l, B_h, [dst, #16] - ldrd B_l, B_h, [src, #16] - strd C_l, C_h, [dst, #24] - ldrd C_l, C_h, [src, #24] - strd D_l, D_h, [dst, #32] - ldrd D_l, D_h, [src, #32] - bcs 2b - /* Save the remaining bytes and restore the callee-saved regs. */ - strd A_l, A_h, [dst, #40] - add src, src, #40 - strd B_l, B_h, [dst, #48] - ldrd B_l, B_h, [sp, #8] - strd C_l, C_h, [dst, #56] - ldrd C_l, C_h, [sp, #16] - strd D_l, D_h, [dst, #64] - ldrd D_l, D_h, [sp, #24] - add dst, dst, #72 - tst tmp2, #0x3f - bne .Ltail63aligned - ldr tmp2, [sp], #FRAME_SIZE - bx lr - #endif - - .Lcpy_notaligned: - pld [src] - pld [src, #64] - /* There's at least 64 bytes to copy, but there is no mutual - alignment. */ - /* Bring DST to 64-bit alignment. */ - lsls tmp2, dst, #29 - pld [src, #(2 * 64)] - beq 1f - rsbs tmp2, tmp2, #0 - sub count, count, tmp2, lsr #29 - ldrmi tmp1, [src], #4 - strmi tmp1, [dst], #4 - lsls tmp2, tmp2, #2 - ldrbne tmp1, [src], #1 - ldrhcs tmp2, [src], #2 - strbne tmp1, [dst], #1 - strhcs tmp2, [dst], #2 - 1: - pld [src, #(3 * 64)] - subs count, count, #64 - ldrmi tmp2, [sp], #FRAME_SIZE - bmi .Ltail63unaligned - pld [src, #(4 * 64)] - - #ifdef USE_NEON - vld1.8 {d0-d3}, [src]! - vld1.8 {d4-d7}, [src]! - subs count, count, #64 - bmi 2f - 1: - pld [src, #(4 * 64)] - vst1.8 {d0-d3}, [ALIGN (dst, 64)]! - vld1.8 {d0-d3}, [src]! - vst1.8 {d4-d7}, [ALIGN (dst, 64)]! - vld1.8 {d4-d7}, [src]! - subs count, count, #64 - bpl 1b - 2: - vst1.8 {d0-d3}, [ALIGN (dst, 64)]! - vst1.8 {d4-d7}, [ALIGN (dst, 64)]! - ands count, count, #0x3f - #else - /* Use an SMS style loop to maximize the I/O bandwidth. */ - sub src, src, #4 - sub dst, dst, #8 - subs tmp2, count, #64 /* Use tmp2 for count. */ - ldr A_l, [src, #4] - ldr A_h, [src, #8] - strd B_l, B_h, [sp, #8] - ldr B_l, [src, #12] - ldr B_h, [src, #16] - strd C_l, C_h, [sp, #16] - ldr C_l, [src, #20] - ldr C_h, [src, #24] - strd D_l, D_h, [sp, #24] - ldr D_l, [src, #28] - ldr D_h, [src, #32]! - b 1f - .p2align 6 - 2: - pld [src, #(5 * 64) - (32 - 4)] - strd A_l, A_h, [dst, #40] - ldr A_l, [src, #36] - ldr A_h, [src, #40] - strd B_l, B_h, [dst, #48] - ldr B_l, [src, #44] - ldr B_h, [src, #48] - strd C_l, C_h, [dst, #56] - ldr C_l, [src, #52] - ldr C_h, [src, #56] - strd D_l, D_h, [dst, #64]! - ldr D_l, [src, #60] - ldr D_h, [src, #64]! - subs tmp2, tmp2, #64 - 1: - strd A_l, A_h, [dst, #8] - ldr A_l, [src, #4] - ldr A_h, [src, #8] - strd B_l, B_h, [dst, #16] - ldr B_l, [src, #12] - ldr B_h, [src, #16] - strd C_l, C_h, [dst, #24] - ldr C_l, [src, #20] - ldr C_h, [src, #24] - strd D_l, D_h, [dst, #32] - ldr D_l, [src, #28] - ldr D_h, [src, #32] - bcs 2b - - /* Save the remaining bytes and restore the callee-saved regs. */ - strd A_l, A_h, [dst, #40] - add src, src, #36 - strd B_l, B_h, [dst, #48] - ldrd B_l, B_h, [sp, #8] - strd C_l, C_h, [dst, #56] - ldrd C_l, C_h, [sp, #16] - strd D_l, D_h, [dst, #64] - ldrd D_l, D_h, [sp, #24] - add dst, dst, #72 - ands count, tmp2, #0x3f - #endif - ldr tmp2, [sp], #FRAME_SIZE - bne .Ltail63unaligned - bx lr - - END(memcpy) |