summaryrefslogtreecommitdiff
path: root/libc/sysdeps/powerpc/powerpc32
diff options
context:
space:
mode:
authorjoseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d>2013-10-18 21:33:25 +0000
committerjoseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d>2013-10-18 21:33:25 +0000
commitfe2ed5aaa408e1ab996a9fe1595a05634208a79c (patch)
treee1027fbc9d8a4a8c33f8149b2b42e8cde89c74f6 /libc/sysdeps/powerpc/powerpc32
parent571c782b982d888565e7d06bfc2f3d47582fe829 (diff)
Merge changes between r23946 and r24305 from /fsf/trunk.
git-svn-id: svn://svn.eglibc.org/trunk@24306 7b3dc134-2b1b-0410-93df-9e9f96275f8d
Diffstat (limited to 'libc/sysdeps/powerpc/powerpc32')
-rw-r--r--libc/sysdeps/powerpc/powerpc32/405/memcmp.S128
-rw-r--r--libc/sysdeps/powerpc/powerpc32/405/memcpy.S130
-rw-r--r--libc/sysdeps/powerpc/powerpc32/405/memset.S152
-rw-r--r--libc/sysdeps/powerpc/powerpc32/405/strcmp.S134
-rw-r--r--libc/sysdeps/powerpc/powerpc32/405/strcpy.S107
-rw-r--r--libc/sysdeps/powerpc/powerpc32/405/strlen.S75
-rw-r--r--libc/sysdeps/powerpc/powerpc32/405/strncmp.S128
-rw-r--r--libc/sysdeps/powerpc/powerpc32/440/Implies2
-rw-r--r--libc/sysdeps/powerpc/powerpc32/464/Implies2
-rw-r--r--libc/sysdeps/powerpc/powerpc32/476/Implies2
-rw-r--r--libc/sysdeps/powerpc/powerpc32/476/memset.S152
-rw-r--r--libc/sysdeps/powerpc/powerpc32/Makefile10
-rw-r--r--libc/sysdeps/powerpc/powerpc32/__longjmp-common.S42
-rw-r--r--libc/sysdeps/powerpc/powerpc32/bsd-_setjmp.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/bsd-setjmp.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/dl-machine.c14
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/Makefile9
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fclrexcpt.c53
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fe_note_change.c39
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fedisblxcpt.c54
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/feenablxcpt.c54
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetenv.c47
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetexcept.c36
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetround.c29
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/feholdexcpt.c57
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_const.c41
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_libc.h96
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetenv.c49
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetround.c35
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/feupdateenv.c47
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_prctl.c42
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_spe.c41
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_prctl.c42
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_spe.c41
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fgetexcptflg.c41
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcept-soft.c28
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcpt.c40
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/fsetexcptflg.c55
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/ftestexcept.c31
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/get-rounding-mode.h4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/s_fabsf.S27
-rw-r--r--libc/sysdeps/powerpc/powerpc32/e500/nofpu/spe-raise.c53
-rw-r--r--libc/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S8
-rw-r--r--libc/sysdeps/powerpc/powerpc32/fpu/__longjmp.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/fpu/s_copysign.S2
-rw-r--r--libc/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S2
-rw-r--r--libc/sysdeps/powerpc/powerpc32/fpu/s_lrint.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/fpu/s_lround.S2
-rw-r--r--libc/sysdeps/powerpc/powerpc32/fpu/s_roundf.S3
-rw-r--r--libc/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S73
-rw-r--r--libc/sysdeps/powerpc/powerpc32/fpu/setjmp.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/mcount.c2
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S20
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power4/hp-timing.h21
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power4/memcmp.S1064
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power4/memcpy.S58
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power4/memset.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power4/strncmp.S56
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S2
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power6/memcpy.S81
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power6/memset.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S5
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S7
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c8
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/memchr.S185
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/memcmp.S1626
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/memcpy.S24
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/mempcpy.S28
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/memrchr.S187
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/memset.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/rawmemchr.S17
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/strchr.S51
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/strchrnul.S27
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/strlen.S17
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/strncmp.S55
-rw-r--r--libc/sysdeps/powerpc/powerpc32/power7/strnlen.S106
-rw-r--r--libc/sysdeps/powerpc/powerpc32/setjmp-common.S41
-rw-r--r--libc/sysdeps/powerpc/powerpc32/setjmp.S4
-rw-r--r--libc/sysdeps/powerpc/powerpc32/stackguard-macros.h10
-rw-r--r--libc/sysdeps/powerpc/powerpc32/stpcpy.S18
-rw-r--r--libc/sysdeps/powerpc/powerpc32/strchr.S71
-rw-r--r--libc/sysdeps/powerpc/powerpc32/strcmp.S42
-rw-r--r--libc/sysdeps/powerpc/powerpc32/strcpy.S18
-rw-r--r--libc/sysdeps/powerpc/powerpc32/strlen.S69
-rw-r--r--libc/sysdeps/powerpc/powerpc32/strncmp.S56
94 files changed, 4816 insertions, 1487 deletions
diff --git a/libc/sysdeps/powerpc/powerpc32/405/memcmp.S b/libc/sysdeps/powerpc/powerpc32/405/memcmp.S
new file mode 100644
index 000000000..2849461cd
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/memcmp.S
@@ -0,0 +1,128 @@
+/* Optimized memcmp implementation for PowerPC476.
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* memcmp
+
+ r3:source1 address, return equality
+ r4:source2 address
+ r5:byte count
+
+ Check 2 words from src1 and src2. If unequal jump to end and
+ return src1 > src2 or src1 < src2.
+ If count = zero check bytes before zero counter and then jump to end and
+ return src1 > src2, src1 < src2 or src1 = src2.
+ If src1 = src2 and no null, repeat. */
+
+EALIGN (memcmp, 5, 0)
+ srwi. r6,r5,5
+ beq L(preword2_count_loop)
+ mtctr r6
+ clrlwi r5,r5,27
+
+L(word8_compare_loop):
+ lwz r10,0(r3)
+ lwz r6,4(r3)
+ lwz r8,0(r4)
+ lwz r9,4(r4)
+ cmplw cr5,r8,r10
+ cmplw cr1,r9,r6
+ bne cr5,L(st2)
+ bne cr1,L(st1)
+ lwz r10,8(r3)
+ lwz r6,12(r3)
+ lwz r8,8(r4)
+ lwz r9,12(r4)
+ cmplw cr5,r8,r10
+ cmplw cr1,r9,r6
+ bne cr5,L(st2)
+ bne cr1,L(st1)
+ lwz r10,16(r3)
+ lwz r6,20(r3)
+ lwz r8,16(r4)
+ lwz r9,20(r4)
+ cmplw cr5,r8,r10
+ cmplw cr1,r9,r6
+ bne cr5,L(st2)
+ bne cr1,L(st1)
+ lwz r10,24(r3)
+ lwz r6,28(r3)
+ addi r3,r3,0x20
+ lwz r8,24(r4)
+ lwz r9,28(r4)
+ addi r4,r4,0x20
+ cmplw cr5,r8,r10
+ cmplw cr1,r9,r6
+ bne cr5,L(st2)
+ bne cr1,L(st1)
+ bdnz L(word8_compare_loop)
+
+L(preword2_count_loop):
+ srwi. r6,r5,3
+ beq L(prebyte_count_loop)
+ mtctr r6
+ clrlwi r5,r5,29
+
+L(word2_count_loop):
+ lwz r10,0(r3)
+ lwz r6,4(r3)
+ addi r3,r3,0x08
+ lwz r8,0(r4)
+ lwz r9,4(r4)
+ addi r4,r4,0x08
+ cmplw cr5,r8,r10
+ cmplw cr1,r9,r6
+ bne cr5,L(st2)
+ bne cr1,L(st1)
+ bdnz L(word2_count_loop)
+
+L(prebyte_count_loop):
+ addi r5,r5,1
+ mtctr r5
+ bdz L(end_memcmp)
+
+L(byte_count_loop):
+ lbz r6,0(r3)
+ addi r3,r3,0x01
+ lbz r8,0(r4)
+ addi r4,r4,0x01
+ cmplw cr5,r8,r6
+ bne cr5,L(st2)
+ bdnz L(byte_count_loop)
+
+L(end_memcmp):
+ addi r3,r0,0
+ blr
+
+L(l_r):
+ addi r3,r0,1
+ blr
+
+L(st1):
+ blt cr1,L(l_r)
+ addi r3,r0,-1
+ blr
+
+L(st2):
+ blt cr5,L(l_r)
+ addi r3,r0,-1
+ blr
+END (memcmp)
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp,bcmp)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/memcpy.S b/libc/sysdeps/powerpc/powerpc32/405/memcpy.S
new file mode 100644
index 000000000..b01d53920
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/memcpy.S
@@ -0,0 +1,130 @@
+/* Optimized memcpy implementation for PowerPC476.
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* memcpy
+
+ r0:return address
+ r3:destination address
+ r4:source address
+ r5:byte count
+
+ Save return address in r0.
+ If destinationn and source are unaligned and copy count is greater than 256
+ then copy 0-3 bytes to make destination aligned.
+ If 32 or more bytes to copy we use 32 byte copy loop.
+ Finaly we copy 0-31 extra bytes. */
+
+EALIGN (memcpy, 5, 0)
+/* Check if bytes to copy are greater than 256 and if
+ source and destination are unaligned */
+ cmpwi r5,0x0100
+ addi r0,r3,0
+ ble L(string_count_loop)
+ neg r6,r3
+ clrlwi. r6,r6,30
+ beq L(string_count_loop)
+ neg r6,r4
+ clrlwi. r6,r6,30
+ beq L(string_count_loop)
+ mtctr r6
+ subf r5,r6,r5
+
+L(unaligned_bytecopy_loop): /* Align destination by coping 0-3 bytes */
+ lbz r8,0x0(r4)
+ addi r4,r4,1
+ stb r8,0x0(r3)
+ addi r3,r3,1
+ bdnz L(unaligned_bytecopy_loop)
+ srwi. r7,r5,5
+ beq L(preword2_count_loop)
+ mtctr r7
+
+L(word8_count_loop_no_dcbt): /* Copy 32 bytes at a time */
+ lwz r6,0(r4)
+ lwz r7,4(r4)
+ lwz r8,8(r4)
+ lwz r9,12(r4)
+ subi r5,r5,0x20
+ stw r6,0(r3)
+ stw r7,4(r3)
+ stw r8,8(r3)
+ stw r9,12(r3)
+ lwz r6,16(r4)
+ lwz r7,20(r4)
+ lwz r8,24(r4)
+ lwz r9,28(r4)
+ addi r4,r4,0x20
+ stw r6,16(r3)
+ stw r7,20(r3)
+ stw r8,24(r3)
+ stw r9,28(r3)
+ addi r3,r3,0x20
+ bdnz L(word8_count_loop_no_dcbt)
+
+L(preword2_count_loop): /* Copy remaining 0-31 bytes */
+ clrlwi. r12,r5,27
+ beq L(end_memcpy)
+ mtxer r12
+ lswx r5,0,r4
+ stswx r5,0,r3
+ mr r3,r0
+ blr
+
+L(string_count_loop): /* Copy odd 0-31 bytes */
+ clrlwi. r12,r5,28
+ add r3,r3,r5
+ add r4,r4,r5
+ beq L(pre_string_copy)
+ mtxer r12
+ subf r4,r12,r4
+ subf r3,r12,r3
+ lswx r6,0,r4
+ stswx r6,0,r3
+
+L(pre_string_copy): /* Check how many 32 byte chunks to copy */
+ srwi. r7,r5,4
+ beq L(end_memcpy)
+ mtctr r7
+
+L(word4_count_loop_no_dcbt): /* Copy 32 bytes at a time */
+ lwz r6,-4(r4)
+ lwz r7,-8(r4)
+ lwz r8,-12(r4)
+ lwzu r9,-16(r4)
+ stw r6,-4(r3)
+ stw r7,-8(r3)
+ stw r8,-12(r3)
+ stwu r9,-16(r3)
+ bdz L(end_memcpy)
+ lwz r6,-4(r4)
+ lwz r7,-8(r4)
+ lwz r8,-12(r4)
+ lwzu r9,-16(r4)
+ stw r6,-4(r3)
+ stw r7,-8(r3)
+ stw r8,-12(r3)
+ stwu r9,-16(r3)
+ bdnz L(word4_count_loop_no_dcbt)
+
+L(end_memcpy):
+ mr r3,r0
+ blr
+END (memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/memset.S b/libc/sysdeps/powerpc/powerpc32/405/memset.S
new file mode 100644
index 000000000..b73dba887
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/memset.S
@@ -0,0 +1,152 @@
+/* Optimized memset for PowerPC405,440,464 (32-byte cacheline).
+ Copyright (C) 2012-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* memset
+
+ r3:destination address and return address
+ r4:source integer to copy
+ r5:byte count
+ r11:sources integer to copy in all 32 bits of reg
+ r12:temp return address
+
+ Save return address in r12
+ If destinationn is unaligned and count is greater tha 255 bytes
+ set 0-3 bytes to make destination aligned
+ If count is greater tha 255 bytes and setting zero to memory
+ use dbcz to set memeory when we can
+ otherwsie do the follwoing
+ If 16 or more words to set we use 16 word copy loop.
+ Finaly we set 0-15 extra bytes with string store. */
+
+EALIGN (memset, 5, 0)
+ rlwinm r11,r4,0,24,31
+ rlwimi r11,r4,8,16,23
+ rlwimi r11,r11,16,0,15
+ addi r12,r3,0
+ cmpwi r5,0x00FF
+ ble L(preword8_count_loop)
+ cmpwi r4,0x00
+ beq L(use_dcbz)
+ neg r6,r3
+ clrlwi. r6,r6,30
+ beq L(preword8_count_loop)
+ addi r8,0,1
+ mtctr r6
+ subi r3,r3,1
+
+L(unaligned_bytecopy_loop):
+ stbu r11,0x1(r3)
+ subf. r5,r8,r5
+ beq L(end_memset)
+ bdnz L(unaligned_bytecopy_loop)
+ addi r3,r3,1
+
+L(preword8_count_loop):
+ srwi. r6,r5,4
+ beq L(preword2_count_loop)
+ mtctr r6
+ addi r3,r3,-4
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+
+L(word8_count_loop_no_dcbt):
+ stwu r8,4(r3)
+ stwu r9,4(r3)
+ subi r5,r5,0x10
+ stwu r10,4(r3)
+ stwu r11,4(r3)
+ bdnz L(word8_count_loop_no_dcbt)
+ addi r3,r3,4
+
+L(preword2_count_loop):
+ clrlwi. r7,r5,28
+ beq L(end_memset)
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+ mtxer r7
+ stswx r8,0,r3
+
+L(end_memset):
+ addi r3,r12,0
+ blr
+
+L(use_dcbz):
+ neg r6,r3
+ clrlwi. r7,r6,28
+ beq L(skip_string_loop)
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+ subf r5,r7,r5
+ mtxer r7
+ stswx r8,0,r3
+ add r3,r3,r7
+
+L(skip_string_loop):
+ clrlwi r8,r6,27
+ srwi. r8,r8,4
+ beq L(dcbz_pre_loop)
+ mtctr r8
+
+L(word_loop):
+ stw r11,0(r3)
+ subi r5,r5,0x10
+ stw r11,4(r3)
+ stw r11,8(r3)
+ stw r11,12(r3)
+ addi r3,r3,0x10
+ bdnz L(word_loop)
+
+L(dcbz_pre_loop):
+ srwi r6,r5,5
+ mtctr r6
+ addi r7,0,0
+
+L(dcbz_loop):
+ dcbz r3,r7
+ addi r3,r3,0x20
+ subi r5,r5,0x20
+ bdnz L(dcbz_loop)
+ srwi. r6,r5,4
+ beq L(postword2_count_loop)
+ mtctr r6
+
+L(postword8_count_loop):
+ stw r11,0(r3)
+ subi r5,r5,0x10
+ stw r11,4(r3)
+ stw r11,8(r3)
+ stw r11,12(r3)
+ addi r3,r3,0x10
+ bdnz L(postword8_count_loop)
+
+L(postword2_count_loop):
+ clrlwi. r7,r5,28
+ beq L(end_memset)
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+ mtxer r7
+ stswx r8,0,r3
+ b L(end_memset)
+END (memset)
+libc_hidden_builtin_def (memset)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/strcmp.S b/libc/sysdeps/powerpc/powerpc32/405/strcmp.S
new file mode 100644
index 000000000..c0b21907b
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/strcmp.S
@@ -0,0 +1,134 @@
+/* Optimized strcmp implementation for PowerPC476.
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* strcmp
+
+ Register Use
+ r0:temp return equality
+ r3:source1 address, return equality
+ r4:source2 address
+
+ Implementation description
+ Check 2 words from src1 and src2. If unequal jump to end and
+ return src1 > src2 or src1 < src2.
+ If null check bytes before null and then jump to end and
+ return src1 > src2, src1 < src2 or src1 = src2.
+ If src1 = src2 and no null, repeat. */
+
+EALIGN (strcmp,5,0)
+ neg r7,r3
+ clrlwi r7,r7,20
+ neg r8,r4
+ clrlwi r8,r8,20
+ srwi. r7,r7,5
+ beq L(byte_loop)
+ srwi. r8,r8,5
+ beq L(byte_loop)
+ cmplw r7,r8
+ mtctr r7
+ ble L(big_loop)
+ mtctr r8
+
+L(big_loop):
+ lwz r5,0(r3)
+ lwz r6,4(r3)
+ lwz r8,0(r4)
+ lwz r9,4(r4)
+ dlmzb. r12,r5,r6
+ bne L(end_check)
+ cmplw r5,r8
+ bne L(st1)
+ cmplw r6,r9
+ bne L(st1)
+ lwz r5,8(r3)
+ lwz r6,12(r3)
+ lwz r8,8(r4)
+ lwz r9,12(r4)
+ dlmzb. r12,r5,r6
+ bne L(end_check)
+ cmplw r5,r8
+ bne L(st1)
+ cmplw r6,r9
+ bne L(st1)
+ lwz r5,16(r3)
+ lwz r6,20(r3)
+ lwz r8,16(r4)
+ lwz r9,20(r4)
+ dlmzb. r12,r5,r6
+ bne L(end_check)
+ cmplw r5,r8
+ bne L(st1)
+ cmplw r6,r9
+ bne L(st1)
+ lwz r5,24(r3)
+ lwz r6,28(r3)
+ addi r3,r3,0x20
+ lwz r8,24(r4)
+ lwz r9,28(r4)
+ addi r4,r4,0x20
+ dlmzb. r12,r5,r6
+ bne L(end_check)
+ cmplw r5,r8
+ bne L(st1)
+ cmplw r6,r9
+ bne L(st1)
+ bdnz L(big_loop)
+ b L(byte_loop)
+
+L(end_check):
+ subfic r12,r12,4
+ blt L(end_check2)
+ rlwinm r12,r12,3,0,31
+ srw r5,r5,r12
+ srw r8,r8,r12
+ cmplw r5,r8
+ bne L(st1)
+ b L(end_strcmp)
+
+L(end_check2):
+ addi r12,r12,4
+ cmplw r5,r8
+ rlwinm r12,r12,3,0,31
+ bne L(st1)
+ srw r6,r6,r12
+ srw r9,r9,r12
+ cmplw r6,r9
+ bne L(st1)
+
+L(end_strcmp):
+ addi r3,r0,0
+ blr
+
+L(st1):
+ mfcr r3
+ blr
+
+L(byte_loop):
+ lbz r5,0(r3)
+ addi r3,r3,1
+ lbz r6,0(r4)
+ addi r4,r4,1
+ cmplw r5,r6
+ bne L(st1)
+ cmpwi r5,0
+ beq L(end_strcmp)
+ b L(byte_loop)
+END (strcmp)
+libc_hidden_builtin_def (strcmp)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/strcpy.S b/libc/sysdeps/powerpc/powerpc32/405/strcpy.S
new file mode 100644
index 000000000..d7c84569d
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/strcpy.S
@@ -0,0 +1,107 @@
+/* Optimized strcpy implementation for PowerPC476.
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* strcpy
+
+ Register Use
+ r3:destination and return address
+ r4:source address
+ r10:temp destination address
+
+ Implementation description
+ Loop by checking 2 words at a time, with dlmzb. Check if there is a null
+ in the 2 words. If there is a null jump to end checking to determine
+ where in the last 8 bytes it is. Copy the appropriate bytes of the last
+ 8 according to the null position. */
+
+EALIGN (strcpy, 5, 0)
+ neg r7,r4
+ subi r4,r4,1
+ clrlwi. r8,r7,29
+ subi r10,r3,1
+ beq L(pre_word8_loop)
+ mtctr r8
+
+L(loop):
+ lbzu r5,0x01(r4)
+ cmpi cr5,r5,0x0
+ stbu r5,0x01(r10)
+ beq cr5,L(end_strcpy)
+ bdnz L(loop)
+
+L(pre_word8_loop):
+ subi r4,r4,3
+ subi r10,r10,3
+
+L(word8_loop):
+ lwzu r5,0x04(r4)
+ lwzu r6,0x04(r4)
+ dlmzb. r11,r5,r6
+ bne L(byte_copy)
+ stwu r5,0x04(r10)
+ stwu r6,0x04(r10)
+ lwzu r5,0x04(r4)
+ lwzu r6,0x04(r4)
+ dlmzb. r11,r5,r6
+ bne L(byte_copy)
+ stwu r5,0x04(r10)
+ stwu r6,0x04(r10)
+ lwzu r5,0x04(r4)
+ lwzu r6,0x04(r4)
+ dlmzb. r11,r5,r6
+ bne L(byte_copy)
+ stwu r5,0x04(r10)
+ stwu r6,0x04(r10)
+ lwzu r5,0x04(r4)
+ lwzu r6,0x04(r4)
+ dlmzb. r11,r5,r6
+ bne L(byte_copy)
+ stwu r5,0x04(r10)
+ stwu r6,0x04(r10)
+ b L(word8_loop)
+
+L(last_bytes_copy):
+ stwu r5,0x04(r10)
+ subi r11,r11,4
+ mtctr r11
+ addi r10,r10,3
+ subi r4,r4,1
+
+L(last_bytes_copy_loop):
+ lbzu r5,0x01(r4)
+ stbu r5,0x01(r10)
+ bdnz L(last_bytes_copy_loop)
+ blr
+
+L(byte_copy):
+ blt L(last_bytes_copy)
+ mtctr r11
+ addi r10,r10,3
+ subi r4,r4,5
+
+L(last_bytes_copy_loop2):
+ lbzu r5,0x01(r4)
+ stbu r5,0x01(r10)
+ bdnz L(last_bytes_copy_loop2)
+
+L(end_strcpy):
+ blr
+END (strcpy)
+libc_hidden_builtin_def (strcpy)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/strlen.S b/libc/sysdeps/powerpc/powerpc32/405/strlen.S
new file mode 100644
index 000000000..77d22ea67
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/strlen.S
@@ -0,0 +1,75 @@
+/* Optimized strlen implementation for PowerPC476.
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* strlen
+
+ Register Use
+ r3:source address and return length of string
+ r4:byte counter
+
+ Implementation description
+ Load 2 words at a time and count bytes, if we find null we subtract one from
+ the count and return the count value. We need to subtract one because
+ we don't count the null character as a byte. */
+
+EALIGN (strlen,5,0)
+ neg r7,r3
+ clrlwi. r8,r7,29
+ addi r4,0,0
+ beq L(byte_count_loop)
+ mtctr r8
+
+L(loop):
+ lbz r5,0(r3)
+ cmpi cr5,r5,0x0
+ addi r3,r3,0x1
+ addi r4,r4,0x1
+ beq cr5,L(end_strlen)
+ bdnz L(loop)
+
+L(byte_count_loop):
+ lwz r5,0(r3)
+ lwz r6,4(r3)
+ dlmzb. r12,r5,r6
+ add r4,r4,r12
+ bne L(end_strlen)
+ lwz r5,8(r3)
+ lwz r6,12(r3)
+ dlmzb. r12,r5,r6
+ add r4,r4,r12
+ bne L(end_strlen)
+ lwz r5,16(r3)
+ lwz r6,20(r3)
+ dlmzb. r12,r5,r6
+ add r4,r4,r12
+ bne L(end_strlen)
+ lwz r5,24(r3)
+ lwz r6,28(r3)
+ addi r3,r3,0x20
+ dlmzb. r12,r5,r6
+ add r4,r4,r12
+ bne L(end_strlen)
+ b L(byte_count_loop)
+
+L(end_strlen):
+ addi r3,r4,-1
+ blr
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/strncmp.S b/libc/sysdeps/powerpc/powerpc32/405/strncmp.S
new file mode 100644
index 000000000..3e2ba5f85
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/strncmp.S
@@ -0,0 +1,128 @@
+/* Optimized strncmp implementation for PowerPC476.
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* strncmp
+
+ Register Use
+ r0:temp return equality
+ r3:source1 address, return equality
+ r4:source2 address
+ r5:byte count
+
+ Implementation description
+ Touch in 3 lines of D-cache.
+ If source1 or source2 is unaligned copy 0-3 bytes to make source1 aligned
+ Check 2 words from src1 and src2. If unequal jump to end and
+ return src1 > src2 or src1 < src2.
+ If null check bytes before null and then jump to end and
+ return src1 > src2, src1 < src2 or src1 = src2.
+ If count = zero check bytes before zero counter and then jump to end and
+ return src1 > src2, src1 < src2 or src1 = src2.
+ If src1 = src2 and no null, repeat. */
+
+EALIGN (strncmp,5,0)
+ neg r7,r3
+ clrlwi r7,r7,20
+ neg r8,r4
+ clrlwi r8,r8,20
+ srwi. r7,r7,3
+ beq L(prebyte_count_loop)
+ srwi. r8,r8,3
+ beq L(prebyte_count_loop)
+ cmplw r7,r8
+ mtctr r7
+ ble L(preword2_count_loop)
+ mtctr r8
+
+L(preword2_count_loop):
+ srwi. r6,r5,3
+ beq L(prebyte_count_loop)
+ mfctr r7
+ cmplw r6,r7
+ bgt L(set_count_loop)
+ mtctr r6
+ clrlwi r5,r5,29
+
+L(word2_count_loop):
+ lwz r10,0(r3)
+ lwz r6,4(r3)
+ addi r3,r3,0x08
+ lwz r8,0(r4)
+ lwz r9,4(r4)
+ addi r4,r4,0x08
+ dlmzb. r12,r10,r6
+ bne L(end_check)
+ cmplw r10,r8
+ bne L(st1)
+ cmplw r6,r9
+ bne L(st1)
+ bdnz L(word2_count_loop)
+
+L(prebyte_count_loop):
+ addi r5,r5,1
+ mtctr r5
+ bdz L(end_strncmp)
+
+L(byte_count_loop):
+ lbz r6,0(r3)
+ addi r3,r3,1
+ lbz r7,0(r4)
+ addi r4,r4,1
+ cmplw r6,r7
+ bne L(st1)
+ cmpwi r6,0
+ beq L(end_strncmp)
+ bdnz L(byte_count_loop)
+ b L(end_strncmp)
+
+L(set_count_loop):
+ slwi r7,r7,3
+ subf r5,r7,r5
+ b L(word2_count_loop)
+
+L(end_check):
+ subfic r12,r12,4
+ blt L(end_check2)
+ rlwinm r12,r12,3,0,31
+ srw r10,r10,r12
+ srw r8,r8,r12
+ cmplw r10,r8
+ bne L(st1)
+ b L(end_strncmp)
+
+L(end_check2):
+ addi r12,r12,4
+ cmplw r10,r8
+ rlwinm r12,r12,3,0,31
+ bne L(st1)
+ srw r6,r6,r12
+ srw r9,r9,r12
+ cmplw r6,r9
+ bne L(st1)
+
+L(end_strncmp):
+ addi r3,r0,0
+ blr
+
+L(st1):
+ mfcr r3
+ blr
+END (strncmp)
+libc_hidden_builtin_def (strncmp)
diff --git a/libc/sysdeps/powerpc/powerpc32/440/Implies b/libc/sysdeps/powerpc/powerpc32/440/Implies
new file mode 100644
index 000000000..70c0d2eda
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/440/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/405/fpu
+powerpc/powerpc32/405
diff --git a/libc/sysdeps/powerpc/powerpc32/464/Implies b/libc/sysdeps/powerpc/powerpc32/464/Implies
new file mode 100644
index 000000000..c3e52c550
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/464/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/440/fpu
+powerpc/powerpc32/440
diff --git a/libc/sysdeps/powerpc/powerpc32/476/Implies b/libc/sysdeps/powerpc/powerpc32/476/Implies
new file mode 100644
index 000000000..2829f9cca
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/476/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/464/fpu
+powerpc/powerpc32/464
diff --git a/libc/sysdeps/powerpc/powerpc32/476/memset.S b/libc/sysdeps/powerpc/powerpc32/476/memset.S
new file mode 100644
index 000000000..48c21d620
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/476/memset.S
@@ -0,0 +1,152 @@
+/* Optimized memset for PowerPC476 (128-byte cacheline).
+ Copyright (C) 2010-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* memset
+
+ r3:destination address and return address
+ r4:source integer to copy
+ r5:byte count
+ r11:sources integer to copy in all 32 bits of reg
+ r12:temp return address
+
+ Save return address in r12
+ If destinationn is unaligned and count is greater tha 255 bytes
+ set 0-3 bytes to make destination aligned
+ If count is greater tha 255 bytes and setting zero to memory
+ use dbcz to set memeory when we can
+ otherwsie do the follwoing
+ If 16 or more words to set we use 16 word copy loop.
+ Finaly we set 0-15 extra bytes with string store. */
+
+EALIGN (memset, 5, 0)
+ rlwinm r11,r4,0,24,31
+ rlwimi r11,r4,8,16,23
+ rlwimi r11,r11,16,0,15
+ addi r12,r3,0
+ cmpwi r5,0x00FF
+ ble L(preword8_count_loop)
+ cmpwi r4,0x00
+ beq L(use_dcbz)
+ neg r6,r3
+ clrlwi. r6,r6,30
+ beq L(preword8_count_loop)
+ addi r8,0,1
+ mtctr r6
+ subi r3,r3,1
+
+L(unaligned_bytecopy_loop):
+ stbu r11,0x1(r3)
+ subf. r5,r8,r5
+ beq L(end_memset)
+ bdnz L(unaligned_bytecopy_loop)
+ addi r3,r3,1
+
+L(preword8_count_loop):
+ srwi. r6,r5,4
+ beq L(preword2_count_loop)
+ mtctr r6
+ addi r3,r3,-4
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+
+L(word8_count_loop_no_dcbt):
+ stwu r8,4(r3)
+ stwu r9,4(r3)
+ subi r5,r5,0x10
+ stwu r10,4(r3)
+ stwu r11,4(r3)
+ bdnz L(word8_count_loop_no_dcbt)
+ addi r3,r3,4
+
+L(preword2_count_loop):
+ clrlwi. r7,r5,28
+ beq L(end_memset)
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+ mtxer r7
+ stswx r8,0,r3
+
+L(end_memset):
+ addi r3,r12,0
+ blr
+
+L(use_dcbz):
+ neg r6,r3
+ clrlwi. r7,r6,28
+ beq L(skip_string_loop)
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+ subf r5,r7,r5
+ mtxer r7
+ stswx r8,0,r3
+ add r3,r3,r7
+
+L(skip_string_loop):
+ clrlwi r8,r6,25
+ srwi. r8,r8,4
+ beq L(dcbz_pre_loop)
+ mtctr r8
+
+L(word_loop):
+ stw r11,0(r3)
+ subi r5,r5,0x10
+ stw r11,4(r3)
+ stw r11,8(r3)
+ stw r11,12(r3)
+ addi r3,r3,0x10
+ bdnz L(word_loop)
+
+L(dcbz_pre_loop):
+ srwi r6,r5,7
+ mtctr r6
+ addi r7,0,0
+
+L(dcbz_loop):
+ dcbz r3,r7
+ addi r3,r3,0x80
+ subi r5,r5,0x80
+ bdnz L(dcbz_loop)
+ srwi. r6,r5,4
+ beq L(postword2_count_loop)
+ mtctr r6
+
+L(postword8_count_loop):
+ stw r11,0(r3)
+ subi r5,r5,0x10
+ stw r11,4(r3)
+ stw r11,8(r3)
+ stw r11,12(r3)
+ addi r3,r3,0x10
+ bdnz L(postword8_count_loop)
+
+L(postword2_count_loop):
+ clrlwi. r7,r5,28
+ beq L(end_memset)
+ mr r8,r11
+ mr r9,r11
+ mr r10,r11
+ mtxer r7
+ stswx r8,0,r3
+ b L(end_memset)
+END (memset)
+libc_hidden_builtin_def (memset)
diff --git a/libc/sysdeps/powerpc/powerpc32/Makefile b/libc/sysdeps/powerpc/powerpc32/Makefile
index 64f79003a..cf620c826 100644
--- a/libc/sysdeps/powerpc/powerpc32/Makefile
+++ b/libc/sysdeps/powerpc/powerpc32/Makefile
@@ -1,8 +1,12 @@
# Powerpc32 specific build options.
-ifeq ($(with-fp),no)
-+cflags += -msoft-float
-sysdep-LDFLAGS += -msoft-float
+# Some Powerpc32 variants assume soft-fp is the default even though there is
+# an fp variant so provide -mhard-float if --with-fp is explicitly passed.
+
+ifeq ($(with-fp),yes)
++cflags += -mhard-float
+ASFLAGS += -mhard-float
+sysdep-LDFLAGS += -mhard-float
endif
ifeq ($(subdir),gmon)
diff --git a/libc/sysdeps/powerpc/powerpc32/__longjmp-common.S b/libc/sysdeps/powerpc/powerpc32/__longjmp-common.S
index 787447363..df1d5195f 100644
--- a/libc/sysdeps/powerpc/powerpc32/__longjmp-common.S
+++ b/libc/sysdeps/powerpc/powerpc32/__longjmp-common.S
@@ -24,6 +24,12 @@
# include <jmpbuf-offsets.h>
#endif
+#if defined __SPE__ || (defined __NO_FPRS__ && !defined _SOFT_FLOAT)
+# define LOAD_GP(N) evldd r##N,((JB_FPRS+((N)-14)*2)*4)(r3)
+#else
+# define LOAD_GP(N) lwz r##N,((JB_GPRS+(N)-14)*4)(r3)
+#endif
+
ENTRY (__longjmp)
#if defined PTR_DEMANGLE || defined CHECK_SP
@@ -39,13 +45,13 @@ ENTRY (__longjmp)
lwz r1,(JB_GPR1*4)(r3)
#endif
lwz r0,(JB_LR*4)(r3)
- lwz r14,((JB_GPRS+0)*4)(r3)
- lwz r15,((JB_GPRS+1)*4)(r3)
- lwz r16,((JB_GPRS+2)*4)(r3)
- lwz r17,((JB_GPRS+3)*4)(r3)
- lwz r18,((JB_GPRS+4)*4)(r3)
- lwz r19,((JB_GPRS+5)*4)(r3)
- lwz r20,((JB_GPRS+6)*4)(r3)
+ LOAD_GP (14)
+ LOAD_GP (15)
+ LOAD_GP (16)
+ LOAD_GP (17)
+ LOAD_GP (18)
+ LOAD_GP (19)
+ LOAD_GP (20)
#ifdef PTR_DEMANGLE
# ifndef CHECK_SP
PTR_DEMANGLE3 (r1, r24, r25)
@@ -53,19 +59,19 @@ ENTRY (__longjmp)
PTR_DEMANGLE2 (r0, r25)
#endif
mtlr r0
- lwz r21,((JB_GPRS+7)*4)(r3)
- lwz r22,((JB_GPRS+8)*4)(r3)
+ LOAD_GP (21)
+ LOAD_GP (22)
lwz r0,(JB_CR*4)(r3)
- lwz r23,((JB_GPRS+9)*4)(r3)
- lwz r24,((JB_GPRS+10)*4)(r3)
- lwz r25,((JB_GPRS+11)*4)(r3)
+ LOAD_GP (23)
+ LOAD_GP (24)
+ LOAD_GP (25)
mtcrf 0xFF,r0
- lwz r26,((JB_GPRS+12)*4)(r3)
- lwz r27,((JB_GPRS+13)*4)(r3)
- lwz r28,((JB_GPRS+14)*4)(r3)
- lwz r29,((JB_GPRS+15)*4)(r3)
- lwz r30,((JB_GPRS+16)*4)(r3)
- lwz r31,((JB_GPRS+17)*4)(r3)
+ LOAD_GP (26)
+ LOAD_GP (27)
+ LOAD_GP (28)
+ LOAD_GP (29)
+ LOAD_GP (30)
+ LOAD_GP (31)
mr r3,r4
blr
END (__longjmp)
diff --git a/libc/sysdeps/powerpc/powerpc32/bsd-_setjmp.S b/libc/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
index 95e8a5aa1..ad2b5ffdb 100644
--- a/libc/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
@@ -30,7 +30,7 @@ libc_hidden_def (_setjmp)
/* Build a versioned object for libc. */
# if defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
-symbol_version (__novmx_setjmp,_setjmp,GLIBC_2.0);
+compat_symbol (libc, __novmx_setjmp, _setjmp, GLIBC_2_0);
ENTRY (__novmx_setjmp)
li r4,0 /* Set second argument to 0. */
@@ -39,7 +39,7 @@ END (__novmx_setjmp)
libc_hidden_def (__novmx_setjmp)
# endif /* defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) */
-default_symbol_version (__vmx_setjmp,_setjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx_setjmp, _setjmp, GLIBC_2_3_4)
/* __GI__setjmp prototype is needed for ntpl i.e. _setjmp is defined
as a libc_hidden_proto & is used in sysdeps/generic/libc-start.c
if HAVE_CLEANUP_JMP_BUF is defined */
diff --git a/libc/sysdeps/powerpc/powerpc32/bsd-setjmp.S b/libc/sysdeps/powerpc/powerpc32/bsd-setjmp.S
index 1113ea533..5e1e860d8 100644
--- a/libc/sysdeps/powerpc/powerpc32/bsd-setjmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/bsd-setjmp.S
@@ -26,7 +26,7 @@ ENTRY (__novmxsetjmp)
b __novmx__sigsetjmp@local
END (__novmxsetjmp)
strong_alias (__novmxsetjmp, __novmx__setjmp)
-symbol_version (__novmxsetjmp, setjmp, GLIBC_2.0)
+compat_symbol (libc, __novmxsetjmp, setjmp, GLIBC_2_0)
#endif /* defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) ) */
@@ -36,4 +36,4 @@ ENTRY (__vmxsetjmp)
END (__vmxsetjmp)
strong_alias (__vmxsetjmp, __vmx__setjmp)
strong_alias (__vmx__setjmp, __setjmp)
-default_symbol_version (__vmxsetjmp,setjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmxsetjmp, setjmp, GLIBC_2_3_4)
diff --git a/libc/sysdeps/powerpc/powerpc32/dl-machine.c b/libc/sysdeps/powerpc/powerpc32/dl-machine.c
index 3e7202d86..aba361856 100644
--- a/libc/sysdeps/powerpc/powerpc32/dl-machine.c
+++ b/libc/sysdeps/powerpc/powerpc32/dl-machine.c
@@ -416,6 +416,12 @@ __process_machine_rela (struct link_map *map,
Elf32_Addr const finaladdr,
int rinfo)
{
+ union unaligned
+ {
+ uint16_t u2;
+ uint32_t u4;
+ } __attribute__((__packed__));
+
switch (rinfo)
{
case R_PPC_NONE:
@@ -432,10 +438,7 @@ __process_machine_rela (struct link_map *map,
return;
case R_PPC_UADDR32:
- ((char *) reloc_addr)[0] = finaladdr >> 24;
- ((char *) reloc_addr)[1] = finaladdr >> 16;
- ((char *) reloc_addr)[2] = finaladdr >> 8;
- ((char *) reloc_addr)[3] = finaladdr;
+ ((union unaligned *) reloc_addr)->u4 = finaladdr;
break;
case R_PPC_ADDR24:
@@ -453,8 +456,7 @@ __process_machine_rela (struct link_map *map,
case R_PPC_UADDR16:
if (__builtin_expect (finaladdr > 0x7fff && finaladdr < 0xffff8000, 0))
_dl_reloc_overflow (map, "R_PPC_UADDR16", reloc_addr, refsym);
- ((char *) reloc_addr)[0] = finaladdr >> 8;
- ((char *) reloc_addr)[1] = finaladdr;
+ ((union unaligned *) reloc_addr)->u2 = finaladdr;
break;
case R_PPC_ADDR16_LO:
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/Makefile b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/Makefile
new file mode 100644
index 000000000..adf556870
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/Makefile
@@ -0,0 +1,9 @@
+ifeq ($(subdir),math)
+libm-routines += fexcepts_to_spe fexcepts_from_spe
+libm-routines += fexcepts_to_prctl fexcepts_from_prctl
+libm-routines += fe_note_change
+endif
+
+ifeq ($(subdir),soft-fp)
+sysdep_routines += fraiseexcept-soft
+endif
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fclrexcpt.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fclrexcpt.c
new file mode 100644
index 000000000..92a7dd1e0
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fclrexcpt.c
@@ -0,0 +1,53 @@
+/* Clear given exceptions in current floating-point environment. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+
+#undef feclearexcept
+int
+__feclearexcept (int excepts)
+{
+ unsigned int fpescr;
+ int excepts_spe = __fexcepts_to_spe (excepts);
+
+ /* Get the current state. */
+ fpescr = fegetenv_register ();
+
+ /* Clear the relevant bits. */
+ fpescr &= ~excepts_spe;
+
+ /* Put the new state in effect. */
+ fesetenv_register (fpescr);
+
+ /* Let the kernel know if the "invalid" or "underflow" bit was
+ cleared. */
+ if (excepts & (FE_INVALID | FE_UNDERFLOW))
+ __fe_note_change ();
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feclearexcept, __old_feclearexcept)
+compat_symbol (libm, __old_feclearexcept, feclearexcept, GLIBC_2_1);
+#endif
+
+libm_hidden_ver (__feclearexcept, feclearexcept)
+versioned_symbol (libm, __feclearexcept, feclearexcept, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fe_note_change.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fe_note_change.c
new file mode 100644
index 000000000..43a570626
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fe_note_change.c
@@ -0,0 +1,39 @@
+/* Note a change to floating-point exceptions.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+/* Inform the kernel of a change to floating-point exceptions. */
+
+void
+__fe_note_change (void)
+{
+ int pflags, r;
+ INTERNAL_SYSCALL_DECL (err);
+
+ r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &pflags);
+ if (INTERNAL_SYSCALL_ERROR_P (r, err))
+ return;
+ if ((pflags & PR_FP_EXC_SW_ENABLE) == 0)
+ INTERNAL_SYSCALL (prctl, err, 2, PR_SET_FPEXC,
+ pflags | PR_FP_EXC_SW_ENABLE);
+}
+
+libm_hidden_def (__fe_note_change)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fedisblxcpt.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fedisblxcpt.c
new file mode 100644
index 000000000..7cc963c01
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fedisblxcpt.c
@@ -0,0 +1,54 @@
+/* Disable floating-point exceptions. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+fedisableexcept (int excepts)
+{
+ int result = 0, pflags, r;
+ INTERNAL_SYSCALL_DECL (err);
+
+ r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &pflags);
+ if (INTERNAL_SYSCALL_ERROR_P (r, err))
+ return -1;
+
+ /* Save old enable bits. */
+ result = __fexcepts_from_prctl (pflags);
+
+ pflags &= ~__fexcepts_to_prctl (excepts);
+ r = INTERNAL_SYSCALL (prctl, err, 2, PR_SET_FPEXC,
+ pflags | PR_FP_EXC_SW_ENABLE);
+ if (INTERNAL_SYSCALL_ERROR_P (r, err))
+ return -1;
+
+ /* If disabling signals for "inexact", also disable trapping to the
+ kernel. */
+ if ((excepts & FE_INEXACT) != 0)
+ {
+ unsigned long fpescr;
+
+ fpescr = fegetenv_register ();
+ fpescr &= ~SPEFSCR_FINXE;
+ fesetenv_register (fpescr);
+ }
+
+ return result;
+}
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feenablxcpt.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feenablxcpt.c
new file mode 100644
index 000000000..133dde7b3
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feenablxcpt.c
@@ -0,0 +1,54 @@
+/* Enable floating-point exceptions. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+feenableexcept (int excepts)
+{
+ unsigned int result = 0, pflags, r;
+ INTERNAL_SYSCALL_DECL (err);
+
+ r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &pflags);
+ if (INTERNAL_SYSCALL_ERROR_P (r, err))
+ return -1;
+
+ /* Save old enable bits. */
+ result = __fexcepts_from_prctl (pflags);
+
+ pflags |= __fexcepts_to_prctl (excepts);
+ r = INTERNAL_SYSCALL (prctl, err, 2, PR_SET_FPEXC,
+ pflags | PR_FP_EXC_SW_ENABLE);
+ if (INTERNAL_SYSCALL_ERROR_P (r, err))
+ return -1;
+
+ /* If enabling signals for "inexact", also enable trapping to the
+ kernel. */
+ if ((excepts & FE_INEXACT) != 0)
+ {
+ unsigned long fpescr;
+
+ fpescr = fegetenv_register ();
+ fpescr |= SPEFSCR_FINXE;
+ fesetenv_register (fpescr);
+ }
+
+ return result;
+}
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetenv.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetenv.c
new file mode 100644
index 000000000..bfcbca2ad
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetenv.c
@@ -0,0 +1,47 @@
+/* Store current floating-point environment. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+__fegetenv (fenv_t *envp)
+{
+ fenv_union_t u;
+ INTERNAL_SYSCALL_DECL (err);
+ int r;
+
+ r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &u.l[0]);
+ if (INTERNAL_SYSCALL_ERROR_P (r, err))
+ return -1;
+
+ u.l[1] = fegetenv_register ();
+ *envp = u.fenv;
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fegetenv, __old_fegetenv)
+compat_symbol (libm, __old_fegetenv, fegetenv, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fegetenv, fegetenv, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetexcept.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetexcept.c
new file mode 100644
index 000000000..9c7afc74f
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetexcept.c
@@ -0,0 +1,36 @@
+/* Get floating-point exceptions. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+fegetexcept (void)
+{
+ int result = 0, pflags, r;
+ INTERNAL_SYSCALL_DECL (err);
+
+ r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &pflags);
+ if (INTERNAL_SYSCALL_ERROR_P (r, err))
+ return -1;
+
+ result = __fexcepts_from_prctl (pflags);
+
+ return result;
+}
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetround.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetround.c
new file mode 100644
index 000000000..f69e9a5bd
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetround.c
@@ -0,0 +1,29 @@
+/* Return current rounding direction. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+
+#undef fegetround
+int
+fegetround (void)
+{
+ unsigned long fpescr;
+
+ fpescr = fegetenv_register ();
+ return fpescr & 3;
+}
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feholdexcpt.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feholdexcpt.c
new file mode 100644
index 000000000..bd05ebd3c
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feholdexcpt.c
@@ -0,0 +1,57 @@
+/* Store current floating-point environment and clear exceptions.
+ e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+feholdexcept (fenv_t *envp)
+{
+ fenv_union_t u;
+ INTERNAL_SYSCALL_DECL (err);
+ int r;
+
+ /* Get the current state. */
+ r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &u.l[0]);
+ if (INTERNAL_SYSCALL_ERROR_P (r, err))
+ return -1;
+
+ u.l[1] = fegetenv_register ();
+ *envp = u.fenv;
+
+ /* Clear everything except for the rounding mode and trapping to the
+ kernel. */
+ u.l[0] &= ~(PR_FP_EXC_DIV
+ | PR_FP_EXC_OVF
+ | PR_FP_EXC_UND
+ | PR_FP_EXC_RES
+ | PR_FP_EXC_INV);
+ u.l[1] &= SPEFSCR_FRMC | (SPEFSCR_ALL_EXCEPT_ENABLE & ~SPEFSCR_FINXE);
+
+ /* Put the new state in effect. */
+ fesetenv_register (u.l[1]);
+ r = INTERNAL_SYSCALL (prctl, err, 2, PR_SET_FPEXC,
+ u.l[0] | PR_FP_EXC_SW_ENABLE);
+ if (INTERNAL_SYSCALL_ERROR_P (r, err))
+ return -1;
+
+ return 0;
+}
+libm_hidden_def (feholdexcept)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_const.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_const.c
new file mode 100644
index 000000000..3a85f1810
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_const.c
@@ -0,0 +1,41 @@
+/* Constant floating-point environments for e500.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* The use of "unsigned long long" as the type to define the
+ bit-pattern explicitly, rather than the type "double" used in
+ <bits/fenv.h>, means that we cannot include <fenv_libc.h> here to
+ get the enum constants for the SPEFSCR bits to enable
+ exceptions. */
+
+#include <sys/prctl.h>
+
+/* If the default argument is used we use this value. */
+const unsigned long long __fe_dfl_env __attribute__ ((aligned (8))) =
+ 0x3cULL;
+
+/* Floating-point environment where none of the exceptions are masked. */
+const unsigned long long __fe_enabled_env __attribute__ ((aligned (8))) =
+ (((unsigned long long) (PR_FP_EXC_DIV
+ | PR_FP_EXC_OVF
+ | PR_FP_EXC_UND
+ | PR_FP_EXC_RES
+ | PR_FP_EXC_INV)) << 32) | 0x7cULL;
+
+/* Non-IEEE mode. */
+const unsigned long long __fe_nonieee_env __attribute__ ((aligned (8))) =
+ 0x0ULL;
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_libc.h b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_libc.h
new file mode 100644
index 000000000..96375808d
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_libc.h
@@ -0,0 +1,96 @@
+/* Internal libc stuff for floating point environment routines. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef _FENV_LIBC_H
+#define _FENV_LIBC_H 1
+
+#include <fenv.h>
+
+int __feraiseexcept_spe (int);
+libm_hidden_proto (__feraiseexcept_spe)
+
+int __fexcepts_to_spe (int);
+libm_hidden_proto (__fexcepts_to_spe)
+
+int __fexcepts_from_spe (int);
+libm_hidden_proto (__fexcepts_from_spe)
+
+int __fexcepts_to_prctl (int);
+libm_hidden_proto (__fexcepts_to_prctl)
+
+int __fexcepts_from_prctl (int);
+libm_hidden_proto (__fexcepts_from_prctl)
+
+void __fe_note_change (void);
+libm_hidden_proto (__fe_note_change)
+
+/* Equivalent to fegetenv, but returns an unsigned int instead of
+ taking a pointer. */
+#define fegetenv_register() \
+ ({ unsigned int fscr; asm volatile ("mfspefscr %0" : "=r" (fscr)); fscr; })
+
+/* Equivalent to fesetenv, but takes an unsigned int instead of a
+ pointer. */
+#define fesetenv_register(fscr) \
+ ({ asm volatile ("mtspefscr %0" : : "r" (fscr)); })
+
+typedef union
+{
+ fenv_t fenv;
+ unsigned int l[2];
+} fenv_union_t;
+
+/* Definitions of all the SPEFSCR bit numbers. */
+enum {
+ SPEFSCR_SOVH = 0x80000000,
+ SPEFSCR_OVH = 0x40000000,
+ SPEFSCR_FGH = 0x20000000,
+ SPEFSCR_FXH = 0x10000000,
+ SPEFSCR_FINVH = 0x08000000,
+ SPEFSCR_FDBZH = 0x04000000,
+ SPEFSCR_FUNFH = 0x02000000,
+ SPEFSCR_FOVFH = 0x01000000,
+ /* 2 unused bits. */
+ SPEFSCR_FINXS = 0x00200000,
+ SPEFSCR_FINVS = 0x00100000,
+ SPEFSCR_FDBZS = 0x00080000,
+ SPEFSCR_FUNFS = 0x00040000,
+ SPEFSCR_FOVFS = 0x00020000,
+ /* Combination of the exception bits. */
+ SPEFSCR_ALL_EXCEPT = 0x003e0000,
+ SPEFSCR_MODE = 0x00010000,
+ SPEFSCR_SOV = 0x00008000,
+ SPEFSCR_OV = 0x00004000,
+ SPEFSCR_FG = 0x00002000,
+ SPEFSCR_FX = 0x00001000,
+ SPEFSCR_FINV = 0x00000800,
+ SPEFSCR_FDBZ = 0x00000400,
+ SPEFSCR_FUNF = 0x00000200,
+ SPEFSCR_FOVF = 0x00000100,
+ /* 1 unused bit. */
+ SPEFSCR_FINXE = 0x00000040,
+ SPEFSCR_FINVE = 0x00000020,
+ SPEFSCR_FDBZE = 0x00000010,
+ SPEFSCR_FUNFE = 0x00000008,
+ SPEFSCR_FOVFE = 0x00000004,
+ /* Combination of the exception trap enable bits. */
+ SPEFSCR_ALL_EXCEPT_ENABLE = 0x0000007c,
+ SPEFSCR_FRMC = 0x00000003
+};
+
+#endif /* fenv_libc.h */
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetenv.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetenv.c
new file mode 100644
index 000000000..411e6be8d
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetenv.c
@@ -0,0 +1,49 @@
+/* Install given floating-point environment. e500 version.
+ Copyright (C) 1997-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+__fesetenv (const fenv_t *envp)
+{
+ fenv_union_t u;
+ INTERNAL_SYSCALL_DECL (err);
+ int r;
+
+ u.fenv = *envp;
+
+ fesetenv_register (u.l[1]);
+ r = INTERNAL_SYSCALL (prctl, err, 2, PR_SET_FPEXC,
+ u.l[0] | PR_FP_EXC_SW_ENABLE);
+ if (INTERNAL_SYSCALL_ERROR_P (r, err))
+ return -1;
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fesetenv, __old_fesetenv)
+compat_symbol (libm, __old_fesetenv, fesetenv, GLIBC_2_1);
+#endif
+
+libm_hidden_ver (__fesetenv, fesetenv)
+versioned_symbol (libm, __fesetenv, fesetenv, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetround.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetround.c
new file mode 100644
index 000000000..805008e0c
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetround.c
@@ -0,0 +1,35 @@
+/* Set current rounding direction. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+
+int
+fesetround (int round)
+{
+ unsigned long fpescr;
+
+ if ((unsigned int) round > 3)
+ return 1;
+
+ fpescr = fegetenv_register ();
+ fpescr = (fpescr & ~SPEFSCR_FRMC) | (round & 3);
+ fesetenv_register (fpescr);
+
+ return 0;
+}
+libm_hidden_def (fesetround)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feupdateenv.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feupdateenv.c
new file mode 100644
index 000000000..505c92363
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feupdateenv.c
@@ -0,0 +1,47 @@
+/* Install given floating-point environment and raise exceptions.
+ e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+
+int
+__feupdateenv (const fenv_t *envp)
+{
+ int exc;
+
+ /* Save the currently set exceptions. */
+ exc = fegetenv_register () & SPEFSCR_ALL_EXCEPT;
+
+ /* Install new environment. */
+ fesetenv (envp);
+
+ /* Raise (if appropriate) saved exceptions. */
+ __feraiseexcept_spe (exc);
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feupdateenv, __old_feupdateenv)
+compat_symbol (libm, __old_feupdateenv, feupdateenv, GLIBC_2_1);
+#endif
+
+libm_hidden_ver (__feupdateenv, feupdateenv)
+versioned_symbol (libm, __feupdateenv, feupdateenv, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_prctl.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_prctl.c
new file mode 100644
index 000000000..c094a04cb
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_prctl.c
@@ -0,0 +1,42 @@
+/* Convert floating-point exceptions from prctl form.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+#include <sys/prctl.h>
+
+/* Convert EXCEPTS from prctl bits to FE_* form, returning the
+ converted value. */
+
+int
+__fexcepts_from_prctl (int excepts)
+{
+ int result = 0;
+ if (excepts & PR_FP_EXC_OVF)
+ result |= FE_OVERFLOW;
+ if (excepts & PR_FP_EXC_UND)
+ result |= FE_UNDERFLOW;
+ if (excepts & PR_FP_EXC_INV)
+ result |= FE_INVALID;
+ if (excepts & PR_FP_EXC_DIV)
+ result |= FE_DIVBYZERO;
+ if (excepts & PR_FP_EXC_RES)
+ result |= FE_INEXACT;
+ return result;
+}
+
+libm_hidden_def (__fexcepts_from_prctl)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_spe.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_spe.c
new file mode 100644
index 000000000..3ec939d18
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_spe.c
@@ -0,0 +1,41 @@
+/* Convert floating-point exceptions from SPEFSCR form.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+
+/* Convert EXCEPTS from SPEFSCR bits to FE_* form, returning the
+ converted value. */
+
+int
+__fexcepts_from_spe (int excepts)
+{
+ int result = 0;
+ if (excepts & SPEFSCR_FINXS)
+ result |= FE_INEXACT;
+ if (excepts & SPEFSCR_FDBZS)
+ result |= FE_DIVBYZERO;
+ if (excepts & SPEFSCR_FUNFS)
+ result |= FE_UNDERFLOW;
+ if (excepts & SPEFSCR_FOVFS)
+ result |= FE_OVERFLOW;
+ if (excepts & SPEFSCR_FINVS)
+ result |= FE_INVALID;
+ return result;
+}
+
+libm_hidden_def (__fexcepts_from_spe)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_prctl.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_prctl.c
new file mode 100644
index 000000000..b9c51b125
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_prctl.c
@@ -0,0 +1,42 @@
+/* Convert floating-point exceptions to prctl form.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+#include <sys/prctl.h>
+
+/* Convert EXCEPTS from FE_* form to prctl bits, returning the
+ converted value. */
+
+int
+__fexcepts_to_prctl (int excepts)
+{
+ int result = 0;
+ if (excepts & FE_INEXACT)
+ result |= PR_FP_EXC_RES;
+ if (excepts & FE_DIVBYZERO)
+ result |= PR_FP_EXC_DIV;
+ if (excepts & FE_UNDERFLOW)
+ result |= PR_FP_EXC_UND;
+ if (excepts & FE_OVERFLOW)
+ result |= PR_FP_EXC_OVF;
+ if (excepts & FE_INVALID)
+ result |= PR_FP_EXC_INV;
+ return result;
+}
+
+libm_hidden_def (__fexcepts_to_prctl)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_spe.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_spe.c
new file mode 100644
index 000000000..570934d15
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_spe.c
@@ -0,0 +1,41 @@
+/* Convert floating-point exceptions to SPEFSCR form.
+ Copyright (C) 2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+
+/* Convert EXCEPTS from FE_* form to SPEFSCR bits, returning the
+ converted value. */
+
+int
+__fexcepts_to_spe (int excepts)
+{
+ int result = 0;
+ if (excepts & FE_INEXACT)
+ result |= SPEFSCR_FINXS;
+ if (excepts & FE_DIVBYZERO)
+ result |= SPEFSCR_FDBZS;
+ if (excepts & FE_UNDERFLOW)
+ result |= SPEFSCR_FUNFS;
+ if (excepts & FE_OVERFLOW)
+ result |= SPEFSCR_FOVFS;
+ if (excepts & FE_INVALID)
+ result |= SPEFSCR_FINVS;
+ return result;
+}
+
+libm_hidden_def (__fexcepts_to_spe)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fgetexcptflg.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fgetexcptflg.c
new file mode 100644
index 000000000..b01cadeff
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fgetexcptflg.c
@@ -0,0 +1,41 @@
+/* Store current representation for exceptions. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+
+int
+__fegetexceptflag (fexcept_t *flagp, int excepts)
+{
+ unsigned long fpescr;
+
+ /* Get the current state. */
+ fpescr = fegetenv_register ();
+
+ *flagp = fpescr & SPEFSCR_ALL_EXCEPT;
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fegetexceptflag, __old_fegetexceptflag)
+compat_symbol (libm, __old_fegetexceptflag, fegetexceptflag, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fegetexceptflag, fegetexceptflag, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcept-soft.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcept-soft.c
new file mode 100644
index 000000000..0aed72ff3
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcept-soft.c
@@ -0,0 +1,28 @@
+/* Raise given exceptions. e500 version for use from soft-fp.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+ Contributed by Aldy Hernandez <aldyh@redhat.com>, 2004.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+#include <libc-symbols.h>
+
+int __feraiseexcept_soft (int);
+libc_hidden_proto (__feraiseexcept_soft)
+
+#define __FERAISEEXCEPT_INTERNAL __feraiseexcept_soft
+#include "spe-raise.c"
+libc_hidden_def (__feraiseexcept_soft)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcpt.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcpt.c
new file mode 100644
index 000000000..0eca9ffff
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcpt.c
@@ -0,0 +1,40 @@
+/* Raise given exceptions. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+
+#define __FERAISEEXCEPT_INTERNAL __feraiseexcept_spe
+#include "spe-raise.c"
+
+libm_hidden_def (__feraiseexcept_spe)
+
+#undef feraiseexcept
+int
+__feraiseexcept (int excepts)
+{
+ return __feraiseexcept_spe (__fexcepts_to_spe (excepts));
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feraiseexcept, __old_feraiseexcept)
+compat_symbol (libm, __old_feraiseexcept, feraiseexcept, GLIBC_2_1);
+#endif
+
+libm_hidden_ver (__feraiseexcept, feraiseexcept)
+versioned_symbol (libm, __feraiseexcept, feraiseexcept, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fsetexcptflg.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fsetexcptflg.c
new file mode 100644
index 000000000..43f2d19d1
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fsetexcptflg.c
@@ -0,0 +1,55 @@
+/* Set floating-point environment exception handling. e500 version.
+ Copyright (C) 1997-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+
+int
+__fesetexceptflag (const fexcept_t *flagp, int excepts)
+{
+ unsigned long old_spefscr, spefscr;
+ fexcept_t flag;
+ int excepts_spe = __fexcepts_to_spe (excepts);
+
+ /* Get the current state. */
+ old_spefscr = fegetenv_register ();
+
+ /* Ignore exceptions not listed in 'excepts'. */
+ flag = *flagp & excepts_spe;
+
+ /* Replace the exception status */
+ spefscr = (old_spefscr & ~excepts_spe) | flag;
+
+ /* Store the new status word (along with the rest of the environment). */
+ fesetenv_register (spefscr);
+
+ /* If the state of the "invalid" or "underflow" flag has changed,
+ inform the kernel. */
+ if (((spefscr ^ old_spefscr) & (SPEFSCR_FINVS | SPEFSCR_FUNFS)) != 0)
+ __fe_note_change ();
+
+ /* Success. */
+ return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fesetexceptflag, __old_fesetexceptflag)
+compat_symbol (libm, __old_fesetexceptflag, fesetexceptflag, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fesetexceptflag, fesetexceptflag, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/ftestexcept.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/ftestexcept.c
new file mode 100644
index 000000000..f4f547d5f
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/ftestexcept.c
@@ -0,0 +1,31 @@
+/* Test exception in current environment. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+
+int
+fetestexcept (int excepts)
+{
+ unsigned long f;
+
+ /* Get the current state. */
+ f = fegetenv_register ();
+
+ return __fexcepts_from_spe (f) & excepts;
+}
+libm_hidden_def (fetestexcept)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/get-rounding-mode.h b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/get-rounding-mode.h
new file mode 100644
index 000000000..117e7331e
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/get-rounding-mode.h
@@ -0,0 +1,4 @@
+/* The generic version of get-rounding-mode.h using fpu_control.h, not
+ the one using the software rounding mode, is correct for e500. */
+
+#include <sysdeps/generic/get-rounding-mode.h>
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/s_fabsf.S b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/s_fabsf.S
new file mode 100644
index 000000000..823f748ba
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/s_fabsf.S
@@ -0,0 +1,27 @@
+/* Floating-point absolute value. e500 version.
+ Copyright (C) 2004-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ENTRY (__fabsf)
+/* float [r3] fabsf (float [r3] x) ; */
+ efsabs r3,r3
+ blr
+END (__fabsf)
+
+weak_alias (__fabsf, fabsf)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/spe-raise.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/spe-raise.c
new file mode 100644
index 000000000..4394ddc7c
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/spe-raise.c
@@ -0,0 +1,53 @@
+/* Raise given exceptions, given the SPEFSCR bits for those exceptions.
+ Copyright (C) 1997-2013 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <fenv_libc.h>
+
+int
+__FERAISEEXCEPT_INTERNAL (int excepts)
+{
+ unsigned long f;
+
+ f = fegetenv_register ();
+ f |= (excepts & SPEFSCR_ALL_EXCEPT);
+ fesetenv_register (f);
+
+ /* Force the operations that cause the exceptions. */
+ if ((SPEFSCR_FINVS & excepts) != 0)
+ /* 0 / 0 */
+ asm volatile ("efsdiv %0,%0,%1" : : "r" (0), "r" (0));
+
+ if ((SPEFSCR_FDBZS & excepts) != 0)
+ /* 1.0 / 0.0 */
+ asm volatile ("efsdiv %0,%0,%1" : : "r" (1.0F), "r" (0));
+
+ if ((SPEFSCR_FOVFS & excepts) != 0)
+ /* Largest normalized number plus itself. */
+ asm volatile ("efsadd %0,%0,%1" : : "r" (0x7f7fffff), "r" (0x7f7fffff));
+
+ if ((SPEFSCR_FUNFS & excepts) != 0)
+ /* Smallest normalized number times itself. */
+ asm volatile ("efsmul %0,%0,%1" : : "r" (0x800000), "r" (0x800000));
+
+ if ((SPEFSCR_FINXS & excepts) != 0)
+ /* Smallest normalized minus 1.0 raises the inexact flag. */
+ asm volatile ("efssub %0,%0,%1" : : "r" (0x00800000), "r" (1.0F));
+
+ /* Success. */
+ return 0;
+}
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S b/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
index 9d34cd916..d02aa5754 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
@@ -43,16 +43,16 @@ ENTRY (__longjmp)
# endif
mtlr r6
cfi_same_value (lr)
- lwz r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r5)
+ lwz r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r5)
# else
lwz r5,_dl_hwcap@got(r5)
mtlr r6
cfi_same_value (lr)
- lwz r5,4(r5)
+ lwz r5,LOWORD(r5)
# endif
# else
- lis r5,(_dl_hwcap+4)@ha
- lwz r5,(_dl_hwcap+4)@l(r5)
+ lis r5,(_dl_hwcap+LOWORD)@ha
+ lwz r5,(_dl_hwcap+LOWORD)@l(r5)
# endif
andis. r5,r5,(PPC_FEATURE_HAS_ALTIVEC >> 16)
beq L(no_vmx)
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp.S b/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
index 96e50de37..27166c454 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
@@ -26,14 +26,14 @@
#else /* !NOT_IN_libc */
/* Build a versioned object for libc. */
-default_symbol_version (__vmx__longjmp,__longjmp,GLIBC_2.3.4);
+versioned_symbol (libc, __vmx__longjmp, __longjmp, GLIBC_2_3_4);
# define __longjmp __vmx__longjmp
# include "__longjmp-common.S"
# if defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
# define __NO_VMX__
# undef JB_SIZE
-symbol_version (__novmx__longjmp,__longjmp,GLIBC_2.0);
+compat_symbol (libc, __novmx__longjmp, __longjmp, GLIBC_2_0);
# undef __longjmp
# define __longjmp __novmx__longjmp
# include "__longjmp-common.S"
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/s_copysign.S b/libc/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
index 840891f1c..1da24f492 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
@@ -29,7 +29,7 @@ ENTRY(__copysign)
stwu r1,-16(r1)
cfi_adjust_cfa_offset (16)
stfd fp2,8(r1)
- lwz r3,8(r1)
+ lwz r3,8+HIWORD(r1)
cmpwi r3,0
addi r1,r1,16
cfi_adjust_cfa_offset (-16)
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S b/libc/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
index 4ec8389b5..2ad6de273 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
@@ -30,7 +30,7 @@ ENTRY(__copysignl)
fmr fp0,fp1
fabs fp1,fp1
fcmpu cr7,fp0,fp1
- lwz r3,8(r1)
+ lwz r3,8+HIWORD(r1)
cmpwi cr6,r3,0
addi r1,r1,16
cfi_adjust_cfa_offset (-16)
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/s_lrint.S b/libc/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
index 27881f8cc..249fda501 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
@@ -24,10 +24,10 @@ ENTRY (__lrint)
stwu r1,-16(r1)
fctiw fp13,fp1
stfd fp13,8(r1)
- nop /* Insure the following load is in a different dispatch group */
+ nop /* Ensure the following load is in a different dispatch group */
nop /* to avoid pipe stall on POWER4&5. */
nop
- lwz r3,12(r1)
+ lwz r3,8+LOWORD(r1)
addi r1,r1,16
blr
END (__lrint)
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/s_lround.S b/libc/sysdeps/powerpc/powerpc32/fpu/s_lround.S
index 92dc3787d..6309f864b 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/s_lround.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/s_lround.S
@@ -67,7 +67,7 @@ ENTRY (__lround)
nop /* Ensure the following load is in a different dispatch */
nop /* group to avoid pipe stall on POWER4&5. */
nop
- lwz r3,12(r1) /* Load return as integer. */
+ lwz r3,8+LOWORD(r1) /* Load return as integer. */
.Lout:
addi r1,r1,16
blr
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/s_roundf.S b/libc/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
index 2ed9ca7b4..8cff1563a 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
@@ -19,7 +19,7 @@
#include <sysdep.h>
.section .rodata.cst8,"aM",@progbits,8
- .align 2
+ .align 3
.LC0: /* 2**23 */
.long 0x4b000000
.LC1: /* 0.5 */
@@ -60,7 +60,6 @@ ENTRY (__roundf )
#ifdef SHARED
lfs fp10,.LC1-.LC0(r9)
#else
- lis r9,.LC1@ha
lfs fp10,.LC1@l(r9)
#endif
ble- cr6,.L4
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S b/libc/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
index 46ea2b00f..f3244060e 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
@@ -94,14 +94,14 @@ ENTRY (__sigsetjmp)
# else
lwz r5,_rtld_global_ro@got(r5)
# endif
- lwz r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r5)
+ lwz r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r5)
# else
lwz r5,_dl_hwcap@got(r5)
- lwz r5,4(r5)
+ lwz r5,LOWORD(r5)
# endif
# else
- lis r6,(_dl_hwcap+4)@ha
- lwz r5,(_dl_hwcap+4)@l(r6)
+ lis r6,(_dl_hwcap+LOWORD)@ha
+ lwz r5,(_dl_hwcap+LOWORD)@l(r6)
# endif
andis. r5,r5,(PPC_FEATURE_HAS_ALTIVEC >> 16)
beq L(no_vmx)
@@ -111,44 +111,43 @@ ENTRY (__sigsetjmp)
stw r0,((JB_VRSAVE)*4)(3)
addi r6,r5,16
beq+ L(aligned_save_vmx)
- lvsr v0,0,r5
- vspltisb v1,-1 /* set v1 to all 1's */
- vspltisb v2,0 /* set v2 to all 0's */
- vperm v3,v2,v1,v0 /* v3 contains shift mask with num all 1 bytes on left = misalignment */
+ lvsr v0,0,r5
+ lvsl v1,0,r5
+ addi r6,r5,-16
- /* Special case for v20 we need to preserve what is in save area below v20 before obliterating it */
- lvx v5,0,r5
- vperm v20,v20,v20,v0
- vsel v5,v5,v20,v3
- vsel v20,v20,v2,v3
- stvx v5,0,r5
+# define save_misaligned_vmx(savevr,prevvr,shiftvr,tmpvr,savegpr,addgpr) \
+ addi addgpr,addgpr,32; \
+ vperm tmpvr,prevvr,savevr,shiftvr; \
+ stvx tmpvr,0,savegpr
-#define save_2vmx_partial(savevr,prev_savevr,hivr,shiftvr,maskvr,savegpr,addgpr) \
- addi addgpr,addgpr,32; \
- vperm savevr,savevr,savevr,shiftvr; \
- vsel hivr,prev_savevr,savevr,maskvr; \
- stvx hivr,0,savegpr;
+ /*
+ * We have to be careful not to corrupt the data below v20 and
+ * above v31. To keep things simple we just rotate both ends in
+ * the opposite direction to our main permute so we can use
+ * the common macro.
+ */
- save_2vmx_partial(v21,v20,v5,v0,v3,r6,r5)
- save_2vmx_partial(v22,v21,v5,v0,v3,r5,r6)
- save_2vmx_partial(v23,v22,v5,v0,v3,r6,r5)
- save_2vmx_partial(v24,v23,v5,v0,v3,r5,r6)
- save_2vmx_partial(v25,v24,v5,v0,v3,r6,r5)
- save_2vmx_partial(v26,v25,v5,v0,v3,r5,r6)
- save_2vmx_partial(v27,v26,v5,v0,v3,r6,r5)
- save_2vmx_partial(v28,v27,v5,v0,v3,r5,r6)
- save_2vmx_partial(v29,v28,v5,v0,v3,r6,r5)
- save_2vmx_partial(v30,v29,v5,v0,v3,r5,r6)
+ /* load and rotate data below v20 */
+ lvx v2,0,r5
+ vperm v2,v2,v2,v1
+ save_misaligned_vmx(v20,v2,v0,v3,r5,r6)
+ save_misaligned_vmx(v21,v20,v0,v3,r6,r5)
+ save_misaligned_vmx(v22,v21,v0,v3,r5,r6)
+ save_misaligned_vmx(v23,v22,v0,v3,r6,r5)
+ save_misaligned_vmx(v24,v23,v0,v3,r5,r6)
+ save_misaligned_vmx(v25,v24,v0,v3,r6,r5)
+ save_misaligned_vmx(v26,v25,v0,v3,r5,r6)
+ save_misaligned_vmx(v27,v26,v0,v3,r6,r5)
+ save_misaligned_vmx(v28,v27,v0,v3,r5,r6)
+ save_misaligned_vmx(v29,v28,v0,v3,r6,r5)
+ save_misaligned_vmx(v30,v29,v0,v3,r5,r6)
+ save_misaligned_vmx(v31,v30,v0,v3,r6,r5)
+ /* load and rotate data above v31 */
+ lvx v2,0,r6
+ vperm v2,v2,v2,v1
+ save_misaligned_vmx(v2,v31,v0,v3,r5,r6)
- /* Special case for r31 we need to preserve what is in save area above v31 before obliterating it */
- addi r5,r5,32
- vperm v31,v31,v31,v0
- lvx v4,0,r5
- vsel v5,v30,v31,v3
- stvx v5,0,r6
- vsel v4,v31,v4,v3
- stvx v4,0,r5
b L(no_vmx)
L(aligned_save_vmx):
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/setjmp.S b/libc/sysdeps/powerpc/powerpc32/fpu/setjmp.S
index 60cd35052..92acff1e6 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/setjmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/setjmp.S
@@ -26,7 +26,7 @@
#else /* !NOT_IN_libc */
/* Build a versioned object for libc. */
-default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx__sigsetjmp, __sigsetjmp, GLIBC_2_3_4)
# define __sigsetjmp __vmx__sigsetjmp
# define __sigjmp_save __vmx__sigjmp_save
# include "setjmp-common.S"
@@ -36,7 +36,7 @@ default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
# undef __sigsetjmp
# undef __sigjmp_save
# undef JB_SIZE
-symbol_version (__novmx__sigsetjmp,__sigsetjmp,GLIBC_2.0)
+compat_symbol (libc, __novmx__sigsetjmp, __sigsetjmp, GLIBC_2_0)
# define __sigsetjmp __novmx__sigsetjmp
# define __sigjmp_save __novmx__sigjmp_save
# include "setjmp-common.S"
diff --git a/libc/sysdeps/powerpc/powerpc32/mcount.c b/libc/sysdeps/powerpc/powerpc32/mcount.c
index 0476bf61d..d8c063222 100644
--- a/libc/sysdeps/powerpc/powerpc32/mcount.c
+++ b/libc/sysdeps/powerpc/powerpc32/mcount.c
@@ -9,7 +9,7 @@
/* __mcount_internal was added in glibc 2.15 with version GLIBC_PRIVATE,
but it should have been put in version GLIBC_2.15. Mark the
GLIBC_PRIVATE version obsolete and add it to GLIBC_2.16 instead. */
-default_symbol_version (___mcount_internal, __mcount_internal, GLIBC_2.16);
+versioned_symbol (libc, ___mcount_internal, __mcount_internal, GLIBC_2_16);
#if SHLIB_COMPAT (libc, GLIBC_2_15, GLIBC_2_16)
strong_alias (___mcount_internal, ___mcount_internal_private);
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
index 55b2850fd..e7a88feb4 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
@@ -29,8 +29,8 @@ ENTRY (__llrint)
nop /* Insure the following load is in a different dispatch group */
nop /* to avoid pipe stall on POWER4&5. */
nop
- lwz r3,8(r1)
- lwz r4,12(r1)
+ lwz r3,8+HIWORD(r1)
+ lwz r4,8+LOWORD(r1)
addi r1,r1,16
blr
END (__llrint)
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
index cc80fcb02..da24ad38d 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
@@ -28,8 +28,8 @@ ENTRY (__llrintf)
nop /* Insure the following load is in a different dispatch group */
nop /* to avoid pipe stall on POWER4&5. */
nop
- lwz r3,8(r1)
- lwz r4,12(r1)
+ lwz r3,8+HIWORD(r1)
+ lwz r4,8+LOWORD(r1)
addi r1,r1,16
blr
END (__llrintf)
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
index 631180f07..7246ca4d1 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
@@ -19,12 +19,10 @@
#include <sysdep.h>
#include <math_ldbl_opt.h>
- .section .rodata.cst12,"aM",@progbits,12
+ .section .rodata.cst8,"aM",@progbits,8
.align 3
- .LC0: /* 0x1.0000000000000p+52 == 2^52 */
- .long 0x43300000
- .long 0x00000000
- .long 0x3f000000 /* Use this for 0.5 */
+ .LC0: .long (52+127)<<23 /* 0x1p+52 */
+ .long (-1+127)<<23 /* 0.5 */
.section ".text"
@@ -57,12 +55,12 @@ ENTRY (__llround)
addi r9,r9,.LC0-got_label@l
mtlr r11
cfi_same_value (lr)
- lfd fp9,0(r9)
- lfs fp10,8(r9)
+ lfs fp9,0(r9)
+ lfs fp10,4(r9)
#else
lis r9,.LC0@ha
- lfd fp9,.LC0@l(r9) /* Load 2^52 into fpr9. */
- lfs fp10,.LC0@l+8(r9) /* Load 0.5 into fpr10. */
+ lfs fp9,.LC0@l(r9) /* Load 2^52 into fpr9. */
+ lfs fp10,.LC0@l+4(r9) /* Load 0.5 into fpr10. */
#endif
fabs fp2,fp1 /* Get the absolute value of x. */
fsub fp12,fp10,fp10 /* Compute 0.0 into fpr12. */
@@ -80,8 +78,8 @@ ENTRY (__llround)
nop
nop
nop
- lwz r4,12(r1) /* Load return as integer. */
- lwz r3,8(r1)
+ lwz r3,8+HIWORD(r1) /* Load return as integer. */
+ lwz r4,8+LOWORD(r1)
.Lout:
addi r1,r1,16
blr
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/hp-timing.h b/libc/sysdeps/powerpc/powerpc32/power4/hp-timing.h
index 7d6c96e9e..4e42374ea 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/hp-timing.h
+++ b/libc/sysdeps/powerpc/powerpc32/power4/hp-timing.h
@@ -87,18 +87,15 @@ typedef unsigned long long int hp_timing_t;
#define HP_TIMING_NOW(Var) \
do { \
- union { long long ll; long ii[2]; } _var; \
- long tmp; \
- __asm__ __volatile__ ( \
- "1: mfspr %0,269;" \
- " mfspr %1,268;" \
- " mfspr %2,269;" \
- " cmpw %0,%2;" \
- " bne 1b;" \
- : "=r" (_var.ii[0]), "=r" (_var.ii[1]) , "=r" (tmp) \
- : : "cr0" \
- ); \
- Var = _var.ll; \
+ unsigned int hi, lo, tmp; \
+ __asm__ __volatile__ ("1: mfspr %0,269;" \
+ " mfspr %1,268;" \
+ " mfspr %2,269;" \
+ " cmpw %0,%2;" \
+ " bne 1b;" \
+ : "=&r" (hi), "=&r" (lo), "=&r" (tmp) \
+ : : "cr0"); \
+ Var = ((hp_timing_t) hi << 32) | lo; \
} while (0)
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/memcmp.S b/libc/sysdeps/powerpc/powerpc32/power4/memcmp.S
index 9a455a3c6..35e162667 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/memcmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/memcmp.S
@@ -1,4 +1,4 @@
-/* Optimized strcmp implementation for PowerPC64.
+/* Optimized strcmp implementation for PowerPC32.
Copyright (C) 2003-2013 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -18,13 +18,14 @@
#include <sysdep.h>
-/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5]) */
+/* int [r3] memcmp (const char *s1 [r3],
+ const char *s2 [r4],
+ size_t size [r5]) */
.machine power4
EALIGN (memcmp, 4, 0)
CALL_MCOUNT
-#define rTMP r0
#define rRTN r3
#define rSTR1 r3 /* first string arg */
#define rSTR2 r4 /* second string arg */
@@ -35,33 +36,32 @@ EALIGN (memcmp, 4, 0)
#define rWORD4 r9 /* next word in s2 */
#define rWORD5 r10 /* next word in s1 */
#define rWORD6 r11 /* next word in s2 */
-#define rBITDIF r12 /* bits that differ in s1 & s2 words */
#define rWORD7 r30 /* next word in s1 */
#define rWORD8 r31 /* next word in s2 */
- xor rTMP, rSTR2, rSTR1
+ xor r0, rSTR2, rSTR1
cmplwi cr6, rN, 0
cmplwi cr1, rN, 12
- clrlwi. rTMP, rTMP, 30
- clrlwi rBITDIF, rSTR1, 30
- cmplwi cr5, rBITDIF, 0
+ clrlwi. r0, r0, 30
+ clrlwi r12, rSTR1, 30
+ cmplwi cr5, r12, 0
beq- cr6, L(zeroLength)
- dcbt 0,rSTR1
- dcbt 0,rSTR2
+ dcbt 0, rSTR1
+ dcbt 0, rSTR2
/* If less than 8 bytes or not aligned, use the unaligned
byte loop. */
blt cr1, L(bytealigned)
- stwu 1,-64(1)
+ stwu 1, -64(r1)
cfi_adjust_cfa_offset(64)
- stw r31,48(1)
- cfi_offset(31,(48-64))
- stw r30,44(1)
- cfi_offset(30,(44-64))
+ stw rWORD8, 48(r1)
+ cfi_offset(rWORD8, (48-64))
+ stw rWORD7, 44(r1)
+ cfi_offset(rWORD7, (44-64))
bne L(unaligned)
/* At this point we know both strings have the same alignment and the
- compare length is at least 8 bytes. rBITDIF contains the low order
+ compare length is at least 8 bytes. r12 contains the low order
2 bits of rSTR1 and cr5 contains the result of the logical compare
- of rBITDIF to 0. If rBITDIF == 0 then we are already word
+ of r12 to 0. If r12 == 0 then we are already word
aligned and can perform the word aligned loop.
Otherwise we know the two strings have the same alignment (but not
@@ -70,74 +70,95 @@ EALIGN (memcmp, 4, 0)
eliminate bits preceding the first byte. Since we want to join the
normal (word aligned) compare loop, starting at the second word,
we need to adjust the length (rN) and special case the loop
- versioning for the first word. This insures that the loop count is
+ versioning for the first word. This ensures that the loop count is
correct and the first word (shifted) is in the expected register pair. */
- .align 4
+ .align 4
L(samealignment):
clrrwi rSTR1, rSTR1, 2
clrrwi rSTR2, rSTR2, 2
beq cr5, L(Waligned)
- add rN, rN, rBITDIF
- slwi r11, rBITDIF, 3
- srwi rTMP, rN, 4 /* Divide by 16 */
- andi. rBITDIF, rN, 12 /* Get the word remainder */
+ add rN, rN, r12
+ slwi rWORD6, r12, 3
+ srwi r0, rN, 4 /* Divide by 16 */
+ andi. r12, rN, 12 /* Get the word remainder */
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 0(rSTR1)
lwz rWORD2, 0(rSTR2)
- cmplwi cr1, rBITDIF, 8
+#endif
+ cmplwi cr1, r12, 8
cmplwi cr7, rN, 16
clrlwi rN, rN, 30
beq L(dPs4)
- mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
+ mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
bgt cr1, L(dPs3)
beq cr1, L(dPs2)
/* Remainder is 4 */
- .align 3
+ .align 3
L(dsP1):
- slw rWORD5, rWORD1, r11
- slw rWORD6, rWORD2, r11
+ slw rWORD5, rWORD1, rWORD6
+ slw rWORD6, rWORD2, rWORD6
cmplw cr5, rWORD5, rWORD6
blt cr7, L(dP1x)
/* Do something useful in this cycle since we have to branch anyway. */
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 4(rSTR1)
lwz rWORD2, 4(rSTR2)
- cmplw cr0, rWORD1, rWORD2
+#endif
+ cmplw cr7, rWORD1, rWORD2
b L(dP1e)
/* Remainder is 8 */
- .align 4
+ .align 4
L(dPs2):
- slw rWORD5, rWORD1, r11
- slw rWORD6, rWORD2, r11
+ slw rWORD5, rWORD1, rWORD6
+ slw rWORD6, rWORD2, rWORD6
cmplw cr6, rWORD5, rWORD6
blt cr7, L(dP2x)
/* Do something useful in this cycle since we have to branch anyway. */
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD7, 4(rSTR1)
lwz rWORD8, 4(rSTR2)
+#endif
cmplw cr5, rWORD7, rWORD8
b L(dP2e)
/* Remainder is 12 */
- .align 4
+ .align 4
L(dPs3):
- slw rWORD3, rWORD1, r11
- slw rWORD4, rWORD2, r11
+ slw rWORD3, rWORD1, rWORD6
+ slw rWORD4, rWORD2, rWORD6
cmplw cr1, rWORD3, rWORD4
b L(dP3e)
/* Count is a multiple of 16, remainder is 0 */
- .align 4
+ .align 4
L(dPs4):
- mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
- slw rWORD1, rWORD1, r11
- slw rWORD2, rWORD2, r11
- cmplw cr0, rWORD1, rWORD2
+ mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
+ slw rWORD1, rWORD1, rWORD6
+ slw rWORD2, rWORD2, rWORD6
+ cmplw cr7, rWORD1, rWORD2
b L(dP4e)
/* At this point we know both strings are word aligned and the
compare length is at least 8 bytes. */
- .align 4
+ .align 4
L(Waligned):
- andi. rBITDIF, rN, 12 /* Get the word remainder */
- srwi rTMP, rN, 4 /* Divide by 16 */
- cmplwi cr1, rBITDIF, 8
+ andi. r12, rN, 12 /* Get the word remainder */
+ srwi r0, rN, 4 /* Divide by 16 */
+ cmplwi cr1, r12, 8
cmplwi cr7, rN, 16
clrlwi rN, rN, 30
beq L(dP4)
@@ -145,177 +166,352 @@ L(Waligned):
beq cr1, L(dP2)
/* Remainder is 4 */
- .align 4
+ .align 4
L(dP1):
- mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
+ mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
(8-15 byte compare), we want to use only volatile registers. This
means we can avoid restoring non-volatile registers since we did not
change any on the early exit path. The key here is the non-early
exit path only cares about the condition code (cr5), not about which
register pair was used. */
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD5, 0(rSTR1)
lwz rWORD6, 0(rSTR2)
+#endif
cmplw cr5, rWORD5, rWORD6
blt cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 4(rSTR1)
lwz rWORD2, 4(rSTR2)
- cmplw cr0, rWORD1, rWORD2
+#endif
+ cmplw cr7, rWORD1, rWORD2
L(dP1e):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD3, 8(rSTR1)
lwz rWORD4, 8(rSTR2)
+#endif
cmplw cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD5, 12(rSTR1)
lwz rWORD6, 12(rSTR2)
+#endif
cmplw cr6, rWORD5, rWORD6
- bne cr5, L(dLcr5)
- bne cr0, L(dLcr0)
+ bne cr5, L(dLcr5x)
+ bne cr7, L(dLcr7x)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwzu rWORD7, 16(rSTR1)
lwzu rWORD8, 16(rSTR2)
+#endif
bne cr1, L(dLcr1)
cmplw cr5, rWORD7, rWORD8
bdnz L(dLoop)
bne cr6, L(dLcr6)
- lwz r30,44(1)
- lwz r31,48(1)
- .align 3
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
+ .align 3
L(dP1x):
slwi. r12, rN, 3
- bne cr5, L(dLcr5)
+ bne cr5, L(dLcr5x)
subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
- lwz 1,0(1)
+ addi 1, 1, 64
+ cfi_adjust_cfa_offset(-64)
bne L(d00)
li rRTN, 0
blr
/* Remainder is 8 */
- .align 4
+ .align 4
+ cfi_adjust_cfa_offset(64)
L(dP2):
- mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
+ mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD5, 0(rSTR1)
lwz rWORD6, 0(rSTR2)
+#endif
cmplw cr6, rWORD5, rWORD6
blt cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD7, 4(rSTR1)
lwz rWORD8, 4(rSTR2)
+#endif
cmplw cr5, rWORD7, rWORD8
L(dP2e):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 8(rSTR1)
lwz rWORD2, 8(rSTR2)
- cmplw cr0, rWORD1, rWORD2
+#endif
+ cmplw cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD3, 12(rSTR1)
lwz rWORD4, 12(rSTR2)
+#endif
cmplw cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 4
addi rSTR2, rSTR2, 4
+#endif
bne cr6, L(dLcr6)
bne cr5, L(dLcr5)
b L(dLoop2)
/* Again we are on a early exit path (16-23 byte compare), we want to
only use volatile registers and avoid restoring non-volatile
registers. */
- .align 4
+ .align 4
L(dP2x):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD3, 4(rSTR1)
lwz rWORD4, 4(rSTR2)
- cmplw cr5, rWORD3, rWORD4
+#endif
+ cmplw cr1, rWORD3, rWORD4
slwi. r12, rN, 3
- bne cr6, L(dLcr6)
+ bne cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 4
addi rSTR2, rSTR2, 4
- bne cr5, L(dLcr5)
+#endif
+ bne cr1, L(dLcr1x)
subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
- lwz 1,0(1)
+ addi 1, 1, 64
+ cfi_adjust_cfa_offset(-64)
bne L(d00)
li rRTN, 0
blr
/* Remainder is 12 */
- .align 4
+ .align 4
+ cfi_adjust_cfa_offset(64)
L(dP3):
- mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
+ mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD3, 0(rSTR1)
lwz rWORD4, 0(rSTR2)
+#endif
cmplw cr1, rWORD3, rWORD4
L(dP3e):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD5, 4(rSTR1)
lwz rWORD6, 4(rSTR2)
+#endif
cmplw cr6, rWORD5, rWORD6
blt cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD7, 8(rSTR1)
lwz rWORD8, 8(rSTR2)
+#endif
cmplw cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 12(rSTR1)
lwz rWORD2, 12(rSTR2)
- cmplw cr0, rWORD1, rWORD2
+#endif
+ cmplw cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
+#endif
bne cr1, L(dLcr1)
bne cr6, L(dLcr6)
b L(dLoop1)
/* Again we are on a early exit path (24-31 byte compare), we want to
only use volatile registers and avoid restoring non-volatile
registers. */
- .align 4
+ .align 4
L(dP3x):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 8(rSTR1)
lwz rWORD2, 8(rSTR2)
- cmplw cr5, rWORD1, rWORD2
+#endif
+ cmplw cr7, rWORD1, rWORD2
slwi. r12, rN, 3
- bne cr1, L(dLcr1)
+ bne cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
- bne cr6, L(dLcr6)
+#endif
+ bne cr6, L(dLcr6x)
subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
- bne cr5, L(dLcr5)
- lwz 1,0(1)
+ bne cr7, L(dLcr7x)
+ addi 1, 1, 64
+ cfi_adjust_cfa_offset(-64)
bne L(d00)
li rRTN, 0
blr
/* Count is a multiple of 16, remainder is 0 */
- .align 4
+ .align 4
+ cfi_adjust_cfa_offset(64)
L(dP4):
- mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
+ mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 0(rSTR1)
lwz rWORD2, 0(rSTR2)
- cmplw cr0, rWORD1, rWORD2
+#endif
+ cmplw cr7, rWORD1, rWORD2
L(dP4e):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD3, 4(rSTR1)
lwz rWORD4, 4(rSTR2)
+#endif
cmplw cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD5, 8(rSTR1)
lwz rWORD6, 8(rSTR2)
+#endif
cmplw cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwzu rWORD7, 12(rSTR1)
lwzu rWORD8, 12(rSTR2)
+#endif
cmplw cr5, rWORD7, rWORD8
- bne cr0, L(dLcr0)
+ bne cr7, L(dLcr7)
bne cr1, L(dLcr1)
bdz- L(d24) /* Adjust CTR as we start with +4 */
/* This is the primary loop */
- .align 4
+ .align 4
L(dLoop):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 4(rSTR1)
lwz rWORD2, 4(rSTR2)
+#endif
cmplw cr1, rWORD3, rWORD4
bne cr6, L(dLcr6)
L(dLoop1):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD3, 8(rSTR1)
lwz rWORD4, 8(rSTR2)
+#endif
cmplw cr6, rWORD5, rWORD6
bne cr5, L(dLcr5)
L(dLoop2):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD5, 12(rSTR1)
lwz rWORD6, 12(rSTR2)
+#endif
cmplw cr5, rWORD7, rWORD8
- bne cr0, L(dLcr0)
+ bne cr7, L(dLcr7)
L(dLoop3):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwzu rWORD7, 16(rSTR1)
lwzu rWORD8, 16(rSTR2)
+#endif
bne- cr1, L(dLcr1)
- cmplw cr0, rWORD1, rWORD2
+ cmplw cr7, rWORD1, rWORD2
bdnz+ L(dLoop)
L(dL4):
@@ -325,7 +521,7 @@ L(dL4):
bne cr5, L(dLcr5)
cmplw cr5, rWORD7, rWORD8
L(d44):
- bne cr0, L(dLcr0)
+ bne cr7, L(dLcr7)
L(d34):
bne cr1, L(dLcr1)
L(d24):
@@ -334,69 +530,82 @@ L(d14):
slwi. r12, rN, 3
bne cr5, L(dLcr5)
L(d04):
- lwz r30,44(1)
- lwz r31,48(1)
- lwz 1,0(1)
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
+ addi 1, 1, 64
+ cfi_adjust_cfa_offset(-64)
subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
beq L(zeroLength)
/* At this point we have a remainder of 1 to 3 bytes to compare. Since
we are aligned it is safe to load the whole word, and use
- shift right to eliminate bits beyond the compare length. */
+ shift right to eliminate bits beyond the compare length. */
L(d00):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 4(rSTR1)
lwz rWORD2, 4(rSTR2)
+#endif
srw rWORD1, rWORD1, rN
srw rWORD2, rWORD2, rN
- cmplw rWORD1,rWORD2
- li rRTN,0
- beqlr
- li rRTN,1
- bgtlr
- li rRTN,-1
- blr
-
- .align 4
-L(dLcr0):
- lwz r30,44(1)
- lwz r31,48(1)
+ sub rRTN, rWORD1, rWORD2
+ blr
+
+ .align 4
+ cfi_adjust_cfa_offset(64)
+L(dLcr7):
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
+L(dLcr7x):
li rRTN, 1
- lwz 1,0(1)
- bgtlr cr0
+ addi 1, 1, 64
+ cfi_adjust_cfa_offset(-64)
+ bgtlr cr7
li rRTN, -1
blr
- .align 4
+ .align 4
+ cfi_adjust_cfa_offset(64)
L(dLcr1):
- lwz r30,44(1)
- lwz r31,48(1)
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
+L(dLcr1x):
li rRTN, 1
- lwz 1,0(1)
+ addi 1, 1, 64
+ cfi_adjust_cfa_offset(-64)
bgtlr cr1
li rRTN, -1
blr
- .align 4
+ .align 4
+ cfi_adjust_cfa_offset(64)
L(dLcr6):
- lwz r30,44(1)
- lwz r31,48(1)
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
+L(dLcr6x):
li rRTN, 1
- lwz 1,0(1)
+ addi 1, 1, 64
+ cfi_adjust_cfa_offset(-64)
bgtlr cr6
li rRTN, -1
blr
- .align 4
+ .align 4
+ cfi_adjust_cfa_offset(64)
L(dLcr5):
- lwz r30,44(1)
- lwz r31,48(1)
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
L(dLcr5x):
li rRTN, 1
- lwz 1,0(1)
+ addi 1, 1, 64
+ cfi_adjust_cfa_offset(-64)
bgtlr cr5
li rRTN, -1
blr
- .align 4
+ .align 4
L(bytealigned):
- cfi_adjust_cfa_offset(-64)
- mtctr rN /* Power4 wants mtctr 1st in dispatch group */
+ mtctr rN /* Power4 wants mtctr 1st in dispatch group */
/* We need to prime this loop. This loop is swing modulo scheduled
to avoid pipe delays. The dependent instruction latencies (load to
@@ -411,7 +620,7 @@ L(bytealigned):
lbz rWORD1, 0(rSTR1)
lbz rWORD2, 0(rSTR2)
bdz- L(b11)
- cmplw cr0, rWORD1, rWORD2
+ cmplw cr7, rWORD1, rWORD2
lbz rWORD3, 1(rSTR1)
lbz rWORD4, 1(rSTR2)
bdz- L(b12)
@@ -419,11 +628,11 @@ L(bytealigned):
lbzu rWORD5, 2(rSTR1)
lbzu rWORD6, 2(rSTR2)
bdz- L(b13)
- .align 4
+ .align 4
L(bLoop):
lbzu rWORD1, 1(rSTR1)
lbzu rWORD2, 1(rSTR2)
- bne- cr0, L(bLcr0)
+ bne- cr7, L(bLcr7)
cmplw cr6, rWORD5, rWORD6
bdz- L(b3i)
@@ -432,7 +641,7 @@ L(bLoop):
lbzu rWORD4, 1(rSTR2)
bne- cr1, L(bLcr1)
- cmplw cr0, rWORD1, rWORD2
+ cmplw cr7, rWORD1, rWORD2
bdz- L(b2i)
lbzu rWORD5, 1(rSTR1)
@@ -449,23 +658,23 @@ L(bLoop):
tested. In this case we must complete the pending operations
before returning. */
L(b1i):
- bne- cr0, L(bLcr0)
+ bne- cr7, L(bLcr7)
bne- cr1, L(bLcr1)
b L(bx56)
- .align 4
+ .align 4
L(b2i):
bne- cr6, L(bLcr6)
- bne- cr0, L(bLcr0)
+ bne- cr7, L(bLcr7)
b L(bx34)
- .align 4
+ .align 4
L(b3i):
bne- cr1, L(bLcr1)
bne- cr6, L(bLcr6)
b L(bx12)
- .align 4
-L(bLcr0):
+ .align 4
+L(bLcr7):
li rRTN, 1
- bgtlr cr0
+ bgtlr cr7
li rRTN, -1
blr
L(bLcr1):
@@ -480,36 +689,31 @@ L(bLcr6):
blr
L(b13):
- bne- cr0, L(bx12)
+ bne- cr7, L(bx12)
bne- cr1, L(bx34)
L(bx56):
sub rRTN, rWORD5, rWORD6
blr
nop
L(b12):
- bne- cr0, L(bx12)
+ bne- cr7, L(bx12)
L(bx34):
sub rRTN, rWORD3, rWORD4
blr
-
L(b11):
L(bx12):
sub rRTN, rWORD1, rWORD2
blr
-
- .align 4
-L(zeroLengthReturn):
-
+ .align 4
L(zeroLength):
li rRTN, 0
blr
- cfi_adjust_cfa_offset(64)
- .align 4
+ .align 4
/* At this point we know the strings have different alignment and the
- compare length is at least 8 bytes. rBITDIF contains the low order
+ compare length is at least 8 bytes. r12 contains the low order
2 bits of rSTR1 and cr5 contains the result of the logical compare
- of rBITDIF to 0. If rBITDIF == 0 then rStr1 is word aligned and can
+ of r12 to 0. If r12 == 0 then rStr1 is word aligned and can
perform the Wunaligned loop.
Otherwise we know that rSTR1 is not already word aligned yet.
@@ -518,79 +722,88 @@ L(zeroLength):
eliminate bits preceding the first byte. Since we want to join the
normal (Wualigned) compare loop, starting at the second word,
we need to adjust the length (rN) and special case the loop
- versioning for the first W. This insures that the loop count is
+ versioning for the first W. This ensures that the loop count is
correct and the first W (shifted) is in the expected resister pair. */
#define rSHL r29 /* Unaligned shift left count. */
#define rSHR r28 /* Unaligned shift right count. */
-#define rB r27 /* Left rotation temp for rWORD2. */
-#define rD r26 /* Left rotation temp for rWORD4. */
-#define rF r25 /* Left rotation temp for rWORD6. */
-#define rH r24 /* Left rotation temp for rWORD8. */
-#define rA r0 /* Right rotation temp for rWORD2. */
-#define rC r12 /* Right rotation temp for rWORD4. */
-#define rE r0 /* Right rotation temp for rWORD6. */
-#define rG r12 /* Right rotation temp for rWORD8. */
+#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
+#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
+#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
+#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
+ cfi_adjust_cfa_offset(64)
L(unaligned):
- stw r29,40(r1)
- cfi_offset(r29,(40-64))
+ stw rSHL, 40(r1)
+ cfi_offset(rSHL, (40-64))
clrlwi rSHL, rSTR2, 30
- stw r28,36(r1)
- cfi_offset(r28,(36-64))
+ stw rSHR, 36(r1)
+ cfi_offset(rSHR, (36-64))
beq cr5, L(Wunaligned)
- stw r27,32(r1)
- cfi_offset(r27,(32-64))
+ stw rWORD8_SHIFT, 32(r1)
+ cfi_offset(rWORD8_SHIFT, (32-64))
/* Adjust the logical start of rSTR2 to compensate for the extra bits
in the 1st rSTR1 W. */
- sub r27, rSTR2, rBITDIF
+ sub rWORD8_SHIFT, rSTR2, r12
/* But do not attempt to address the W before that W that contains
the actual start of rSTR2. */
clrrwi rSTR2, rSTR2, 2
- stw r26,28(r1)
- cfi_offset(r26,(28-64))
-/* Compute the left/right shift counts for the unalign rSTR2,
+ stw rWORD2_SHIFT, 28(r1)
+ cfi_offset(rWORD2_SHIFT, (28-64))
+/* Compute the left/right shift counts for the unaligned rSTR2,
compensating for the logical (W aligned) start of rSTR1. */
- clrlwi rSHL, r27, 30
+ clrlwi rSHL, rWORD8_SHIFT, 30
clrrwi rSTR1, rSTR1, 2
- stw r25,24(r1)
- cfi_offset(r25,(24-64))
+ stw rWORD4_SHIFT, 24(r1)
+ cfi_offset(rWORD4_SHIFT, (24-64))
slwi rSHL, rSHL, 3
- cmplw cr5, r27, rSTR2
- add rN, rN, rBITDIF
- slwi r11, rBITDIF, 3
- stw r24,20(r1)
- cfi_offset(r24,(20-64))
+ cmplw cr5, rWORD8_SHIFT, rSTR2
+ add rN, rN, r12
+ slwi rWORD6, r12, 3
+ stw rWORD6_SHIFT, 20(r1)
+ cfi_offset(rWORD6_SHIFT, (20-64))
subfic rSHR, rSHL, 32
- srwi rTMP, rN, 4 /* Divide by 16 */
- andi. rBITDIF, rN, 12 /* Get the W remainder */
+ srwi r0, rN, 4 /* Divide by 16 */
+ andi. r12, rN, 12 /* Get the W remainder */
/* We normally need to load 2 Ws to start the unaligned rSTR2, but in
this special case those bits may be discarded anyway. Also we
must avoid loading a W where none of the bits are part of rSTR2 as
this may cross a page boundary and cause a page fault. */
li rWORD8, 0
blt cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD8, 0(rSTR2)
- la rSTR2, 4(rSTR2)
+ addi rSTR2, rSTR2, 4
+#endif
slw rWORD8, rWORD8, rSHL
L(dus0):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 0(rSTR1)
lwz rWORD2, 0(rSTR2)
- cmplwi cr1, rBITDIF, 8
+#endif
+ cmplwi cr1, r12, 8
cmplwi cr7, rN, 16
- srw rG, rWORD2, rSHR
+ srw r12, rWORD2, rSHR
clrlwi rN, rN, 30
beq L(duPs4)
- mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
- or rWORD8, rG, rWORD8
+ mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
+ or rWORD8, r12, rWORD8
bgt cr1, L(duPs3)
beq cr1, L(duPs2)
/* Remainder is 4 */
- .align 4
+ .align 4
L(dusP1):
- slw rB, rWORD2, rSHL
- slw rWORD7, rWORD1, r11
- slw rWORD8, rWORD8, r11
+ slw rWORD8_SHIFT, rWORD2, rSHL
+ slw rWORD7, rWORD1, rWORD6
+ slw rWORD8, rWORD8, rWORD6
bge cr7, L(duP1e)
/* At this point we exit early with the first word compare
complete and remainder of 0 to 3 bytes. See L(du14) for details on
@@ -600,95 +813,133 @@ L(dusP1):
bne cr5, L(duLcr5)
cmplw cr7, rN, rSHR
beq L(duZeroReturn)
- li rA, 0
+ li r0, 0
ble cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD2, 4(rSTR2)
- srw rA, rWORD2, rSHR
+#endif
+ srw r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 8 */
- .align 4
+ .align 4
L(duPs2):
- slw rH, rWORD2, rSHL
- slw rWORD5, rWORD1, r11
- slw rWORD6, rWORD8, r11
+ slw rWORD6_SHIFT, rWORD2, rSHL
+ slw rWORD5, rWORD1, rWORD6
+ slw rWORD6, rWORD8, rWORD6
b L(duP2e)
/* Remainder is 12 */
- .align 4
+ .align 4
L(duPs3):
- slw rF, rWORD2, rSHL
- slw rWORD3, rWORD1, r11
- slw rWORD4, rWORD8, r11
+ slw rWORD4_SHIFT, rWORD2, rSHL
+ slw rWORD3, rWORD1, rWORD6
+ slw rWORD4, rWORD8, rWORD6
b L(duP3e)
/* Count is a multiple of 16, remainder is 0 */
- .align 4
+ .align 4
L(duPs4):
- mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
- or rWORD8, rG, rWORD8
- slw rD, rWORD2, rSHL
- slw rWORD1, rWORD1, r11
- slw rWORD2, rWORD8, r11
+ mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
+ or rWORD8, r12, rWORD8
+ slw rWORD2_SHIFT, rWORD2, rSHL
+ slw rWORD1, rWORD1, rWORD6
+ slw rWORD2, rWORD8, rWORD6
b L(duP4e)
/* At this point we know rSTR1 is word aligned and the
compare length is at least 8 bytes. */
- .align 4
+ .align 4
L(Wunaligned):
- stw r27,32(r1)
- cfi_offset(r27,(32-64))
+ stw rWORD8_SHIFT, 32(r1)
+ cfi_offset(rWORD8_SHIFT, (32-64))
clrrwi rSTR2, rSTR2, 2
- stw r26,28(r1)
- cfi_offset(r26,(28-64))
- srwi rTMP, rN, 4 /* Divide by 16 */
- stw r25,24(r1)
- cfi_offset(r25,(24-64))
- andi. rBITDIF, rN, 12 /* Get the W remainder */
- stw r24,20(r1)
- cfi_offset(r24,(20-64))
+ stw rWORD2_SHIFT, 28(r1)
+ cfi_offset(rWORD2_SHIFT, (28-64))
+ srwi r0, rN, 4 /* Divide by 16 */
+ stw rWORD4_SHIFT, 24(r1)
+ cfi_offset(rWORD4_SHIFT, (24-64))
+ andi. r12, rN, 12 /* Get the W remainder */
+ stw rWORD6_SHIFT, 20(r1)
+ cfi_offset(rWORD6_SHIFT, (20-64))
slwi rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD6, 0(rSTR2)
lwzu rWORD8, 4(rSTR2)
- cmplwi cr1, rBITDIF, 8
+#endif
+ cmplwi cr1, r12, 8
cmplwi cr7, rN, 16
clrlwi rN, rN, 30
subfic rSHR, rSHL, 32
- slw rH, rWORD6, rSHL
+ slw rWORD6_SHIFT, rWORD6, rSHL
beq L(duP4)
- mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
+ mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
bgt cr1, L(duP3)
beq cr1, L(duP2)
/* Remainder is 4 */
- .align 4
+ .align 4
L(duP1):
- srw rG, rWORD8, rSHR
+ srw r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ addi rSTR1, rSTR1, 4
+#else
lwz rWORD7, 0(rSTR1)
- slw rB, rWORD8, rSHL
- or rWORD8, rG, rH
+#endif
+ slw rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP1x)
L(duP1e):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 4(rSTR1)
lwz rWORD2, 4(rSTR2)
+#endif
cmplw cr5, rWORD7, rWORD8
- srw rA, rWORD2, rSHR
- slw rD, rWORD2, rSHL
- or rWORD2, rA, rB
+ srw r0, rWORD2, rSHR
+ slw rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD3, 8(rSTR1)
lwz rWORD4, 8(rSTR2)
- cmplw cr0, rWORD1, rWORD2
- srw rC, rWORD4, rSHR
- slw rF, rWORD4, rSHL
+#endif
+ cmplw cr7, rWORD1, rWORD2
+ srw r12, rWORD4, rSHR
+ slw rWORD4_SHIFT, rWORD4, rSHL
bne cr5, L(duLcr5)
- or rWORD4, rC, rD
+ or rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD5, 12(rSTR1)
lwz rWORD6, 12(rSTR2)
+#endif
cmplw cr1, rWORD3, rWORD4
- srw rE, rWORD6, rSHR
- slw rH, rWORD6, rSHL
- bne cr0, L(duLcr0)
- or rWORD6, rE, rF
+ srw r0, rWORD6, rSHR
+ slw rWORD6_SHIFT, rWORD6, rSHL
+ bne cr7, L(duLcr7)
+ or rWORD6, r0, rWORD4_SHIFT
cmplw cr6, rWORD5, rWORD6
b L(duLoop3)
- .align 4
+ .align 4
/* At this point we exit early with the first word compare
complete and remainder of 0 to 3 bytes. See L(du14) for details on
how we handle the remaining bytes. */
@@ -698,186 +949,321 @@ L(duP1x):
bne cr5, L(duLcr5)
cmplw cr7, rN, rSHR
beq L(duZeroReturn)
- li rA, 0
+ li r0, 0
ble cr7, L(dutrim)
- ld rWORD2, 8(rSTR2)
- srw rA, rWORD2, rSHR
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD2, 8(rSTR2)
+#endif
+ srw r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 8 */
- .align 4
+ .align 4
L(duP2):
- srw rE, rWORD8, rSHR
+ srw r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ addi rSTR1, rSTR1, 4
+#else
lwz rWORD5, 0(rSTR1)
- or rWORD6, rE, rH
- slw rH, rWORD8, rSHL
+#endif
+ or rWORD6, r0, rWORD6_SHIFT
+ slw rWORD6_SHIFT, rWORD8, rSHL
L(duP2e):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD7, 4(rSTR1)
lwz rWORD8, 4(rSTR2)
+#endif
cmplw cr6, rWORD5, rWORD6
- srw rG, rWORD8, rSHR
- slw rB, rWORD8, rSHL
- or rWORD8, rG, rH
+ srw r12, rWORD8, rSHR
+ slw rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 8(rSTR1)
lwz rWORD2, 8(rSTR2)
+#endif
cmplw cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
- srw rA, rWORD2, rSHR
- slw rD, rWORD2, rSHL
- or rWORD2, rA, rB
+ srw r0, rWORD2, rSHR
+ slw rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD3, 12(rSTR1)
lwz rWORD4, 12(rSTR2)
- cmplw cr0, rWORD1, rWORD2
+#endif
+ cmplw cr7, rWORD1, rWORD2
bne cr5, L(duLcr5)
- srw rC, rWORD4, rSHR
- slw rF, rWORD4, rSHL
- or rWORD4, rC, rD
+ srw r12, rWORD4, rSHR
+ slw rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 4
addi rSTR2, rSTR2, 4
+#endif
cmplw cr1, rWORD3, rWORD4
b L(duLoop2)
- .align 4
+ .align 4
L(duP2x):
cmplw cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 4
addi rSTR2, rSTR2, 4
+#endif
bne cr6, L(duLcr6)
slwi. rN, rN, 3
bne cr5, L(duLcr5)
cmplw cr7, rN, rSHR
beq L(duZeroReturn)
- li rA, 0
+ li r0, 0
ble cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD2, 4(rSTR2)
- srw rA, rWORD2, rSHR
+#endif
+ srw r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 12 */
- .align 4
+ .align 4
L(duP3):
- srw rC, rWORD8, rSHR
+ srw r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ addi rSTR1, rSTR1, 4
+#else
lwz rWORD3, 0(rSTR1)
- slw rF, rWORD8, rSHL
- or rWORD4, rC, rH
+#endif
+ slw rWORD4_SHIFT, rWORD8, rSHL
+ or rWORD4, r12, rWORD6_SHIFT
L(duP3e):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD5, 4(rSTR1)
lwz rWORD6, 4(rSTR2)
+#endif
cmplw cr1, rWORD3, rWORD4
- srw rE, rWORD6, rSHR
- slw rH, rWORD6, rSHL
- or rWORD6, rE, rF
+ srw r0, rWORD6, rSHR
+ slw rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD7, 8(rSTR1)
lwz rWORD8, 8(rSTR2)
+#endif
cmplw cr6, rWORD5, rWORD6
bne cr1, L(duLcr1)
- srw rG, rWORD8, rSHR
- slw rB, rWORD8, rSHL
- or rWORD8, rG, rH
+ srw r12, rWORD8, rSHR
+ slw rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
blt cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 12(rSTR1)
lwz rWORD2, 12(rSTR2)
+#endif
cmplw cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
- srw rA, rWORD2, rSHR
- slw rD, rWORD2, rSHL
- or rWORD2, rA, rB
+ srw r0, rWORD2, rSHR
+ slw rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
- cmplw cr0, rWORD1, rWORD2
+#endif
+ cmplw cr7, rWORD1, rWORD2
b L(duLoop1)
- .align 4
+ .align 4
L(duP3x):
+#ifndef __LITTLE_ENDIAN__
addi rSTR1, rSTR1, 8
addi rSTR2, rSTR2, 8
+#endif
+#if 0
+/* Huh? We've already branched on cr1! */
bne cr1, L(duLcr1)
+#endif
cmplw cr5, rWORD7, rWORD8
bne cr6, L(duLcr6)
slwi. rN, rN, 3
bne cr5, L(duLcr5)
cmplw cr7, rN, rSHR
beq L(duZeroReturn)
- li rA, 0
+ li r0, 0
ble cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD2, 4(rSTR2)
- srw rA, rWORD2, rSHR
+#endif
+ srw r0, rWORD2, rSHR
b L(dutrim)
/* Count is a multiple of 16, remainder is 0 */
- .align 4
+ .align 4
L(duP4):
- mtctr rTMP /* Power4 wants mtctr 1st in dispatch group */
- srw rA, rWORD8, rSHR
+ mtctr r0 /* Power4 wants mtctr 1st in dispatch group */
+ srw r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ addi rSTR1, rSTR1, 4
+#else
lwz rWORD1, 0(rSTR1)
- slw rD, rWORD8, rSHL
- or rWORD2, rA, rH
+#endif
+ slw rWORD2_SHIFT, rWORD8, rSHL
+ or rWORD2, r0, rWORD6_SHIFT
L(duP4e):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD3, 4(rSTR1)
lwz rWORD4, 4(rSTR2)
- cmplw cr0, rWORD1, rWORD2
- srw rC, rWORD4, rSHR
- slw rF, rWORD4, rSHL
- or rWORD4, rC, rD
+#endif
+ cmplw cr7, rWORD1, rWORD2
+ srw r12, rWORD4, rSHR
+ slw rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD5, 8(rSTR1)
lwz rWORD6, 8(rSTR2)
+#endif
cmplw cr1, rWORD3, rWORD4
- bne cr0, L(duLcr0)
- srw rE, rWORD6, rSHR
- slw rH, rWORD6, rSHL
- or rWORD6, rE, rF
+ bne cr7, L(duLcr7)
+ srw r0, rWORD6, rSHR
+ slw rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwzu rWORD7, 12(rSTR1)
lwzu rWORD8, 12(rSTR2)
+#endif
cmplw cr6, rWORD5, rWORD6
bne cr1, L(duLcr1)
- srw rG, rWORD8, rSHR
- slw rB, rWORD8, rSHL
- or rWORD8, rG, rH
+ srw r12, rWORD8, rSHR
+ slw rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
cmplw cr5, rWORD7, rWORD8
bdz- L(du24) /* Adjust CTR as we start with +4 */
/* This is the primary loop */
- .align 4
+ .align 4
L(duLoop):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD1, 4(rSTR1)
lwz rWORD2, 4(rSTR2)
+#endif
cmplw cr1, rWORD3, rWORD4
bne cr6, L(duLcr6)
- srw rA, rWORD2, rSHR
- slw rD, rWORD2, rSHL
- or rWORD2, rA, rB
+ srw r0, rWORD2, rSHR
+ slw rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
L(duLoop1):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD3, 8(rSTR1)
lwz rWORD4, 8(rSTR2)
+#endif
cmplw cr6, rWORD5, rWORD6
bne cr5, L(duLcr5)
- srw rC, rWORD4, rSHR
- slw rF, rWORD4, rSHL
- or rWORD4, rC, rD
+ srw r12, rWORD4, rSHR
+ slw rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
L(duLoop2):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD5, 12(rSTR1)
lwz rWORD6, 12(rSTR2)
+#endif
cmplw cr5, rWORD7, rWORD8
- bne cr0, L(duLcr0)
- srw rE, rWORD6, rSHR
- slw rH, rWORD6, rSHL
- or rWORD6, rE, rF
+ bne cr7, L(duLcr7)
+ srw r0, rWORD6, rSHR
+ slw rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
L(duLoop3):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
lwzu rWORD7, 16(rSTR1)
lwzu rWORD8, 16(rSTR2)
- cmplw cr0, rWORD1, rWORD2
+#endif
+ cmplw cr7, rWORD1, rWORD2
bne- cr1, L(duLcr1)
- srw rG, rWORD8, rSHR
- slw rB, rWORD8, rSHL
- or rWORD8, rG, rH
+ srw r12, rWORD8, rSHR
+ slw rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
bdnz+ L(duLoop)
L(duL4):
+#if 0
+/* Huh? We've already branched on cr1! */
bne cr1, L(duLcr1)
+#endif
cmplw cr1, rWORD3, rWORD4
bne cr6, L(duLcr6)
cmplw cr6, rWORD5, rWORD6
bne cr5, L(duLcr5)
cmplw cr5, rWORD7, rWORD8
L(du44):
- bne cr0, L(duLcr0)
+ bne cr7, L(duLcr7)
L(du34):
bne cr1, L(duLcr1)
L(du24):
@@ -887,95 +1273,101 @@ L(du14):
bne cr5, L(duLcr5)
/* At this point we have a remainder of 1 to 3 bytes to compare. We use
shift right to eliminate bits beyond the compare length.
+ This allows the use of word subtract to compute the final result.
However it may not be safe to load rWORD2 which may be beyond the
string length. So we compare the bit length of the remainder to
the right shift count (rSHR). If the bit count is less than or equal
we do not need to load rWORD2 (all significant bits are already in
- rB). */
+ rWORD8_SHIFT). */
cmplw cr7, rN, rSHR
beq L(duZeroReturn)
- li rA, 0
+ li r0, 0
ble cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
lwz rWORD2, 4(rSTR2)
- srw rA, rWORD2, rSHR
- .align 4
+#endif
+ srw r0, rWORD2, rSHR
+ .align 4
L(dutrim):
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+#else
lwz rWORD1, 4(rSTR1)
- lwz r31,48(1)
+#endif
+ lwz rWORD8, 48(r1)
subfic rN, rN, 32 /* Shift count is 32 - (rN * 8). */
- or rWORD2, rA, rB
- lwz r30,44(1)
- lwz r29,40(r1)
+ or rWORD2, r0, rWORD8_SHIFT
+ lwz rWORD7, 44(r1)
+ lwz rSHL, 40(r1)
srw rWORD1, rWORD1, rN
srw rWORD2, rWORD2, rN
- lwz r28,36(r1)
- lwz r27,32(r1)
- cmplw rWORD1,rWORD2
- li rRTN,0
- beq L(dureturn26)
- li rRTN,1
- bgt L(dureturn26)
- li rRTN,-1
- b L(dureturn26)
- .align 4
-L(duLcr0):
- lwz r31,48(1)
- lwz r30,44(1)
+ lwz rSHR, 36(r1)
+ lwz rWORD8_SHIFT, 32(r1)
+ sub rRTN, rWORD1, rWORD2
+ b L(dureturn26)
+ .align 4
+L(duLcr7):
+ lwz rWORD8, 48(r1)
+ lwz rWORD7, 44(r1)
li rRTN, 1
- bgt cr0, L(dureturn29)
- lwz r29,40(r1)
- lwz r28,36(r1)
+ bgt cr7, L(dureturn29)
+ lwz rSHL, 40(r1)
+ lwz rSHR, 36(r1)
li rRTN, -1
b L(dureturn27)
- .align 4
+ .align 4
L(duLcr1):
- lwz r31,48(1)
- lwz r30,44(1)
+ lwz rWORD8, 48(r1)
+ lwz rWORD7, 44(r1)
li rRTN, 1
bgt cr1, L(dureturn29)
- lwz r29,40(r1)
- lwz r28,36(r1)
+ lwz rSHL, 40(r1)
+ lwz rSHR, 36(r1)
li rRTN, -1
b L(dureturn27)
- .align 4
+ .align 4
L(duLcr6):
- lwz r31,48(1)
- lwz r30,44(1)
+ lwz rWORD8, 48(r1)
+ lwz rWORD7, 44(r1)
li rRTN, 1
bgt cr6, L(dureturn29)
- lwz r29,40(r1)
- lwz r28,36(r1)
+ lwz rSHL, 40(r1)
+ lwz rSHR, 36(r1)
li rRTN, -1
b L(dureturn27)
- .align 4
+ .align 4
L(duLcr5):
- lwz r31,48(1)
- lwz r30,44(1)
+ lwz rWORD8, 48(r1)
+ lwz rWORD7, 44(r1)
li rRTN, 1
bgt cr5, L(dureturn29)
- lwz r29,40(r1)
- lwz r28,36(r1)
+ lwz rSHL, 40(r1)
+ lwz rSHR, 36(r1)
li rRTN, -1
b L(dureturn27)
.align 3
L(duZeroReturn):
- li rRTN,0
+ li rRTN, 0
.align 4
L(dureturn):
- lwz r31,48(1)
- lwz r30,44(1)
+ lwz rWORD8, 48(r1)
+ lwz rWORD7, 44(r1)
L(dureturn29):
- lwz r29,40(r1)
- lwz r28,36(r1)
+ lwz rSHL, 40(r1)
+ lwz rSHR, 36(r1)
L(dureturn27):
- lwz r27,32(r1)
+ lwz rWORD8_SHIFT, 32(r1)
L(dureturn26):
- lwz r26,28(r1)
+ lwz rWORD2_SHIFT, 28(r1)
L(dureturn25):
- lwz r25,24(r1)
- lwz r24,20(r1)
- lwz 1,0(1)
+ lwz rWORD4_SHIFT, 24(r1)
+ lwz rWORD6_SHIFT, 20(r1)
+ addi 1, 1, 64
+ cfi_adjust_cfa_offset(-64)
blr
END (memcmp)
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/memcpy.S b/libc/sysdeps/powerpc/powerpc32/power4/memcpy.S
index d9146631e..338d3cce3 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/memcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/memcpy.S
@@ -203,15 +203,28 @@ EALIGN (memcpy, 5, 0)
blt cr6,5f
srwi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmplwi cr1,10,16
@@ -339,13 +352,23 @@ EALIGN (memcpy, 5, 0)
bf 30,1f
/* there are at least two words to copy, so copy them */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10 /* shift 1st src word to left align it in R0 */
srw 8,7,9 /* shift 2nd src word to right align it in R8 */
+#endif
or 0,0,8 /* or them to get word to store */
lwz 6,8(5) /* load the 3rd src word */
stw 0,0(4) /* store the 1st dst word */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,7,10
+ slw 8,6,9
+#else
slw 0,7,10 /* now left align 2nd src word into R0 */
srw 8,6,9 /* shift 3rd src word to right align it in R8 */
+#endif
or 0,0,8 /* or them to get word to store */
lwz 7,12(5)
stw 0,4(4) /* store the 2nd dst word */
@@ -353,8 +376,13 @@ EALIGN (memcpy, 5, 0)
addi 5,5,16
bf 31,4f
/* there is a third word to copy, so copy it */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10 /* shift 3rd src word to left align it in R0 */
srw 8,7,9 /* shift 4th src word to right align it in R8 */
+#endif
or 0,0,8 /* or them to get word to store */
stw 0,0(4) /* store 3rd dst word */
mr 6,7
@@ -364,8 +392,13 @@ EALIGN (memcpy, 5, 0)
b 4f
.align 4
1:
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10 /* shift 1st src word to left align it in R0 */
srw 8,7,9 /* shift 2nd src word to right align it in R8 */
+#endif
addi 5,5,8
or 0,0,8 /* or them to get word to store */
bf 31,4f
@@ -378,23 +411,43 @@ EALIGN (memcpy, 5, 0)
.align 4
4:
/* copy 16 bytes at a time */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10
srw 8,7,9
+#endif
or 0,0,8
lwz 6,0(5)
stw 0,0(4)
+#ifdef __LITTLE_ENDIAN__
+ srw 0,7,10
+ slw 8,6,9
+#else
slw 0,7,10
srw 8,6,9
+#endif
or 0,0,8
lwz 7,4(5)
stw 0,4(4)
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10
srw 8,7,9
+#endif
or 0,0,8
lwz 6,8(5)
stw 0,8(4)
+#ifdef __LITTLE_ENDIAN__
+ srw 0,7,10
+ slw 8,6,9
+#else
slw 0,7,10
srw 8,6,9
+#endif
or 0,0,8
lwz 7,12(5)
stw 0,12(4)
@@ -403,8 +456,13 @@ EALIGN (memcpy, 5, 0)
bdnz+ 4b
8:
/* calculate and store the final word */
+#ifdef __LITTLE_ENDIAN__
+ srw 0,6,10
+ slw 8,7,9
+#else
slw 0,6,10
srw 8,7,9
+#endif
or 0,0,8
stw 0,0(4)
3:
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/memset.S b/libc/sysdeps/powerpc/powerpc32/power4/memset.S
index c2d288b38..4fd9d8cb4 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/memset.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/memset.S
@@ -50,7 +50,7 @@ L(_memset):
/* Align to word boundary. */
cmplwi cr5, rLEN, 31
- rlwimi rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword. */
+ insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */
beq+ L(aligned)
mtcrf 0x01, rMEMP0
subfic rALIGN, rALIGN, 4
@@ -65,7 +65,7 @@ L(g0):
/* Handle the case of size < 31. */
L(aligned):
mtcrf 0x01, rLEN
- rlwimi rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word. */
+ insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
ble cr5, L(medium)
/* Align to 32-byte boundary. */
andi. rALIGN, rMEMP, 0x1C
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/strncmp.S b/libc/sysdeps/powerpc/powerpc32/power4/strncmp.S
index 724d9084a..89b961e78 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/strncmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/strncmp.S
@@ -24,7 +24,7 @@
EALIGN (strncmp, 4, 0)
-#define rTMP r0
+#define rTMP2 r0
#define rRTN r3
#define rSTR1 r3 /* first string arg */
#define rSTR2 r4 /* second string arg */
@@ -37,6 +37,7 @@ EALIGN (strncmp, 4, 0)
#define r7F7F r9 /* constant 0x7f7f7f7f */
#define rNEG r10 /* ~(word in s1 | 0x7f7f7f7f) */
#define rBITDIF r11 /* bits that differ in s1 & s2 words */
+#define rTMP r12
dcbt 0,rSTR1
or rTMP, rSTR2, rSTR1
@@ -75,12 +76,45 @@ L(g1): add rTMP, rFEFE, rWORD1
we don't compare two strings as different because of gunk beyond
the end of the strings... */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+ slwi rTMP, rTMP, 1
+ addi rTMP2, rTMP, -1
+ andc rTMP2, rTMP2, rTMP
+ and rWORD2, rWORD2, rTMP2 /* Mask off gunk. */
+ and rWORD1, rWORD1, rTMP2
+ rlwinm rTMP2, rWORD2, 8, 0xffffffff /* Byte reverse word. */
+ rlwinm rTMP, rWORD1, 8, 0xffffffff
+ rldimi rTMP2, rWORD2, 24, 32
+ rldimi rTMP, rWORD1, 24, 32
+ rlwimi rTMP2, rWORD2, 24, 16, 23
+ rlwimi rTMP, rWORD1, 24, 16, 23
+ xor. rBITDIF, rTMP, rTMP2
+ sub rRTN, rTMP, rTMP2
+ bgelr+
+ ori rRTN, rTMP2, 1
+ blr
+
+L(different):
+ lwz rWORD1, -4(rSTR1)
+ rlwinm rTMP2, rWORD2, 8, 0xffffffff /* Byte reverse word. */
+ rlwinm rTMP, rWORD1, 8, 0xffffffff
+ rldimi rTMP2, rWORD2, 24, 32
+ rldimi rTMP, rWORD1, 24, 32
+ rlwimi rTMP2, rWORD2, 24, 16, 23
+ rlwimi rTMP, rWORD1, 24, 16, 23
+ xor. rBITDIF, rTMP, rTMP2
+ sub rRTN, rTMP, rTMP2
+ bgelr+
+ ori rRTN, rTMP2, 1
+ blr
+
+#else
L(endstring):
and rTMP, r7F7F, rWORD1
beq cr1, L(equal)
add rTMP, rTMP, r7F7F
xor. rBITDIF, rWORD1, rWORD2
-
andc rNEG, rNEG, rTMP
blt- L(highbit)
cntlzw rBITDIF, rBITDIF
@@ -88,28 +122,20 @@ L(endstring):
addi rNEG, rNEG, 7
cmpw cr1, rNEG, rBITDIF
sub rRTN, rWORD1, rWORD2
- blt- cr1, L(equal)
- srawi rRTN, rRTN, 31
- ori rRTN, rRTN, 1
- blr
+ bgelr+ cr1
L(equal):
li rRTN, 0
blr
L(different):
- lwzu rWORD1, -4(rSTR1)
+ lwz rWORD1, -4(rSTR1)
xor. rBITDIF, rWORD1, rWORD2
sub rRTN, rWORD1, rWORD2
- blt- L(highbit)
- srawi rRTN, rRTN, 31
- ori rRTN, rRTN, 1
- blr
+ bgelr+
L(highbit):
- srwi rWORD2, rWORD2, 24
- srwi rWORD1, rWORD1, 24
- sub rRTN, rWORD1, rWORD2
+ ori rRTN, rWORD2, 1
blr
-
+#endif
/* Oh well. In this case, we just do a byte-by-byte comparison. */
.align 4
diff --git a/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S b/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
index ecd37c3cd..49c8a0866 100644
--- a/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
+++ b/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
@@ -39,8 +39,8 @@ ENTRY (__llround)
nop /* Ensure the following load is in a different dispatch */
nop /* group to avoid pipe stall on POWER4&5. */
nop
- lwz r4,12(r1)
- lwz r3,8(r1)
+ lwz r3,8+HIWORD(r1)
+ lwz r4,8+LOWORD(r1)
addi r1,r1,16
blr
END (__llround)
diff --git a/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S b/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
index d4da625bb..780dd9ca4 100644
--- a/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
+++ b/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
@@ -38,7 +38,7 @@ ENTRY (__lround)
nop /* Ensure the following load is in a different dispatch */
nop /* group to avoid pipe stall on POWER4&5. */
nop
- lwz r3,12(r1)
+ lwz r3,8+LOWORD(r1)
addi r1,r1,16
blr
END (__lround)
diff --git a/libc/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S b/libc/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
index f2417fdf4..5f7ba43a2 100644
--- a/libc/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
+++ b/libc/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
@@ -27,8 +27,8 @@ EALIGN (__isnan, 4, 0)
ori r1,r1,0
stfd fp1,24(r1) /* copy FPR to GPR */
ori r1,r1,0
- lwz r4,24(r1)
- lwz r5,28(r1)
+ lwz r4,24+HIWORD(r1)
+ lwz r5,24+LOWORD(r1)
lis r0,0x7ff0 /* const long r0 0x7ff00000 00000000 */
clrlwi r4,r4,1 /* x = fabs(x) */
cmpw cr7,r4,r0 /* if (fabs(x) =< inf) */
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
index 2c095db1d..3ea18589c 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
@@ -27,8 +27,8 @@ EALIGN (__isnan, 4, 0)
ori r1,r1,0
stfd fp1,24(r1) /* copy FPR to GPR */
ori r1,r1,0
- lwz r4,24(r1)
- lwz r5,28(r1)
+ lwz r4,24+HIWORD(r1)
+ lwz r5,24+LOWORD(r1)
lis r0,0x7ff0 /* const long r0 0x7ff00000 00000000 */
clrlwi r4,r4,1 /* x = fabs(x) */
cmpw cr7,r4,r0 /* if (fabs(x) =< inf) */
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
index 3344b312e..c0660cf6e 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
@@ -29,8 +29,8 @@ ENTRY (__llrint)
/* Insure the following load is in a different dispatch group by
inserting "group ending nop". */
ori r1,r1,0
- lwz r3,8(r1)
- lwz r4,12(r1)
+ lwz r3,8+HIWORD(r1)
+ lwz r4,8+LOWORD(r1)
addi r1,r1,16
blr
END (__llrint)
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
index 7f64f8d12..ce298905c 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
@@ -28,8 +28,8 @@ ENTRY (__llrintf)
/* Insure the following load is in a different dispatch group by
inserting "group ending nop". */
ori r1,r1,0
- lwz r3,8(r1)
- lwz r4,12(r1)
+ lwz r3,8+HIWORD(r1)
+ lwz r4,8+LOWORD(r1)
addi r1,r1,16
blr
END (__llrintf)
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
index 0ff04cb71..abb0840d1 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
@@ -39,8 +39,8 @@ ENTRY (__llround)
/* Insure the following load is in a different dispatch group by
inserting "group ending nop". */
ori r1,r1,0
- lwz r4,12(r1)
- lwz r3,8(r1)
+ lwz r3,8+HIWORD(r1)
+ lwz r4,8+LOWORD(r1)
addi r1,r1,16
blr
END (__llround)
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/memcpy.S b/libc/sysdeps/powerpc/powerpc32/power6/memcpy.S
index a76f71e04..f58114a0c 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/memcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/memcpy.S
@@ -219,15 +219,28 @@ L(word_unaligned_short):
blt cr6,5f
srwi 7,6,16
bgt cr6,3f
+#ifdef __LITTLE_ENDIAN__
+ sth 7,0(3)
+#else
sth 6,0(3)
+#endif
b 7f
.align 4
3:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,24
+ stb 6,0(3)
+ sth 7,1(3)
+#else
stb 7,0(3)
sth 6,1(3)
+#endif
b 7f
.align 4
5:
+#ifdef __LITTLE_ENDIAN__
+ rotlwi 6,6,8
+#endif
stb 6,0(3)
7:
cmplwi cr1,10,16
@@ -577,7 +590,11 @@ L(wdu1_32):
lwz 6,-1(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,6,8
+#else
slwi 6,6,8
+#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu1_32tail)
mtctr 8
@@ -585,8 +602,12 @@ L(wdu1_32):
lwz 8,3(4)
lwz 7,4(4)
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,24,32
+#else
/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
+#endif
b L(wdu1_loop32x)
.align 4
L(wdu1_loop32):
@@ -595,8 +616,12 @@ L(wdu1_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,24,32
+#else
/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
+#endif
L(wdu1_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@@ -613,7 +638,11 @@ L(wdu1_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,8,8
+#else
slwi 6,8,8
+#endif
bdnz+ L(wdu1_loop32)
stw 10,-8(3)
stw 11,-4(3)
@@ -624,8 +653,12 @@ L(wdu1_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,3(4)
-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,24,32
+#else
+/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
+#endif
b L(wdu_32tailx)
L(wdu2_32):
@@ -633,7 +666,11 @@ L(wdu2_32):
lwz 6,-2(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,6,16
+#else
slwi 6,6,16
+#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu2_32tail)
mtctr 8
@@ -641,8 +678,11 @@ L(wdu2_32):
lwz 8,2(4)
lwz 7,4(4)
-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,16,32
+#else
rlwimi 6,8,16,(32-16),31
+#endif
b L(wdu2_loop32x)
.align 4
L(wdu2_loop32):
@@ -651,8 +691,11 @@ L(wdu2_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,16,32
+#else
rlwimi 6,8,16,(32-16),31
+#endif
L(wdu2_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@@ -670,7 +713,11 @@ L(wdu2_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,8,16
+#else
slwi 6,8,16
+#endif
bdnz+ L(wdu2_loop32)
stw 10,-8(3)
stw 11,-4(3)
@@ -681,8 +728,11 @@ L(wdu2_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,2(4)
-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,16,32
+#else
rlwimi 6,8,16,(32-16),31
+#endif
b L(wdu_32tailx)
L(wdu3_32):
@@ -690,7 +740,11 @@ L(wdu3_32):
lwz 6,-3(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,6,24
+#else
slwi 6,6,24
+#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu3_32tail)
mtctr 8
@@ -698,8 +752,11 @@ L(wdu3_32):
lwz 8,1(4)
lwz 7,4(4)
-/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,8,32
+#else
rlwimi 6,8,24,(32-24),31
+#endif
b L(wdu3_loop32x)
.align 4
L(wdu3_loop32):
@@ -708,8 +765,11 @@ L(wdu3_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
-/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,8,32
+#else
rlwimi 6,8,24,(32-24),31
+#endif
L(wdu3_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@@ -726,7 +786,11 @@ L(wdu3_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
+#ifdef __LITTLE_ENDIAN__
+ srwi 6,8,24
+#else
slwi 6,8,24
+#endif
bdnz+ L(wdu3_loop32)
stw 10,-8(3)
stw 11,-4(3)
@@ -737,8 +801,11 @@ L(wdu3_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,1(4)
-/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+ rldimi 6,8,8,32
+#else
rlwimi 6,8,24,(32-24),31
+#endif
b L(wdu_32tailx)
.align 4
L(wdu_32tailx):
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/memset.S b/libc/sysdeps/powerpc/powerpc32/power6/memset.S
index 8c23c8d13..a4b002a96 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/memset.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/memset.S
@@ -48,7 +48,7 @@ L(_memset):
ble- cr1, L(small)
/* Align to word boundary. */
cmplwi cr5, rLEN, 31
- rlwimi rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword. */
+ insrdi rCHR, rCHR, 8, 48 /* Replicate byte to halfword. */
beq+ L(aligned)
mtcrf 0x01, rMEMP0
subfic rALIGN, rALIGN, 4
@@ -64,7 +64,7 @@ L(g0):
/* Handle the case of size < 31. */
L(aligned):
mtcrf 0x01, rLEN
- rlwimi rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word. */
+ insrdi rCHR, rCHR, 16, 32 /* Replicate halfword to word. */
ble cr5, L(medium)
/* Align to 32-byte boundary. */
andi. rALIGN, rMEMP, 0x1C
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
index b2ab5bfe7..095c15547 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
@@ -54,9 +54,8 @@ ENTRY (__finite)
stfd fp1,8(r1) /* Transfer FP to GPR's. */
ori 2,2,0 /* Force a new dispatch group. */
- lhz r0,8(r1) /* Fetch the upper portion of the high word of
- the FP value (where the exponent and sign bits
- are). */
+ lhz r0,8+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+ (biased exponent and sign bit). */
clrlwi r0,r0,17 /* r0 = abs(r0). */
addi r1,r1,16 /* Reset the stack pointer. */
cmpwi cr7,r0,0x7ff0 /* r4 == 0x7ff0?. */
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
index 3f8af60a5..0101c8fa1 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
@@ -48,14 +48,13 @@ ENTRY (__isinf)
li r3,0
bflr 29 /* If not INF, return. */
- /* Either we have -INF/+INF or a denormal. */
+ /* Either we have +INF or -INF. */
stwu r1,-16(r1) /* Allocate stack space. */
stfd fp1,8(r1) /* Transfer FP to GPR's. */
ori 2,2,0 /* Force a new dispatch group. */
- lhz r4,8(r1) /* Fetch the upper portion of the high word of
- the FP value (where the exponent and sign bits
- are). */
+ lhz r4,8+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+ (biased exponent and sign bit). */
addi r1,r1,16 /* Reset the stack pointer. */
cmpwi cr7,r4,0x7ff0 /* r4 == 0x7ff0? */
li r3,1
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
index 99ff12696..0ad1dcf1f 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
@@ -53,8 +53,8 @@ ENTRY (__isnan)
stwu r1,-16(r1) /* Allocate stack space. */
stfd fp1,8(r1) /* Transfer FP to GPR's. */
ori 2,2,0 /* Force a new dispatch group. */
- lwz r4,8(r1) /* Load the upper half of the FP value. */
- lwz r5,12(r1) /* Load the lower half of the FP value. */
+ lwz r4,8+HIWORD(r1) /* Load the upper half of the FP value. */
+ lwz r5,8+LOWORD(r1) /* Load the lower half of the FP value. */
addi r1,r1,16 /* Reset the stack pointer. */
lis r0,0x7ff0 /* Load the upper portion for an INF/NaN. */
clrlwi r4,r4,1 /* r4 = abs(r4). */
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
index e008ed0c3..1c82577f5 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
+++ b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
@@ -35,14 +35,14 @@ static const union {
long double
__logbl (long double x)
{
- double xh, xl;
+ double xh;
double ret;
if (__builtin_expect (x == 0.0L, 0))
/* Raise FE_DIVBYZERO and return -HUGE_VAL[LF]. */
return -1.0L / __builtin_fabsl (x);
- ldbl_unpack (x, &xh, &xl);
+ xh = ldbl_high (x);
/* ret = x & 0x7ff0000000000000; */
asm (
"xxland %x0,%x1,%x2\n"
@@ -58,9 +58,9 @@ __logbl (long double x)
{
/* POSIX specifies that denormal number is treated as
though it were normalized. */
- int64_t lx, hx;
+ int64_t hx;
- GET_LDOUBLE_WORDS64 (hx, lx, x);
+ EXTRACT_WORDS64 (hx, xh);
return (long double) (-1023 - (__builtin_clzll (hx) - 12));
}
/* Test to avoid logb_downward (0.0) == -0.0. */
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/memchr.S b/libc/sysdeps/powerpc/powerpc32/power7/memchr.S
index 369e5e048..85754f3f1 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/memchr.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/memchr.S
@@ -25,107 +25,111 @@ ENTRY (__memchr)
CALL_MCOUNT
dcbt 0,r3
clrrwi r8,r3,2
- rlwimi r4,r4,8,16,23
- rlwimi r4,r4,16,0,15
+ insrdi r4,r4,8,48
add r7,r3,r5 /* Calculate the last acceptable address. */
+ insrdi r4,r4,16,32
cmplwi r5,16
+ li r9, -1
+ rlwinm r6,r3,3,27,28 /* Calculate padding. */
+ addi r7,r7,-1
+#ifdef __LITTLE_ENDIAN__
+ slw r9,r9,r6
+#else
+ srw r9,r9,r6
+#endif
ble L(small_range)
- cmplw cr7,r3,r7 /* Compare the starting address (r3) with the
- ending address (r7). If (r3 >= r7), the size
- passed in is zero or negative. */
- ble cr7,L(proceed)
-
- li r7,-1 /* Artificially set our ending address (r7)
- such that we will exit early. */
-L(proceed):
- rlwinm r6,r3,3,27,28 /* Calculate padding. */
- cmpli cr6,r6,0 /* cr6 == Do we have padding? */
lwz r12,0(r8) /* Load word from memory. */
- cmpb r10,r12,r4 /* Check for BYTEs in WORD1. */
- beq cr6,L(proceed_no_padding)
- slw r10,r10,r6
- srw r10,r10,r6
-L(proceed_no_padding):
- cmplwi cr7,r10,0 /* If r10 == 0, no BYTEs have been found. */
+ cmpb r3,r12,r4 /* Check for BYTEs in WORD1. */
+ and r3,r3,r9
+ clrlwi r5,r7,30 /* Byte count - 1 in last word. */
+ clrrwi r7,r7,2 /* Address of last word. */
+ cmplwi cr7,r3,0 /* If r3 == 0, no BYTEs have been found. */
bne cr7,L(done)
- /* Are we done already? */
- addi r9,r8,4
- cmplw cr6,r9,r7
- bge cr6,L(null)
-
mtcrf 0x01,r8
/* Are we now aligned to a doubleword boundary? If so, skip to
the main loop. Otherwise, go through the alignment code. */
-
bt 29,L(loop_setup)
/* Handle WORD2 of pair. */
lwzu r12,4(r8)
- cmpb r10,r12,r4
- cmplwi cr7,r10,0
+ cmpb r3,r12,r4
+ cmplwi cr7,r3,0
bne cr7,L(done)
- /* Are we done already? */
- addi r9,r8,4
- cmplw cr6,r9,r7
- bge cr6,L(null)
-
L(loop_setup):
- sub r5,r7,r9
- srwi r6,r5,3 /* Number of loop iterations. */
+ /* The last word we want to read in the loop below is the one
+ containing the last byte of the string, ie. the word at
+ (s + size - 1) & ~3, or r7. The first word read is at
+ r8 + 4, we read 2 * cnt words, so the last word read will
+ be at r8 + 4 + 8 * cnt - 4. Solving for cnt gives
+ cnt = (r7 - r8) / 8 */
+ sub r6,r7,r8
+ srwi r6,r6,3 /* Number of loop iterations. */
mtctr r6 /* Setup the counter. */
- b L(loop)
- /* Main loop to look for BYTE backwards in the string. Since
- it's a small loop (< 8 instructions), align it to 32-bytes. */
- .p2align 5
+
+ /* Main loop to look for BYTE in the string. Since
+ it's a small loop (8 instructions), align it to 32-bytes. */
+ .align 5
L(loop):
/* Load two words, compare and merge in a
single register for speed. This is an attempt
to speed up the byte-checking process for bigger strings. */
lwz r12,4(r8)
lwzu r11,8(r8)
- cmpb r10,r12,r4
+ cmpb r3,r12,r4
cmpb r9,r11,r4
- or r5,r9,r10 /* Merge everything in one word. */
- cmplwi cr7,r5,0
+ or r6,r9,r3 /* Merge everything in one word. */
+ cmplwi cr7,r6,0
bne cr7,L(found)
bdnz L(loop)
- /* We're here because the counter reached 0, and that means we
- didn't have any matches for BYTE in the whole range. */
- subi r11,r7,4
- cmplw cr6,r8,r11
- blt cr6,L(loop_small)
- b L(null)
+ /* We may have one more dword to read. */
+ cmplw r8,r7
+ beqlr
+
+ lwzu r12,4(r8)
+ cmpb r3,r12,r4
+ cmplwi cr6,r3,0
+ bne cr6,L(done)
+ blr
+ .align 4
+L(found):
/* OK, one (or both) of the words contains BYTE. Check
the first word and decrement the address in case the first
word really contains BYTE. */
- .align 4
-L(found):
- cmplwi cr6,r10,0
+ cmplwi cr6,r3,0
addi r8,r8,-4
bne cr6,L(done)
/* BYTE must be in the second word. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
+ again and move the result of cmpb to r3 so we can calculate the
pointer. */
- mr r10,r9
+ mr r3,r9
addi r8,r8,4
- /* r10 has the output of the cmpb instruction, that is, it contains
+ /* r3 has the output of the cmpb instruction, that is, it contains
0xff in the same position as BYTE in the original
word from the string. Use that to calculate the pointer.
We need to make sure BYTE is *before* the end of the range. */
L(done):
- cntlzw r0,r10 /* Count leading zeroes before the match. */
- srwi r0,r0,3 /* Convert leading zeroes to bytes. */
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r3,-1
+ andc r0,r0,r3
+ popcntw r0,r0 /* Count trailing zeros. */
+#else
+ cntlzw r0,r3 /* Count leading zeros before the match. */
+#endif
+ cmplw r8,r7 /* Are we on the last word? */
+ srwi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
add r3,r8,r0
- cmplw r3,r7
- bge L(null)
+ cmplw cr7,r0,r5 /* If on the last dword, check byte offset. */
+ bnelr
+ blelr cr7
+ li r3,0
blr
.align 4
@@ -137,67 +141,42 @@ L(null):
.align 4
L(small_range):
cmplwi r5,0
- rlwinm r6,r3,3,27,28 /* Calculate padding. */
- beq L(null) /* This branch is for the cmplwi r5,0 above */
+ beq L(null)
lwz r12,0(r8) /* Load word from memory. */
- cmplwi cr6,r6,0 /* cr6 == Do we have padding? */
- cmpb r10,r12,r4 /* Check for BYTE in DWORD1. */
- beq cr6,L(small_no_padding)
- slw r10,r10,r6
- srw r10,r10,r6
-L(small_no_padding):
- cmplwi cr7,r10,0
+ cmpb r3,r12,r4 /* Check for BYTE in DWORD1. */
+ and r3,r3,r9
+ cmplwi cr7,r3,0
+ clrlwi r5,r7,30 /* Byte count - 1 in last word. */
+ clrrwi r7,r7,2 /* Address of last word. */
+ cmplw r8,r7 /* Are we done already? */
bne cr7,L(done)
+ beqlr
- /* Are we done already? */
- addi r9,r8,4
- cmplw r9,r7
- bge L(null)
-
-L(loop_small): /* loop_small has been unrolled. */
lwzu r12,4(r8)
- cmpb r10,r12,r4
- addi r9,r8,4
- cmplwi cr6,r10,0
- cmplw r9,r7
+ cmpb r3,r12,r4
+ cmplwi cr6,r3,0
+ cmplw r8,r7
bne cr6,L(done)
- bge L(null)
+ beqlr
lwzu r12,4(r8)
- cmpb r10,r12,r4
- addi r9,r8,4
- cmplwi cr6,r10,0
- cmplw r9,r7
+ cmpb r3,r12,r4
+ cmplwi cr6,r3,0
+ cmplw r8,r7
bne cr6,L(done)
- bge L(null)
+ beqlr
lwzu r12,4(r8)
- cmpb r10,r12,r4
- addi r9,r8,4
- cmplwi cr6,r10,0
- cmplw r9,r7
+ cmpb r3,r12,r4
+ cmplwi cr6,r3,0
+ cmplw r8,r7
bne cr6,L(done)
- bge L(null)
+ beqlr
lwzu r12,4(r8)
- cmpb r10,r12,r4
- addi r9,r8,4
- cmplwi cr6,r10,0
- cmplw r9,r7
+ cmpb r3,r12,r4
+ cmplwi cr6,r3,0
bne cr6,L(done)
- bge L(null)
-
- /* For most cases we will never get here. Under some combinations of
- padding + length there is a leftover word that still needs to be
- checked. */
- lwzu r12,4(r8)
- cmpb r10,r12,r4
- addi r9,r8,4
- cmplwi cr6,r10,0
- bne cr6,L(done)
-
- /* save a branch and exit directly */
- li r3,0
blr
END (__memchr)
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/memcmp.S b/libc/sysdeps/powerpc/powerpc32/power7/memcmp.S
index 075e19f14..f160ddebf 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/memcmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/memcmp.S
@@ -23,10 +23,9 @@
size_t size [r5]) */
.machine power7
-EALIGN (memcmp,4,0)
+EALIGN (memcmp, 4, 0)
CALL_MCOUNT
-#define rTMP r0
#define rRTN r3
#define rSTR1 r3 /* first string arg */
#define rSTR2 r4 /* second string arg */
@@ -37,35 +36,32 @@ EALIGN (memcmp,4,0)
#define rWORD4 r9 /* next word in s2 */
#define rWORD5 r10 /* next word in s1 */
#define rWORD6 r11 /* next word in s2 */
-#define rBITDIF r12 /* bits that differ in s1 & s2 words */
#define rWORD7 r30 /* next word in s1 */
#define rWORD8 r31 /* next word in s2 */
- xor rTMP,rSTR2,rSTR1
- cmplwi cr6,rN,0
- cmplwi cr1,rN,12
- clrlwi. rTMP,rTMP,30
- clrlwi rBITDIF,rSTR1,30
- cmplwi cr5,rBITDIF,0
- beq- cr6,L(zeroLength)
- dcbt 0,rSTR1
- dcbt 0,rSTR2
-
- /* If less than 8 bytes or not aligned, use the unaligned
- byte loop. */
-
- blt cr1,L(bytealigned)
- stwu 1,-64(1)
+ xor r0, rSTR2, rSTR1
+ cmplwi cr6, rN, 0
+ cmplwi cr1, rN, 12
+ clrlwi. r0, r0, 30
+ clrlwi r12, rSTR1, 30
+ cmplwi cr5, r12, 0
+ beq- cr6, L(zeroLength)
+ dcbt 0, rSTR1
+ dcbt 0, rSTR2
+/* If less than 8 bytes or not aligned, use the unaligned
+ byte loop. */
+ blt cr1, L(bytealigned)
+ stwu 1, -64(r1)
cfi_adjust_cfa_offset(64)
- stw r31,48(1)
- cfi_offset(31,(48-64))
- stw r30,44(1)
- cfi_offset(30,(44-64))
+ stw rWORD8, 48(r1)
+ cfi_offset(rWORD8, (48-64))
+ stw rWORD7, 44(r1)
+ cfi_offset(rWORD7, (44-64))
bne L(unaligned)
/* At this point we know both strings have the same alignment and the
- compare length is at least 8 bytes. rBITDIF contains the low order
+ compare length is at least 8 bytes. r12 contains the low order
2 bits of rSTR1 and cr5 contains the result of the logical compare
- of rBITDIF to 0. If rBITDIF == 0 then we are already word
+ of r12 to 0. If r12 == 0 then we are already word
aligned and can perform the word aligned loop.
Otherwise we know the two strings have the same alignment (but not
@@ -74,332 +70,541 @@ EALIGN (memcmp,4,0)
eliminate bits preceding the first byte. Since we want to join the
normal (word aligned) compare loop, starting at the second word,
we need to adjust the length (rN) and special case the loop
- versioning for the first word. This insures that the loop count is
+ versioning for the first word. This ensures that the loop count is
correct and the first word (shifted) is in the expected register pair. */
.align 4
L(samealignment):
- clrrwi rSTR1,rSTR1,2
- clrrwi rSTR2,rSTR2,2
- beq cr5,L(Waligned)
- add rN,rN,rBITDIF
- slwi r11,rBITDIF,3
- srwi rTMP,rN,4 /* Divide by 16 */
- andi. rBITDIF,rN,12 /* Get the word remainder */
- lwz rWORD1,0(rSTR1)
- lwz rWORD2,0(rSTR2)
- cmplwi cr1,rBITDIF,8
- cmplwi cr7,rN,16
- clrlwi rN,rN,30
+ clrrwi rSTR1, rSTR1, 2
+ clrrwi rSTR2, rSTR2, 2
+ beq cr5, L(Waligned)
+ add rN, rN, r12
+ slwi rWORD6, r12, 3
+ srwi r0, rN, 4 /* Divide by 16 */
+ andi. r12, rN, 12 /* Get the word remainder */
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 0(rSTR1)
+ lwz rWORD2, 0(rSTR2)
+#endif
+ cmplwi cr1, r12, 8
+ cmplwi cr7, rN, 16
+ clrlwi rN, rN, 30
beq L(dPs4)
- mtctr rTMP
- bgt cr1,L(dPs3)
- beq cr1,L(dPs2)
+ mtctr r0
+ bgt cr1, L(dPs3)
+ beq cr1, L(dPs2)
/* Remainder is 4 */
.align 3
L(dsP1):
- slw rWORD5,rWORD1,r11
- slw rWORD6,rWORD2,r11
- cmplw cr5,rWORD5,rWORD6
- blt cr7,L(dP1x)
+ slw rWORD5, rWORD1, rWORD6
+ slw rWORD6, rWORD2, rWORD6
+ cmplw cr5, rWORD5, rWORD6
+ blt cr7, L(dP1x)
/* Do something useful in this cycle since we have to branch anyway. */
- lwz rWORD1,4(rSTR1)
- lwz rWORD2,4(rSTR2)
- cmplw cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 4(rSTR1)
+ lwz rWORD2, 4(rSTR2)
+#endif
+ cmplw cr7, rWORD1, rWORD2
b L(dP1e)
/* Remainder is 8 */
.align 4
L(dPs2):
- slw rWORD5,rWORD1,r11
- slw rWORD6,rWORD2,r11
- cmplw cr6,rWORD5,rWORD6
- blt cr7,L(dP2x)
+ slw rWORD5, rWORD1, rWORD6
+ slw rWORD6, rWORD2, rWORD6
+ cmplw cr6, rWORD5, rWORD6
+ blt cr7, L(dP2x)
/* Do something useful in this cycle since we have to branch anyway. */
- lwz rWORD7,4(rSTR1)
- lwz rWORD8,4(rSTR2)
- cmplw cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD7, 4(rSTR1)
+ lwz rWORD8, 4(rSTR2)
+#endif
+ cmplw cr5, rWORD7, rWORD8
b L(dP2e)
/* Remainder is 12 */
.align 4
L(dPs3):
- slw rWORD3,rWORD1,r11
- slw rWORD4,rWORD2,r11
- cmplw cr1,rWORD3,rWORD4
+ slw rWORD3, rWORD1, rWORD6
+ slw rWORD4, rWORD2, rWORD6
+ cmplw cr1, rWORD3, rWORD4
b L(dP3e)
/* Count is a multiple of 16, remainder is 0 */
.align 4
L(dPs4):
- mtctr rTMP
- slw rWORD1,rWORD1,r11
- slw rWORD2,rWORD2,r11
- cmplw cr0,rWORD1,rWORD2
+ mtctr r0
+ slw rWORD1, rWORD1, rWORD6
+ slw rWORD2, rWORD2, rWORD6
+ cmplw cr7, rWORD1, rWORD2
b L(dP4e)
/* At this point we know both strings are word aligned and the
compare length is at least 8 bytes. */
.align 4
L(Waligned):
- andi. rBITDIF,rN,12 /* Get the word remainder */
- srwi rTMP,rN,4 /* Divide by 16 */
- cmplwi cr1,rBITDIF,8
- cmplwi cr7,rN,16
- clrlwi rN,rN,30
+ andi. r12, rN, 12 /* Get the word remainder */
+ srwi r0, rN, 4 /* Divide by 16 */
+ cmplwi cr1, r12, 8
+ cmplwi cr7, rN, 16
+ clrlwi rN, rN, 30
beq L(dP4)
- bgt cr1,L(dP3)
- beq cr1,L(dP2)
+ bgt cr1, L(dP3)
+ beq cr1, L(dP2)
/* Remainder is 4 */
.align 4
L(dP1):
- mtctr rTMP
+ mtctr r0
/* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
(8-15 byte compare), we want to use only volatile registers. This
means we can avoid restoring non-volatile registers since we did not
change any on the early exit path. The key here is the non-early
exit path only cares about the condition code (cr5), not about which
register pair was used. */
- lwz rWORD5,0(rSTR1)
- lwz rWORD6,0(rSTR2)
- cmplw cr5,rWORD5,rWORD6
- blt cr7,L(dP1x)
- lwz rWORD1,4(rSTR1)
- lwz rWORD2,4(rSTR2)
- cmplw cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD5, 0(rSTR1)
+ lwz rWORD6, 0(rSTR2)
+#endif
+ cmplw cr5, rWORD5, rWORD6
+ blt cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 4(rSTR1)
+ lwz rWORD2, 4(rSTR2)
+#endif
+ cmplw cr7, rWORD1, rWORD2
L(dP1e):
- lwz rWORD3,8(rSTR1)
- lwz rWORD4,8(rSTR2)
- cmplw cr1,rWORD3,rWORD4
- lwz rWORD5,12(rSTR1)
- lwz rWORD6,12(rSTR2)
- cmplw cr6,rWORD5,rWORD6
- bne cr5,L(dLcr5)
- bne cr0,L(dLcr0)
-
- lwzu rWORD7,16(rSTR1)
- lwzu rWORD8,16(rSTR2)
- bne cr1,L(dLcr1)
- cmplw cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD3, 8(rSTR1)
+ lwz rWORD4, 8(rSTR2)
+#endif
+ cmplw cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD5, 12(rSTR1)
+ lwz rWORD6, 12(rSTR2)
+#endif
+ cmplw cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5x)
+ bne cr7, L(dLcr7x)
+
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwzu rWORD7, 16(rSTR1)
+ lwzu rWORD8, 16(rSTR2)
+#endif
+ bne cr1, L(dLcr1)
+ cmplw cr5, rWORD7, rWORD8
bdnz L(dLoop)
- bne cr6,L(dLcr6)
- lwz r30,44(1)
- lwz r31,48(1)
+ bne cr6, L(dLcr6)
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
.align 3
L(dP1x):
- slwi. r12,rN,3
- bne cr5,L(dLcr5)
- subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
- lwz 1,0(1)
+ slwi. r12, rN, 3
+ bne cr5, L(dLcr5x)
+ subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
+ addi r1, r1, 64
+ cfi_adjust_cfa_offset(-64)
bne L(d00)
- li rRTN,0
+ li rRTN, 0
blr
/* Remainder is 8 */
.align 4
+ cfi_adjust_cfa_offset(64)
L(dP2):
- mtctr rTMP
- lwz rWORD5,0(rSTR1)
- lwz rWORD6,0(rSTR2)
- cmplw cr6,rWORD5,rWORD6
- blt cr7,L(dP2x)
- lwz rWORD7,4(rSTR1)
- lwz rWORD8,4(rSTR2)
- cmplw cr5,rWORD7,rWORD8
+ mtctr r0
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD5, 0(rSTR1)
+ lwz rWORD6, 0(rSTR2)
+#endif
+ cmplw cr6, rWORD5, rWORD6
+ blt cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD7, 4(rSTR1)
+ lwz rWORD8, 4(rSTR2)
+#endif
+ cmplw cr5, rWORD7, rWORD8
L(dP2e):
- lwz rWORD1,8(rSTR1)
- lwz rWORD2,8(rSTR2)
- cmplw cr0,rWORD1,rWORD2
- lwz rWORD3,12(rSTR1)
- lwz rWORD4,12(rSTR2)
- cmplw cr1,rWORD3,rWORD4
- addi rSTR1,rSTR1,4
- addi rSTR2,rSTR2,4
- bne cr6,L(dLcr6)
- bne cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 8(rSTR1)
+ lwz rWORD2, 8(rSTR2)
+#endif
+ cmplw cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD3, 12(rSTR1)
+ lwz rWORD4, 12(rSTR2)
+#endif
+ cmplw cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#endif
+ bne cr6, L(dLcr6)
+ bne cr5, L(dLcr5)
b L(dLoop2)
/* Again we are on a early exit path (16-23 byte compare), we want to
only use volatile registers and avoid restoring non-volatile
registers. */
.align 4
L(dP2x):
- lwz rWORD3,4(rSTR1)
- lwz rWORD4,4(rSTR2)
- cmplw cr5,rWORD3,rWORD4
- slwi. r12,rN,3
- bne cr6,L(dLcr6)
- addi rSTR1,rSTR1,4
- addi rSTR2,rSTR2,4
- bne cr5,L(dLcr5)
- subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
- lwz 1,0(1)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD3, 4(rSTR1)
+ lwz rWORD4, 4(rSTR2)
+#endif
+ cmplw cr1, rWORD3, rWORD4
+ slwi. r12, rN, 3
+ bne cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#endif
+ bne cr1, L(dLcr1x)
+ subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
+ addi r1, r1, 64
+ cfi_adjust_cfa_offset(-64)
bne L(d00)
- li rRTN,0
+ li rRTN, 0
blr
/* Remainder is 12 */
.align 4
+ cfi_adjust_cfa_offset(64)
L(dP3):
- mtctr rTMP
- lwz rWORD3,0(rSTR1)
- lwz rWORD4,0(rSTR2)
- cmplw cr1,rWORD3,rWORD4
+ mtctr r0
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD3, 0(rSTR1)
+ lwz rWORD4, 0(rSTR2)
+#endif
+ cmplw cr1, rWORD3, rWORD4
L(dP3e):
- lwz rWORD5,4(rSTR1)
- lwz rWORD6,4(rSTR2)
- cmplw cr6,rWORD5,rWORD6
- blt cr7,L(dP3x)
- lwz rWORD7,8(rSTR1)
- lwz rWORD8,8(rSTR2)
- cmplw cr5,rWORD7,rWORD8
- lwz rWORD1,12(rSTR1)
- lwz rWORD2,12(rSTR2)
- cmplw cr0,rWORD1,rWORD2
- addi rSTR1,rSTR1,8
- addi rSTR2,rSTR2,8
- bne cr1,L(dLcr1)
- bne cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD5, 4(rSTR1)
+ lwz rWORD6, 4(rSTR2)
+#endif
+ cmplw cr6, rWORD5, rWORD6
+ blt cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD7, 8(rSTR1)
+ lwz rWORD8, 8(rSTR2)
+#endif
+ cmplw cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 12(rSTR1)
+ lwz rWORD2, 12(rSTR2)
+#endif
+ cmplw cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+#endif
+ bne cr1, L(dLcr1)
+ bne cr6, L(dLcr6)
b L(dLoop1)
/* Again we are on a early exit path (24-31 byte compare), we want to
only use volatile registers and avoid restoring non-volatile
registers. */
.align 4
L(dP3x):
- lwz rWORD1,8(rSTR1)
- lwz rWORD2,8(rSTR2)
- cmplw cr5,rWORD1,rWORD2
- slwi. r12,rN,3
- bne cr1,L(dLcr1)
- addi rSTR1,rSTR1,8
- addi rSTR2,rSTR2,8
- bne cr6,L(dLcr6)
- subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
- bne cr5,L(dLcr5)
- lwz 1,0(1)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 8(rSTR1)
+ lwz rWORD2, 8(rSTR2)
+#endif
+ cmplw cr7, rWORD1, rWORD2
+ slwi. r12, rN, 3
+ bne cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+#endif
+ bne cr6, L(dLcr6x)
+ subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
+ bne cr7, L(dLcr7x)
+ addi r1, r1, 64
+ cfi_adjust_cfa_offset(-64)
bne L(d00)
- li rRTN,0
+ li rRTN, 0
blr
/* Count is a multiple of 16, remainder is 0 */
.align 4
+ cfi_adjust_cfa_offset(64)
L(dP4):
- mtctr rTMP
- lwz rWORD1,0(rSTR1)
- lwz rWORD2,0(rSTR2)
- cmplw cr0,rWORD1,rWORD2
+ mtctr r0
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 0(rSTR1)
+ lwz rWORD2, 0(rSTR2)
+#endif
+ cmplw cr7, rWORD1, rWORD2
L(dP4e):
- lwz rWORD3,4(rSTR1)
- lwz rWORD4,4(rSTR2)
- cmplw cr1,rWORD3,rWORD4
- lwz rWORD5,8(rSTR1)
- lwz rWORD6,8(rSTR2)
- cmplw cr6,rWORD5,rWORD6
- lwzu rWORD7,12(rSTR1)
- lwzu rWORD8,12(rSTR2)
- cmplw cr5,rWORD7,rWORD8
- bne cr0,L(dLcr0)
- bne cr1,L(dLcr1)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD3, 4(rSTR1)
+ lwz rWORD4, 4(rSTR2)
+#endif
+ cmplw cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD5, 8(rSTR1)
+ lwz rWORD6, 8(rSTR2)
+#endif
+ cmplw cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwzu rWORD7, 12(rSTR1)
+ lwzu rWORD8, 12(rSTR2)
+#endif
+ cmplw cr5, rWORD7, rWORD8
+ bne cr7, L(dLcr7)
+ bne cr1, L(dLcr1)
bdz- L(d24) /* Adjust CTR as we start with +4 */
/* This is the primary loop */
.align 4
L(dLoop):
- lwz rWORD1,4(rSTR1)
- lwz rWORD2,4(rSTR2)
- cmplw cr1,rWORD3,rWORD4
- bne cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 4(rSTR1)
+ lwz rWORD2, 4(rSTR2)
+#endif
+ cmplw cr1, rWORD3, rWORD4
+ bne cr6, L(dLcr6)
L(dLoop1):
- lwz rWORD3,8(rSTR1)
- lwz rWORD4,8(rSTR2)
- cmplw cr6,rWORD5,rWORD6
- bne cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD3, 8(rSTR1)
+ lwz rWORD4, 8(rSTR2)
+#endif
+ cmplw cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5)
L(dLoop2):
- lwz rWORD5,12(rSTR1)
- lwz rWORD6,12(rSTR2)
- cmplw cr5,rWORD7,rWORD8
- bne cr0,L(dLcr0)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD5, 12(rSTR1)
+ lwz rWORD6, 12(rSTR2)
+#endif
+ cmplw cr5, rWORD7, rWORD8
+ bne cr7, L(dLcr7)
L(dLoop3):
- lwzu rWORD7,16(rSTR1)
- lwzu rWORD8,16(rSTR2)
- bne cr1,L(dLcr1)
- cmplw cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwzu rWORD7, 16(rSTR1)
+ lwzu rWORD8, 16(rSTR2)
+#endif
+ bne cr1, L(dLcr1)
+ cmplw cr7, rWORD1, rWORD2
bdnz L(dLoop)
L(dL4):
- cmplw cr1,rWORD3,rWORD4
- bne cr6,L(dLcr6)
- cmplw cr6,rWORD5,rWORD6
- bne cr5,L(dLcr5)
- cmplw cr5,rWORD7,rWORD8
+ cmplw cr1, rWORD3, rWORD4
+ bne cr6, L(dLcr6)
+ cmplw cr6, rWORD5, rWORD6
+ bne cr5, L(dLcr5)
+ cmplw cr5, rWORD7, rWORD8
L(d44):
- bne cr0,L(dLcr0)
+ bne cr7, L(dLcr7)
L(d34):
- bne cr1,L(dLcr1)
+ bne cr1, L(dLcr1)
L(d24):
- bne cr6,L(dLcr6)
+ bne cr6, L(dLcr6)
L(d14):
- slwi. r12,rN,3
- bne cr5,L(dLcr5)
+ slwi. r12, rN, 3
+ bne cr5, L(dLcr5)
L(d04):
- lwz r30,44(1)
- lwz r31,48(1)
- lwz 1,0(1)
- subfic rN,r12,32 /* Shift count is 32 - (rN * 8). */
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
+ addi r1, r1, 64
+ cfi_adjust_cfa_offset(-64)
+ subfic rN, r12, 32 /* Shift count is 32 - (rN * 8). */
beq L(zeroLength)
/* At this point we have a remainder of 1 to 3 bytes to compare. Since
we are aligned it is safe to load the whole word, and use
- shift right to eliminate bits beyond the compare length. */
+ shift right to eliminate bits beyond the compare length. */
L(d00):
- lwz rWORD1,4(rSTR1)
- lwz rWORD2,4(rSTR2)
- srw rWORD1,rWORD1,rN
- srw rWORD2,rWORD2,rN
- cmplw rWORD1,rWORD2
- li rRTN,0
- beqlr
- li rRTN,1
- bgtlr
- li rRTN,-1
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 4(rSTR1)
+ lwz rWORD2, 4(rSTR2)
+#endif
+ srw rWORD1, rWORD1, rN
+ srw rWORD2, rWORD2, rN
+ sub rRTN, rWORD1, rWORD2
blr
.align 4
-L(dLcr0):
- lwz r30,44(1)
- lwz r31,48(1)
- li rRTN,1
- lwz 1,0(1)
- bgtlr cr0
- li rRTN,-1
+ cfi_adjust_cfa_offset(64)
+L(dLcr7):
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
+L(dLcr7x):
+ li rRTN, 1
+ addi r1, r1, 64
+ cfi_adjust_cfa_offset(-64)
+ bgtlr cr7
+ li rRTN, -1
blr
.align 4
+ cfi_adjust_cfa_offset(64)
L(dLcr1):
- lwz r30,44(1)
- lwz r31,48(1)
- li rRTN,1
- lwz 1,0(1)
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
+L(dLcr1x):
+ li rRTN, 1
+ addi r1, r1, 64
+ cfi_adjust_cfa_offset(-64)
bgtlr cr1
- li rRTN,-1
+ li rRTN, -1
blr
.align 4
+ cfi_adjust_cfa_offset(64)
L(dLcr6):
- lwz r30,44(1)
- lwz r31,48(1)
- li rRTN,1
- lwz 1,0(1)
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
+L(dLcr6x):
+ li rRTN, 1
+ addi r1, r1, 64
+ cfi_adjust_cfa_offset(-64)
bgtlr cr6
- li rRTN,-1
+ li rRTN, -1
blr
.align 4
+ cfi_adjust_cfa_offset(64)
L(dLcr5):
- lwz r30,44(1)
- lwz r31,48(1)
+ lwz rWORD7, 44(r1)
+ lwz rWORD8, 48(r1)
L(dLcr5x):
- li rRTN,1
- lwz 1,0(1)
+ li rRTN, 1
+ addi r1, r1, 64
+ cfi_adjust_cfa_offset(-64)
bgtlr cr5
- li rRTN,-1
+ li rRTN, -1
blr
.align 4
L(bytealigned):
- cfi_adjust_cfa_offset(-64)
mtctr rN
/* We need to prime this loop. This loop is swing modulo scheduled
@@ -411,38 +616,39 @@ L(bytealigned):
So we must precondition some registers and condition codes so that
we don't exit the loop early on the first iteration. */
- lbz rWORD1,0(rSTR1)
- lbz rWORD2,0(rSTR2)
+
+ lbz rWORD1, 0(rSTR1)
+ lbz rWORD2, 0(rSTR2)
bdz L(b11)
- cmplw cr0,rWORD1,rWORD2
- lbz rWORD3,1(rSTR1)
- lbz rWORD4,1(rSTR2)
+ cmplw cr7, rWORD1, rWORD2
+ lbz rWORD3, 1(rSTR1)
+ lbz rWORD4, 1(rSTR2)
bdz L(b12)
- cmplw cr1,rWORD3,rWORD4
- lbzu rWORD5,2(rSTR1)
- lbzu rWORD6,2(rSTR2)
+ cmplw cr1, rWORD3, rWORD4
+ lbzu rWORD5, 2(rSTR1)
+ lbzu rWORD6, 2(rSTR2)
bdz L(b13)
.align 4
L(bLoop):
- lbzu rWORD1,1(rSTR1)
- lbzu rWORD2,1(rSTR2)
- bne cr0,L(bLcr0)
+ lbzu rWORD1, 1(rSTR1)
+ lbzu rWORD2, 1(rSTR2)
+ bne cr7, L(bLcr7)
- cmplw cr6,rWORD5,rWORD6
+ cmplw cr6, rWORD5, rWORD6
bdz L(b3i)
- lbzu rWORD3,1(rSTR1)
- lbzu rWORD4,1(rSTR2)
- bne cr1,L(bLcr1)
+ lbzu rWORD3, 1(rSTR1)
+ lbzu rWORD4, 1(rSTR2)
+ bne cr1, L(bLcr1)
- cmplw cr0,rWORD1,rWORD2
+ cmplw cr7, rWORD1, rWORD2
bdz L(b2i)
- lbzu rWORD5,1(rSTR1)
- lbzu rWORD6,1(rSTR2)
- bne cr6,L(bLcr6)
+ lbzu rWORD5, 1(rSTR1)
+ lbzu rWORD6, 1(rSTR2)
+ bne cr6, L(bLcr6)
- cmplw cr1,rWORD3,rWORD4
+ cmplw cr1, rWORD3, rWORD4
bdnz L(bLoop)
/* We speculatively loading bytes before we have tested the previous
@@ -452,67 +658,62 @@ L(bLoop):
tested. In this case we must complete the pending operations
before returning. */
L(b1i):
- bne cr0,L(bLcr0)
- bne cr1,L(bLcr1)
+ bne cr7, L(bLcr7)
+ bne cr1, L(bLcr1)
b L(bx56)
.align 4
L(b2i):
- bne cr6,L(bLcr6)
- bne cr0,L(bLcr0)
+ bne cr6, L(bLcr6)
+ bne cr7, L(bLcr7)
b L(bx34)
.align 4
L(b3i):
- bne cr1,L(bLcr1)
- bne cr6,L(bLcr6)
+ bne cr1, L(bLcr1)
+ bne cr6, L(bLcr6)
b L(bx12)
.align 4
-L(bLcr0):
- li rRTN,1
- bgtlr cr0
- li rRTN,-1
+L(bLcr7):
+ li rRTN, 1
+ bgtlr cr7
+ li rRTN, -1
blr
L(bLcr1):
- li rRTN,1
+ li rRTN, 1
bgtlr cr1
- li rRTN,-1
+ li rRTN, -1
blr
L(bLcr6):
- li rRTN,1
+ li rRTN, 1
bgtlr cr6
- li rRTN,-1
+ li rRTN, -1
blr
L(b13):
- bne cr0,L(bx12)
- bne cr1,L(bx34)
+ bne cr7, L(bx12)
+ bne cr1, L(bx34)
L(bx56):
- sub rRTN,rWORD5,rWORD6
+ sub rRTN, rWORD5, rWORD6
blr
nop
L(b12):
- bne cr0,L(bx12)
+ bne cr7, L(bx12)
L(bx34):
- sub rRTN,rWORD3,rWORD4
+ sub rRTN, rWORD3, rWORD4
blr
-
L(b11):
L(bx12):
- sub rRTN,rWORD1,rWORD2
+ sub rRTN, rWORD1, rWORD2
blr
-
.align 4
-L(zeroLengthReturn):
-
L(zeroLength):
- li rRTN,0
+ li rRTN, 0
blr
- cfi_adjust_cfa_offset(64)
.align 4
/* At this point we know the strings have different alignment and the
- compare length is at least 8 bytes. rBITDIF contains the low order
+ compare length is at least 8 bytes. r12 contains the low order
2 bits of rSTR1 and cr5 contains the result of the logical compare
- of rBITDIF to 0. If rBITDIF == 0 then rStr1 is word aligned and can
+ of r12 to 0. If r12 == 0 then rStr1 is word aligned and can
perform the Wunaligned loop.
Otherwise we know that rSTR1 is not already word aligned yet.
@@ -521,465 +722,654 @@ L(zeroLength):
eliminate bits preceding the first byte. Since we want to join the
normal (Wualigned) compare loop, starting at the second word,
we need to adjust the length (rN) and special case the loop
- versioning for the first W. This insures that the loop count is
+ versioning for the first W. This ensures that the loop count is
correct and the first W (shifted) is in the expected resister pair. */
#define rSHL r29 /* Unaligned shift left count. */
#define rSHR r28 /* Unaligned shift right count. */
-#define rB r27 /* Left rotation temp for rWORD2. */
-#define rD r26 /* Left rotation temp for rWORD4. */
-#define rF r25 /* Left rotation temp for rWORD6. */
-#define rH r24 /* Left rotation temp for rWORD8. */
-#define rA r0 /* Right rotation temp for rWORD2. */
-#define rC r12 /* Right rotation temp for rWORD4. */
-#define rE r0 /* Right rotation temp for rWORD6. */
-#define rG r12 /* Right rotation temp for rWORD8. */
+#define rWORD8_SHIFT r27 /* Left rotation temp for rWORD2. */
+#define rWORD2_SHIFT r26 /* Left rotation temp for rWORD4. */
+#define rWORD4_SHIFT r25 /* Left rotation temp for rWORD6. */
+#define rWORD6_SHIFT r24 /* Left rotation temp for rWORD8. */
+ cfi_adjust_cfa_offset(64)
L(unaligned):
- stw r29,40(r1)
- cfi_offset(r29,(40-64))
- clrlwi rSHL,rSTR2,30
- stw r28,36(r1)
- cfi_offset(r28,(36-64))
- beq cr5,L(Wunaligned)
- stw r27,32(r1)
- cfi_offset(r27,(32-64))
+ stw rSHL, 40(r1)
+ cfi_offset(rSHL, (40-64))
+ clrlwi rSHL, rSTR2, 30
+ stw rSHR, 36(r1)
+ cfi_offset(rSHR, (36-64))
+ beq cr5, L(Wunaligned)
+ stw rWORD8_SHIFT, 32(r1)
+ cfi_offset(rWORD8_SHIFT, (32-64))
/* Adjust the logical start of rSTR2 to compensate for the extra bits
in the 1st rSTR1 W. */
- sub r27,rSTR2,rBITDIF
+ sub rWORD8_SHIFT, rSTR2, r12
/* But do not attempt to address the W before that W that contains
the actual start of rSTR2. */
- clrrwi rSTR2,rSTR2,2
- stw r26,28(r1)
- cfi_offset(r26,(28-64))
-/* Compute the left/right shift counts for the unalign rSTR2,
+ clrrwi rSTR2, rSTR2, 2
+ stw rWORD2_SHIFT, 28(r1)
+ cfi_offset(rWORD2_SHIFT, (28-64))
+/* Compute the left/right shift counts for the unaligned rSTR2,
compensating for the logical (W aligned) start of rSTR1. */
- clrlwi rSHL,r27,30
- clrrwi rSTR1,rSTR1,2
- stw r25,24(r1)
- cfi_offset(r25,(24-64))
- slwi rSHL,rSHL,3
- cmplw cr5,r27,rSTR2
- add rN,rN,rBITDIF
- slwi r11,rBITDIF,3
- stw r24,20(r1)
- cfi_offset(r24,(20-64))
- subfic rSHR,rSHL,32
- srwi rTMP,rN,4 /* Divide by 16 */
- andi. rBITDIF,rN,12 /* Get the W remainder */
+ clrlwi rSHL, rWORD8_SHIFT, 30
+ clrrwi rSTR1, rSTR1, 2
+ stw rWORD4_SHIFT, 24(r1)
+ cfi_offset(rWORD4_SHIFT, (24-64))
+ slwi rSHL, rSHL, 3
+ cmplw cr5, rWORD8_SHIFT, rSTR2
+ add rN, rN, r12
+ slwi rWORD6, r12, 3
+ stw rWORD6_SHIFT, 20(r1)
+ cfi_offset(rWORD6_SHIFT, (20-64))
+ subfic rSHR, rSHL, 32
+ srwi r0, rN, 4 /* Divide by 16 */
+ andi. r12, rN, 12 /* Get the W remainder */
/* We normally need to load 2 Ws to start the unaligned rSTR2, but in
this special case those bits may be discarded anyway. Also we
must avoid loading a W where none of the bits are part of rSTR2 as
this may cross a page boundary and cause a page fault. */
- li rWORD8,0
- blt cr5,L(dus0)
- lwz rWORD8,0(rSTR2)
- la rSTR2,4(rSTR2)
- slw rWORD8,rWORD8,rSHL
+ li rWORD8, 0
+ blt cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD8, 0(rSTR2)
+ addi rSTR2, rSTR2, 4
+#endif
+ slw rWORD8, rWORD8, rSHL
L(dus0):
- lwz rWORD1,0(rSTR1)
- lwz rWORD2,0(rSTR2)
- cmplwi cr1,rBITDIF,8
- cmplwi cr7,rN,16
- srw rG,rWORD2,rSHR
- clrlwi rN,rN,30
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 0(rSTR1)
+ lwz rWORD2, 0(rSTR2)
+#endif
+ cmplwi cr1, r12, 8
+ cmplwi cr7, rN, 16
+ srw r12, rWORD2, rSHR
+ clrlwi rN, rN, 30
beq L(duPs4)
- mtctr rTMP
- or rWORD8,rG,rWORD8
- bgt cr1,L(duPs3)
- beq cr1,L(duPs2)
+ mtctr r0
+ or rWORD8, r12, rWORD8
+ bgt cr1, L(duPs3)
+ beq cr1, L(duPs2)
/* Remainder is 4 */
.align 4
L(dusP1):
- slw rB,rWORD2,rSHL
- slw rWORD7,rWORD1,r11
- slw rWORD8,rWORD8,r11
- bge cr7,L(duP1e)
+ slw rWORD8_SHIFT, rWORD2, rSHL
+ slw rWORD7, rWORD1, rWORD6
+ slw rWORD8, rWORD8, rWORD6
+ bge cr7, L(duP1e)
/* At this point we exit early with the first word compare
complete and remainder of 0 to 3 bytes. See L(du14) for details on
how we handle the remaining bytes. */
- cmplw cr5,rWORD7,rWORD8
- slwi. rN,rN,3
- bne cr5,L(duLcr5)
- cmplw cr7,rN,rSHR
+ cmplw cr5, rWORD7, rWORD8
+ slwi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmplw cr7, rN, rSHR
beq L(duZeroReturn)
- li rA,0
- ble cr7,L(dutrim)
- lwz rWORD2,4(rSTR2)
- srw rA,rWORD2,rSHR
+ li r0, 0
+ ble cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD2, 4(rSTR2)
+#endif
+ srw r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 8 */
.align 4
L(duPs2):
- slw rH,rWORD2,rSHL
- slw rWORD5,rWORD1,r11
- slw rWORD6,rWORD8,r11
+ slw rWORD6_SHIFT, rWORD2, rSHL
+ slw rWORD5, rWORD1, rWORD6
+ slw rWORD6, rWORD8, rWORD6
b L(duP2e)
/* Remainder is 12 */
.align 4
L(duPs3):
- slw rF,rWORD2,rSHL
- slw rWORD3,rWORD1,r11
- slw rWORD4,rWORD8,r11
+ slw rWORD4_SHIFT, rWORD2, rSHL
+ slw rWORD3, rWORD1, rWORD6
+ slw rWORD4, rWORD8, rWORD6
b L(duP3e)
/* Count is a multiple of 16, remainder is 0 */
.align 4
L(duPs4):
- mtctr rTMP
- or rWORD8,rG,rWORD8
- slw rD,rWORD2,rSHL
- slw rWORD1,rWORD1,r11
- slw rWORD2,rWORD8,r11
+ mtctr r0
+ or rWORD8, r12, rWORD8
+ slw rWORD2_SHIFT, rWORD2, rSHL
+ slw rWORD1, rWORD1, rWORD6
+ slw rWORD2, rWORD8, rWORD6
b L(duP4e)
/* At this point we know rSTR1 is word aligned and the
compare length is at least 8 bytes. */
.align 4
L(Wunaligned):
- stw r27,32(r1)
- cfi_offset(r27,(32-64))
- clrrwi rSTR2,rSTR2,2
- stw r26,28(r1)
- cfi_offset(r26,(28-64))
- srwi rTMP,rN,4 /* Divide by 16 */
- stw r25,24(r1)
- cfi_offset(r25,(24-64))
- andi. rBITDIF,rN,12 /* Get the W remainder */
- stw r24,20(r1)
- cfi_offset(r24,(24-64))
- slwi rSHL,rSHL,3
- lwz rWORD6,0(rSTR2)
- lwzu rWORD8,4(rSTR2)
- cmplwi cr1,rBITDIF,8
- cmplwi cr7,rN,16
- clrlwi rN,rN,30
- subfic rSHR,rSHL,32
- slw rH,rWORD6,rSHL
+ stw rWORD8_SHIFT, 32(r1)
+ cfi_offset(rWORD8_SHIFT, (32-64))
+ clrrwi rSTR2, rSTR2, 2
+ stw rWORD2_SHIFT, 28(r1)
+ cfi_offset(rWORD2_SHIFT, (28-64))
+ srwi r0, rN, 4 /* Divide by 16 */
+ stw rWORD4_SHIFT, 24(r1)
+ cfi_offset(rWORD4_SHIFT, (24-64))
+ andi. r12, rN, 12 /* Get the W remainder */
+ stw rWORD6_SHIFT, 20(r1)
+ cfi_offset(rWORD6_SHIFT, (20-64))
+ slwi rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD6, 0(rSTR2)
+ lwzu rWORD8, 4(rSTR2)
+#endif
+ cmplwi cr1, r12, 8
+ cmplwi cr7, rN, 16
+ clrlwi rN, rN, 30
+ subfic rSHR, rSHL, 32
+ slw rWORD6_SHIFT, rWORD6, rSHL
beq L(duP4)
- mtctr rTMP
- bgt cr1,L(duP3)
- beq cr1,L(duP2)
+ mtctr r0
+ bgt cr1, L(duP3)
+ beq cr1, L(duP2)
/* Remainder is 4 */
.align 4
L(duP1):
- srw rG,rWORD8,rSHR
- lwz rWORD7,0(rSTR1)
- slw rB,rWORD8,rSHL
- or rWORD8,rG,rH
- blt cr7,L(duP1x)
+ srw r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ addi rSTR1, rSTR1, 4
+#else
+ lwz rWORD7, 0(rSTR1)
+#endif
+ slw rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP1x)
L(duP1e):
- lwz rWORD1,4(rSTR1)
- lwz rWORD2,4(rSTR2)
- cmplw cr5,rWORD7,rWORD8
- srw rA,rWORD2,rSHR
- slw rD,rWORD2,rSHL
- or rWORD2,rA,rB
- lwz rWORD3,8(rSTR1)
- lwz rWORD4,8(rSTR2)
- cmplw cr0,rWORD1,rWORD2
- srw rC,rWORD4,rSHR
- slw rF,rWORD4,rSHL
- bne cr5,L(duLcr5)
- or rWORD4,rC,rD
- lwz rWORD5,12(rSTR1)
- lwz rWORD6,12(rSTR2)
- cmplw cr1,rWORD3,rWORD4
- srw rE,rWORD6,rSHR
- slw rH,rWORD6,rSHL
- bne cr0,L(duLcr0)
- or rWORD6,rE,rF
- cmplw cr6,rWORD5,rWORD6
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 4(rSTR1)
+ lwz rWORD2, 4(rSTR2)
+#endif
+ cmplw cr5, rWORD7, rWORD8
+ srw r0, rWORD2, rSHR
+ slw rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD3, 8(rSTR1)
+ lwz rWORD4, 8(rSTR2)
+#endif
+ cmplw cr7, rWORD1, rWORD2
+ srw r12, rWORD4, rSHR
+ slw rWORD4_SHIFT, rWORD4, rSHL
+ bne cr5, L(duLcr5)
+ or rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD5, 12(rSTR1)
+ lwz rWORD6, 12(rSTR2)
+#endif
+ cmplw cr1, rWORD3, rWORD4
+ srw r0, rWORD6, rSHR
+ slw rWORD6_SHIFT, rWORD6, rSHL
+ bne cr7, L(duLcr7)
+ or rWORD6, r0, rWORD4_SHIFT
+ cmplw cr6, rWORD5, rWORD6
b L(duLoop3)
.align 4
/* At this point we exit early with the first word compare
complete and remainder of 0 to 3 bytes. See L(du14) for details on
how we handle the remaining bytes. */
L(duP1x):
- cmplw cr5,rWORD7,rWORD8
- slwi. rN,rN,3
- bne cr5,L(duLcr5)
- cmplw cr7,rN,rSHR
+ cmplw cr5, rWORD7, rWORD8
+ slwi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmplw cr7, rN, rSHR
beq L(duZeroReturn)
- li rA,0
- ble cr7,L(dutrim)
- ld rWORD2,8(rSTR2)
- srw rA,rWORD2,rSHR
+ li r0, 0
+ ble cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD2, 8(rSTR2)
+#endif
+ srw r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 8 */
.align 4
L(duP2):
- srw rE,rWORD8,rSHR
- lwz rWORD5,0(rSTR1)
- or rWORD6,rE,rH
- slw rH,rWORD8,rSHL
+ srw r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ addi rSTR1, rSTR1, 4
+#else
+ lwz rWORD5, 0(rSTR1)
+#endif
+ or rWORD6, r0, rWORD6_SHIFT
+ slw rWORD6_SHIFT, rWORD8, rSHL
L(duP2e):
- lwz rWORD7,4(rSTR1)
- lwz rWORD8,4(rSTR2)
- cmplw cr6,rWORD5,rWORD6
- srw rG,rWORD8,rSHR
- slw rB,rWORD8,rSHL
- or rWORD8,rG,rH
- blt cr7,L(duP2x)
- lwz rWORD1,8(rSTR1)
- lwz rWORD2,8(rSTR2)
- cmplw cr5,rWORD7,rWORD8
- bne cr6,L(duLcr6)
- srw rA,rWORD2,rSHR
- slw rD,rWORD2,rSHL
- or rWORD2,rA,rB
- lwz rWORD3,12(rSTR1)
- lwz rWORD4,12(rSTR2)
- cmplw cr0,rWORD1,rWORD2
- bne cr5,L(duLcr5)
- srw rC,rWORD4,rSHR
- slw rF,rWORD4,rSHL
- or rWORD4,rC,rD
- addi rSTR1,rSTR1,4
- addi rSTR2,rSTR2,4
- cmplw cr1,rWORD3,rWORD4
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD7, 4(rSTR1)
+ lwz rWORD8, 4(rSTR2)
+#endif
+ cmplw cr6, rWORD5, rWORD6
+ srw r12, rWORD8, rSHR
+ slw rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 8(rSTR1)
+ lwz rWORD2, 8(rSTR2)
+#endif
+ cmplw cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ srw r0, rWORD2, rSHR
+ slw rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD3, 12(rSTR1)
+ lwz rWORD4, 12(rSTR2)
+#endif
+ cmplw cr7, rWORD1, rWORD2
+ bne cr5, L(duLcr5)
+ srw r12, rWORD4, rSHR
+ slw rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#endif
+ cmplw cr1, rWORD3, rWORD4
b L(duLoop2)
.align 4
L(duP2x):
- cmplw cr5,rWORD7,rWORD8
- addi rSTR1,rSTR1,4
- addi rSTR2,rSTR2,4
- bne cr6,L(duLcr6)
- slwi. rN,rN,3
- bne cr5,L(duLcr5)
- cmplw cr7,rN,rSHR
+ cmplw cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#endif
+ bne cr6, L(duLcr6)
+ slwi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmplw cr7, rN, rSHR
beq L(duZeroReturn)
- li rA,0
- ble cr7,L(dutrim)
- lwz rWORD2,4(rSTR2)
- srw rA,rWORD2,rSHR
+ li r0, 0
+ ble cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD2, 4(rSTR2)
+#endif
+ srw r0, rWORD2, rSHR
b L(dutrim)
/* Remainder is 12 */
.align 4
L(duP3):
- srw rC,rWORD8,rSHR
- lwz rWORD3,0(rSTR1)
- slw rF,rWORD8,rSHL
- or rWORD4,rC,rH
+ srw r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ addi rSTR1, rSTR1, 4
+#else
+ lwz rWORD3, 0(rSTR1)
+#endif
+ slw rWORD4_SHIFT, rWORD8, rSHL
+ or rWORD4, r12, rWORD6_SHIFT
L(duP3e):
- lwz rWORD5,4(rSTR1)
- lwz rWORD6,4(rSTR2)
- cmplw cr1,rWORD3,rWORD4
- srw rE,rWORD6,rSHR
- slw rH,rWORD6,rSHL
- or rWORD6,rE,rF
- lwz rWORD7,8(rSTR1)
- lwz rWORD8,8(rSTR2)
- cmplw cr6,rWORD5,rWORD6
- bne cr1,L(duLcr1)
- srw rG,rWORD8,rSHR
- slw rB,rWORD8,rSHL
- or rWORD8,rG,rH
- blt cr7,L(duP3x)
- lwz rWORD1,12(rSTR1)
- lwz rWORD2,12(rSTR2)
- cmplw cr5,rWORD7,rWORD8
- bne cr6,L(duLcr6)
- srw rA,rWORD2,rSHR
- slw rD,rWORD2,rSHL
- or rWORD2,rA,rB
- addi rSTR1,rSTR1,8
- addi rSTR2,rSTR2,8
- cmplw cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD5, 4(rSTR1)
+ lwz rWORD6, 4(rSTR2)
+#endif
+ cmplw cr1, rWORD3, rWORD4
+ srw r0, rWORD6, rSHR
+ slw rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD7, 8(rSTR1)
+ lwz rWORD8, 8(rSTR2)
+#endif
+ cmplw cr6, rWORD5, rWORD6
+ bne cr1, L(duLcr1)
+ srw r12, rWORD8, rSHR
+ slw rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ blt cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 12(rSTR1)
+ lwz rWORD2, 12(rSTR2)
+#endif
+ cmplw cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ srw r0, rWORD2, rSHR
+ slw rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+#endif
+ cmplw cr7, rWORD1, rWORD2
b L(duLoop1)
.align 4
L(duP3x):
- addi rSTR1,rSTR1,8
- addi rSTR2,rSTR2,8
- bne cr1,L(duLcr1)
- cmplw cr5,rWORD7,rWORD8
- bne cr6,L(duLcr6)
- slwi. rN,rN,3
- bne cr5,L(duLcr5)
- cmplw cr7,rN,rSHR
+#ifndef __LITTLE_ENDIAN__
+ addi rSTR1, rSTR1, 8
+ addi rSTR2, rSTR2, 8
+#endif
+#if 0
+/* Huh? We've already branched on cr1! */
+ bne cr1, L(duLcr1)
+#endif
+ cmplw cr5, rWORD7, rWORD8
+ bne cr6, L(duLcr6)
+ slwi. rN, rN, 3
+ bne cr5, L(duLcr5)
+ cmplw cr7, rN, rSHR
beq L(duZeroReturn)
- li rA,0
- ble cr7,L(dutrim)
- lwz rWORD2,4(rSTR2)
- srw rA,rWORD2,rSHR
+ li r0, 0
+ ble cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD2, 4(rSTR2)
+#endif
+ srw r0, rWORD2, rSHR
b L(dutrim)
/* Count is a multiple of 16, remainder is 0 */
.align 4
L(duP4):
- mtctr rTMP
- srw rA,rWORD8,rSHR
- lwz rWORD1,0(rSTR1)
- slw rD,rWORD8,rSHL
- or rWORD2,rA,rH
+ mtctr r0
+ srw r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ addi rSTR1, rSTR1, 4
+#else
+ lwz rWORD1, 0(rSTR1)
+#endif
+ slw rWORD2_SHIFT, rWORD8, rSHL
+ or rWORD2, r0, rWORD6_SHIFT
L(duP4e):
- lwz rWORD3,4(rSTR1)
- lwz rWORD4,4(rSTR2)
- cmplw cr0,rWORD1,rWORD2
- srw rC,rWORD4,rSHR
- slw rF,rWORD4,rSHL
- or rWORD4,rC,rD
- lwz rWORD5,8(rSTR1)
- lwz rWORD6,8(rSTR2)
- cmplw cr1,rWORD3,rWORD4
- bne cr0,L(duLcr0)
- srw rE,rWORD6,rSHR
- slw rH,rWORD6,rSHL
- or rWORD6,rE,rF
- lwzu rWORD7,12(rSTR1)
- lwzu rWORD8,12(rSTR2)
- cmplw cr6,rWORD5,rWORD6
- bne cr1,L(duLcr1)
- srw rG,rWORD8,rSHR
- slw rB,rWORD8,rSHL
- or rWORD8,rG,rH
- cmplw cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD3, 4(rSTR1)
+ lwz rWORD4, 4(rSTR2)
+#endif
+ cmplw cr7, rWORD1, rWORD2
+ srw r12, rWORD4, rSHR
+ slw rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD5, 8(rSTR1)
+ lwz rWORD6, 8(rSTR2)
+#endif
+ cmplw cr1, rWORD3, rWORD4
+ bne cr7, L(duLcr7)
+ srw r0, rWORD6, rSHR
+ slw rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwzu rWORD7, 12(rSTR1)
+ lwzu rWORD8, 12(rSTR2)
+#endif
+ cmplw cr6, rWORD5, rWORD6
+ bne cr1, L(duLcr1)
+ srw r12, rWORD8, rSHR
+ slw rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
+ cmplw cr5, rWORD7, rWORD8
bdz L(du24) /* Adjust CTR as we start with +4 */
/* This is the primary loop */
.align 4
L(duLoop):
- lwz rWORD1,4(rSTR1)
- lwz rWORD2,4(rSTR2)
- cmplw cr1,rWORD3,rWORD4
- bne cr6,L(duLcr6)
- srw rA,rWORD2,rSHR
- slw rD,rWORD2,rSHL
- or rWORD2,rA,rB
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD1, 4(rSTR1)
+ lwz rWORD2, 4(rSTR2)
+#endif
+ cmplw cr1, rWORD3, rWORD4
+ bne cr6, L(duLcr6)
+ srw r0, rWORD2, rSHR
+ slw rWORD2_SHIFT, rWORD2, rSHL
+ or rWORD2, r0, rWORD8_SHIFT
L(duLoop1):
- lwz rWORD3,8(rSTR1)
- lwz rWORD4,8(rSTR2)
- cmplw cr6,rWORD5,rWORD6
- bne cr5,L(duLcr5)
- srw rC,rWORD4,rSHR
- slw rF,rWORD4,rSHL
- or rWORD4,rC,rD
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD3, 0, rSTR1
+ lwbrx rWORD4, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD3, 8(rSTR1)
+ lwz rWORD4, 8(rSTR2)
+#endif
+ cmplw cr6, rWORD5, rWORD6
+ bne cr5, L(duLcr5)
+ srw r12, rWORD4, rSHR
+ slw rWORD4_SHIFT, rWORD4, rSHL
+ or rWORD4, r12, rWORD2_SHIFT
L(duLoop2):
- lwz rWORD5,12(rSTR1)
- lwz rWORD6,12(rSTR2)
- cmplw cr5,rWORD7,rWORD8
- bne cr0,L(duLcr0)
- srw rE,rWORD6,rSHR
- slw rH,rWORD6,rSHL
- or rWORD6,rE,rF
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD5, 0, rSTR1
+ lwbrx rWORD6, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD5, 12(rSTR1)
+ lwz rWORD6, 12(rSTR2)
+#endif
+ cmplw cr5, rWORD7, rWORD8
+ bne cr7, L(duLcr7)
+ srw r0, rWORD6, rSHR
+ slw rWORD6_SHIFT, rWORD6, rSHL
+ or rWORD6, r0, rWORD4_SHIFT
L(duLoop3):
- lwzu rWORD7,16(rSTR1)
- lwzu rWORD8,16(rSTR2)
- cmplw cr0,rWORD1,rWORD2
- bne cr1,L(duLcr1)
- srw rG,rWORD8,rSHR
- slw rB,rWORD8,rSHL
- or rWORD8,rG,rH
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD7, 0, rSTR1
+ lwbrx rWORD8, 0, rSTR2
+ addi rSTR1, rSTR1, 4
+ addi rSTR2, rSTR2, 4
+#else
+ lwzu rWORD7, 16(rSTR1)
+ lwzu rWORD8, 16(rSTR2)
+#endif
+ cmplw cr7, rWORD1, rWORD2
+ bne cr1, L(duLcr1)
+ srw r12, rWORD8, rSHR
+ slw rWORD8_SHIFT, rWORD8, rSHL
+ or rWORD8, r12, rWORD6_SHIFT
bdnz L(duLoop)
L(duL4):
- bne cr1,L(duLcr1)
- cmplw cr1,rWORD3,rWORD4
- bne cr6,L(duLcr6)
- cmplw cr6,rWORD5,rWORD6
- bne cr5,L(duLcr5)
- cmplw cr5,rWORD7,rWORD8
+#if 0
+/* Huh? We've already branched on cr1! */
+ bne cr1, L(duLcr1)
+#endif
+ cmplw cr1, rWORD3, rWORD4
+ bne cr6, L(duLcr6)
+ cmplw cr6, rWORD5, rWORD6
+ bne cr5, L(duLcr5)
+ cmplw cr5, rWORD7, rWORD8
L(du44):
- bne cr0,L(duLcr0)
+ bne cr7, L(duLcr7)
L(du34):
- bne cr1,L(duLcr1)
+ bne cr1, L(duLcr1)
L(du24):
- bne cr6,L(duLcr6)
+ bne cr6, L(duLcr6)
L(du14):
- slwi. rN,rN,3
- bne cr5,L(duLcr5)
+ slwi. rN, rN, 3
+ bne cr5, L(duLcr5)
/* At this point we have a remainder of 1 to 3 bytes to compare. We use
shift right to eliminate bits beyond the compare length.
+ This allows the use of word subtract to compute the final result.
However it may not be safe to load rWORD2 which may be beyond the
string length. So we compare the bit length of the remainder to
the right shift count (rSHR). If the bit count is less than or equal
we do not need to load rWORD2 (all significant bits are already in
- rB). */
- cmplw cr7,rN,rSHR
+ rWORD8_SHIFT). */
+ cmplw cr7, rN, rSHR
beq L(duZeroReturn)
- li rA,0
- ble cr7,L(dutrim)
- lwz rWORD2,4(rSTR2)
- srw rA,rWORD2,rSHR
+ li r0, 0
+ ble cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD2, 0, rSTR2
+ addi rSTR2, rSTR2, 4
+#else
+ lwz rWORD2, 4(rSTR2)
+#endif
+ srw r0, rWORD2, rSHR
.align 4
L(dutrim):
- lwz rWORD1,4(rSTR1)
- lwz r31,48(1)
- subfic rN,rN,32 /* Shift count is 32 - (rN * 8). */
- or rWORD2,rA,rB
- lwz r30,44(1)
- lwz r29,40(r1)
- srw rWORD1,rWORD1,rN
- srw rWORD2,rWORD2,rN
- lwz r28,36(r1)
- lwz r27,32(r1)
- cmplw rWORD1,rWORD2
- li rRTN,0
- beq L(dureturn26)
- li rRTN,1
- bgt L(dureturn26)
- li rRTN,-1
+#ifdef __LITTLE_ENDIAN__
+ lwbrx rWORD1, 0, rSTR1
+#else
+ lwz rWORD1, 4(rSTR1)
+#endif
+ lwz rWORD8, 48(r1)
+ subfic rN, rN, 32 /* Shift count is 32 - (rN * 8). */
+ or rWORD2, r0, rWORD8_SHIFT
+ lwz rWORD7, 44(r1)
+ lwz rSHL, 40(r1)
+ srw rWORD1, rWORD1, rN
+ srw rWORD2, rWORD2, rN
+ lwz rSHR, 36(r1)
+ lwz rWORD8_SHIFT, 32(r1)
+ sub rRTN, rWORD1, rWORD2
b L(dureturn26)
.align 4
-L(duLcr0):
- lwz r31,48(1)
- lwz r30,44(1)
- li rRTN,1
- bgt cr0,L(dureturn29)
- lwz r29,40(r1)
- lwz r28,36(r1)
- li rRTN,-1
+L(duLcr7):
+ lwz rWORD8, 48(r1)
+ lwz rWORD7, 44(r1)
+ li rRTN, 1
+ bgt cr7, L(dureturn29)
+ lwz rSHL, 40(r1)
+ lwz rSHR, 36(r1)
+ li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr1):
- lwz r31,48(1)
- lwz r30,44(1)
- li rRTN,1
- bgt cr1,L(dureturn29)
- lwz r29,40(r1)
- lwz r28,36(r1)
- li rRTN,-1
+ lwz rWORD8, 48(r1)
+ lwz rWORD7, 44(r1)
+ li rRTN, 1
+ bgt cr1, L(dureturn29)
+ lwz rSHL, 40(r1)
+ lwz rSHR, 36(r1)
+ li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr6):
- lwz r31,48(1)
- lwz r30,44(1)
- li rRTN,1
- bgt cr6,L(dureturn29)
- lwz r29,40(r1)
- lwz r28,36(r1)
- li rRTN,-1
+ lwz rWORD8, 48(r1)
+ lwz rWORD7, 44(r1)
+ li rRTN, 1
+ bgt cr6, L(dureturn29)
+ lwz rSHL, 40(r1)
+ lwz rSHR, 36(r1)
+ li rRTN, -1
b L(dureturn27)
.align 4
L(duLcr5):
- lwz r31,48(1)
- lwz r30,44(1)
- li rRTN,1
- bgt cr5,L(dureturn29)
- lwz r29,40(r1)
- lwz r28,36(r1)
- li rRTN,-1
+ lwz rWORD8, 48(r1)
+ lwz rWORD7, 44(r1)
+ li rRTN, 1
+ bgt cr5, L(dureturn29)
+ lwz rSHL, 40(r1)
+ lwz rSHR, 36(r1)
+ li rRTN, -1
b L(dureturn27)
.align 3
L(duZeroReturn):
- li rRTN,0
+ li rRTN, 0
.align 4
L(dureturn):
- lwz r31,48(1)
- lwz r30,44(1)
+ lwz rWORD8, 48(r1)
+ lwz rWORD7, 44(r1)
L(dureturn29):
- lwz r29,40(r1)
- lwz r28,36(r1)
+ lwz rSHL, 40(r1)
+ lwz rSHR, 36(r1)
L(dureturn27):
- lwz r27,32(r1)
+ lwz rWORD8_SHIFT, 32(r1)
L(dureturn26):
- lwz r26,28(r1)
+ lwz rWORD2_SHIFT, 28(r1)
L(dureturn25):
- lwz r25,24(r1)
- lwz r24,20(r1)
- lwz 1,0(1)
+ lwz rWORD4_SHIFT, 24(r1)
+ lwz rWORD6_SHIFT, 20(r1)
+ addi r1, r1, 64
+ cfi_adjust_cfa_offset(-64)
blr
END (memcmp)
+
libc_hidden_builtin_def (memcmp)
-weak_alias (memcmp,bcmp)
+weak_alias (memcmp, bcmp)
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/memcpy.S b/libc/sysdeps/powerpc/powerpc32/power7/memcpy.S
index 7f0077823..acf3c1019 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -383,7 +383,7 @@ L(copy_GE_32_unaligned):
beq L(copy_GE_32_unaligned_cont)
- /* SRC is not quadword aligned, get it aligned. */
+ /* DST is not quadword aligned, get it aligned. */
mtcrf 0x01,0
subf 31,0,5
@@ -435,13 +435,21 @@ L(copy_GE_32_unaligned_cont):
mr 11,12
mtcrf 0x01,9
cmplwi cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,12
+#else
lvsl 5,0,12
+#endif
lvx 3,0,12
bf 31,L(setup_unaligned_loop)
/* Copy another 16 bytes to align to 32-bytes due to the loop . */
lvx 4,12,6
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
vperm 6,3,4,5
+#endif
addi 11,12,16
addi 10,3,16
stvx 6,0,3
@@ -461,11 +469,17 @@ L(unaligned_loop):
vector instructions though. */
lvx 4,11,6 /* vr4 = r11+16. */
- vperm 6,3,4,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr6. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
lvx 3,11,7 /* vr3 = r11+32. */
- vperm 10,4,3,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr10. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
addi 11,11,32
stvx 6,0,10
stvx 10,10,6
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/mempcpy.S b/libc/sysdeps/powerpc/powerpc32/power7/mempcpy.S
index 5ad4edb58..4610ec5b5 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/mempcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/mempcpy.S
@@ -325,7 +325,7 @@ L(copy_GE_32_unaligned):
beq L(copy_GE_32_unaligned_cont)
- /* SRC is not quadword aligned, get it aligned. */
+ /* DST is not quadword aligned, get it aligned. */
mtcrf 0x01,0
subf 31,0,5
@@ -377,13 +377,21 @@ L(copy_GE_32_unaligned_cont):
mr 11,12
mtcrf 0x01,9
cmplwi cr6,9,1
- lvsl 5,0,12
+#ifdef __LITTLE_ENDIAN__
+ lvsr 5,0,12
+#else
+ lvsl 5,0,12
+#endif
lvx 3,0,12
bf 31,L(setup_unaligned_loop)
/* Copy another 16 bytes to align to 32-bytes due to the loop . */
lvx 4,12,6
- vperm 6,3,4,5
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
addi 11,12,16
addi 10,3,16
stvx 6,0,3
@@ -403,11 +411,17 @@ L(unaligned_loop):
vector instructions though. */
lvx 4,11,6 /* vr4 = r11+16. */
- vperm 6,3,4,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr6. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 6,4,3,5
+#else
+ vperm 6,3,4,5
+#endif
lvx 3,11,7 /* vr3 = r11+32. */
- vperm 10,4,3,5 /* Merge the correctly-aligned portions
- of vr3/vr4 into vr10. */
+#ifdef __LITTLE_ENDIAN__
+ vperm 10,3,4,5
+#else
+ vperm 10,4,3,5
+#endif
addi 11,11,32
stvx 6,0,10
stvx 10,10,6
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/memrchr.S b/libc/sysdeps/powerpc/powerpc32/power7/memrchr.S
index defd832b0..9601aa799 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/memrchr.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/memrchr.S
@@ -23,117 +23,131 @@
.machine power7
ENTRY (__memrchr)
CALL_MCOUNT
- dcbt 0,r3
- mr r7,r3
- add r3,r7,r5 /* Calculate the last acceptable address. */
- cmplw cr7,r3,r7 /* Is the address equal or less than r3? */
+ add r7,r3,r5 /* Calculate the last acceptable address. */
+ neg r0,r7
+ addi r7,r7,-1
+ mr r10,r3
+ clrrwi r6,r7,7
+ li r9,3<<5
+ dcbt r9,r6,16 /* Stream hint, decreasing addresses. */
/* Replicate BYTE to word. */
- rlwimi r4,r4,8,16,23
- rlwimi r4,r4,16,0,15
- bge cr7,L(proceed)
-
- li r3,-1 /* Make r11 the biggest if r4 <= 0. */
-L(proceed):
+ rldimi r4,r4,8,48
+ rldimi r4,r4,16,32
li r6,-4
- addi r9,r3,-1
- clrrwi r8,r9,2
- addi r8,r8,4
- neg r0,r3
+ li r9,-1
rlwinm r0,r0,3,27,28 /* Calculate padding. */
-
+ clrrwi r8,r7,2
+ srw r9,r9,r0
cmplwi r5,16
+ clrrwi r0,r10,2
ble L(small_range)
- lwbrx r12,r8,r6 /* Load reversed word from memory. */
- cmpb r10,r12,r4 /* Check for BYTE in WORD1. */
- slw r10,r10,r0
- srw r10,r10,r0
- cmplwi cr7,r10,0 /* If r10 == 0, no BYTEs have been found. */
+#ifdef __LITTLE_ENDIAN__
+ lwzx r12,0,r8
+#else
+ lwbrx r12,0,r8 /* Load reversed word from memory. */
+#endif
+ cmpb r3,r12,r4 /* Check for BYTE in WORD1. */
+ and r3,r3,r9
+ cmplwi cr7,r3,0 /* If r3 == 0, no BYTEs have been found. */
bne cr7,L(done)
- /* Are we done already? */
- addi r9,r8,-4
- cmplw cr6,r9,r7
- ble cr6,L(null)
-
mtcrf 0x01,r8
/* Are we now aligned to a doubleword boundary? If so, skip to
the main loop. Otherwise, go through the alignment code. */
- mr r8,r9
- bt 29,L(loop_setup)
+ bf 29,L(loop_setup)
/* Handle WORD2 of pair. */
+#ifdef __LITTLE_ENDIAN__
+ lwzx r12,r8,r6
+#else
lwbrx r12,r8,r6
- cmpb r10,r12,r4
- cmplwi cr7,r10,0
- bne cr7,L(done)
-
- /* Are we done already? */
+#endif
addi r8,r8,-4
- cmplw cr6,r8,r7
- ble cr6,L(null)
+ cmpb r3,r12,r4
+ cmplwi cr7,r3,0
+ bne cr7,L(done)
L(loop_setup):
- li r0,-8
- sub r5,r8,r7
- srwi r9,r5,3 /* Number of loop iterations. */
+ /* The last word we want to read in the loop below is the one
+ containing the first byte of the string, ie. the word at
+ s & ~3, or r0. The first word read is at r8 - 4, we
+ read 2 * cnt words, so the last word read will be at
+ r8 - 4 - 8 * cnt + 4. Solving for cnt gives
+ cnt = (r8 - r0) / 8 */
+ sub r5,r8,r0
+ addi r8,r8,-4
+ srwi r9,r5,3 /* Number of loop iterations. */
mtctr r9 /* Setup the counter. */
- b L(loop)
- /* Main loop to look for BYTE backwards in the string. Since it's a
- small loop (< 8 instructions), align it to 32-bytes. */
- .p2align 5
+
+ /* Main loop to look for BYTE backwards in the string.
+ FIXME: Investigate whether 32 byte align helps with this
+ 9 instruction loop. */
+ .align 5
L(loop):
/* Load two words, compare and merge in a
single register for speed. This is an attempt
to speed up the byte-checking process for bigger strings. */
- lwbrx r12,r8,r6
- lwbrx r11,r8,r0
- addi r8,r8,-4
- cmpb r10,r12,r4
+#ifdef __LITTLE_ENDIAN__
+ lwzx r12,0,r8
+ lwzx r11,r8,r6
+#else
+ lwbrx r12,0,r8
+ lwbrx r11,r8,r6
+#endif
+ cmpb r3,r12,r4
cmpb r9,r11,r4
- or r5,r9,r10 /* Merge everything in one word. */
+ or r5,r9,r3 /* Merge everything in one word. */
cmplwi cr7,r5,0
bne cr7,L(found)
- addi r8,r8,-4
+ addi r8,r8,-8
bdnz L(loop)
- /* We're here because the counter reached 0, and that means we
- didn't have any matches for BYTE in the whole range. Just return
- the original range. */
- addi r8,r8,4
- cmplw cr6,r8,r7
- bgt cr6,L(loop_small)
- b L(null)
- /* OK, one (or both) of the words contains BYTE. Check
- the first word and decrement the address in case the first
- word really contains BYTE. */
+ /* We may have one more word to read. */
+ cmplw r8,r0
+ bnelr
+
+#ifdef __LITTLE_ENDIAN__
+ lwzx r12,0,r8
+#else
+ lwbrx r12,0,r8
+#endif
+ cmpb r3,r12,r4
+ cmplwi cr7,r3,0
+ bne cr7,L(done)
+ blr
+
.align 4
L(found):
- cmplwi cr6,r10,0
- addi r8,r8,4
+ /* OK, one (or both) of the words contains BYTE. Check
+ the first word. */
+ cmplwi cr6,r3,0
bne cr6,L(done)
/* BYTE must be in the second word. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
+ again and move the result of cmpb to r3 so we can calculate the
pointer. */
- mr r10,r9
+ mr r3,r9
addi r8,r8,-4
- /* r10 has the output of the cmpb instruction, that is, it contains
+ /* r3 has the output of the cmpb instruction, that is, it contains
0xff in the same position as BYTE in the original
word from the string. Use that to calculate the pointer.
We need to make sure BYTE is *before* the end of the
range. */
L(done):
- cntlzw r0,r10 /* Count leading zeroes before the match. */
- srwi r6,r0,3 /* Convert leading zeroes to bytes. */
- addi r0,r6,1
+ cntlzw r9,r3 /* Count leading zeros before the match. */
+ cmplw r8,r0 /* Are we on the last word? */
+ srwi r6,r9,3 /* Convert leading zeros to bytes. */
+ addi r0,r6,-3
sub r3,r8,r0
- cmplw r3,r7
- blt L(null)
+ cmplw cr7,r3,r10
+ bnelr
+ bgelr cr7
+ li r3,0
blr
.align 4
@@ -147,28 +161,35 @@ L(small_range):
cmplwi r5,0
beq L(null)
- lwbrx r12,r8,r6 /* Load reversed word from memory. */
- cmpb r10,r12,r4 /* Check for null bytes in WORD1. */
- slw r10,r10,r0
- srw r10,r10,r0
- cmplwi cr7,r10,0
+#ifdef __LITTLE_ENDIAN__
+ lwzx r12,0,r8
+#else
+ lwbrx r12,0,r8 /* Load reversed word from memory. */
+#endif
+ cmpb r3,r12,r4 /* Check for BYTE in WORD1. */
+ and r3,r3,r9
+ cmplwi cr7,r3,0
bne cr7,L(done)
+ /* Are we done already? */
+ cmplw r8,r0
addi r8,r8,-4
- cmplw r8,r7
- ble L(null)
- b L(loop_small)
+ beqlr
- .p2align 5
+ .align 5
L(loop_small):
- lwbrx r12,r8,r6
- cmpb r10,r12,r4
- cmplwi cr6,r10,0
- bne cr6,L(done)
+#ifdef __LITTLE_ENDIAN__
+ lwzx r12,0,r8
+#else
+ lwbrx r12,0,r8
+#endif
+ cmpb r3,r12,r4
+ cmplw r8,r0
+ cmplwi cr7,r3,0
+ bne cr7,L(done)
addi r8,r8,-4
- cmplw r8,r7
- ble L(null)
- b L(loop_small)
+ bne L(loop_small)
+ blr
END (__memrchr)
weak_alias (__memrchr, memrchr)
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/memset.S b/libc/sysdeps/powerpc/powerpc32/power7/memset.S
index 360ea717f..aadda2558 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/memset.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/memset.S
@@ -35,8 +35,8 @@ L(_memset):
cfi_offset(31,-8)
/* Replicate byte to word. */
- rlwimi 4,4,8,16,23
- rlwimi 4,4,16,0,15
+ insrdi 4,4,8,48
+ insrdi 4,4,16,32
ble cr6,L(small) /* If length <= 8, use short copy code. */
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/rawmemchr.S b/libc/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
index a80c74a09..c2d8c4b7b 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
@@ -27,16 +27,21 @@ ENTRY (__rawmemchr)
clrrwi r8,r3,2 /* Align the address to word boundary. */
/* Replicate byte to word. */
- rlwimi r4,r4,8,16,23
- rlwimi r4,r4,16,0,15
+ rldimi r4,r4,8,48
+ rldimi r4,r4,16,32
/* Now r4 has a word of c bytes. */
rlwinm r6,r3,3,27,28 /* Calculate padding. */
lwz r12,0(r8) /* Load word from memory. */
cmpb r5,r12,r4 /* Compare each byte against c byte. */
+#ifdef __LITTLE_ENDIAN__
+ srw r5,r5,r6
+ slw r5,r5,r6
+#else
slw r5,r5,r6 /* Move left to discard ignored bits. */
srw r5,r5,r6 /* Bring the bits back as zeros. */
+#endif
cmpwi cr7,r5,0 /* If r5 == 0, no c bytes have been found. */
bne cr7,L(done)
@@ -90,8 +95,14 @@ L(loop):
word from the string. Use that fact to find out what is
the position of the byte inside the string. */
L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntw r0,r0
+#else
cntlzw r0,r5 /* Count leading zeros before the match. */
- srwi r0,r0,3 /* Convert leading zeroes to bytes. */
+#endif
+ srwi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r8,r0 /* Return address of the matching char. */
blr
END (__rawmemchr)
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/strchr.S b/libc/sysdeps/powerpc/powerpc32/power7/strchr.S
index 0ecadb271..b66265967 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/strchr.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/strchr.S
@@ -35,8 +35,8 @@ ENTRY (strchr)
beq cr7,L(null_match)
/* Replicate byte to word. */
- rlwimi r4,r4,8,16,23
- rlwimi r4,r4,16,0,15
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
/* Now r4 has a word of c bytes and r0 has
a word of null bytes. */
@@ -46,11 +46,17 @@ ENTRY (strchr)
/* Move the words left and right to discard the bits that are
not part of the string and to bring them back as zeros. */
-
+#ifdef __LITTLE_ENDIAN__
+ srw r10,r10,r6
+ srw r11,r11,r6
+ slw r10,r10,r6
+ slw r11,r11,r6
+#else
slw r10,r10,r6
slw r11,r11,r6
srw r10,r10,r6
srw r11,r11,r6
+#endif
or r5,r10,r11 /* OR the results to speed things up. */
cmpwi cr7,r5,0 /* If r5 == 0, no c or null bytes
have been found. */
@@ -65,7 +71,7 @@ ENTRY (strchr)
/* Handle WORD2 of pair. */
lwzu r12,4(r8)
- cmpb r10,r12,r4
+ cmpb r10,r12,r4
cmpb r11,r12,r0
or r5,r10,r11
cmpwi cr7,r5,0
@@ -100,22 +106,31 @@ L(loop):
bne cr6,L(done)
/* The c/null byte must be in the second word. Adjust the address
- again and move the result of cmpb to r10 so we can calculate the
- pointer. */
+ again and move the result of cmpb to r10/r11 so we can calculate
+ the pointer. */
mr r10,r6
mr r11,r7
addi r8,r8,4
- /* r5 has the output of the cmpb instruction, that is, it contains
+ /* r10/r11 have the output of the cmpb instructions, that is,
0xff in the same position as the c/null byte in the original
word from the string. Use that to calculate the pointer. */
L(done):
- cntlzw r4,r10 /* Count leading zeroes before c matches. */
- cntlzw r0,r11 /* Count leading zeroes before null matches. */
- cmplw cr7,r4,r0
+#ifdef __LITTLE_ENDIAN__
+ addi r3,r10,-1
+ andc r3,r3,r10
+ popcntw r0,r3
+ addi r4,r11,-1
+ andc r4,r4,r11
+ cmplw cr7,r3,r4
+ bgt cr7,L(no_match)
+#else
+ cntlzw r0,r10 /* Count leading zeros before c matches. */
+ cmplw cr7,r11,r10
bgt cr7,L(no_match)
- srwi r0,r4,3 /* Convert leading zeroes to bytes. */
+#endif
+ srwi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r8,r0 /* Return address of the matching c byte
or null in case c was not found. */
blr
@@ -133,10 +148,14 @@ L(null_match):
cmpb r5,r12,r0 /* Compare each byte against null bytes. */
/* Move the words left and right to discard the bits that are
- not part of the string and to bring them back as zeros. */
-
+ not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srw r5,r5,r6
+ slw r5,r5,r6
+#else
slw r5,r5,r6
srw r5,r5,r6
+#endif
cmpwi cr7,r5,0 /* If r10 == 0, no c or null bytes
have been found. */
bne cr7,L(done_null)
@@ -191,7 +210,13 @@ L(loop_null):
0xff in the same position as the null byte in the original
word from the string. Use that to calculate the pointer. */
L(done_null):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntw r0,r0
+#else
cntlzw r0,r5 /* Count leading zeros before the match. */
+#endif
srwi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r8,r0 /* Return address of the matching null byte. */
blr
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/strchrnul.S b/libc/sysdeps/powerpc/powerpc32/power7/strchrnul.S
index d4cacab60..f5d24d434 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/strchrnul.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/strchrnul.S
@@ -27,8 +27,8 @@ ENTRY (__strchrnul)
clrrwi r8,r3,2 /* Align the address to word boundary. */
/* Replicate byte to word. */
- rlwimi r4,r4,8,16,23
- rlwimi r4,r4,16,0,15
+ insrdi r4,r4,8,48
+ insrdi r4,r4,16,32
rlwinm r6,r3,3,27,28 /* Calculate padding. */
lwz r12,0(r8) /* Load word from memory. */
@@ -43,10 +43,17 @@ ENTRY (__strchrnul)
/* Move the words left and right to discard the bits that are
not part of the string and bring them back as zeros. */
+#ifdef __LITTLE_ENDIAN__
+ srw r10,r10,r6
+ srw r9,r9,r6
+ slw r10,r10,r6
+ slw r9,r9,r6
+#else
slw r10,r10,r6
slw r9,r9,r6
srw r10,r10,r6
srw r9,r9,r6
+#endif
or r5,r9,r10 /* OR the results to speed things up. */
cmpwi cr7,r5,0 /* If r5 == 0, no c or null bytes
have been found. */
@@ -54,7 +61,7 @@ ENTRY (__strchrnul)
mtcrf 0x01,r8
- /* Are we now aligned to a quadword boundary? If so, skip to
+ /* Are we now aligned to a doubleword boundary? If so, skip to
the main loop. Otherwise, go through the alignment code. */
bt 29,L(loop)
@@ -76,7 +83,7 @@ L(loop):
single register for speed. This is an attempt
to speed up the null-checking process for bigger strings. */
lwz r12,4(r8)
- lwzu r11,8(r8)
+ lwzu r11,8(r8)
cmpb r10,r12,r0
cmpb r9,r12,r4
cmpb r6,r11,r0
@@ -95,9 +102,9 @@ L(loop):
addi r8,r8,-4
bne cr6,L(done)
- /* The c/null byte must be in the second word. Adjust the
- address again and move the result of cmpb to r10 so we can calculate
- the pointer. */
+ /* The c/null byte must be in the second word. Adjust the address
+ again and move the result of cmpb to r5 so we can calculate the
+ pointer. */
mr r5,r10
addi r8,r8,4
@@ -105,7 +112,13 @@ L(loop):
0xff in the same position as the c/null byte in the original
word from the string. Use that to calculate the pointer. */
L(done):
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r5,-1
+ andc r0,r0,r5
+ popcntw r0,r0
+#else
cntlzw r0,r5 /* Count leading zeros before the match. */
+#endif
srwi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r8,r0 /* Return address of matching c/null byte. */
blr
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/strlen.S b/libc/sysdeps/powerpc/powerpc32/power7/strlen.S
index b71a10f5c..b08d6c028 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/strlen.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/strlen.S
@@ -29,7 +29,11 @@ ENTRY (strlen)
li r0,0 /* Word with null chars to use with cmpb. */
li r5,-1 /* MASK = 0xffffffffffffffff. */
lwz r12,0(r4) /* Load word from memory. */
+#ifdef __LITTLE_ENDIAN__
+ slw r5,r5,r6
+#else
srw r5,r5,r6 /* MASK = MASK >> padding. */
+#endif
orc r9,r12,r5 /* Mask bits that are not part of the string. */
cmpb r10,r9,r0 /* Check for null bytes in WORD1. */
cmpwi cr7,r10,0 /* If r10 == 0, no null's have been found. */
@@ -47,9 +51,6 @@ ENTRY (strlen)
cmpb r10,r12,r0
cmpwi cr7,r10,0
bne cr7,L(done)
- b L(loop) /* We branch here (rather than falling through)
- to skip the nops due to heavy alignment
- of the loop below. */
/* Main loop to look for the end of the string. Since it's a
small loop (< 8 instructions), align it to 32-bytes. */
@@ -86,9 +87,15 @@ L(loop):
0xff in the same position as the null byte in the original
word from the string. Use that to calculate the length. */
L(done):
- cntlzw r0,r10 /* Count leading zeroes before the match. */
+#ifdef __LITTLE_ENDIAN__
+ addi r9, r10, -1 /* Form a mask from trailing zeros. */
+ andc r9, r9, r10
+ popcntw r0, r9 /* Count the bits in the mask. */
+#else
+ cntlzw r0,r10 /* Count leading zeros before the match. */
+#endif
subf r5,r3,r4
- srwi r0,r0,3 /* Convert leading zeroes to bytes. */
+ srwi r0,r0,3 /* Convert leading zeros to bytes. */
add r3,r5,r0 /* Compute final length. */
blr
END (strlen)
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/strncmp.S b/libc/sysdeps/powerpc/powerpc32/power7/strncmp.S
index fdae44d26..10c9d251b 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/strncmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/strncmp.S
@@ -26,7 +26,7 @@
EALIGN (strncmp,5,0)
-#define rTMP r0
+#define rTMP2 r0
#define rRTN r3
#define rSTR1 r3 /* first string arg */
#define rSTR2 r4 /* second string arg */
@@ -39,6 +39,7 @@ EALIGN (strncmp,5,0)
#define r7F7F r9 /* constant 0x7f7f7f7f */
#define rNEG r10 /* ~(word in s1 | 0x7f7f7f7f) */
#define rBITDIF r11 /* bits that differ in s1 & s2 words */
+#define rTMP r12
dcbt 0,rSTR1
nop
@@ -78,13 +79,45 @@ L(g1): add rTMP,rFEFE,rWORD1
/* OK. We've hit the end of the string. We need to be careful that
we don't compare two strings as different because of gunk beyond
the end of the strings... */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+ slwi rTMP, rTMP, 1
+ addi rTMP2, rTMP, -1
+ andc rTMP2, rTMP2, rTMP
+ and rWORD2, rWORD2, rTMP2 /* Mask off gunk. */
+ and rWORD1, rWORD1, rTMP2
+ rlwinm rTMP2, rWORD2, 8, 0xffffffff /* Byte reverse word. */
+ rlwinm rTMP, rWORD1, 8, 0xffffffff
+ rldimi rTMP2, rWORD2, 24, 32
+ rldimi rTMP, rWORD1, 24, 32
+ rlwimi rTMP2, rWORD2, 24, 16, 23
+ rlwimi rTMP, rWORD1, 24, 16, 23
+ xor. rBITDIF, rTMP, rTMP2
+ sub rRTN, rTMP, rTMP2
+ bgelr
+ ori rRTN, rTMP2, 1
+ blr
+
+L(different):
+ lwz rWORD1, -4(rSTR1)
+ rlwinm rTMP2, rWORD2, 8, 0xffffffff /* Byte reverse word. */
+ rlwinm rTMP, rWORD1, 8, 0xffffffff
+ rldimi rTMP2, rWORD2, 24, 32
+ rldimi rTMP, rWORD1, 24, 32
+ rlwimi rTMP2, rWORD2, 24, 16, 23
+ rlwimi rTMP, rWORD1, 24, 16, 23
+ xor. rBITDIF, rTMP, rTMP2
+ sub rRTN, rTMP, rTMP2
+ bgelr
+ ori rRTN, rTMP2, 1
+ blr
+#else
L(endstring):
and rTMP,r7F7F,rWORD1
beq cr1,L(equal)
add rTMP,rTMP,r7F7F
xor. rBITDIF,rWORD1,rWORD2
-
andc rNEG,rNEG,rTMP
blt L(highbit)
cntlzw rBITDIF,rBITDIF
@@ -92,28 +125,20 @@ L(endstring):
addi rNEG,rNEG,7
cmpw cr1,rNEG,rBITDIF
sub rRTN,rWORD1,rWORD2
- blt cr1,L(equal)
- srawi rRTN,rRTN,31
- ori rRTN,rRTN,1
- blr
+ bgelr cr1
L(equal):
li rRTN,0
blr
L(different):
- lwzu rWORD1,-4(rSTR1)
+ lwz rWORD1,-4(rSTR1)
xor. rBITDIF,rWORD1,rWORD2
sub rRTN,rWORD1,rWORD2
- blt L(highbit)
- srawi rRTN,rRTN,31
- ori rRTN,rRTN,1
- blr
+ bgelr
L(highbit):
- srwi rWORD2,rWORD2,24
- srwi rWORD1,rWORD1,24
- sub rRTN,rWORD1,rWORD2
+ ori rRTN, rWORD2, 1
blr
-
+#endif
/* Oh well. In this case, we just do a byte-by-byte comparison. */
.align 4
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/strnlen.S b/libc/sysdeps/powerpc/powerpc32/power7/strnlen.S
index ed088366a..eb52afd1a 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/strnlen.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/strnlen.S
@@ -28,51 +28,47 @@ ENTRY (__strnlen)
add r7,r3,r4 /* Calculate the last acceptable address. */
cmplwi r4,16
li r0,0 /* Word with null chars. */
+ addi r7,r7,-1
ble L(small_range)
- cmplw cr7,r3,r7 /* Is the address equal or less than r3? If
- it's equal or less, it means size is either 0
- or a negative number. */
- ble cr7,L(proceed)
-
- li r7,-1 /* Make r11 the biggest if r4 <= 0. */
-L(proceed):
rlwinm r6,r3,3,27,28 /* Calculate padding. */
lwz r12,0(r8) /* Load word from memory. */
cmpb r10,r12,r0 /* Check for null bytes in DWORD1. */
+#ifdef __LITTLE_ENDIAN__
+ srw r10,r10,r6
+ slw r10,r10,r6
+#else
slw r10,r10,r6
srw r10,r10,r6
+#endif
cmplwi cr7,r10,0 /* If r10 == 0, no null's have been found. */
bne cr7,L(done)
- /* Are we done already? */
- addi r9,r8,4
- cmplw cr6,r9,r7
- bge cr6,L(end_max)
-
+ clrrwi r7,r7,2 /* Address of last word. */
mtcrf 0x01,r8
/* Are we now aligned to a doubleword boundary? If so, skip to
the main loop. Otherwise, go through the alignment code. */
bt 29,L(loop_setup)
- /* Handle DWORD2 of pair. */
+ /* Handle WORD2 of pair. */
lwzu r12,4(r8)
cmpb r10,r12,r0
cmplwi cr7,r10,0
bne cr7,L(done)
- /* Are we done already? */
- addi r9,r8,4
- cmplw cr6,r9,r7
- bge cr6,L(end_max)
-
L(loop_setup):
- sub r5,r7,r9
+ /* The last word we want to read in the loop below is the one
+ containing the last byte of the string, ie. the word at
+ (s + size - 1) & ~3, or r7. The first word read is at
+ r8 + 4, we read 2 * cnt words, so the last word read will
+ be at r8 + 4 + 8 * cnt - 4. Solving for cnt gives
+ cnt = (r7 - r8) / 8 */
+ sub r5,r7,r8
srwi r6,r5,3 /* Number of loop iterations. */
mtctr r6 /* Setup the counter. */
- b L(loop)
- /* Main loop to look for the null byte backwards in the string. Since
+
+ /* Main loop to look for the null byte in the string. Since
it's a small loop (< 8 instructions), align it to 32-bytes. */
.p2align 5
L(loop):
@@ -88,15 +84,18 @@ L(loop):
cmplwi cr7,r5,0
bne cr7,L(found)
bdnz L(loop)
- /* We're here because the counter reached 0, and that means we
- didn't have any matches for null in the whole range. Just return
- the original size. */
- addi r9,r8,4
- cmplw cr6,r9,r7
- blt cr6,L(loop_small)
+
+ /* We may have one more word to read. */
+ cmplw cr6,r8,r7
+ beq cr6,L(end_max)
+
+ lwzu r12,4(r8)
+ cmpb r10,r12,r0
+ cmplwi cr6,r10,0
+ bne cr6,L(done)
L(end_max):
- sub r3,r7,r3
+ mr r3,r4
blr
/* OK, one (or both) of the words contains a null byte. Check
@@ -121,49 +120,56 @@ L(found):
We need to make sure the null char is *before* the end of the
range. */
L(done):
- cntlzw r0,r10 /* Count leading zeroes before the match. */
- srwi r0,r0,3 /* Convert leading zeroes to bytes. */
- add r9,r8,r0
- sub r6,r9,r3 /* Length until the match. */
- cmplw r9,r7
- bgt L(end_max)
- mr r3,r6
- blr
-
- .align 4
-L(zero):
- li r3,0
+#ifdef __LITTLE_ENDIAN__
+ addi r0,r10,-1
+ andc r0,r0,r10
+ popcntw r0,r0
+#else
+ cntlzw r0,r10 /* Count leading zeros before the match. */
+#endif
+ sub r3,r8,r3
+ srwi r0,r0,3 /* Convert leading/trailing zeros to bytes. */
+ add r3,r3,r0 /* Length until the match. */
+ cmplw r3,r4
+ blelr
+ mr r3,r4
blr
-/* Deals with size <= 32. */
+/* Deals with size <= 16. */
.align 4
L(small_range):
cmplwi r4,0
- beq L(zero)
+ beq L(end_max)
+
+ clrrwi r7,r7,2 /* Address of last word. */
rlwinm r6,r3,3,27,28 /* Calculate padding. */
lwz r12,0(r8) /* Load word from memory. */
cmpb r10,r12,r0 /* Check for null bytes in WORD1. */
+#ifdef __LITTLE_ENDIAN__
+ srw r10,r10,r6
+ slw r10,r10,r6
+#else
slw r10,r10,r6
srw r10,r10,r6
+#endif
cmplwi cr7,r10,0
bne cr7,L(done)
- addi r9,r8,4
- cmplw r9,r7
- bge L(end_max)
- b L(loop_small)
+ cmplw r8,r7
+ beq L(end_max)
.p2align 5
L(loop_small):
lwzu r12,4(r8)
cmpb r10,r12,r0
- addi r9,r8,4
cmplwi cr6,r10,0
bne cr6,L(done)
- cmplw r9,r7
- bge L(end_max)
- b L(loop_small)
+ cmplw r8,r7
+ bne L(loop_small)
+ mr r3,r4
+ blr
+
END (__strnlen)
weak_alias (__strnlen, strnlen)
libc_hidden_builtin_def (strnlen)
diff --git a/libc/sysdeps/powerpc/powerpc32/setjmp-common.S b/libc/sysdeps/powerpc/powerpc32/setjmp-common.S
index 60b0026fa..3fb65b5f7 100644
--- a/libc/sysdeps/powerpc/powerpc32/setjmp-common.S
+++ b/libc/sysdeps/powerpc/powerpc32/setjmp-common.S
@@ -24,6 +24,11 @@
# include <jmpbuf-offsets.h>
#endif
+#if defined __SPE__ || (defined __NO_FPRS__ && !defined _SOFT_FLOAT)
+# define SAVE_GP(N) evstdd r##N,((JB_FPRS+((N)-14)*2)*4)(3)
+#else
+# define SAVE_GP(N) stw r##N,((JB_GPRS+(N)-14)*4)(3)
+#endif
ENTRY (__sigsetjmp)
@@ -35,31 +40,31 @@ ENTRY (__sigsetjmp)
stw r1,(JB_GPR1*4)(3)
#endif
mflr r0
- stw r14,((JB_GPRS+0)*4)(3)
+ SAVE_GP (14)
#ifdef PTR_MANGLE
PTR_MANGLE2 (r0, r10)
li r10,0
#endif
stw r0,(JB_LR*4)(3)
- stw r15,((JB_GPRS+1)*4)(3)
+ SAVE_GP (15)
mfcr r0
- stw r16,((JB_GPRS+2)*4)(3)
+ SAVE_GP (16)
stw r0,(JB_CR*4)(3)
- stw r17,((JB_GPRS+3)*4)(3)
- stw r18,((JB_GPRS+4)*4)(3)
- stw r19,((JB_GPRS+5)*4)(3)
- stw r20,((JB_GPRS+6)*4)(3)
- stw r21,((JB_GPRS+7)*4)(3)
- stw r22,((JB_GPRS+8)*4)(3)
- stw r23,((JB_GPRS+9)*4)(3)
- stw r24,((JB_GPRS+10)*4)(3)
- stw r25,((JB_GPRS+11)*4)(3)
- stw r26,((JB_GPRS+12)*4)(3)
- stw r27,((JB_GPRS+13)*4)(3)
- stw r28,((JB_GPRS+14)*4)(3)
- stw r29,((JB_GPRS+15)*4)(3)
- stw r30,((JB_GPRS+16)*4)(3)
- stw r31,((JB_GPRS+17)*4)(3)
+ SAVE_GP (17)
+ SAVE_GP (18)
+ SAVE_GP (19)
+ SAVE_GP (20)
+ SAVE_GP (21)
+ SAVE_GP (22)
+ SAVE_GP (23)
+ SAVE_GP (24)
+ SAVE_GP (25)
+ SAVE_GP (26)
+ SAVE_GP (27)
+ SAVE_GP (28)
+ SAVE_GP (29)
+ SAVE_GP (30)
+ SAVE_GP (31)
#if defined NOT_IN_libc && defined IS_IN_rtld
li r3,0
blr
diff --git a/libc/sysdeps/powerpc/powerpc32/setjmp.S b/libc/sysdeps/powerpc/powerpc32/setjmp.S
index 8a8cf0d6e..49b64ecf0 100644
--- a/libc/sysdeps/powerpc/powerpc32/setjmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/setjmp.S
@@ -25,7 +25,7 @@
#else /* !NOT_IN_libc */
/* Build a versioned object for libc. */
-default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx__sigsetjmp, __sigsetjmp, GLIBC_2_3_4)
# define __sigsetjmp __vmx__sigsetjmp
# define __sigjmp_save __vmx__sigjmp_save
# include "setjmp-common.S"
@@ -35,7 +35,7 @@ default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
# undef __sigsetjmp
# undef __sigjmp_save
# undef JB_SIZE
-symbol_version (__novmx__sigsetjmp,__sigsetjmp,GLIBC_2.0)
+compat_symbol (libc, __novmx__sigsetjmp, __sigsetjmp, GLIBC_2_0)
# define __sigsetjmp __novmx__sigsetjmp
# define __sigjmp_save __novmx__sigjmp_save
# include "setjmp-common.S"
diff --git a/libc/sysdeps/powerpc/powerpc32/stackguard-macros.h b/libc/sysdeps/powerpc/powerpc32/stackguard-macros.h
index 839f6a4b9..b3d0af830 100644
--- a/libc/sysdeps/powerpc/powerpc32/stackguard-macros.h
+++ b/libc/sysdeps/powerpc/powerpc32/stackguard-macros.h
@@ -2,3 +2,13 @@
#define STACK_CHK_GUARD \
({ uintptr_t x; asm ("lwz %0,-28680(2)" : "=r" (x)); x; })
+
+#define POINTER_CHK_GUARD \
+ ({ \
+ uintptr_t x; \
+ asm ("lwz %0,%1(2)" \
+ : "=r" (x) \
+ : "i" (offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t)) \
+ ); \
+ x; \
+ })
diff --git a/libc/sysdeps/powerpc/powerpc32/stpcpy.S b/libc/sysdeps/powerpc/powerpc32/stpcpy.S
index 03c6dddc3..7e106e0e6 100644
--- a/libc/sysdeps/powerpc/powerpc32/stpcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/stpcpy.S
@@ -62,7 +62,22 @@ L(g2): add rTMP, rFEFE, rWORD
mr rALT, rWORD
/* We've hit the end of the string. Do the rest byte-by-byte. */
-L(g1): rlwinm. rTMP, rALT, 8, 24, 31
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+ rlwinm. rTMP, rALT, 0, 24, 31
+ stbu rALT, 4(rDEST)
+ beqlr-
+ rlwinm. rTMP, rALT, 24, 24, 31
+ stbu rTMP, 1(rDEST)
+ beqlr-
+ rlwinm. rTMP, rALT, 16, 24, 31
+ stbu rTMP, 1(rDEST)
+ beqlr-
+ rlwinm rTMP, rALT, 8, 24, 31
+ stbu rTMP, 1(rDEST)
+ blr
+#else
+ rlwinm. rTMP, rALT, 8, 24, 31
stbu rTMP, 4(rDEST)
beqlr-
rlwinm. rTMP, rALT, 16, 24, 31
@@ -73,6 +88,7 @@ L(g1): rlwinm. rTMP, rALT, 8, 24, 31
beqlr-
stbu rALT, 1(rDEST)
blr
+#endif
/* Oh well. In this case, we just do a byte-by-byte copy. */
.align 4
diff --git a/libc/sysdeps/powerpc/powerpc32/strchr.S b/libc/sysdeps/powerpc/powerpc32/strchr.S
index c9952eecc..605056577 100644
--- a/libc/sysdeps/powerpc/powerpc32/strchr.S
+++ b/libc/sysdeps/powerpc/powerpc32/strchr.S
@@ -36,6 +36,8 @@ ENTRY (strchr)
#define rIGN r10 /* number of bits we should ignore in the first word */
#define rMASK r11 /* mask with the bits to ignore set to 0 */
#define rTMP3 r12
+#define rTMP4 rIGN
+#define rTMP5 rMASK
rlwimi rCHR, rCHR, 8, 16, 23
@@ -49,64 +51,93 @@ ENTRY (strchr)
addi r7F7F, r7F7F, 0x7f7f
/* Test the first (partial?) word. */
lwz rWORD, 0(rSTR)
+#ifdef __LITTLE_ENDIAN__
+ slw rMASK, rMASK, rIGN
+#else
srw rMASK, rMASK, rIGN
+#endif
orc rWORD, rWORD, rMASK
add rTMP1, rFEFE, rWORD
nor rTMP2, r7F7F, rWORD
- and. rTMP1, rTMP1, rTMP2
+ and. rTMP4, rTMP1, rTMP2
xor rTMP3, rCHR, rWORD
orc rTMP3, rTMP3, rMASK
b L(loopentry)
/* The loop. */
-L(loop):lwzu rWORD, 4(rSTR)
- and. rTMP1, rTMP1, rTMP2
+L(loop):
+ lwzu rWORD, 4(rSTR)
+ and. rTMP5, rTMP1, rTMP2
/* Test for 0. */
- add rTMP1, rFEFE, rWORD
- nor rTMP2, r7F7F, rWORD
+ add rTMP1, rFEFE, rWORD /* x - 0x01010101. */
+ nor rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080. */
bne L(foundit)
- and. rTMP1, rTMP1, rTMP2
+ and. rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080. */
/* Start test for the bytes we're looking for. */
xor rTMP3, rCHR, rWORD
L(loopentry):
add rTMP1, rFEFE, rTMP3
nor rTMP2, r7F7F, rTMP3
beq L(loop)
+
/* There is a zero byte in the word, but may also be a matching byte (either
before or after the zero byte). In fact, we may be looking for a
- zero byte, in which case we return a match. We guess that this hasn't
- happened, though. */
-L(missed):
- and. rTMP1, rTMP1, rTMP2
+ zero byte, in which case we return a match. */
+ and. rTMP5, rTMP1, rTMP2
li rRTN, 0
beqlr
-/* It did happen. Decide which one was first...
- I'm not sure if this is actually faster than a sequence of
- rotates, compares, and branches (we use it anyway because it's shorter). */
+/* At this point:
+ rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
+ rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
+ But there may be false matches in the next most significant byte from
+ a true match due to carries. This means we need to recalculate the
+ matches using a longer method for big-endian. */
+#ifdef __LITTLE_ENDIAN__
+ addi rTMP1, rTMP5, -1
+ andc rTMP1, rTMP1, rTMP5
+ cntlzw rCLZB, rTMP1
+ addi rTMP2, rTMP4, -1
+ andc rTMP2, rTMP2, rTMP4
+ cmplw rTMP1, rTMP2
+ bgtlr
+ subfic rCLZB, rCLZB, 32-7
+#else
+/* I think we could reduce this by two instructions by keeping the "nor"
+ results from the loop for reuse here. See strlen.S tail. Similarly
+ one instruction could be pruned from L(foundit). */
and rFEFE, r7F7F, rWORD
- or rMASK, r7F7F, rWORD
+ or rTMP5, r7F7F, rWORD
and rTMP1, r7F7F, rTMP3
- or rIGN, r7F7F, rTMP3
+ or rTMP4, r7F7F, rTMP3
add rFEFE, rFEFE, r7F7F
add rTMP1, rTMP1, r7F7F
- nor rWORD, rMASK, rFEFE
- nor rTMP2, rIGN, rTMP1
+ nor rWORD, rTMP5, rFEFE
+ nor rTMP2, rTMP4, rTMP1
+ cntlzw rCLZB, rTMP2
cmplw rWORD, rTMP2
bgtlr
- cntlzw rCLZB, rTMP2
+#endif
srwi rCLZB, rCLZB, 3
add rRTN, rSTR, rCLZB
blr
L(foundit):
+#ifdef __LITTLE_ENDIAN__
+ addi rTMP1, rTMP5, -1
+ andc rTMP1, rTMP1, rTMP5
+ cntlzw rCLZB, rTMP1
+ subfic rCLZB, rCLZB, 32-7-32
+ srawi rCLZB, rCLZB, 3
+#else
and rTMP1, r7F7F, rTMP3
- or rIGN, r7F7F, rTMP3
+ or rTMP4, r7F7F, rTMP3
add rTMP1, rTMP1, r7F7F
- nor rTMP2, rIGN, rTMP1
+ nor rTMP2, rTMP4, rTMP1
cntlzw rCLZB, rTMP2
subi rSTR, rSTR, 4
srwi rCLZB, rCLZB, 3
+#endif
add rRTN, rSTR, rCLZB
blr
END (strchr)
diff --git a/libc/sysdeps/powerpc/powerpc32/strcmp.S b/libc/sysdeps/powerpc/powerpc32/strcmp.S
index 297ca3c1b..91d60c905 100644
--- a/libc/sysdeps/powerpc/powerpc32/strcmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/strcmp.S
@@ -24,7 +24,7 @@
EALIGN (strcmp, 4, 0)
-#define rTMP r0
+#define rTMP2 r0
#define rRTN r3
#define rSTR1 r3 /* first string arg */
#define rSTR2 r4 /* second string arg */
@@ -34,6 +34,7 @@ EALIGN (strcmp, 4, 0)
#define r7F7F r8 /* constant 0x7f7f7f7f */
#define rNEG r9 /* ~(word in s1 | 0x7f7f7f7f) */
#define rBITDIF r10 /* bits that differ in s1 & s2 words */
+#define rTMP r11
or rTMP, rSTR2, rSTR1
@@ -56,10 +57,45 @@ L(g1): add rTMP, rFEFE, rWORD1
and. rTMP, rTMP, rNEG
cmpw cr1, rWORD1, rWORD2
beq+ L(g0)
-L(endstring):
+
/* OK. We've hit the end of the string. We need to be careful that
we don't compare two strings as different because of gunk beyond
the end of the strings... */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+ addi rTMP2, rTMP, -1
+ andc rTMP2, rTMP2, rTMP
+ rlwimi rTMP2, rTMP2, 1, 0, 30
+ and rWORD2, rWORD2, rTMP2 /* Mask off gunk. */
+ and rWORD1, rWORD1, rTMP2
+ rlwinm rTMP2, rWORD2, 8, 0xffffffff /* Byte reverse word. */
+ rlwinm rTMP, rWORD1, 8, 0xffffffff
+ rlwimi rTMP2, rWORD2, 24, 0, 7
+ rlwimi rTMP, rWORD1, 24, 0, 7
+ rlwimi rTMP2, rWORD2, 24, 16, 23
+ rlwimi rTMP, rWORD1, 24, 16, 23
+ xor. rBITDIF, rTMP, rTMP2
+ sub rRTN, rTMP, rTMP2
+ bgelr+
+ ori rRTN, rTMP2, 1
+ blr
+
+L(different):
+ lwz rWORD1, -4(rSTR1)
+ rlwinm rTMP2, rWORD2, 8, 0xffffffff /* Byte reverse word. */
+ rlwinm rTMP, rWORD1, 8, 0xffffffff
+ rlwimi rTMP2, rWORD2, 24, 0, 7
+ rlwimi rTMP, rWORD1, 24, 0, 7
+ rlwimi rTMP2, rWORD2, 24, 16, 23
+ rlwimi rTMP, rWORD1, 24, 16, 23
+ xor. rBITDIF, rTMP, rTMP2
+ sub rRTN, rTMP, rTMP2
+ bgelr+
+ ori rRTN, rTMP2, 1
+ blr
+
+#else
+L(endstring):
and rTMP, r7F7F, rWORD1
beq cr1, L(equal)
add rTMP, rTMP, r7F7F
@@ -84,7 +120,7 @@ L(different):
L(highbit):
ori rRTN, rWORD2, 1
blr
-
+#endif
/* Oh well. In this case, we just do a byte-by-byte comparison. */
.align 4
diff --git a/libc/sysdeps/powerpc/powerpc32/strcpy.S b/libc/sysdeps/powerpc/powerpc32/strcpy.S
index 4ae577dbb..e938cc42a 100644
--- a/libc/sysdeps/powerpc/powerpc32/strcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/strcpy.S
@@ -62,7 +62,22 @@ L(g2): add rTMP, rFEFE, rWORD
mr rALT, rWORD
/* We've hit the end of the string. Do the rest byte-by-byte. */
-L(g1): rlwinm. rTMP, rALT, 8, 24, 31
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+ rlwinm. rTMP, rALT, 0, 24, 31
+ stb rALT, 4(rDEST)
+ beqlr-
+ rlwinm. rTMP, rALT, 24, 24, 31
+ stb rTMP, 5(rDEST)
+ beqlr-
+ rlwinm. rTMP, rALT, 16, 24, 31
+ stb rTMP, 6(rDEST)
+ beqlr-
+ rlwinm rTMP, rALT, 8, 24, 31
+ stb rTMP, 7(rDEST)
+ blr
+#else
+ rlwinm. rTMP, rALT, 8, 24, 31
stb rTMP, 4(rDEST)
beqlr-
rlwinm. rTMP, rALT, 16, 24, 31
@@ -73,6 +88,7 @@ L(g1): rlwinm. rTMP, rALT, 8, 24, 31
beqlr-
stb rALT, 7(rDEST)
blr
+#endif
/* Oh well. In this case, we just do a byte-by-byte copy. */
.align 4
diff --git a/libc/sysdeps/powerpc/powerpc32/strlen.S b/libc/sysdeps/powerpc/powerpc32/strlen.S
index 9a6eafc38..a7153ed7a 100644
--- a/libc/sysdeps/powerpc/powerpc32/strlen.S
+++ b/libc/sysdeps/powerpc/powerpc32/strlen.S
@@ -29,7 +29,12 @@
1 is subtracted you get a value in the range 0x00-0x7f, none of which
have their high bit set. The expression here is
(x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
- there were no 0x00 bytes in the word.
+ there were no 0x00 bytes in the word. You get 0x80 in bytes that
+ match, but possibly false 0x80 matches in the next more significant
+ byte to a true match due to carries. For little-endian this is
+ of no consequence since the least significant match is the one
+ we're interested in, but big-endian needs method 2 to find which
+ byte matches.
2) Given a word 'x', we can test to see _which_ byte was zero by
calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
@@ -72,7 +77,7 @@
ENTRY (strlen)
-#define rTMP1 r0
+#define rTMP4 r0
#define rRTN r3 /* incoming STR arg, outgoing result */
#define rSTR r4 /* current string position */
#define rPADN r5 /* number of padding bits we prepend to the
@@ -82,9 +87,9 @@ ENTRY (strlen)
#define rWORD1 r8 /* current string word */
#define rWORD2 r9 /* next string word */
#define rMASK r9 /* mask for first string word */
-#define rTMP2 r10
-#define rTMP3 r11
-#define rTMP4 r12
+#define rTMP1 r10
+#define rTMP2 r11
+#define rTMP3 r12
clrrwi rSTR, rRTN, 2
@@ -93,15 +98,20 @@ ENTRY (strlen)
lwz rWORD1, 0(rSTR)
li rMASK, -1
addi r7F7F, r7F7F, 0x7f7f
-/* That's the setup done, now do the first pair of words.
- We make an exception and use method (2) on the first two words, to reduce
- overhead. */
+/* We use method (2) on the first two words, because rFEFE isn't
+ required which reduces setup overhead. Also gives a faster return
+ for small strings on big-endian due to needing to recalculate with
+ method (2) anyway. */
+#ifdef __LITTLE_ENDIAN__
+ slw rMASK, rMASK, rPADN
+#else
srw rMASK, rMASK, rPADN
+#endif
and rTMP1, r7F7F, rWORD1
or rTMP2, r7F7F, rWORD1
add rTMP1, rTMP1, r7F7F
- nor rTMP1, rTMP2, rTMP1
- and. rWORD1, rTMP1, rMASK
+ nor rTMP3, rTMP2, rTMP1
+ and. rTMP3, rTMP3, rMASK
mtcrf 0x01, rRTN
bne L(done0)
lis rFEFE, -0x101
@@ -110,11 +120,12 @@ ENTRY (strlen)
bt 29, L(loop)
/* Handle second word of pair. */
+/* Perhaps use method (1) here for little-endian, saving one instruction? */
lwzu rWORD1, 4(rSTR)
and rTMP1, r7F7F, rWORD1
or rTMP2, r7F7F, rWORD1
add rTMP1, rTMP1, r7F7F
- nor. rWORD1, rTMP2, rTMP1
+ nor. rTMP3, rTMP2, rTMP1
bne L(done0)
/* The loop. */
@@ -128,28 +139,52 @@ L(loop):
add rTMP3, rFEFE, rWORD2
nor rTMP4, r7F7F, rWORD2
bne L(done1)
- and. rTMP1, rTMP3, rTMP4
+ and. rTMP3, rTMP3, rTMP4
beq L(loop)
+#ifndef __LITTLE_ENDIAN__
and rTMP1, r7F7F, rWORD2
add rTMP1, rTMP1, r7F7F
- andc rWORD1, rTMP4, rTMP1
+ andc rTMP3, rTMP4, rTMP1
b L(done0)
L(done1):
and rTMP1, r7F7F, rWORD1
subi rSTR, rSTR, 4
add rTMP1, rTMP1, r7F7F
- andc rWORD1, rTMP2, rTMP1
+ andc rTMP3, rTMP2, rTMP1
/* When we get to here, rSTR points to the first word in the string that
- contains a zero byte, and the most significant set bit in rWORD1 is in that
- byte. */
+ contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
+ and 0x00 otherwise. */
L(done0):
- cntlzw rTMP3, rWORD1
+ cntlzw rTMP3, rTMP3
subf rTMP1, rRTN, rSTR
srwi rTMP3, rTMP3, 3
add rRTN, rTMP1, rTMP3
blr
+#else
+
+L(done0):
+ addi rTMP1, rTMP3, -1 /* Form a mask from trailing zeros. */
+ andc rTMP1, rTMP1, rTMP3
+ cntlzw rTMP1, rTMP1 /* Count bits not in the mask. */
+ subf rTMP3, rRTN, rSTR
+ subfic rTMP1, rTMP1, 32-7
+ srwi rTMP1, rTMP1, 3
+ add rRTN, rTMP1, rTMP3
+ blr
+
+L(done1):
+ addi rTMP3, rTMP1, -1
+ andc rTMP3, rTMP3, rTMP1
+ cntlzw rTMP3, rTMP3
+ subf rTMP1, rRTN, rSTR
+ subfic rTMP3, rTMP3, 32-7-32
+ srawi rTMP3, rTMP3, 3
+ add rRTN, rTMP1, rTMP3
+ blr
+#endif
+
END (strlen)
libc_hidden_builtin_def (strlen)
diff --git a/libc/sysdeps/powerpc/powerpc32/strncmp.S b/libc/sysdeps/powerpc/powerpc32/strncmp.S
index fa345d293..e36a160a8 100644
--- a/libc/sysdeps/powerpc/powerpc32/strncmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/strncmp.S
@@ -24,7 +24,7 @@
EALIGN (strncmp, 4, 0)
-#define rTMP r0
+#define rTMP2 r0
#define rRTN r3
#define rSTR1 r3 /* first string arg */
#define rSTR2 r4 /* second string arg */
@@ -35,6 +35,7 @@ EALIGN (strncmp, 4, 0)
#define r7F7F r9 /* constant 0x7f7f7f7f */
#define rNEG r10 /* ~(word in s1 | 0x7f7f7f7f) */
#define rBITDIF r11 /* bits that differ in s1 & s2 words */
+#define rTMP r12
dcbt 0,rSTR1
or rTMP, rSTR2, rSTR1
@@ -73,12 +74,45 @@ L(g1): add rTMP, rFEFE, rWORD1
we don't compare two strings as different because of gunk beyond
the end of the strings... */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+ slwi rTMP, rTMP, 1
+ addi rTMP2, rTMP, -1
+ andc rTMP2, rTMP2, rTMP
+ and rWORD2, rWORD2, rTMP2 /* Mask off gunk. */
+ and rWORD1, rWORD1, rTMP2
+ rlwinm rTMP2, rWORD2, 8, 0xffffffff /* Byte reverse word. */
+ rlwinm rTMP, rWORD1, 8, 0xffffffff
+ rlwimi rTMP2, rWORD2, 24, 0, 7
+ rlwimi rTMP, rWORD1, 24, 0, 7
+ rlwimi rTMP2, rWORD2, 24, 16, 23
+ rlwimi rTMP, rWORD1, 24, 16, 23
+ xor. rBITDIF, rTMP, rTMP2
+ sub rRTN, rTMP, rTMP2
+ bgelr+
+ ori rRTN, rTMP2, 1
+ blr
+
+L(different):
+ lwz rWORD1, -4(rSTR1)
+ rlwinm rTMP2, rWORD2, 8, 0xffffffff /* Byte reverse word. */
+ rlwinm rTMP, rWORD1, 8, 0xffffffff
+ rlwimi rTMP2, rWORD2, 24, 0, 7
+ rlwimi rTMP, rWORD1, 24, 0, 7
+ rlwimi rTMP2, rWORD2, 24, 16, 23
+ rlwimi rTMP, rWORD1, 24, 16, 23
+ xor. rBITDIF, rTMP, rTMP2
+ sub rRTN, rTMP, rTMP2
+ bgelr+
+ ori rRTN, rTMP2, 1
+ blr
+
+#else
L(endstring):
and rTMP, r7F7F, rWORD1
beq cr1, L(equal)
add rTMP, rTMP, r7F7F
xor. rBITDIF, rWORD1, rWORD2
-
andc rNEG, rNEG, rTMP
blt- L(highbit)
cntlzw rBITDIF, rBITDIF
@@ -86,28 +120,20 @@ L(endstring):
addi rNEG, rNEG, 7
cmpw cr1, rNEG, rBITDIF
sub rRTN, rWORD1, rWORD2
- blt- cr1, L(equal)
- srawi rRTN, rRTN, 31
- ori rRTN, rRTN, 1
- blr
+ bgelr+ cr1
L(equal):
li rRTN, 0
blr
L(different):
- lwzu rWORD1, -4(rSTR1)
+ lwz rWORD1, -4(rSTR1)
xor. rBITDIF, rWORD1, rWORD2
sub rRTN, rWORD1, rWORD2
- blt- L(highbit)
- srawi rRTN, rRTN, 31
- ori rRTN, rRTN, 1
- blr
+ bgelr+
L(highbit):
- srwi rWORD2, rWORD2, 24
- srwi rWORD1, rWORD1, 24
- sub rRTN, rWORD1, rWORD2
+ ori rRTN, rWORD2, 1
blr
-
+#endif
/* Oh well. In this case, we just do a byte-by-byte comparison. */
.align 4