Merge changes between r23946 and r24305 from /fsf/trunk.

git-svn-id: svn://svn.eglibc.org/trunk@24306 7b3dc134-2b1b-0410-93df-9e9f96275f8d
author: joseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d> 2013-10-18 21:33:25 +0000
committer: joseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d> 2013-10-18 21:33:25 +0000
commit: fe2ed5aaa408e1ab996a9fe1595a05634208a79c (patch)
tree: e1027fbc9d8a4a8c33f8149b2b42e8cde89c74f6 /libc/sysdeps/powerpc/powerpc32
parent: 571c782b982d888565e7d06bfc2f3d47582fe829 (diff)
94 files changed, 4816 insertions, 1487 deletions
diff --git a/libc/sysdeps/powerpc/powerpc32/405/memcmp.S b/libc/sysdeps/powerpc/powerpc32/405/memcmp.S
new file mode 100644
index 000000000..2849461cd
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/memcmp.S
@@ -0,0 +1,128 @@
+/* Optimized memcmp implementation for PowerPC476.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* memcmp
+
+       r3:source1 address, return equality
+       r4:source2 address
+       r5:byte count
+
+       Check 2 words from src1 and src2. If unequal jump to end and
+       return src1 > src2 or src1 < src2.
+       If count = zero check bytes before zero counter and then jump to end and
+       return src1 > src2, src1 < src2 or src1 = src2.
+       If src1 = src2 and no null, repeat. */
+
+EALIGN (memcmp, 5, 0)
+       srwi.   r6,r5,5
+       beq     L(preword2_count_loop)
+       mtctr   r6
+       clrlwi  r5,r5,27
+
+L(word8_compare_loop):
+       lwz     r10,0(r3)
+       lwz     r6,4(r3)
+       lwz     r8,0(r4)
+       lwz     r9,4(r4)
+       cmplw   cr5,r8,r10
+       cmplw   cr1,r9,r6
+       bne     cr5,L(st2)
+       bne     cr1,L(st1)
+       lwz     r10,8(r3)
+       lwz     r6,12(r3)
+       lwz     r8,8(r4)
+       lwz     r9,12(r4)
+       cmplw   cr5,r8,r10
+       cmplw   cr1,r9,r6
+       bne     cr5,L(st2)
+       bne     cr1,L(st1)
+       lwz     r10,16(r3)
+       lwz     r6,20(r3)
+       lwz     r8,16(r4)
+       lwz     r9,20(r4)
+       cmplw   cr5,r8,r10
+       cmplw   cr1,r9,r6
+       bne     cr5,L(st2)
+       bne     cr1,L(st1)
+       lwz     r10,24(r3)
+       lwz     r6,28(r3)
+       addi    r3,r3,0x20
+       lwz     r8,24(r4)
+       lwz     r9,28(r4)
+       addi    r4,r4,0x20
+       cmplw   cr5,r8,r10
+       cmplw   cr1,r9,r6
+       bne     cr5,L(st2)
+       bne     cr1,L(st1)
+       bdnz    L(word8_compare_loop)
+
+L(preword2_count_loop):
+       srwi.   r6,r5,3
+       beq     L(prebyte_count_loop)
+       mtctr   r6
+       clrlwi  r5,r5,29
+
+L(word2_count_loop):
+       lwz     r10,0(r3)
+       lwz     r6,4(r3)
+       addi    r3,r3,0x08
+       lwz     r8,0(r4)
+       lwz     r9,4(r4)
+       addi    r4,r4,0x08
+       cmplw   cr5,r8,r10
+       cmplw   cr1,r9,r6
+       bne     cr5,L(st2)
+       bne     cr1,L(st1)
+       bdnz    L(word2_count_loop)
+
+L(prebyte_count_loop):
+       addi    r5,r5,1
+       mtctr   r5
+       bdz     L(end_memcmp)
+
+L(byte_count_loop):
+       lbz     r6,0(r3)
+       addi    r3,r3,0x01
+       lbz     r8,0(r4)
+       addi    r4,r4,0x01
+       cmplw   cr5,r8,r6
+       bne     cr5,L(st2)
+       bdnz    L(byte_count_loop)
+
+L(end_memcmp):
+       addi    r3,r0,0
+       blr
+
+L(l_r):
+       addi    r3,r0,1
+       blr
+
+L(st1):
+       blt     cr1,L(l_r)
+       addi    r3,r0,-1
+       blr
+
+L(st2):
+       blt     cr5,L(l_r)
+       addi    r3,r0,-1
+       blr
+END (memcmp)
+libc_hidden_builtin_def (memcmp)
+weak_alias (memcmp,bcmp)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/memcpy.S b/libc/sysdeps/powerpc/powerpc32/405/memcpy.S
new file mode 100644
index 000000000..b01d53920
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/memcpy.S
@@ -0,0 +1,130 @@
+/* Optimized memcpy implementation for PowerPC476.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* memcpy
+
+       r0:return address
+       r3:destination address
+       r4:source address
+       r5:byte count
+
+       Save return address in r0.
+       If destinationn and source are unaligned and copy count is greater than 256
+       then copy 0-3 bytes to make destination aligned.
+       If 32 or more bytes to copy we use 32 byte copy loop.
+       Finaly we copy 0-31 extra bytes. */
+
+EALIGN (memcpy, 5, 0)
+/* Check if bytes to copy are greater than 256 and if
+       source and destination are unaligned */
+       cmpwi   r5,0x0100
+       addi    r0,r3,0
+       ble     L(string_count_loop)
+       neg     r6,r3
+       clrlwi. r6,r6,30
+       beq     L(string_count_loop)
+       neg     r6,r4
+       clrlwi. r6,r6,30
+       beq     L(string_count_loop)
+       mtctr   r6
+       subf    r5,r6,r5
+
+L(unaligned_bytecopy_loop): /* Align destination by coping 0-3 bytes */
+       lbz     r8,0x0(r4)
+       addi    r4,r4,1
+       stb     r8,0x0(r3)
+       addi    r3,r3,1
+       bdnz    L(unaligned_bytecopy_loop)
+       srwi.   r7,r5,5
+       beq     L(preword2_count_loop)
+       mtctr   r7
+
+L(word8_count_loop_no_dcbt): /* Copy 32 bytes at a time */
+       lwz     r6,0(r4)
+       lwz     r7,4(r4)
+       lwz     r8,8(r4)
+       lwz     r9,12(r4)
+       subi    r5,r5,0x20
+       stw     r6,0(r3)
+       stw     r7,4(r3)
+       stw     r8,8(r3)
+       stw     r9,12(r3)
+       lwz     r6,16(r4)
+       lwz     r7,20(r4)
+       lwz     r8,24(r4)
+       lwz     r9,28(r4)
+       addi    r4,r4,0x20
+       stw     r6,16(r3)
+       stw     r7,20(r3)
+       stw     r8,24(r3)
+       stw     r9,28(r3)
+       addi    r3,r3,0x20
+       bdnz    L(word8_count_loop_no_dcbt)
+
+L(preword2_count_loop): /* Copy remaining 0-31 bytes */
+       clrlwi. r12,r5,27
+       beq     L(end_memcpy)
+       mtxer   r12
+       lswx    r5,0,r4
+       stswx   r5,0,r3
+       mr       r3,r0
+       blr
+
+L(string_count_loop): /* Copy odd 0-31 bytes */
+       clrlwi. r12,r5,28
+       add     r3,r3,r5
+       add     r4,r4,r5
+       beq     L(pre_string_copy)
+       mtxer   r12
+       subf    r4,r12,r4
+       subf    r3,r12,r3
+       lswx    r6,0,r4
+       stswx   r6,0,r3
+
+L(pre_string_copy): /* Check how many 32 byte chunks to copy */
+       srwi.   r7,r5,4
+       beq     L(end_memcpy)
+       mtctr   r7
+
+L(word4_count_loop_no_dcbt): /* Copy 32 bytes at a time */
+       lwz     r6,-4(r4)
+       lwz     r7,-8(r4)
+       lwz     r8,-12(r4)
+       lwzu    r9,-16(r4)
+       stw     r6,-4(r3)
+       stw     r7,-8(r3)
+       stw     r8,-12(r3)
+       stwu    r9,-16(r3)
+       bdz     L(end_memcpy)
+       lwz     r6,-4(r4)
+       lwz     r7,-8(r4)
+       lwz     r8,-12(r4)
+       lwzu    r9,-16(r4)
+       stw     r6,-4(r3)
+       stw     r7,-8(r3)
+       stw     r8,-12(r3)
+       stwu    r9,-16(r3)
+       bdnz    L(word4_count_loop_no_dcbt)
+
+L(end_memcpy):
+       mr       r3,r0
+       blr
+END (memcpy)
+libc_hidden_builtin_def (memcpy)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/memset.S b/libc/sysdeps/powerpc/powerpc32/405/memset.S
new file mode 100644
index 000000000..b73dba887
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/memset.S
@@ -0,0 +1,152 @@
+/* Optimized memset for PowerPC405,440,464 (32-byte cacheline).
+   Copyright (C) 2012-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* memset
+
+       r3:destination address and return address
+       r4:source integer to copy
+       r5:byte count
+       r11:sources integer to copy in all 32 bits of reg
+       r12:temp return address
+
+       Save return address in r12
+       If destinationn is unaligned and count is greater tha 255 bytes
+       set 0-3 bytes to make destination aligned
+       If count is greater tha 255 bytes and setting zero to memory
+       use dbcz to set memeory when we can
+       otherwsie do the follwoing
+       If 16 or more words to set we use 16 word copy loop.
+       Finaly we set 0-15 extra bytes with string store. */
+
+EALIGN (memset, 5, 0)
+       rlwinm  r11,r4,0,24,31
+       rlwimi  r11,r4,8,16,23
+       rlwimi  r11,r11,16,0,15
+       addi    r12,r3,0
+       cmpwi   r5,0x00FF
+       ble     L(preword8_count_loop)
+       cmpwi   r4,0x00
+       beq     L(use_dcbz)
+       neg     r6,r3
+       clrlwi. r6,r6,30
+       beq     L(preword8_count_loop)
+       addi    r8,0,1
+       mtctr   r6
+       subi    r3,r3,1
+
+L(unaligned_bytecopy_loop):
+       stbu    r11,0x1(r3)
+       subf.   r5,r8,r5
+       beq     L(end_memset)
+       bdnz    L(unaligned_bytecopy_loop)
+       addi    r3,r3,1
+
+L(preword8_count_loop):
+       srwi.   r6,r5,4
+       beq     L(preword2_count_loop)
+       mtctr   r6
+       addi    r3,r3,-4
+       mr      r8,r11
+       mr      r9,r11
+       mr      r10,r11
+
+L(word8_count_loop_no_dcbt):
+       stwu    r8,4(r3)
+       stwu    r9,4(r3)
+       subi    r5,r5,0x10
+       stwu    r10,4(r3)
+       stwu    r11,4(r3)
+       bdnz    L(word8_count_loop_no_dcbt)
+       addi    r3,r3,4
+
+L(preword2_count_loop):
+       clrlwi. r7,r5,28
+       beq     L(end_memset)
+       mr      r8,r11
+       mr      r9,r11
+       mr      r10,r11
+       mtxer   r7
+       stswx   r8,0,r3
+
+L(end_memset):
+       addi    r3,r12,0
+       blr
+
+L(use_dcbz):
+       neg     r6,r3
+       clrlwi. r7,r6,28
+       beq     L(skip_string_loop)
+       mr      r8,r11
+       mr      r9,r11
+       mr      r10,r11
+       subf    r5,r7,r5
+       mtxer   r7
+       stswx   r8,0,r3
+       add     r3,r3,r7
+
+L(skip_string_loop):
+       clrlwi  r8,r6,27
+       srwi.   r8,r8,4
+       beq     L(dcbz_pre_loop)
+       mtctr   r8
+
+L(word_loop):
+       stw     r11,0(r3)
+       subi    r5,r5,0x10
+       stw     r11,4(r3)
+       stw     r11,8(r3)
+       stw     r11,12(r3)
+       addi    r3,r3,0x10
+       bdnz    L(word_loop)
+
+L(dcbz_pre_loop):
+       srwi    r6,r5,5
+       mtctr   r6
+       addi    r7,0,0
+
+L(dcbz_loop):
+       dcbz    r3,r7
+       addi    r3,r3,0x20
+       subi    r5,r5,0x20
+       bdnz    L(dcbz_loop)
+       srwi.   r6,r5,4
+       beq     L(postword2_count_loop)
+       mtctr   r6
+
+L(postword8_count_loop):
+       stw     r11,0(r3)
+       subi    r5,r5,0x10
+       stw     r11,4(r3)
+       stw     r11,8(r3)
+       stw     r11,12(r3)
+       addi    r3,r3,0x10
+       bdnz    L(postword8_count_loop)
+
+L(postword2_count_loop):
+       clrlwi. r7,r5,28
+       beq     L(end_memset)
+       mr      r8,r11
+       mr      r9,r11
+       mr      r10,r11
+       mtxer   r7
+       stswx   r8,0,r3
+       b       L(end_memset)
+END (memset)
+libc_hidden_builtin_def (memset)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/strcmp.S b/libc/sysdeps/powerpc/powerpc32/405/strcmp.S
new file mode 100644
index 000000000..c0b21907b
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/strcmp.S
@@ -0,0 +1,134 @@
+/* Optimized strcmp implementation for PowerPC476.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* strcmp
+
+       Register Use
+       r0:temp return equality
+       r3:source1 address, return equality
+       r4:source2 address
+
+       Implementation description
+       Check 2 words from src1 and src2. If unequal jump to end and
+       return src1 > src2 or src1 < src2.
+       If null check bytes before null and then jump to end and
+       return src1 > src2, src1 < src2 or src1 = src2.
+       If src1 = src2 and no null, repeat. */
+
+EALIGN (strcmp,5,0)
+       neg     r7,r3
+       clrlwi  r7,r7,20
+       neg     r8,r4
+       clrlwi  r8,r8,20
+       srwi.   r7,r7,5
+       beq     L(byte_loop)
+       srwi.   r8,r8,5
+       beq     L(byte_loop)
+       cmplw   r7,r8
+       mtctr   r7
+       ble     L(big_loop)
+       mtctr   r8
+
+L(big_loop):
+       lwz     r5,0(r3)
+       lwz     r6,4(r3)
+       lwz     r8,0(r4)
+       lwz     r9,4(r4)
+       dlmzb.  r12,r5,r6
+       bne     L(end_check)
+       cmplw   r5,r8
+       bne     L(st1)
+       cmplw   r6,r9
+       bne     L(st1)
+       lwz     r5,8(r3)
+       lwz     r6,12(r3)
+       lwz     r8,8(r4)
+       lwz     r9,12(r4)
+       dlmzb.  r12,r5,r6
+       bne     L(end_check)
+       cmplw   r5,r8
+       bne     L(st1)
+       cmplw   r6,r9
+       bne     L(st1)
+       lwz     r5,16(r3)
+       lwz     r6,20(r3)
+       lwz     r8,16(r4)
+       lwz     r9,20(r4)
+       dlmzb.  r12,r5,r6
+       bne     L(end_check)
+       cmplw   r5,r8
+       bne     L(st1)
+       cmplw   r6,r9
+       bne     L(st1)
+       lwz     r5,24(r3)
+       lwz     r6,28(r3)
+       addi    r3,r3,0x20
+       lwz     r8,24(r4)
+       lwz     r9,28(r4)
+       addi    r4,r4,0x20
+       dlmzb.  r12,r5,r6
+       bne     L(end_check)
+       cmplw   r5,r8
+       bne     L(st1)
+       cmplw   r6,r9
+       bne     L(st1)
+       bdnz    L(big_loop)
+       b       L(byte_loop)
+
+L(end_check):
+       subfic  r12,r12,4
+       blt     L(end_check2)
+       rlwinm  r12,r12,3,0,31
+       srw     r5,r5,r12
+       srw     r8,r8,r12
+       cmplw   r5,r8
+       bne     L(st1)
+       b       L(end_strcmp)
+
+L(end_check2):
+       addi    r12,r12,4
+       cmplw   r5,r8
+       rlwinm  r12,r12,3,0,31
+       bne     L(st1)
+       srw     r6,r6,r12
+       srw     r9,r9,r12
+       cmplw   r6,r9
+       bne     L(st1)
+
+L(end_strcmp):
+       addi    r3,r0,0
+       blr
+
+L(st1):
+       mfcr    r3
+       blr
+
+L(byte_loop):
+       lbz     r5,0(r3)
+       addi    r3,r3,1
+       lbz     r6,0(r4)
+       addi    r4,r4,1
+       cmplw   r5,r6
+       bne     L(st1)
+       cmpwi   r5,0
+       beq     L(end_strcmp)
+       b       L(byte_loop)
+END (strcmp)
+libc_hidden_builtin_def (strcmp)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/strcpy.S b/libc/sysdeps/powerpc/powerpc32/405/strcpy.S
new file mode 100644
index 000000000..d7c84569d
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/strcpy.S
@@ -0,0 +1,107 @@
+/* Optimized strcpy implementation for PowerPC476.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* strcpy
+
+       Register Use
+       r3:destination and return address
+       r4:source address
+       r10:temp destination address
+
+       Implementation description
+       Loop by checking 2 words at a time, with dlmzb. Check if there is a null
+       in the 2 words. If there is a null jump to end checking to determine
+       where in the last 8 bytes it is. Copy the appropriate bytes of the last
+       8 according to the null position. */
+
+EALIGN (strcpy, 5, 0)
+       neg     r7,r4
+       subi    r4,r4,1
+       clrlwi. r8,r7,29
+       subi    r10,r3,1
+       beq     L(pre_word8_loop)
+       mtctr   r8
+
+L(loop):
+       lbzu    r5,0x01(r4)
+       cmpi    cr5,r5,0x0
+       stbu    r5,0x01(r10)
+       beq     cr5,L(end_strcpy)
+       bdnz    L(loop)
+
+L(pre_word8_loop):
+       subi    r4,r4,3
+       subi    r10,r10,3
+
+L(word8_loop):
+       lwzu    r5,0x04(r4)
+       lwzu    r6,0x04(r4)
+       dlmzb.  r11,r5,r6
+       bne     L(byte_copy)
+       stwu    r5,0x04(r10)
+       stwu    r6,0x04(r10)
+       lwzu    r5,0x04(r4)
+       lwzu    r6,0x04(r4)
+       dlmzb.  r11,r5,r6
+       bne     L(byte_copy)
+       stwu    r5,0x04(r10)
+       stwu    r6,0x04(r10)
+       lwzu    r5,0x04(r4)
+       lwzu    r6,0x04(r4)
+       dlmzb.  r11,r5,r6
+       bne     L(byte_copy)
+       stwu    r5,0x04(r10)
+       stwu    r6,0x04(r10)
+       lwzu    r5,0x04(r4)
+       lwzu    r6,0x04(r4)
+       dlmzb.  r11,r5,r6
+       bne     L(byte_copy)
+       stwu    r5,0x04(r10)
+       stwu    r6,0x04(r10)
+       b       L(word8_loop)
+
+L(last_bytes_copy):
+       stwu    r5,0x04(r10)
+       subi    r11,r11,4
+       mtctr   r11
+       addi    r10,r10,3
+       subi    r4,r4,1
+
+L(last_bytes_copy_loop):
+       lbzu    r5,0x01(r4)
+       stbu    r5,0x01(r10)
+       bdnz    L(last_bytes_copy_loop)
+       blr
+
+L(byte_copy):
+       blt     L(last_bytes_copy)
+       mtctr   r11
+       addi    r10,r10,3
+       subi    r4,r4,5
+
+L(last_bytes_copy_loop2):
+       lbzu    r5,0x01(r4)
+       stbu    r5,0x01(r10)
+       bdnz    L(last_bytes_copy_loop2)
+
+L(end_strcpy):
+       blr
+END (strcpy)
+libc_hidden_builtin_def (strcpy)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/strlen.S b/libc/sysdeps/powerpc/powerpc32/405/strlen.S
new file mode 100644
index 000000000..77d22ea67
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/strlen.S
@@ -0,0 +1,75 @@
+/* Optimized strlen implementation for PowerPC476.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* strlen
+
+       Register Use
+       r3:source address and return length of string
+       r4:byte counter
+
+       Implementation description
+       Load 2 words at a time and count bytes, if we find null we subtract one from
+       the count and return the count value. We need to subtract one because
+       we don't count the null character as a byte. */
+
+EALIGN (strlen,5,0)
+       neg     r7,r3
+       clrlwi. r8,r7,29
+       addi    r4,0,0
+       beq     L(byte_count_loop)
+       mtctr   r8
+
+L(loop):
+       lbz     r5,0(r3)
+       cmpi    cr5,r5,0x0
+       addi    r3,r3,0x1
+       addi    r4,r4,0x1
+       beq     cr5,L(end_strlen)
+       bdnz    L(loop)
+
+L(byte_count_loop):
+       lwz     r5,0(r3)
+       lwz     r6,4(r3)
+       dlmzb.  r12,r5,r6
+       add     r4,r4,r12
+       bne     L(end_strlen)
+       lwz     r5,8(r3)
+       lwz     r6,12(r3)
+       dlmzb.  r12,r5,r6
+       add     r4,r4,r12
+       bne     L(end_strlen)
+       lwz     r5,16(r3)
+       lwz     r6,20(r3)
+       dlmzb.  r12,r5,r6
+       add     r4,r4,r12
+       bne     L(end_strlen)
+       lwz     r5,24(r3)
+       lwz     r6,28(r3)
+       addi    r3,r3,0x20
+       dlmzb.  r12,r5,r6
+       add     r4,r4,r12
+       bne     L(end_strlen)
+       b       L(byte_count_loop)
+
+L(end_strlen):
+       addi    r3,r4,-1
+       blr
+END (strlen)
+libc_hidden_builtin_def (strlen)
diff --git a/libc/sysdeps/powerpc/powerpc32/405/strncmp.S b/libc/sysdeps/powerpc/powerpc32/405/strncmp.S
new file mode 100644
index 000000000..3e2ba5f85
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/405/strncmp.S
@@ -0,0 +1,128 @@
+/* Optimized strncmp implementation for PowerPC476.
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* strncmp
+
+       Register Use
+       r0:temp return equality
+       r3:source1 address, return equality
+       r4:source2 address
+       r5:byte count
+
+       Implementation description
+       Touch in 3 lines of D-cache.
+       If source1 or source2 is unaligned copy 0-3 bytes to make source1 aligned
+       Check 2 words from src1 and src2. If unequal jump to end and
+       return src1 > src2 or src1 < src2.
+       If null check bytes before null and then jump to end and
+       return src1 > src2, src1 < src2 or src1 = src2.
+       If count = zero check bytes before zero counter and then jump to end and
+       return src1 > src2, src1 < src2 or src1 = src2.
+       If src1 = src2 and no null, repeat. */
+
+EALIGN (strncmp,5,0)
+       neg     r7,r3
+       clrlwi  r7,r7,20
+       neg     r8,r4
+       clrlwi  r8,r8,20
+       srwi.   r7,r7,3
+       beq     L(prebyte_count_loop)
+       srwi.   r8,r8,3
+       beq     L(prebyte_count_loop)
+       cmplw   r7,r8
+       mtctr   r7
+       ble     L(preword2_count_loop)
+       mtctr   r8
+
+L(preword2_count_loop):
+       srwi.   r6,r5,3
+       beq     L(prebyte_count_loop)
+       mfctr   r7
+       cmplw   r6,r7
+       bgt     L(set_count_loop)
+       mtctr   r6
+       clrlwi  r5,r5,29
+
+L(word2_count_loop):
+       lwz     r10,0(r3)
+       lwz     r6,4(r3)
+       addi    r3,r3,0x08
+       lwz     r8,0(r4)
+       lwz     r9,4(r4)
+       addi    r4,r4,0x08
+       dlmzb.  r12,r10,r6
+       bne     L(end_check)
+       cmplw   r10,r8
+       bne     L(st1)
+       cmplw   r6,r9
+       bne     L(st1)
+       bdnz    L(word2_count_loop)
+
+L(prebyte_count_loop):
+       addi    r5,r5,1
+       mtctr   r5
+       bdz     L(end_strncmp)
+
+L(byte_count_loop):
+       lbz     r6,0(r3)
+       addi    r3,r3,1
+       lbz     r7,0(r4)
+       addi    r4,r4,1
+       cmplw   r6,r7
+       bne     L(st1)
+       cmpwi   r6,0
+       beq     L(end_strncmp)
+       bdnz    L(byte_count_loop)
+       b       L(end_strncmp)
+
+L(set_count_loop):
+       slwi    r7,r7,3
+       subf    r5,r7,r5
+       b       L(word2_count_loop)
+
+L(end_check):
+       subfic  r12,r12,4
+       blt     L(end_check2)
+       rlwinm  r12,r12,3,0,31
+       srw     r10,r10,r12
+       srw     r8,r8,r12
+       cmplw   r10,r8
+       bne     L(st1)
+       b       L(end_strncmp)
+
+L(end_check2):
+       addi    r12,r12,4
+       cmplw   r10,r8
+       rlwinm  r12,r12,3,0,31
+       bne     L(st1)
+       srw     r6,r6,r12
+       srw     r9,r9,r12
+       cmplw   r6,r9
+       bne     L(st1)
+
+L(end_strncmp):
+       addi    r3,r0,0
+       blr
+
+L(st1):
+       mfcr    r3
+       blr
+END (strncmp)
+libc_hidden_builtin_def (strncmp)
diff --git a/libc/sysdeps/powerpc/powerpc32/440/Implies b/libc/sysdeps/powerpc/powerpc32/440/Implies
new file mode 100644
index 000000000..70c0d2eda
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/440/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/405/fpu
+powerpc/powerpc32/405
diff --git a/libc/sysdeps/powerpc/powerpc32/464/Implies b/libc/sysdeps/powerpc/powerpc32/464/Implies
new file mode 100644
index 000000000..c3e52c550
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/464/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/440/fpu
+powerpc/powerpc32/440
diff --git a/libc/sysdeps/powerpc/powerpc32/476/Implies b/libc/sysdeps/powerpc/powerpc32/476/Implies
new file mode 100644
index 000000000..2829f9cca
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/476/Implies
@@ -0,0 +1,2 @@
+powerpc/powerpc32/464/fpu
+powerpc/powerpc32/464
diff --git a/libc/sysdeps/powerpc/powerpc32/476/memset.S b/libc/sysdeps/powerpc/powerpc32/476/memset.S
new file mode 100644
index 000000000..48c21d620
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/476/memset.S
@@ -0,0 +1,152 @@
+/* Optimized memset for PowerPC476 (128-byte cacheline).
+   Copyright (C) 2010-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* memset
+
+       r3:destination address and return address
+       r4:source integer to copy
+       r5:byte count
+       r11:sources integer to copy in all 32 bits of reg
+       r12:temp return address
+
+       Save return address in r12
+       If destinationn is unaligned and count is greater tha 255 bytes
+       set 0-3 bytes to make destination aligned
+       If count is greater tha 255 bytes and setting zero to memory
+       use dbcz to set memeory when we can
+       otherwsie do the follwoing
+       If 16 or more words to set we use 16 word copy loop.
+       Finaly we set 0-15 extra bytes with string store. */
+
+EALIGN (memset, 5, 0)
+       rlwinm  r11,r4,0,24,31
+       rlwimi  r11,r4,8,16,23
+       rlwimi  r11,r11,16,0,15
+       addi    r12,r3,0
+       cmpwi   r5,0x00FF
+       ble     L(preword8_count_loop)
+       cmpwi   r4,0x00
+       beq     L(use_dcbz)
+       neg     r6,r3
+       clrlwi. r6,r6,30
+       beq     L(preword8_count_loop)
+       addi    r8,0,1
+       mtctr   r6
+       subi    r3,r3,1
+
+L(unaligned_bytecopy_loop):
+       stbu    r11,0x1(r3)
+       subf.   r5,r8,r5
+       beq     L(end_memset)
+       bdnz    L(unaligned_bytecopy_loop)
+       addi    r3,r3,1
+
+L(preword8_count_loop):
+       srwi.   r6,r5,4
+       beq     L(preword2_count_loop)
+       mtctr   r6
+       addi    r3,r3,-4
+       mr      r8,r11
+       mr      r9,r11
+       mr      r10,r11
+
+L(word8_count_loop_no_dcbt):
+       stwu    r8,4(r3)
+       stwu    r9,4(r3)
+       subi    r5,r5,0x10
+       stwu    r10,4(r3)
+       stwu    r11,4(r3)
+       bdnz    L(word8_count_loop_no_dcbt)
+       addi    r3,r3,4
+
+L(preword2_count_loop):
+       clrlwi. r7,r5,28
+       beq     L(end_memset)
+       mr      r8,r11
+       mr      r9,r11
+       mr      r10,r11
+       mtxer   r7
+       stswx   r8,0,r3
+
+L(end_memset):
+       addi    r3,r12,0
+       blr
+
+L(use_dcbz):
+       neg     r6,r3
+       clrlwi. r7,r6,28
+       beq     L(skip_string_loop)
+       mr      r8,r11
+       mr      r9,r11
+       mr      r10,r11
+       subf    r5,r7,r5
+       mtxer   r7
+       stswx   r8,0,r3
+       add     r3,r3,r7
+
+L(skip_string_loop):
+       clrlwi  r8,r6,25
+       srwi.   r8,r8,4
+       beq     L(dcbz_pre_loop)
+       mtctr   r8
+
+L(word_loop):
+       stw     r11,0(r3)
+       subi    r5,r5,0x10
+       stw     r11,4(r3)
+       stw     r11,8(r3)
+       stw     r11,12(r3)
+       addi    r3,r3,0x10
+       bdnz    L(word_loop)
+
+L(dcbz_pre_loop):
+       srwi    r6,r5,7
+       mtctr   r6
+       addi    r7,0,0
+
+L(dcbz_loop):
+       dcbz    r3,r7
+       addi    r3,r3,0x80
+       subi    r5,r5,0x80
+       bdnz    L(dcbz_loop)
+       srwi.   r6,r5,4
+       beq     L(postword2_count_loop)
+       mtctr   r6
+
+L(postword8_count_loop):
+       stw     r11,0(r3)
+       subi    r5,r5,0x10
+       stw     r11,4(r3)
+       stw     r11,8(r3)
+       stw     r11,12(r3)
+       addi    r3,r3,0x10
+       bdnz    L(postword8_count_loop)
+
+L(postword2_count_loop):
+       clrlwi. r7,r5,28
+       beq     L(end_memset)
+       mr      r8,r11
+       mr      r9,r11
+       mr      r10,r11
+       mtxer   r7
+       stswx   r8,0,r3
+       b       L(end_memset)
+END (memset)
+libc_hidden_builtin_def (memset)
diff --git a/libc/sysdeps/powerpc/powerpc32/Makefile b/libc/sysdeps/powerpc/powerpc32/Makefile
index 64f79003a..cf620c826 100644
--- a/libc/sysdeps/powerpc/powerpc32/Makefile
+++ b/libc/sysdeps/powerpc/powerpc32/Makefile
@@ -1,8 +1,12 @@
 # Powerpc32 specific build options.
 
-ifeq ($(with-fp),no)
-+cflags += -msoft-float
-sysdep-LDFLAGS += -msoft-float
+# Some Powerpc32 variants assume soft-fp is the default even though there is
+# an fp variant so provide -mhard-float if --with-fp is explicitly passed.
+
+ifeq ($(with-fp),yes)
++cflags += -mhard-float
+ASFLAGS += -mhard-float
+sysdep-LDFLAGS += -mhard-float
 endif
 
 ifeq ($(subdir),gmon)
diff --git a/libc/sysdeps/powerpc/powerpc32/__longjmp-common.S b/libc/sysdeps/powerpc/powerpc32/__longjmp-common.S
index 787447363..df1d5195f 100644
--- a/libc/sysdeps/powerpc/powerpc32/__longjmp-common.S
+++ b/libc/sysdeps/powerpc/powerpc32/__longjmp-common.S
@@ -24,6 +24,12 @@
 # include <jmpbuf-offsets.h>
 #endif
 
+#if defined __SPE__ || (defined __NO_FPRS__ && !defined _SOFT_FLOAT)
+# define LOAD_GP(N)	evldd r##N,((JB_FPRS+((N)-14)*2)*4)(r3)
+#else
+# define LOAD_GP(N)	lwz r##N,((JB_GPRS+(N)-14)*4)(r3)
+#endif
+
 ENTRY (__longjmp)
 
 #if defined PTR_DEMANGLE || defined CHECK_SP
@@ -39,13 +45,13 @@ ENTRY (__longjmp)
 	lwz r1,(JB_GPR1*4)(r3)
 #endif
 	lwz r0,(JB_LR*4)(r3)
-	lwz r14,((JB_GPRS+0)*4)(r3)
-	lwz r15,((JB_GPRS+1)*4)(r3)
-	lwz r16,((JB_GPRS+2)*4)(r3)
-	lwz r17,((JB_GPRS+3)*4)(r3)
-	lwz r18,((JB_GPRS+4)*4)(r3)
-	lwz r19,((JB_GPRS+5)*4)(r3)
-	lwz r20,((JB_GPRS+6)*4)(r3)
+	LOAD_GP (14)
+	LOAD_GP (15)
+	LOAD_GP (16)
+	LOAD_GP (17)
+	LOAD_GP (18)
+	LOAD_GP (19)
+	LOAD_GP (20)
 #ifdef PTR_DEMANGLE
 # ifndef CHECK_SP
 	PTR_DEMANGLE3 (r1, r24, r25)
@@ -53,19 +59,19 @@ ENTRY (__longjmp)
 	PTR_DEMANGLE2 (r0, r25)
 #endif
 	mtlr r0
-	lwz r21,((JB_GPRS+7)*4)(r3)
-	lwz r22,((JB_GPRS+8)*4)(r3)
+	LOAD_GP (21)
+	LOAD_GP (22)
 	lwz r0,(JB_CR*4)(r3)
-	lwz r23,((JB_GPRS+9)*4)(r3)
-	lwz r24,((JB_GPRS+10)*4)(r3)
-	lwz r25,((JB_GPRS+11)*4)(r3)
+	LOAD_GP (23)
+	LOAD_GP (24)
+	LOAD_GP (25)
 	mtcrf 0xFF,r0
-	lwz r26,((JB_GPRS+12)*4)(r3)
-	lwz r27,((JB_GPRS+13)*4)(r3)
-	lwz r28,((JB_GPRS+14)*4)(r3)
-	lwz r29,((JB_GPRS+15)*4)(r3)
-	lwz r30,((JB_GPRS+16)*4)(r3)
-	lwz r31,((JB_GPRS+17)*4)(r3)
+	LOAD_GP (26)
+	LOAD_GP (27)
+	LOAD_GP (28)
+	LOAD_GP (29)
+	LOAD_GP (30)
+	LOAD_GP (31)
 	mr r3,r4
 	blr
 END (__longjmp)
diff --git a/libc/sysdeps/powerpc/powerpc32/bsd-_setjmp.S b/libc/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
index 95e8a5aa1..ad2b5ffdb 100644
--- a/libc/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/bsd-_setjmp.S
@@ -30,7 +30,7 @@ libc_hidden_def (_setjmp)
 /* Build a versioned object for libc.  */
 
 # if defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
-symbol_version (__novmx_setjmp,_setjmp,GLIBC_2.0);
+compat_symbol (libc, __novmx_setjmp, _setjmp, GLIBC_2_0);
 
 ENTRY (__novmx_setjmp)
 	li r4,0			/* Set second argument to 0.  */
@@ -39,7 +39,7 @@ END (__novmx_setjmp)
 libc_hidden_def (__novmx_setjmp)
 # endif /* defined SHARED  && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) */
 
-default_symbol_version (__vmx_setjmp,_setjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx_setjmp, _setjmp, GLIBC_2_3_4)
 /* __GI__setjmp prototype is needed for ntpl i.e. _setjmp is defined
    as a libc_hidden_proto & is used in sysdeps/generic/libc-start.c
    if HAVE_CLEANUP_JMP_BUF is defined */
diff --git a/libc/sysdeps/powerpc/powerpc32/bsd-setjmp.S b/libc/sysdeps/powerpc/powerpc32/bsd-setjmp.S
index 1113ea533..5e1e860d8 100644
--- a/libc/sysdeps/powerpc/powerpc32/bsd-setjmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/bsd-setjmp.S
@@ -26,7 +26,7 @@ ENTRY (__novmxsetjmp)
 	b __novmx__sigsetjmp@local
 END (__novmxsetjmp)
 strong_alias (__novmxsetjmp, __novmx__setjmp)
-symbol_version (__novmxsetjmp, setjmp, GLIBC_2.0)
+compat_symbol (libc, __novmxsetjmp, setjmp, GLIBC_2_0)
 
 #endif  /* defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4) ) */
 
@@ -36,4 +36,4 @@ ENTRY (__vmxsetjmp)
 END (__vmxsetjmp)
 strong_alias (__vmxsetjmp, __vmx__setjmp)
 strong_alias (__vmx__setjmp, __setjmp)
-default_symbol_version (__vmxsetjmp,setjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmxsetjmp, setjmp, GLIBC_2_3_4)
diff --git a/libc/sysdeps/powerpc/powerpc32/dl-machine.c b/libc/sysdeps/powerpc/powerpc32/dl-machine.c
index 3e7202d86..aba361856 100644
--- a/libc/sysdeps/powerpc/powerpc32/dl-machine.c
+++ b/libc/sysdeps/powerpc/powerpc32/dl-machine.c
@@ -416,6 +416,12 @@ __process_machine_rela (struct link_map *map,
 			Elf32_Addr const finaladdr,
 			int rinfo)
 {
+  union unaligned
+    {
+      uint16_t u2;
+      uint32_t u4;
+    } __attribute__((__packed__));
+
   switch (rinfo)
     {
     case R_PPC_NONE:
@@ -432,10 +438,7 @@ __process_machine_rela (struct link_map *map,
       return;
 
     case R_PPC_UADDR32:
-      ((char *) reloc_addr)[0] = finaladdr >> 24;
-      ((char *) reloc_addr)[1] = finaladdr >> 16;
-      ((char *) reloc_addr)[2] = finaladdr >> 8;
-      ((char *) reloc_addr)[3] = finaladdr;
+      ((union unaligned *) reloc_addr)->u4 = finaladdr;
       break;
 
     case R_PPC_ADDR24:
@@ -453,8 +456,7 @@ __process_machine_rela (struct link_map *map,
     case R_PPC_UADDR16:
       if (__builtin_expect (finaladdr > 0x7fff && finaladdr < 0xffff8000, 0))
 	_dl_reloc_overflow (map,  "R_PPC_UADDR16", reloc_addr, refsym);
-      ((char *) reloc_addr)[0] = finaladdr >> 8;
-      ((char *) reloc_addr)[1] = finaladdr;
+      ((union unaligned *) reloc_addr)->u2 = finaladdr;
       break;
 
     case R_PPC_ADDR16_LO:
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/Makefile b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/Makefile
new file mode 100644
index 000000000..adf556870
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/Makefile
@@ -0,0 +1,9 @@
+ifeq ($(subdir),math)
+libm-routines += fexcepts_to_spe fexcepts_from_spe
+libm-routines += fexcepts_to_prctl fexcepts_from_prctl
+libm-routines += fe_note_change
+endif
+
+ifeq ($(subdir),soft-fp)
+sysdep_routines += fraiseexcept-soft
+endif
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fclrexcpt.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fclrexcpt.c
new file mode 100644
index 000000000..92a7dd1e0
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fclrexcpt.c
@@ -0,0 +1,53 @@
+/* Clear given exceptions in current floating-point environment.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+
+#undef feclearexcept
+int
+__feclearexcept (int excepts)
+{
+  unsigned int fpescr;
+  int excepts_spe = __fexcepts_to_spe (excepts);
+
+  /* Get the current state.  */
+  fpescr = fegetenv_register ();
+
+  /* Clear the relevant bits.  */
+  fpescr &= ~excepts_spe;
+
+  /* Put the new state in effect.  */
+  fesetenv_register (fpescr);
+
+  /* Let the kernel know if the "invalid" or "underflow" bit was
+     cleared.  */
+  if (excepts & (FE_INVALID | FE_UNDERFLOW))
+    __fe_note_change ();
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feclearexcept, __old_feclearexcept)
+compat_symbol (libm, __old_feclearexcept, feclearexcept, GLIBC_2_1);
+#endif
+
+libm_hidden_ver (__feclearexcept, feclearexcept)
+versioned_symbol (libm, __feclearexcept, feclearexcept, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fe_note_change.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fe_note_change.c
new file mode 100644
index 000000000..43a570626
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fe_note_change.c
@@ -0,0 +1,39 @@
+/* Note a change to floating-point exceptions.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+/* Inform the kernel of a change to floating-point exceptions.  */
+
+void
+__fe_note_change (void)
+{
+  int pflags, r;
+  INTERNAL_SYSCALL_DECL (err);
+
+  r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &pflags);
+  if (INTERNAL_SYSCALL_ERROR_P (r, err))
+    return;
+  if ((pflags & PR_FP_EXC_SW_ENABLE) == 0)
+    INTERNAL_SYSCALL (prctl, err, 2, PR_SET_FPEXC,
+		      pflags | PR_FP_EXC_SW_ENABLE);
+}
+
+libm_hidden_def (__fe_note_change)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fedisblxcpt.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fedisblxcpt.c
new file mode 100644
index 000000000..7cc963c01
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fedisblxcpt.c
@@ -0,0 +1,54 @@
+/* Disable floating-point exceptions.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+fedisableexcept (int excepts)
+{
+  int result = 0, pflags, r;
+  INTERNAL_SYSCALL_DECL (err);
+
+  r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &pflags);
+  if (INTERNAL_SYSCALL_ERROR_P (r, err))
+    return -1;
+
+  /* Save old enable bits.  */
+  result = __fexcepts_from_prctl (pflags);
+
+  pflags &= ~__fexcepts_to_prctl (excepts);
+  r = INTERNAL_SYSCALL (prctl, err, 2, PR_SET_FPEXC,
+			pflags | PR_FP_EXC_SW_ENABLE);
+  if (INTERNAL_SYSCALL_ERROR_P (r, err))
+    return -1;
+
+  /* If disabling signals for "inexact", also disable trapping to the
+     kernel.  */
+  if ((excepts & FE_INEXACT) != 0)
+    {
+      unsigned long fpescr;
+
+      fpescr = fegetenv_register ();
+      fpescr &= ~SPEFSCR_FINXE;
+      fesetenv_register (fpescr);
+    }
+
+  return result;
+}
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feenablxcpt.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feenablxcpt.c
new file mode 100644
index 000000000..133dde7b3
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feenablxcpt.c
@@ -0,0 +1,54 @@
+/* Enable floating-point exceptions.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+feenableexcept (int excepts)
+{
+  unsigned int result = 0, pflags, r;
+  INTERNAL_SYSCALL_DECL (err);
+
+  r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &pflags);
+  if (INTERNAL_SYSCALL_ERROR_P (r, err))
+    return -1;
+
+  /* Save old enable bits.  */
+  result = __fexcepts_from_prctl (pflags);
+
+  pflags |= __fexcepts_to_prctl (excepts);
+  r = INTERNAL_SYSCALL (prctl, err, 2, PR_SET_FPEXC,
+			pflags | PR_FP_EXC_SW_ENABLE);
+  if (INTERNAL_SYSCALL_ERROR_P (r, err))
+    return -1;
+
+  /* If enabling signals for "inexact", also enable trapping to the
+     kernel.  */
+  if ((excepts & FE_INEXACT) != 0)
+    {
+      unsigned long fpescr;
+
+      fpescr = fegetenv_register ();
+      fpescr |= SPEFSCR_FINXE;
+      fesetenv_register (fpescr);
+    }
+
+  return result;
+}
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetenv.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetenv.c
new file mode 100644
index 000000000..bfcbca2ad
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetenv.c
@@ -0,0 +1,47 @@
+/* Store current floating-point environment.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+__fegetenv (fenv_t *envp)
+{
+  fenv_union_t u;
+  INTERNAL_SYSCALL_DECL (err);
+  int r;
+
+  r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &u.l[0]);
+  if (INTERNAL_SYSCALL_ERROR_P (r, err))
+    return -1;
+
+  u.l[1] = fegetenv_register ();
+  *envp = u.fenv;
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fegetenv, __old_fegetenv)
+compat_symbol (libm, __old_fegetenv, fegetenv, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fegetenv, fegetenv, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetexcept.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetexcept.c
new file mode 100644
index 000000000..9c7afc74f
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetexcept.c
@@ -0,0 +1,36 @@
+/* Get floating-point exceptions.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+fegetexcept (void)
+{
+  int result = 0, pflags, r;
+  INTERNAL_SYSCALL_DECL (err);
+
+  r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &pflags);
+  if (INTERNAL_SYSCALL_ERROR_P (r, err))
+    return -1;
+
+  result = __fexcepts_from_prctl (pflags);
+
+  return result;
+}
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetround.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetround.c
new file mode 100644
index 000000000..f69e9a5bd
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fegetround.c
@@ -0,0 +1,29 @@
+/* Return current rounding direction.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+
+#undef fegetround
+int
+fegetround (void)
+{
+  unsigned long fpescr;
+
+  fpescr = fegetenv_register ();
+  return fpescr & 3;
+}
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feholdexcpt.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feholdexcpt.c
new file mode 100644
index 000000000..bd05ebd3c
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feholdexcpt.c
@@ -0,0 +1,57 @@
+/* Store current floating-point environment and clear exceptions.
+   e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+feholdexcept (fenv_t *envp)
+{
+  fenv_union_t u;
+  INTERNAL_SYSCALL_DECL (err);
+  int r;
+
+  /* Get the current state.  */
+  r = INTERNAL_SYSCALL (prctl, err, 2, PR_GET_FPEXC, &u.l[0]);
+  if (INTERNAL_SYSCALL_ERROR_P (r, err))
+    return -1;
+
+  u.l[1] = fegetenv_register ();
+  *envp = u.fenv;
+
+  /* Clear everything except for the rounding mode and trapping to the
+     kernel.  */
+  u.l[0] &= ~(PR_FP_EXC_DIV
+	      | PR_FP_EXC_OVF
+	      | PR_FP_EXC_UND
+	      | PR_FP_EXC_RES
+	      | PR_FP_EXC_INV);
+  u.l[1] &= SPEFSCR_FRMC | (SPEFSCR_ALL_EXCEPT_ENABLE & ~SPEFSCR_FINXE);
+
+  /* Put the new state in effect.  */
+  fesetenv_register (u.l[1]);
+  r = INTERNAL_SYSCALL (prctl, err, 2, PR_SET_FPEXC,
+			u.l[0] | PR_FP_EXC_SW_ENABLE);
+  if (INTERNAL_SYSCALL_ERROR_P (r, err))
+    return -1;
+
+  return 0;
+}
+libm_hidden_def (feholdexcept)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_const.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_const.c
new file mode 100644
index 000000000..3a85f1810
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_const.c
@@ -0,0 +1,41 @@
+/* Constant floating-point environments for e500.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+/* The use of "unsigned long long" as the type to define the
+   bit-pattern explicitly, rather than the type "double" used in
+   <bits/fenv.h>, means that we cannot include <fenv_libc.h> here to
+   get the enum constants for the SPEFSCR bits to enable
+   exceptions.  */
+
+#include <sys/prctl.h>
+
+/* If the default argument is used we use this value.  */
+const unsigned long long __fe_dfl_env __attribute__ ((aligned (8))) =
+  0x3cULL;
+
+/* Floating-point environment where none of the exceptions are masked.  */
+const unsigned long long __fe_enabled_env __attribute__ ((aligned (8))) =
+  (((unsigned long long) (PR_FP_EXC_DIV
+			  | PR_FP_EXC_OVF
+			  | PR_FP_EXC_UND
+			  | PR_FP_EXC_RES
+			  | PR_FP_EXC_INV)) << 32) | 0x7cULL;
+
+/* Non-IEEE mode.  */
+const unsigned long long __fe_nonieee_env __attribute__ ((aligned (8))) =
+  0x0ULL;
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_libc.h b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_libc.h
new file mode 100644
index 000000000..96375808d
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fenv_libc.h
@@ -0,0 +1,96 @@
+/* Internal libc stuff for floating point environment routines.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef _FENV_LIBC_H
+#define _FENV_LIBC_H	1
+
+#include <fenv.h>
+
+int __feraiseexcept_spe (int);
+libm_hidden_proto (__feraiseexcept_spe)
+
+int __fexcepts_to_spe (int);
+libm_hidden_proto (__fexcepts_to_spe)
+
+int __fexcepts_from_spe (int);
+libm_hidden_proto (__fexcepts_from_spe)
+
+int __fexcepts_to_prctl (int);
+libm_hidden_proto (__fexcepts_to_prctl)
+
+int __fexcepts_from_prctl (int);
+libm_hidden_proto (__fexcepts_from_prctl)
+
+void __fe_note_change (void);
+libm_hidden_proto (__fe_note_change)
+
+/* Equivalent to fegetenv, but returns an unsigned int instead of
+   taking a pointer.  */
+#define fegetenv_register() \
+  ({ unsigned int fscr; asm volatile ("mfspefscr %0" : "=r" (fscr)); fscr; })
+
+/* Equivalent to fesetenv, but takes an unsigned int instead of a
+   pointer.  */
+#define fesetenv_register(fscr) \
+  ({ asm volatile ("mtspefscr %0" : : "r" (fscr)); })
+
+typedef union
+{
+  fenv_t fenv;
+  unsigned int l[2];
+} fenv_union_t;
+
+/* Definitions of all the SPEFSCR bit numbers.  */
+enum {
+  SPEFSCR_SOVH          = 0x80000000,
+  SPEFSCR_OVH           = 0x40000000,
+  SPEFSCR_FGH           = 0x20000000,
+  SPEFSCR_FXH           = 0x10000000,
+  SPEFSCR_FINVH         = 0x08000000,
+  SPEFSCR_FDBZH         = 0x04000000,
+  SPEFSCR_FUNFH         = 0x02000000,
+  SPEFSCR_FOVFH         = 0x01000000,
+  /* 2 unused bits.  */
+  SPEFSCR_FINXS         = 0x00200000,
+  SPEFSCR_FINVS         = 0x00100000,
+  SPEFSCR_FDBZS         = 0x00080000,
+  SPEFSCR_FUNFS         = 0x00040000,
+  SPEFSCR_FOVFS         = 0x00020000,
+  /* Combination of the exception bits.  */
+  SPEFSCR_ALL_EXCEPT    = 0x003e0000,
+  SPEFSCR_MODE          = 0x00010000,
+  SPEFSCR_SOV           = 0x00008000,
+  SPEFSCR_OV            = 0x00004000,
+  SPEFSCR_FG            = 0x00002000,
+  SPEFSCR_FX            = 0x00001000,
+  SPEFSCR_FINV          = 0x00000800,
+  SPEFSCR_FDBZ          = 0x00000400,
+  SPEFSCR_FUNF          = 0x00000200,
+  SPEFSCR_FOVF          = 0x00000100,
+  /* 1 unused bit.  */
+  SPEFSCR_FINXE         = 0x00000040,
+  SPEFSCR_FINVE         = 0x00000020,
+  SPEFSCR_FDBZE         = 0x00000010,
+  SPEFSCR_FUNFE         = 0x00000008,
+  SPEFSCR_FOVFE         = 0x00000004,
+  /* Combination of the exception trap enable bits.  */
+  SPEFSCR_ALL_EXCEPT_ENABLE = 0x0000007c,
+  SPEFSCR_FRMC          = 0x00000003
+};
+
+#endif /* fenv_libc.h */
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetenv.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetenv.c
new file mode 100644
index 000000000..411e6be8d
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetenv.c
@@ -0,0 +1,49 @@
+/* Install given floating-point environment.  e500 version.
+   Copyright (C) 1997-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+#include <sysdep.h>
+#include <sys/prctl.h>
+
+int
+__fesetenv (const fenv_t *envp)
+{
+  fenv_union_t u;
+  INTERNAL_SYSCALL_DECL (err);
+  int r;
+
+  u.fenv = *envp;
+
+  fesetenv_register (u.l[1]);
+  r = INTERNAL_SYSCALL (prctl, err, 2, PR_SET_FPEXC,
+			u.l[0] | PR_FP_EXC_SW_ENABLE);
+  if (INTERNAL_SYSCALL_ERROR_P (r, err))
+    return -1;
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fesetenv, __old_fesetenv)
+compat_symbol (libm, __old_fesetenv, fesetenv, GLIBC_2_1);
+#endif
+
+libm_hidden_ver (__fesetenv, fesetenv)
+versioned_symbol (libm, __fesetenv, fesetenv, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetround.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetround.c
new file mode 100644
index 000000000..805008e0c
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fesetround.c
@@ -0,0 +1,35 @@
+/* Set current rounding direction.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+
+int
+fesetround (int round)
+{
+  unsigned long fpescr;
+
+  if ((unsigned int) round > 3)
+    return 1;
+
+  fpescr = fegetenv_register ();
+  fpescr = (fpescr & ~SPEFSCR_FRMC) | (round & 3);
+  fesetenv_register (fpescr);
+
+  return 0;
+}
+libm_hidden_def (fesetround)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feupdateenv.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feupdateenv.c
new file mode 100644
index 000000000..505c92363
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/feupdateenv.c
@@ -0,0 +1,47 @@
+/* Install given floating-point environment and raise exceptions.
+   e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+
+int
+__feupdateenv (const fenv_t *envp)
+{
+  int exc;
+
+  /* Save the currently set exceptions.  */
+  exc = fegetenv_register () & SPEFSCR_ALL_EXCEPT;
+
+  /* Install new environment.  */
+  fesetenv (envp);
+
+  /* Raise (if appropriate) saved exceptions. */
+  __feraiseexcept_spe (exc);
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feupdateenv, __old_feupdateenv)
+compat_symbol (libm, __old_feupdateenv, feupdateenv, GLIBC_2_1);
+#endif
+
+libm_hidden_ver (__feupdateenv, feupdateenv)
+versioned_symbol (libm, __feupdateenv, feupdateenv, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_prctl.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_prctl.c
new file mode 100644
index 000000000..c094a04cb
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_prctl.c
@@ -0,0 +1,42 @@
+/* Convert floating-point exceptions from prctl form.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+#include <sys/prctl.h>
+
+/* Convert EXCEPTS from prctl bits to FE_* form, returning the
+   converted value.  */
+
+int
+__fexcepts_from_prctl (int excepts)
+{
+  int result = 0;
+  if (excepts & PR_FP_EXC_OVF)
+    result |= FE_OVERFLOW;
+  if (excepts & PR_FP_EXC_UND)
+    result |= FE_UNDERFLOW;
+  if (excepts & PR_FP_EXC_INV)
+    result |= FE_INVALID;
+  if (excepts & PR_FP_EXC_DIV)
+    result |= FE_DIVBYZERO;
+  if (excepts & PR_FP_EXC_RES)
+    result |= FE_INEXACT;
+  return result;
+}
+
+libm_hidden_def (__fexcepts_from_prctl)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_spe.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_spe.c
new file mode 100644
index 000000000..3ec939d18
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_from_spe.c
@@ -0,0 +1,41 @@
+/* Convert floating-point exceptions from SPEFSCR form.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+
+/* Convert EXCEPTS from SPEFSCR bits to FE_* form, returning the
+   converted value.  */
+
+int
+__fexcepts_from_spe (int excepts)
+{
+  int result = 0;
+  if (excepts & SPEFSCR_FINXS)
+    result |= FE_INEXACT;
+  if (excepts & SPEFSCR_FDBZS)
+    result |= FE_DIVBYZERO;
+  if (excepts & SPEFSCR_FUNFS)
+    result |= FE_UNDERFLOW;
+  if (excepts & SPEFSCR_FOVFS)
+    result |= FE_OVERFLOW;
+  if (excepts & SPEFSCR_FINVS)
+    result |= FE_INVALID;
+  return result;
+}
+
+libm_hidden_def (__fexcepts_from_spe)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_prctl.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_prctl.c
new file mode 100644
index 000000000..b9c51b125
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_prctl.c
@@ -0,0 +1,42 @@
+/* Convert floating-point exceptions to prctl form.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+#include <sys/prctl.h>
+
+/* Convert EXCEPTS from FE_* form to prctl bits, returning the
+   converted value.  */
+
+int
+__fexcepts_to_prctl (int excepts)
+{
+  int result = 0;
+  if (excepts & FE_INEXACT)
+    result |= PR_FP_EXC_RES;
+  if (excepts & FE_DIVBYZERO)
+    result |= PR_FP_EXC_DIV;
+  if (excepts & FE_UNDERFLOW)
+    result |= PR_FP_EXC_UND;
+  if (excepts & FE_OVERFLOW)
+    result |= PR_FP_EXC_OVF;
+  if (excepts & FE_INVALID)
+    result |= PR_FP_EXC_INV;
+  return result;
+}
+
+libm_hidden_def (__fexcepts_to_prctl)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_spe.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_spe.c
new file mode 100644
index 000000000..570934d15
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fexcepts_to_spe.c
@@ -0,0 +1,41 @@
+/* Convert floating-point exceptions to SPEFSCR form.
+   Copyright (C) 2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+
+/* Convert EXCEPTS from FE_* form to SPEFSCR bits, returning the
+   converted value.  */
+
+int
+__fexcepts_to_spe (int excepts)
+{
+  int result = 0;
+  if (excepts & FE_INEXACT)
+    result |= SPEFSCR_FINXS;
+  if (excepts & FE_DIVBYZERO)
+    result |= SPEFSCR_FDBZS;
+  if (excepts & FE_UNDERFLOW)
+    result |= SPEFSCR_FUNFS;
+  if (excepts & FE_OVERFLOW)
+    result |= SPEFSCR_FOVFS;
+  if (excepts & FE_INVALID)
+    result |= SPEFSCR_FINVS;
+  return result;
+}
+
+libm_hidden_def (__fexcepts_to_spe)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fgetexcptflg.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fgetexcptflg.c
new file mode 100644
index 000000000..b01cadeff
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fgetexcptflg.c
@@ -0,0 +1,41 @@
+/* Store current representation for exceptions.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+
+int
+__fegetexceptflag (fexcept_t *flagp, int excepts)
+{
+  unsigned long fpescr;
+
+  /* Get the current state.  */
+  fpescr = fegetenv_register ();
+
+  *flagp = fpescr & SPEFSCR_ALL_EXCEPT;
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fegetexceptflag, __old_fegetexceptflag)
+compat_symbol (libm, __old_fegetexceptflag, fegetexceptflag, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fegetexceptflag, fegetexceptflag, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcept-soft.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcept-soft.c
new file mode 100644
index 000000000..0aed72ff3
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcept-soft.c
@@ -0,0 +1,28 @@
+/* Raise given exceptions.  e500 version for use from soft-fp.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+   Contributed by Aldy Hernandez <aldyh@redhat.com>, 2004.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+#include <libc-symbols.h>
+
+int __feraiseexcept_soft (int);
+libc_hidden_proto (__feraiseexcept_soft)
+
+#define __FERAISEEXCEPT_INTERNAL __feraiseexcept_soft
+#include "spe-raise.c"
+libc_hidden_def (__feraiseexcept_soft)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcpt.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcpt.c
new file mode 100644
index 000000000..0eca9ffff
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fraiseexcpt.c
@@ -0,0 +1,40 @@
+/* Raise given exceptions.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+
+#define __FERAISEEXCEPT_INTERNAL __feraiseexcept_spe
+#include "spe-raise.c"
+
+libm_hidden_def (__feraiseexcept_spe)
+
+#undef feraiseexcept
+int
+__feraiseexcept (int excepts)
+{
+  return __feraiseexcept_spe (__fexcepts_to_spe (excepts));
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__feraiseexcept, __old_feraiseexcept)
+compat_symbol (libm, __old_feraiseexcept, feraiseexcept, GLIBC_2_1);
+#endif
+
+libm_hidden_ver (__feraiseexcept, feraiseexcept)
+versioned_symbol (libm, __feraiseexcept, feraiseexcept, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fsetexcptflg.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fsetexcptflg.c
new file mode 100644
index 000000000..43f2d19d1
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/fsetexcptflg.c
@@ -0,0 +1,55 @@
+/* Set floating-point environment exception handling.  e500 version.
+   Copyright (C) 1997-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+
+int
+__fesetexceptflag (const fexcept_t *flagp, int excepts)
+{
+  unsigned long old_spefscr, spefscr;
+  fexcept_t flag;
+  int excepts_spe = __fexcepts_to_spe (excepts);
+
+  /* Get the current state.  */
+  old_spefscr = fegetenv_register ();
+
+  /* Ignore exceptions not listed in 'excepts'.  */
+  flag = *flagp & excepts_spe;
+
+  /* Replace the exception status */
+  spefscr = (old_spefscr & ~excepts_spe) | flag;
+
+  /* Store the new status word (along with the rest of the environment).  */
+  fesetenv_register (spefscr);
+
+  /* If the state of the "invalid" or "underflow" flag has changed,
+     inform the kernel.  */
+  if (((spefscr ^ old_spefscr) & (SPEFSCR_FINVS | SPEFSCR_FUNFS)) != 0)
+    __fe_note_change ();
+
+  /* Success.  */
+  return 0;
+}
+
+#include <shlib-compat.h>
+#if SHLIB_COMPAT (libm, GLIBC_2_1, GLIBC_2_2)
+strong_alias (__fesetexceptflag, __old_fesetexceptflag)
+compat_symbol (libm, __old_fesetexceptflag, fesetexceptflag, GLIBC_2_1);
+#endif
+
+versioned_symbol (libm, __fesetexceptflag, fesetexceptflag, GLIBC_2_2);
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/ftestexcept.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/ftestexcept.c
new file mode 100644
index 000000000..f4f547d5f
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/ftestexcept.c
@@ -0,0 +1,31 @@
+/* Test exception in current environment.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+
+int
+fetestexcept (int excepts)
+{
+  unsigned long f;
+
+  /* Get the current state.  */
+  f = fegetenv_register ();
+
+  return __fexcepts_from_spe (f) & excepts;
+}
+libm_hidden_def (fetestexcept)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/get-rounding-mode.h b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/get-rounding-mode.h
new file mode 100644
index 000000000..117e7331e
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/get-rounding-mode.h
@@ -0,0 +1,4 @@
+/* The generic version of get-rounding-mode.h using fpu_control.h, not
+   the one using the software rounding mode, is correct for e500.  */
+
+#include <sysdeps/generic/get-rounding-mode.h>
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/s_fabsf.S b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/s_fabsf.S
new file mode 100644
index 000000000..823f748ba
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/s_fabsf.S
@@ -0,0 +1,27 @@
+/* Floating-point absolute value.  e500 version.
+   Copyright (C) 2004-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+ENTRY (__fabsf)
+/* float [r3] fabsf (float [r3] x) ;  */
+	efsabs r3,r3
+	blr
+END (__fabsf)
+
+weak_alias (__fabsf, fabsf)
diff --git a/libc/sysdeps/powerpc/powerpc32/e500/nofpu/spe-raise.c b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/spe-raise.c
new file mode 100644
index 000000000..4394ddc7c
--- /dev/null
+++ b/libc/sysdeps/powerpc/powerpc32/e500/nofpu/spe-raise.c
@@ -0,0 +1,53 @@
+/* Raise given exceptions, given the SPEFSCR bits for those exceptions.
+   Copyright (C) 1997-2013 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#include <fenv_libc.h>
+
+int
+__FERAISEEXCEPT_INTERNAL (int excepts)
+{
+  unsigned long f;
+
+  f = fegetenv_register ();
+  f |= (excepts & SPEFSCR_ALL_EXCEPT);
+  fesetenv_register (f);
+
+  /* Force the operations that cause the exceptions.  */
+  if ((SPEFSCR_FINVS & excepts) != 0)
+    /* 0 / 0 */
+    asm volatile ("efsdiv %0,%0,%1" : : "r" (0), "r" (0));
+
+  if ((SPEFSCR_FDBZS & excepts) != 0)
+    /* 1.0 / 0.0 */
+    asm volatile ("efsdiv %0,%0,%1" : : "r" (1.0F), "r" (0));
+
+  if ((SPEFSCR_FOVFS & excepts) != 0)
+    /* Largest normalized number plus itself.  */
+    asm volatile ("efsadd %0,%0,%1" : : "r" (0x7f7fffff), "r" (0x7f7fffff));
+
+  if ((SPEFSCR_FUNFS & excepts) != 0)
+    /* Smallest normalized number times itself.  */
+    asm volatile ("efsmul %0,%0,%1" : : "r" (0x800000), "r" (0x800000));
+
+  if ((SPEFSCR_FINXS & excepts) != 0)
+    /* Smallest normalized minus 1.0 raises the inexact flag.  */
+    asm volatile ("efssub %0,%0,%1" : : "r" (0x00800000), "r" (1.0F));
+
+  /* Success.  */
+  return 0;
+}
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S b/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
index 9d34cd916..d02aa5754 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp-common.S
@@ -43,16 +43,16 @@ ENTRY (__longjmp)
 #   endif
 	mtlr    r6
 	cfi_same_value (lr)
-	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r5)
+	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r5)
 #  else
 	lwz     r5,_dl_hwcap@got(r5)
 	mtlr    r6
 	cfi_same_value (lr)
-	lwz     r5,4(r5)
+	lwz     r5,LOWORD(r5)
 #  endif
 # else
-	lis	r5,(_dl_hwcap+4)@ha
-	lwz     r5,(_dl_hwcap+4)@l(r5)
+	lis	r5,(_dl_hwcap+LOWORD)@ha
+	lwz     r5,(_dl_hwcap+LOWORD)@l(r5)
 # endif
 	andis.	r5,r5,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 	beq	L(no_vmx)
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp.S b/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
index 96e50de37..27166c454 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/__longjmp.S
@@ -26,14 +26,14 @@
 
 #else /* !NOT_IN_libc */
 /* Build a versioned object for libc.  */
-default_symbol_version (__vmx__longjmp,__longjmp,GLIBC_2.3.4);
+versioned_symbol (libc, __vmx__longjmp, __longjmp, GLIBC_2_3_4);
 # define __longjmp  __vmx__longjmp
 # include "__longjmp-common.S"
 
 # if defined SHARED && SHLIB_COMPAT (libc, GLIBC_2_0, GLIBC_2_3_4)
 #  define __NO_VMX__
 #  undef JB_SIZE
-symbol_version (__novmx__longjmp,__longjmp,GLIBC_2.0);
+compat_symbol (libc, __novmx__longjmp, __longjmp, GLIBC_2_0);
 #  undef __longjmp
 #  define __longjmp  __novmx__longjmp
 #  include "__longjmp-common.S"
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/s_copysign.S b/libc/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
index 840891f1c..1da24f492 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/s_copysign.S
@@ -29,7 +29,7 @@ ENTRY(__copysign)
 	stwu	r1,-16(r1)
 	cfi_adjust_cfa_offset (16)
 	stfd	fp2,8(r1)
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
 	cmpwi   r3,0
 	addi    r1,r1,16
 	cfi_adjust_cfa_offset (-16)
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S b/libc/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
index 4ec8389b5..2ad6de273 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/s_copysignl.S
@@ -30,7 +30,7 @@ ENTRY(__copysignl)
 	fmr	fp0,fp1
 	fabs	fp1,fp1
 	fcmpu	cr7,fp0,fp1
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
 	cmpwi	cr6,r3,0
 	addi	r1,r1,16
 	cfi_adjust_cfa_offset (-16)
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/s_lrint.S b/libc/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
index 27881f8cc..249fda501 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/s_lrint.S
@@ -24,10 +24,10 @@ ENTRY (__lrint)
 	stwu	r1,-16(r1)
 	fctiw	fp13,fp1
 	stfd	fp13,8(r1)
-	nop	/* Insure the following load is in a different dispatch group */
+	nop	/* Ensure the following load is in a different dispatch group */
 	nop	/* to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,12(r1)
+	lwz	r3,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__lrint)
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/s_lround.S b/libc/sysdeps/powerpc/powerpc32/fpu/s_lround.S
index 92dc3787d..6309f864b 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/s_lround.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/s_lround.S
@@ -67,7 +67,7 @@ ENTRY (__lround)
 	nop	/* Ensure the following load is in a different dispatch  */
 	nop	/* group to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,12(r1)	/* Load return as integer.  */
+	lwz	r3,8+LOWORD(r1)	/* Load return as integer.  */
 .Lout:
 	addi	r1,r1,16
 	blr
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/s_roundf.S b/libc/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
index 2ed9ca7b4..8cff1563a 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/s_roundf.S
@@ -19,7 +19,7 @@
 #include <sysdep.h>
 
 	.section	.rodata.cst8,"aM",@progbits,8
-	.align	2
+	.align	3
 .LC0:	/* 2**23 */
 	.long 0x4b000000
 .LC1:	/* 0.5 */
@@ -60,7 +60,6 @@ ENTRY (__roundf )
 #ifdef SHARED
 	lfs	fp10,.LC1-.LC0(r9)
 #else
-	lis	r9,.LC1@ha
 	lfs	fp10,.LC1@l(r9)
 #endif
 	ble-	cr6,.L4
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S b/libc/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
index 46ea2b00f..f3244060e 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/setjmp-common.S
@@ -94,14 +94,14 @@ ENTRY (__sigsetjmp)
 #   else
 	lwz     r5,_rtld_global_ro@got(r5)
 #   endif
-	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+4(r5)
+	lwz     r5,RTLD_GLOBAL_RO_DL_HWCAP_OFFSET+LOWORD(r5)
 #  else
 	lwz     r5,_dl_hwcap@got(r5)
-	lwz     r5,4(r5)
+	lwz     r5,LOWORD(r5)
 #  endif
 # else
-	lis	r6,(_dl_hwcap+4)@ha
-	lwz     r5,(_dl_hwcap+4)@l(r6)
+	lis	r6,(_dl_hwcap+LOWORD)@ha
+	lwz     r5,(_dl_hwcap+LOWORD)@l(r6)
 # endif
 	andis.	r5,r5,(PPC_FEATURE_HAS_ALTIVEC >> 16)
 	beq	L(no_vmx)
@@ -111,44 +111,43 @@ ENTRY (__sigsetjmp)
 	stw	r0,((JB_VRSAVE)*4)(3)
 	addi	r6,r5,16
 	beq+	L(aligned_save_vmx)
-	lvsr	v0,0,r5
-	vspltisb v1,-1         /* set v1 to all 1's */
-	vspltisb v2,0          /* set v2 to all 0's */
-	vperm   v3,v2,v1,v0   /* v3 contains shift mask with num all 1 bytes on left = misalignment  */
 
+	lvsr	v0,0,r5
+	lvsl	v1,0,r5
+	addi	r6,r5,-16
 
-	/* Special case for v20 we need to preserve what is in save area below v20 before obliterating it */
-	lvx     v5,0,r5
-	vperm   v20,v20,v20,v0
-	vsel    v5,v5,v20,v3
-	vsel    v20,v20,v2,v3
-	stvx    v5,0,r5
+# define save_misaligned_vmx(savevr,prevvr,shiftvr,tmpvr,savegpr,addgpr) \
+	addi	addgpr,addgpr,32;					 \
+	vperm	tmpvr,prevvr,savevr,shiftvr;				 \
+	stvx	tmpvr,0,savegpr
 
-#define save_2vmx_partial(savevr,prev_savevr,hivr,shiftvr,maskvr,savegpr,addgpr) \
-	addi    addgpr,addgpr,32; \
-	vperm   savevr,savevr,savevr,shiftvr; \
-	vsel    hivr,prev_savevr,savevr,maskvr; \
-	stvx    hivr,0,savegpr;
+	/*
+	 * We have to be careful not to corrupt the data below v20 and
+	 * above v31. To keep things simple we just rotate both ends in
+	 * the opposite direction to our main permute so we can use
+	 * the common macro.
+	 */
 
-	save_2vmx_partial(v21,v20,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v22,v21,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v23,v22,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v24,v23,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v25,v24,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v26,v25,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v27,v26,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v28,v27,v5,v0,v3,r5,r6)
-	save_2vmx_partial(v29,v28,v5,v0,v3,r6,r5)
-	save_2vmx_partial(v30,v29,v5,v0,v3,r5,r6)
+	/* load and rotate data below v20 */
+	lvx	v2,0,r5
+	vperm	v2,v2,v2,v1
+	save_misaligned_vmx(v20,v2,v0,v3,r5,r6)
+	save_misaligned_vmx(v21,v20,v0,v3,r6,r5)
+	save_misaligned_vmx(v22,v21,v0,v3,r5,r6)
+	save_misaligned_vmx(v23,v22,v0,v3,r6,r5)
+	save_misaligned_vmx(v24,v23,v0,v3,r5,r6)
+	save_misaligned_vmx(v25,v24,v0,v3,r6,r5)
+	save_misaligned_vmx(v26,v25,v0,v3,r5,r6)
+	save_misaligned_vmx(v27,v26,v0,v3,r6,r5)
+	save_misaligned_vmx(v28,v27,v0,v3,r5,r6)
+	save_misaligned_vmx(v29,v28,v0,v3,r6,r5)
+	save_misaligned_vmx(v30,v29,v0,v3,r5,r6)
+	save_misaligned_vmx(v31,v30,v0,v3,r6,r5)
+	/* load and rotate data above v31 */
+	lvx	v2,0,r6
+	vperm	v2,v2,v2,v1
+	save_misaligned_vmx(v2,v31,v0,v3,r5,r6)
 
-	/* Special case for r31 we need to preserve what is in save area above v31 before obliterating it */
-	addi    r5,r5,32
-	vperm   v31,v31,v31,v0
-	lvx     v4,0,r5
-	vsel    v5,v30,v31,v3
-	stvx    v5,0,r6
-	vsel    v4,v31,v4,v3
-	stvx    v4,0,r5
 	b	L(no_vmx)
 
 L(aligned_save_vmx):
diff --git a/libc/sysdeps/powerpc/powerpc32/fpu/setjmp.S b/libc/sysdeps/powerpc/powerpc32/fpu/setjmp.S
index 60cd35052..92acff1e6 100644
--- a/libc/sysdeps/powerpc/powerpc32/fpu/setjmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/fpu/setjmp.S
@@ -26,7 +26,7 @@
 
 #else /* !NOT_IN_libc */
 /* Build a versioned object for libc.  */
-default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx__sigsetjmp, __sigsetjmp, GLIBC_2_3_4)
 # define __sigsetjmp __vmx__sigsetjmp
 # define __sigjmp_save __vmx__sigjmp_save
 # include "setjmp-common.S"
@@ -36,7 +36,7 @@ default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
 #  undef __sigsetjmp
 #  undef __sigjmp_save
 #  undef JB_SIZE
-symbol_version (__novmx__sigsetjmp,__sigsetjmp,GLIBC_2.0)
+compat_symbol (libc, __novmx__sigsetjmp, __sigsetjmp, GLIBC_2_0)
 #  define __sigsetjmp __novmx__sigsetjmp
 #  define __sigjmp_save __novmx__sigjmp_save
 #  include "setjmp-common.S"
diff --git a/libc/sysdeps/powerpc/powerpc32/mcount.c b/libc/sysdeps/powerpc/powerpc32/mcount.c
index 0476bf61d..d8c063222 100644
--- a/libc/sysdeps/powerpc/powerpc32/mcount.c
+++ b/libc/sysdeps/powerpc/powerpc32/mcount.c
@@ -9,7 +9,7 @@
 /* __mcount_internal was added in glibc 2.15 with version GLIBC_PRIVATE,
    but it should have been put in version GLIBC_2.15.  Mark the
    GLIBC_PRIVATE version obsolete and add it to GLIBC_2.16 instead.  */
-default_symbol_version (___mcount_internal, __mcount_internal, GLIBC_2.16);
+versioned_symbol (libc, ___mcount_internal, __mcount_internal, GLIBC_2_16);
 
 #if SHLIB_COMPAT (libc, GLIBC_2_15, GLIBC_2_16)
 strong_alias (___mcount_internal, ___mcount_internal_private);
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
index 55b2850fd..e7a88feb4 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrint.S
@@ -29,8 +29,8 @@ ENTRY (__llrint)
 	nop	/* Insure the following load is in a different dispatch group */
 	nop	/* to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrint)
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
index cc80fcb02..da24ad38d 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llrintf.S
@@ -28,8 +28,8 @@ ENTRY (__llrintf)
 	nop	/* Insure the following load is in a different dispatch group */
 	nop	/* to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrintf)
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
index 631180f07..7246ca4d1 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/fpu/s_llround.S
@@ -19,12 +19,10 @@
 #include <sysdep.h>
 #include <math_ldbl_opt.h>
 
- .section .rodata.cst12,"aM",@progbits,12
+ .section .rodata.cst8,"aM",@progbits,8
  .align 3
- .LC0:   /* 0x1.0000000000000p+52 == 2^52 */
-	.long 0x43300000
-	.long 0x00000000
-	.long 0x3f000000 /* Use this for 0.5  */
+ .LC0:	.long (52+127)<<23 /* 0x1p+52  */
+	.long (-1+127)<<23 /* 0.5  */
 
 	.section	".text"
 
@@ -57,12 +55,12 @@ ENTRY (__llround)
 	addi	r9,r9,.LC0-got_label@l
 	mtlr	r11
 	cfi_same_value (lr)
-	lfd	fp9,0(r9)
-	lfs	fp10,8(r9)
+	lfs	fp9,0(r9)
+	lfs	fp10,4(r9)
 #else
 	lis r9,.LC0@ha
-	lfd fp9,.LC0@l(r9)	/* Load 2^52 into fpr9.  */
-	lfs fp10,.LC0@l+8(r9)	/* Load 0.5 into fpr10.  */
+	lfs fp9,.LC0@l(r9)	/* Load 2^52 into fpr9.  */
+	lfs fp10,.LC0@l+4(r9)	/* Load 0.5 into fpr10.  */
 #endif
 	fabs	fp2,fp1		/* Get the absolute value of x.  */
 	fsub	fp12,fp10,fp10	/* Compute 0.0 into fpr12.  */
@@ -80,8 +78,8 @@ ENTRY (__llround)
 	nop
 	nop
 	nop
-	lwz	r4,12(r1)	/* Load return as integer.  */
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)	/* Load return as integer.  */
+	lwz	r4,8+LOWORD(r1)
 .Lout:
 	addi	r1,r1,16
 	blr
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/hp-timing.h b/libc/sysdeps/powerpc/powerpc32/power4/hp-timing.h
index 7d6c96e9e..4e42374ea 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/hp-timing.h
+++ b/libc/sysdeps/powerpc/powerpc32/power4/hp-timing.h
@@ -87,18 +87,15 @@ typedef unsigned long long int hp_timing_t;
 
 #define HP_TIMING_NOW(Var)						\
   do {									\
-        union { long long ll; long ii[2]; } _var;			\
-	long tmp;							\
-        __asm__ __volatile__ (						\
-		"1:	mfspr	%0,269;"				\
-		"	mfspr	%1,268;"				\
-		"	mfspr	%2,269;"				\
-		"	cmpw	%0,%2;"					\
-		"	bne	1b;"					\
-		: "=r" (_var.ii[0]), "=r" (_var.ii[1]) , "=r" (tmp)	\
-		: : "cr0"						\
-		);							\
-	Var = _var.ll;							\
+    unsigned int hi, lo, tmp;						\
+    __asm__ __volatile__ ("1:	mfspr	%0,269;"			\
+			  "	mfspr	%1,268;"			\
+			  "	mfspr	%2,269;"			\
+			  "	cmpw	%0,%2;"				\
+			  "	bne	1b;"				\
+			  : "=&r" (hi), "=&r" (lo), "=&r" (tmp)		\
+			  : : "cr0");					\
+    Var = ((hp_timing_t) hi << 32) | lo;				\
   } while (0)
 
 
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/memcmp.S b/libc/sysdeps/powerpc/powerpc32/power4/memcmp.S
index 9a455a3c6..35e162667 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/memcmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/memcmp.S
@@ -1,4 +1,4 @@
-/* Optimized strcmp implementation for PowerPC64.
+/* Optimized strcmp implementation for PowerPC32.
    Copyright (C) 2003-2013 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
@@ -18,13 +18,14 @@
 
 #include <sysdep.h>
 
-/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4], size_t size [r5])  */
+/* int [r3] memcmp (const char *s1 [r3],
+		    const char *s2 [r4],
+		    size_t size [r5])  */
 
 	.machine power4
 EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT
 
-#define rTMP	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -35,33 +36,32 @@ EALIGN (memcmp, 4, 0)
 #define rWORD4	r9	/* next word in s2 */
 #define rWORD5	r10	/* next word in s1 */
 #define rWORD6	r11	/* next word in s2 */
-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
 #define rWORD7	r30	/* next word in s1 */
 #define rWORD8	r31	/* next word in s2 */
 
-	xor	rTMP, rSTR2, rSTR1
+	xor	r0, rSTR2, rSTR1
 	cmplwi	cr6, rN, 0
 	cmplwi	cr1, rN, 12
-	clrlwi.	rTMP, rTMP, 30
-	clrlwi	rBITDIF, rSTR1, 30
-	cmplwi	cr5, rBITDIF, 0
+	clrlwi.	r0, r0, 30
+	clrlwi	r12, rSTR1, 30
+	cmplwi	cr5, r12, 0
 	beq-	cr6, L(zeroLength)
-	dcbt	0,rSTR1
-	dcbt	0,rSTR2
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
 /* If less than 8 bytes or not aligned, use the unaligned
    byte loop.  */
 	blt	cr1, L(bytealigned)
-        stwu    1,-64(1)
+	stwu	1, -64(r1)
 	cfi_adjust_cfa_offset(64)
-        stw     r31,48(1)
-	cfi_offset(31,(48-64))
-        stw     r30,44(1)
-	cfi_offset(30,(44-64))
+	stw	rWORD8, 48(r1)
+	cfi_offset(rWORD8, (48-64))
+	stw	rWORD7, 44(r1)
+	cfi_offset(rWORD7, (44-64))
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then we are already word
+   of r12 to 0.  If r12 == 0 then we are already word
    aligned and can perform the word aligned loop.
 
    Otherwise we know the two strings have the same alignment (but not
@@ -70,74 +70,95 @@ EALIGN (memcmp, 4, 0)
    eliminate bits preceding the first byte.  Since we want to join the
    normal (word aligned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first word. This insures that the loop count is
+   versioning for the first word. This ensures that the loop count is
    correct and the first word (shifted) is in the expected register pair. */
-	.align 4
+	.align	4
 L(samealignment):
 	clrrwi	rSTR1, rSTR1, 2
 	clrrwi	rSTR2, rSTR2, 2
 	beq	cr5, L(Waligned)
-	add	rN, rN, rBITDIF
-	slwi	r11, rBITDIF, 3
-	srwi	rTMP, rN, 4	 /* Divide by 16 */
-	andi.	rBITDIF, rN, 12  /* Get the word remainder */
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the word remainder */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
 	lwz	rWORD2, 0(rSTR2)
-	cmplwi	cr1, rBITDIF, 8
+#endif
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
 	clrlwi	rN, rN, 30
 	beq	L(dPs4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 	bgt	cr1, L(dPs3)
 	beq	cr1, L(dPs2)
 
 /* Remainder is 4 */
-	.align 3
+	.align	3
 L(dsP1):
-	slw	rWORD5, rWORD1, r11
-	slw	rWORD6, rWORD2, r11
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
 	cmplw	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(dPs2):
-	slw	rWORD5, rWORD1, r11
-	slw	rWORD6, rWORD2, r11
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
 	cmplw	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 4(rSTR1)
 	lwz	rWORD8, 4(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 12 */
-	.align 4
+	.align	4
 L(dPs3):
-	slw	rWORD3, rWORD1, r11
-	slw	rWORD4, rWORD2, r11
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD2, rWORD6
 	cmplw	cr1, rWORD3, rWORD4
 	b	L(dP3e)
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
 L(dPs4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	slw	rWORD1, rWORD1, r11
-	slw	rWORD2, rWORD2, r11
-	cmplw	cr0, rWORD1, rWORD2
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD2, rWORD6
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP4e)
 
 /* At this point we know both strings are word aligned and the
    compare length is at least 8 bytes.  */
-	.align 4
+	.align	4
 L(Waligned):
-	andi.	rBITDIF, rN, 12  /* Get the word remainder */
-	srwi	rTMP, rN, 4	 /* Divide by 16 */
-	cmplwi	cr1, rBITDIF, 8
+	andi.	r12, rN, 12	/* Get the word remainder */
+	srwi	r0, rN, 4	/* Divide by 16 */
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
 	clrlwi	rN, rN, 30
 	beq	L(dP4)
@@ -145,177 +166,352 @@ L(Waligned):
 	beq	cr1, L(dP2)
 
 /* Remainder is 4 */
-	.align 4
+	.align	4
 L(dP1):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
    (8-15 byte compare), we want to use only volatile registers.  This
    means we can avoid restoring non-volatile registers since we did not
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 0(rSTR1)
 	lwz	rWORD6, 0(rSTR2)
+#endif
 	cmplw	cr5, rWORD5, rWORD6
 	blt	cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP1e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
-	bne	cr5, L(dLcr5)
-	bne	cr0, L(dLcr0)
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
 
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 16(rSTR1)
 	lwzu	rWORD8, 16(rSTR2)
+#endif
 	bne	cr1, L(dLcr1)
 	cmplw	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
 	bne	cr6, L(dLcr6)
-        lwz     r30,44(1)
-        lwz     r31,48(1)
-	.align 3
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+	.align	3
 L(dP1x):
 	slwi.	r12, rN, 3
-	bne	cr5, L(dLcr5)
+	bne	cr5, L(dLcr5x)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Remainder is 8 */
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP2):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 0(rSTR1)
 	lwz	rWORD6, 0(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 4(rSTR1)
 	lwz	rWORD8, 4(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 L(dP2e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 8(rSTR1)
 	lwz	rWORD2, 8(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 12(rSTR1)
 	lwz	rWORD4, 12(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
+#endif
 	bne	cr6, L(dLcr6)
 	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
 /* Again we are on a early exit path (16-23 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
-	.align 4
+	.align	4
 L(dP2x):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 4(rSTR1)
 	lwz	rWORD4, 4(rSTR2)
-	cmplw	cr5, rWORD3, rWORD4
+#endif
+	cmplw	cr1, rWORD3, rWORD4
 	slwi.	r12, rN, 3
-	bne	cr6, L(dLcr6)
+	bne	cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
-	bne	cr5, L(dLcr5)
+#endif
+	bne	cr1, L(dLcr1x)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Remainder is 12 */
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP3):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 0(rSTR1)
 	lwz	rWORD4, 0(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 L(dP3e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 4(rSTR1)
 	lwz	rWORD6, 4(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	blt	cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 8(rSTR1)
 	lwz	rWORD8, 8(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 12(rSTR1)
 	lwz	rWORD2, 12(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
 	bne	cr1, L(dLcr1)
 	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
 /* Again we are on a early exit path (24-31 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
-	.align 4
+	.align	4
 L(dP3x):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 8(rSTR1)
 	lwz	rWORD2, 8(rSTR2)
-	cmplw	cr5, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	slwi.	r12, rN, 3
-	bne	cr1, L(dLcr1)
+	bne	cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-	bne	cr6, L(dLcr6)
+#endif
+	bne	cr6, L(dLcr6x)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
-	bne	cr5, L(dLcr5)
-        lwz     1,0(1)
+	bne	cr7, L(dLcr7x)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
 	li	rRTN, 0
 	blr
 
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
 	lwz	rWORD2, 0(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP4e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 4(rSTR1)
 	lwz	rWORD4, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 8(rSTR1)
 	lwz	rWORD6, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 12(rSTR1)
 	lwzu	rWORD8, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 	bne	cr1, L(dLcr1)
 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
-	.align 4
+	.align	4
 L(dLoop):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	bne	cr6, L(dLcr6)
 L(dLoop1):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr5, L(dLcr5)
 L(dLoop2):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(dLoop3):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 16(rSTR1)
 	lwzu	rWORD8, 16(rSTR2)
+#endif
 	bne-	cr1, L(dLcr1)
-	cmplw	cr0, rWORD1, rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	bdnz+	L(dLoop)
 
 L(dL4):
@@ -325,7 +521,7 @@ L(dL4):
 	bne	cr5, L(dLcr5)
 	cmplw	cr5, rWORD7, rWORD8
 L(d44):
-	bne	cr0, L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(d34):
 	bne	cr1, L(dLcr1)
 L(d24):
@@ -334,69 +530,82 @@ L(d14):
 	slwi.	r12, rN, 3
 	bne	cr5, L(dLcr5)
 L(d04):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
-        lwz     1,0(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
 	beq	L(zeroLength)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  Since
    we are aligned it is safe to load the whole word, and use
-   shift right to eliminate bits beyond the compare length. */
+   shift right to eliminate bits beyond the compare length.  */
 L(d00):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	srw	rWORD1, rWORD1, rN
 	srw	rWORD2, rWORD2, rN
-        cmplw   rWORD1,rWORD2
-        li      rRTN,0
-        beqlr
-        li      rRTN,1
-        bgtlr
-        li      rRTN,-1
-        blr
-
-	.align 4
-L(dLcr0):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	sub	rRTN, rWORD1, rWORD2
+	blr
+
+	.align	4
+	cfi_adjust_cfa_offset(64)
+L(dLcr7):
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr7x):
 	li	rRTN, 1
-        lwz     1,0(1)
-	bgtlr	cr0
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr1):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr1x):
 	li	rRTN, 1
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr1
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr6):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr6x):
 	li	rRTN, 1
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr6
 	li	rRTN, -1
 	blr
-	.align 4
+	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr5):
-        lwz     r30,44(1)
-        lwz     r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
 L(dLcr5x):
 	li	rRTN, 1
-        lwz     1,0(1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr5
 	li	rRTN, -1
 	blr
 
-	.align 4
+	.align	4
 L(bytealigned):
-	cfi_adjust_cfa_offset(-64)
-	mtctr   rN	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	rN	/* Power4 wants mtctr 1st in dispatch group */
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
    to avoid pipe delays.  The dependent instruction latencies (load to
@@ -411,7 +620,7 @@ L(bytealigned):
 	lbz	rWORD1, 0(rSTR1)
 	lbz	rWORD2, 0(rSTR2)
 	bdz-	L(b11)
-	cmplw	cr0, rWORD1, rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	lbz	rWORD3, 1(rSTR1)
 	lbz	rWORD4, 1(rSTR2)
 	bdz-	L(b12)
@@ -419,11 +628,11 @@ L(bytealigned):
 	lbzu	rWORD5, 2(rSTR1)
 	lbzu	rWORD6, 2(rSTR2)
 	bdz-	L(b13)
-	.align 4
+	.align	4
 L(bLoop):
 	lbzu	rWORD1, 1(rSTR1)
 	lbzu	rWORD2, 1(rSTR2)
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 
 	cmplw	cr6, rWORD5, rWORD6
 	bdz-	L(b3i)
@@ -432,7 +641,7 @@ L(bLoop):
 	lbzu	rWORD4, 1(rSTR2)
 	bne-	cr1, L(bLcr1)
 
-	cmplw	cr0, rWORD1, rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	bdz-	L(b2i)
 
 	lbzu	rWORD5, 1(rSTR1)
@@ -449,23 +658,23 @@ L(bLoop):
    tested.  In this case we must complete the pending operations
    before returning.  */
 L(b1i):
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 	bne-	cr1, L(bLcr1)
 	b	L(bx56)
-	.align 4
+	.align	4
 L(b2i):
 	bne-	cr6, L(bLcr6)
-	bne-	cr0, L(bLcr0)
+	bne-	cr7, L(bLcr7)
 	b	L(bx34)
-	.align 4
+	.align	4
 L(b3i):
 	bne-	cr1, L(bLcr1)
 	bne-	cr6, L(bLcr6)
 	b	L(bx12)
-	.align 4
-L(bLcr0):
+	.align	4
+L(bLcr7):
 	li	rRTN, 1
-	bgtlr	cr0
+	bgtlr	cr7
 	li	rRTN, -1
 	blr
 L(bLcr1):
@@ -480,36 +689,31 @@ L(bLcr6):
 	blr
 
 L(b13):
-	bne-	cr0, L(bx12)
+	bne-	cr7, L(bx12)
 	bne-	cr1, L(bx34)
 L(bx56):
 	sub	rRTN, rWORD5, rWORD6
 	blr
 	nop
 L(b12):
-	bne-	cr0, L(bx12)
+	bne-	cr7, L(bx12)
 L(bx34):
 	sub	rRTN, rWORD3, rWORD4
 	blr
-
 L(b11):
 L(bx12):
 	sub	rRTN, rWORD1, rWORD2
 	blr
-
-	.align 4
-L(zeroLengthReturn):
-
+	.align	4
 L(zeroLength):
 	li	rRTN, 0
 	blr
 
-	cfi_adjust_cfa_offset(64)
-	.align 4
+	.align	4
 /* At this point we know the strings have different alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is word aligned and can
+   of r12 to 0.  If r12 == 0 then rStr1 is word aligned and can
    perform the Wunaligned loop.
 
    Otherwise we know that rSTR1 is not already word aligned yet.
@@ -518,79 +722,88 @@ L(zeroLength):
    eliminate bits preceding the first byte.  Since we want to join the
    normal (Wualigned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first W. This insures that the loop count is
+   versioning for the first W. This ensures that the loop count is
    correct and the first W (shifted) is in the expected resister pair.  */
 #define rSHL		r29	/* Unaligned shift left count.  */
 #define rSHR		r28	/* Unaligned shift right count.  */
-#define rB		r27	/* Left rotation temp for rWORD2.  */
-#define rD		r26	/* Left rotation temp for rWORD4.  */
-#define rF		r25	/* Left rotation temp for rWORD6.  */
-#define rH		r24	/* Left rotation temp for rWORD8.  */
-#define rA		r0	/* Right rotation temp for rWORD2.  */
-#define rC		r12	/* Right rotation temp for rWORD4.  */
-#define rE		r0	/* Right rotation temp for rWORD6.  */
-#define rG		r12	/* Right rotation temp for rWORD8.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+	cfi_adjust_cfa_offset(64)
 L(unaligned):
-	stw     r29,40(r1)
-	cfi_offset(r29,(40-64))
+	stw	rSHL, 40(r1)
+	cfi_offset(rSHL, (40-64))
 	clrlwi	rSHL, rSTR2, 30
-        stw     r28,36(r1)
-	cfi_offset(r28,(36-64))
+	stw	rSHR, 36(r1)
+	cfi_offset(rSHR, (36-64))
 	beq	cr5, L(Wunaligned)
-        stw     r27,32(r1)
-	cfi_offset(r27,(32-64))
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
 /* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 W.  */
-	sub	r27, rSTR2, rBITDIF
+	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the W before that W that contains
    the actual start of rSTR2.  */
 	clrrwi	rSTR2, rSTR2, 2
-        stw     r26,28(r1)
-	cfi_offset(r26,(28-64))
-/* Compute the left/right shift counts for the unalign rSTR2,
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+/* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (W aligned) start of rSTR1.  */
-	clrlwi	rSHL, r27, 30
+	clrlwi	rSHL, rWORD8_SHIFT, 30
 	clrrwi	rSTR1, rSTR1, 2
-        stw     r25,24(r1)
-	cfi_offset(r25,(24-64))
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
 	slwi	rSHL, rSHL, 3
-	cmplw	cr5, r27, rSTR2
-	add	rN, rN, rBITDIF
-	slwi	r11, rBITDIF, 3
-        stw     r24,20(r1)
-	cfi_offset(r24,(20-64))
+	cmplw	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
 	subfic	rSHR, rSHL, 32
-	srwi	rTMP, rN, 4      /* Divide by 16 */
-	andi.	rBITDIF, rN, 12  /* Get the W remainder */
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the W remainder */
 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
    this special case those bits may be discarded anyway.  Also we
    must avoid loading a W where none of the bits are part of rSTR2 as
    this may cross a page boundary and cause a page fault.  */
 	li	rWORD8, 0
 	blt	cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD8, 0(rSTR2)
-	la	rSTR2, 4(rSTR2)
+	addi	rSTR2, rSTR2, 4
+#endif
 	slw	rWORD8, rWORD8, rSHL
 
 L(dus0):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
 	lwz	rWORD2, 0(rSTR2)
-	cmplwi	cr1, rBITDIF, 8
+#endif
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
-	srw	rG, rWORD2, rSHR
+	srw	r12, rWORD2, rSHR
 	clrlwi	rN, rN, 30
 	beq	L(duPs4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	or	rWORD8, rG, rWORD8
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	or	rWORD8, r12, rWORD8
 	bgt	cr1, L(duPs3)
 	beq	cr1, L(duPs2)
 
 /* Remainder is 4 */
-	.align 4
+	.align	4
 L(dusP1):
-	slw	rB, rWORD2, rSHL
-	slw	rWORD7, rWORD1, r11
-	slw	rWORD8, rWORD8, r11
+	slw	rWORD8_SHIFT, rWORD2, rSHL
+	slw	rWORD7, rWORD1, rWORD6
+	slw	rWORD8, rWORD8, rWORD6
 	bge	cr7, L(duP1e)
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
@@ -600,95 +813,133 @@ L(dusP1):
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(duPs2):
-	slw	rH, rWORD2, rSHL
-	slw	rWORD5, rWORD1, r11
-	slw	rWORD6, rWORD8, r11
+	slw	rWORD6_SHIFT, rWORD2, rSHL
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD8, rWORD6
 	b	L(duP2e)
 /* Remainder is 12 */
-	.align 4
+	.align	4
 L(duPs3):
-	slw	rF, rWORD2, rSHL
-	slw	rWORD3, rWORD1, r11
-	slw	rWORD4, rWORD8, r11
+	slw	rWORD4_SHIFT, rWORD2, rSHL
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD8, rWORD6
 	b	L(duP3e)
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
 L(duPs4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	or	rWORD8, rG, rWORD8
-	slw	rD, rWORD2, rSHL
-	slw	rWORD1, rWORD1, r11
-	slw	rWORD2, rWORD8, r11
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	or	rWORD8, r12, rWORD8
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD8, rWORD6
 	b	L(duP4e)
 
 /* At this point we know rSTR1 is word aligned and the
    compare length is at least 8 bytes.  */
-	.align 4
+	.align	4
 L(Wunaligned):
-        stw     r27,32(r1)
-	cfi_offset(r27,(32-64))
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
 	clrrwi	rSTR2, rSTR2, 2
-        stw     r26,28(r1)
-	cfi_offset(r26,(28-64))
-	srwi	rTMP, rN, 4	 /* Divide by 16 */
-        stw     r25,24(r1)
-	cfi_offset(r25,(24-64))
-	andi.	rBITDIF, rN, 12  /* Get the W remainder */
-        stw     r24,20(r1)
-	cfi_offset(r24,(20-64))
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+	srwi	r0, rN, 4	/* Divide by 16 */
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
+	andi.	r12, rN, 12	/* Get the W remainder */
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
 	slwi	rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD6, 0(rSTR2)
 	lwzu	rWORD8, 4(rSTR2)
-	cmplwi	cr1, rBITDIF, 8
+#endif
+	cmplwi	cr1, r12, 8
 	cmplwi	cr7, rN, 16
 	clrlwi	rN, rN, 30
 	subfic	rSHR, rSHL, 32
-	slw	rH, rWORD6, rSHL
+	slw	rWORD6_SHIFT, rWORD6, rSHL
 	beq	L(duP4)
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
 	bgt	cr1, L(duP3)
 	beq	cr1, L(duP2)
 
 /* Remainder is 4 */
-	.align 4
+	.align	4
 L(duP1):
-	srw	rG, rWORD8, rSHR
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD7, 0(rSTR1)
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+#endif
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP1x)
 L(duP1e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
 	bne	cr5, L(duLcr5)
-	or	rWORD4, rC, rD
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	bne	cr0, L(duLcr0)
-	or	rWORD6, rE, rF
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
 	cmplw	cr6, rWORD5, rWORD6
 	b	L(duLoop3)
-	.align 4
+	.align	4
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
@@ -698,186 +949,321 @@ L(duP1x):
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
-	ld	rWORD2, 8(rSTR2)
-	srw	rA, rWORD2, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
-	.align 4
+	.align	4
 L(duP2):
-	srw	rE, rWORD8, rSHR
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD5, 0(rSTR1)
-	or	rWORD6, rE, rH
-	slw	rH, rWORD8, rSHL
+#endif
+	or	rWORD6, r0, rWORD6_SHIFT
+	slw	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 4(rSTR1)
 	lwz	rWORD8, 4(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 8(rSTR1)
 	lwz	rWORD2, 8(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 12(rSTR1)
 	lwz	rWORD4, 12(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	bne	cr5, L(duLcr5)
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
-	.align 4
+	.align	4
 L(duP2x):
 	cmplw	cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 4
 	addi	rSTR2, rSTR2, 4
+#endif
 	bne	cr6, L(duLcr6)
 	slwi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Remainder is 12 */
-	.align 4
+	.align	4
 L(duP3):
-	srw	rC, rWORD8, rSHR
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD3, 0(rSTR1)
-	slw	rF, rWORD8, rSHL
-	or	rWORD4, rC, rH
+#endif
+	slw	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 4(rSTR1)
 	lwz	rWORD6, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD7, 8(rSTR1)
 	lwz	rWORD8, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	blt	cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 12(rSTR1)
 	lwz	rWORD2, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
-	.align 4
+	.align	4
 L(duP3x):
+#ifndef __LITTLE_ENDIAN__
 	addi	rSTR1, rSTR1, 8
 	addi	rSTR2, rSTR2, 8
+#endif
+#if 0
+/* Huh?  We've already branched on cr1!  */
 	bne	cr1, L(duLcr1)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
 	bne	cr6, L(duLcr6)
 	slwi.	rN, rN, 3
 	bne	cr5, L(duLcr5)
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Count is a multiple of 16, remainder is 0 */
-	.align 4
+	.align	4
 L(duP4):
-	mtctr   rTMP	/* Power4 wants mtctr 1st in dispatch group */
-	srw	rA, rWORD8, rSHR
+	mtctr	r0	/* Power4 wants mtctr 1st in dispatch group */
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
 	lwz	rWORD1, 0(rSTR1)
-	slw	rD, rWORD8, rSHL
-	or	rWORD2, rA, rH
+#endif
+	slw	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 4(rSTR1)
 	lwz	rWORD4, 4(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 8(rSTR1)
 	lwz	rWORD6, 8(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
-	bne	cr0, L(duLcr0)
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 12(rSTR1)
 	lwzu	rWORD8, 12(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr1, L(duLcr1)
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	cmplw	cr5, rWORD7, rWORD8
 	bdz-	L(du24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
-	.align 4
+	.align	4
 L(duLoop):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD1, 4(rSTR1)
 	lwz	rWORD2, 4(rSTR2)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
-	srw	rA, rWORD2, rSHR
-	slw	rD, rWORD2, rSHL
-	or	rWORD2, rA, rB
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD3, 8(rSTR1)
 	lwz	rWORD4, 8(rSTR2)
+#endif
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
-	srw	rC, rWORD4, rSHR
-	slw	rF, rWORD4, rSHL
-	or	rWORD4, rC, rD
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD5, 12(rSTR1)
 	lwz	rWORD6, 12(rSTR2)
+#endif
 	cmplw	cr5, rWORD7, rWORD8
-	bne	cr0, L(duLcr0)
-	srw	rE, rWORD6, rSHR
-	slw	rH, rWORD6, rSHL
-	or	rWORD6, rE, rF
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
 	lwzu	rWORD7, 16(rSTR1)
 	lwzu	rWORD8, 16(rSTR2)
-	cmplw	cr0, rWORD1, rWORD2
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	bne-	cr1, L(duLcr1)
-	srw	rG, rWORD8, rSHR
-	slw	rB, rWORD8, rSHL
-	or	rWORD8, rG, rH
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	bdnz+	L(duLoop)
 
 L(duL4):
+#if 0
+/* Huh?  We've already branched on cr1!  */
 	bne	cr1, L(duLcr1)
+#endif
 	cmplw	cr1, rWORD3, rWORD4
 	bne	cr6, L(duLcr6)
 	cmplw	cr6, rWORD5, rWORD6
 	bne	cr5, L(duLcr5)
 	cmplw	cr5, rWORD7, rWORD8
 L(du44):
-	bne	cr0, L(duLcr0)
+	bne	cr7, L(duLcr7)
 L(du34):
 	bne	cr1, L(duLcr1)
 L(du24):
@@ -887,95 +1273,101 @@ L(du14):
 	bne	cr5, L(duLcr5)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  We use
    shift right to eliminate bits beyond the compare length.
+   This allows the use of word subtract to compute the final result.
 
    However it may not be safe to load rWORD2 which may be beyond the
    string length. So we compare the bit length of the remainder to
    the right shift count (rSHR). If the bit count is less than or equal
    we do not need to load rWORD2 (all significant bits are already in
-   rB).  */
+   rWORD8_SHIFT).  */
 	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA, 0
+	li	r0, 0
 	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
 	lwz	rWORD2, 4(rSTR2)
-	srw	rA, rWORD2, rSHR
-	.align 4
+#endif
+	srw	r0, rWORD2, rSHR
+	.align	4
 L(dutrim):
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+#else
 	lwz	rWORD1, 4(rSTR1)
-        lwz     r31,48(1)
+#endif
+	lwz	rWORD8, 48(r1)
 	subfic	rN, rN, 32	/* Shift count is 32 - (rN * 8).  */
-	or	rWORD2, rA, rB
-        lwz     r30,44(1)
-        lwz     r29,40(r1)
+	or	rWORD2, r0, rWORD8_SHIFT
+	lwz	rWORD7, 44(r1)
+	lwz	rSHL, 40(r1)
 	srw	rWORD1, rWORD1, rN
 	srw	rWORD2, rWORD2, rN
-        lwz     r28,36(r1)
-        lwz     r27,32(r1)
-        cmplw   rWORD1,rWORD2
-        li      rRTN,0
-        beq     L(dureturn26)
-        li      rRTN,1
-        bgt     L(dureturn26)
-        li      rRTN,-1
-	b    L(dureturn26)
-	.align 4
-L(duLcr0):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rSHR, 36(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
+	sub	rRTN, rWORD1, rWORD2
+	b	L(dureturn26)
+	.align	4
+L(duLcr7):
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
-	bgt	cr0, L(dureturn29)
-	lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	bgt	cr7, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr1):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
 	bgt	cr1, L(dureturn29)
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr6):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
 	bgt	cr6, L(dureturn29)
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
-	.align 4
+	.align	4
 L(duLcr5):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 	li	rRTN, 1
 	bgt	cr5, L(dureturn29)
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 	li	rRTN, -1
 	b	L(dureturn27)
 	.align	3
 L(duZeroReturn):
-	li	rRTN,0
+	li	rRTN, 0
 	.align	4
 L(dureturn):
-        lwz     r31,48(1)
-        lwz     r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 L(dureturn29):
-        lwz     r29,40(r1)
-        lwz     r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 L(dureturn27):
-        lwz     r27,32(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
 L(dureturn26):
-        lwz     r26,28(r1)
+	lwz	rWORD2_SHIFT, 28(r1)
 L(dureturn25):
-        lwz     r25,24(r1)
-        lwz     r24,20(r1)
-        lwz     1,0(1)
+	lwz	rWORD4_SHIFT, 24(r1)
+	lwz	rWORD6_SHIFT, 20(r1)
+	addi	1, 1, 64
+	cfi_adjust_cfa_offset(-64)
 	blr
 END (memcmp)
 
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/memcpy.S b/libc/sysdeps/powerpc/powerpc32/power4/memcpy.S
index d9146631e..338d3cce3 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/memcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/memcpy.S
@@ -203,15 +203,28 @@ EALIGN (memcpy, 5, 0)
     blt   cr6,5f
     srwi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmplwi	cr1,10,16
@@ -339,13 +352,23 @@ EALIGN (memcpy, 5, 0)
     bf      30,1f
 
     /* there are at least two words to copy, so copy them */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10  /* shift 1st src word to left align it in R0 */
     srw   8,7,9   /* shift 2nd src word to right align it in R8 */
+#endif
     or    0,0,8   /* or them to get word to store */
     lwz   6,8(5)  /* load the 3rd src word */
     stw   0,0(4)  /* store the 1st dst word */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
     slw   0,7,10  /* now left align 2nd src word into R0 */
     srw   8,6,9   /* shift 3rd src word to right align it in R8 */
+#endif
     or    0,0,8   /* or them to get word to store */
     lwz   7,12(5)
     stw   0,4(4)  /* store the 2nd dst word */
@@ -353,8 +376,13 @@ EALIGN (memcpy, 5, 0)
     addi  5,5,16
     bf    31,4f
     /* there is a third word to copy, so copy it */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10  /* shift 3rd src word to left align it in R0 */
     srw   8,7,9   /* shift 4th src word to right align it in R8 */
+#endif
     or    0,0,8   /* or them to get word to store */
     stw   0,0(4)  /* store 3rd dst word */
     mr    6,7
@@ -364,8 +392,13 @@ EALIGN (memcpy, 5, 0)
     b     4f
     .align 4
 1:
+#ifdef __LITTLE_ENDIAN__
+    srw     0,6,10
+    slw     8,7,9
+#else
     slw     0,6,10  /* shift 1st src word to left align it in R0 */
     srw     8,7,9   /* shift 2nd src word to right align it in R8 */
+#endif
     addi  5,5,8
     or    0,0,8   /* or them to get word to store */
     bf    31,4f
@@ -378,23 +411,43 @@ EALIGN (memcpy, 5, 0)
     .align  4
 4:
     /* copy 16 bytes at a time */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10
     srw   8,7,9
+#endif
     or    0,0,8
     lwz   6,0(5)
     stw   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
     slw   0,7,10
     srw   8,6,9
+#endif
     or    0,0,8
     lwz   7,4(5)
     stw   0,4(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10
     srw   8,7,9
+#endif
     or    0,0,8
     lwz   6,8(5)
     stw   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
     slw   0,7,10
     srw   8,6,9
+#endif
     or    0,0,8
     lwz   7,12(5)
     stw   0,12(4)
@@ -403,8 +456,13 @@ EALIGN (memcpy, 5, 0)
     bdnz+ 4b
 8:
     /* calculate and store the final word */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
     slw   0,6,10
     srw   8,7,9
+#endif
     or    0,0,8
     stw   0,0(4)
 3:
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/memset.S b/libc/sysdeps/powerpc/powerpc32/power4/memset.S
index c2d288b38..4fd9d8cb4 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/memset.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/memset.S
@@ -50,7 +50,7 @@ L(_memset):
 
 /* Align to word boundary.  */
 	cmplwi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48     /* Replicate byte to halfword.  */
 	beq+	L(aligned)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 4
@@ -65,7 +65,7 @@ L(g0):
 /* Handle the case of size < 31.  */
 L(aligned):
 	mtcrf	0x01, rLEN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32    /* Replicate halfword to word.  */
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x1C
diff --git a/libc/sysdeps/powerpc/powerpc32/power4/strncmp.S b/libc/sysdeps/powerpc/powerpc32/power4/strncmp.S
index 724d9084a..89b961e78 100644
--- a/libc/sysdeps/powerpc/powerpc32/power4/strncmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/power4/strncmp.S
@@ -24,7 +24,7 @@
 
 EALIGN (strncmp, 4, 0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -37,6 +37,7 @@ EALIGN (strncmp, 4, 0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -75,12 +76,45 @@ L(g1):	add	rTMP, rFEFE, rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	slwi	rTMP, rTMP, 1
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzw	rBITDIF, rBITDIF
@@ -88,28 +122,20 @@ L(endstring):
 	addi	rNEG, rNEG, 7
 	cmpw	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
-	blt-	cr1, L(equal)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+	cr1
 L(equal):
 	li	rRTN, 0
 	blr
 
 L(different):
-	lwzu	rWORD1, -4(rSTR1)
+	lwz	rWORD1, -4(rSTR1)
 	xor.	rBITDIF, rWORD1, rWORD2
 	sub	rRTN, rWORD1, rWORD2
-	blt-	L(highbit)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+
 L(highbit):
-	srwi	rWORD2, rWORD2, 24
-	srwi	rWORD1, rWORD1, 24
-	sub	rRTN, rWORD1, rWORD2
+	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S b/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
index ecd37c3cd..49c8a0866 100644
--- a/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
+++ b/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_llround.S
@@ -39,8 +39,8 @@ ENTRY (__llround)
 	nop	/* Ensure the following load is in a different dispatch  */
 	nop	/* group to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r4,12(r1)
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llround)
diff --git a/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S b/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
index d4da625bb..780dd9ca4 100644
--- a/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
+++ b/libc/sysdeps/powerpc/powerpc32/power5+/fpu/s_lround.S
@@ -38,7 +38,7 @@ ENTRY (__lround)
 	nop	/* Ensure the following load is in a different dispatch  */
 	nop	/* group to avoid pipe stall on POWER4&5.  */
 	nop
-	lwz	r3,12(r1)
+	lwz	r3,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__lround)
diff --git a/libc/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S b/libc/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
index f2417fdf4..5f7ba43a2 100644
--- a/libc/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
+++ b/libc/sysdeps/powerpc/powerpc32/power5/fpu/s_isnan.S
@@ -27,8 +27,8 @@ EALIGN (__isnan, 4, 0)
 	ori	r1,r1,0
 	stfd	fp1,24(r1)	/* copy FPR to GPR */
 	ori	r1,r1,0
-	lwz	r4,24(r1)
-	lwz	r5,28(r1)
+	lwz	r4,24+HIWORD(r1)
+	lwz	r5,24+LOWORD(r1)
 	lis	r0,0x7ff0	/* const long r0 0x7ff00000 00000000 */
 	clrlwi	r4,r4,1		/* x = fabs(x) */
 	cmpw	cr7,r4,r0	/* if (fabs(x) =< inf) */
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
index 2c095db1d..3ea18589c 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_isnan.S
@@ -27,8 +27,8 @@ EALIGN (__isnan, 4, 0)
 	ori	r1,r1,0
 	stfd	fp1,24(r1)	/* copy FPR to GPR */
 	ori	r1,r1,0
-	lwz	r4,24(r1)
-	lwz	r5,28(r1)
+	lwz	r4,24+HIWORD(r1)
+	lwz	r5,24+LOWORD(r1)
 	lis	r0,0x7ff0	/* const long r0 0x7ff00000 00000000 */
 	clrlwi	r4,r4,1		/* x = fabs(x) */
 	cmpw	cr7,r4,r0	/* if (fabs(x) =< inf) */
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
index 3344b312e..c0660cf6e 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrint.S
@@ -29,8 +29,8 @@ ENTRY (__llrint)
 /* Insure the following load is in a different dispatch group by
    inserting "group ending nop".  */
 	ori	r1,r1,0
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrint)
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
index 7f64f8d12..ce298905c 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llrintf.S
@@ -28,8 +28,8 @@ ENTRY (__llrintf)
 /* Insure the following load is in a different dispatch group by
    inserting "group ending nop".  */
 	ori	r1,r1,0
-	lwz	r3,8(r1)
-	lwz	r4,12(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llrintf)
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
index 0ff04cb71..abb0840d1 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/fpu/s_llround.S
@@ -39,8 +39,8 @@ ENTRY (__llround)
 /* Insure the following load is in a different dispatch group by
    inserting "group ending nop".  */
 	ori	r1,r1,0
-	lwz	r4,12(r1)
-	lwz	r3,8(r1)
+	lwz	r3,8+HIWORD(r1)
+	lwz	r4,8+LOWORD(r1)
 	addi	r1,r1,16
 	blr
 	END (__llround)
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/memcpy.S b/libc/sysdeps/powerpc/powerpc32/power6/memcpy.S
index a76f71e04..f58114a0c 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/memcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/memcpy.S
@@ -219,15 +219,28 @@ L(word_unaligned_short):
     blt   cr6,5f
     srwi  7,6,16
     bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
     sth   6,0(3)
+#endif
     b     7f
     .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
     stb   7,0(3)
     sth   6,1(3)
+#endif
     b     7f
     .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
     stb   6,0(3)
 7:
     cmplwi	cr1,10,16
@@ -577,7 +590,11 @@ L(wdu1_32):
     lwz     6,-1(4)
     cmplwi  cr6,31,4
     srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,8
+#else
     slwi    6,6,8
+#endif
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
     blt     cr5,L(wdu1_32tail)
     mtctr   8
@@ -585,8 +602,12 @@ L(wdu1_32):
 
     lwz   8,3(4)
     lwz   7,4(4)
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
     rlwimi 6,8,8,(32-8),31
+#endif
     b      L(wdu1_loop32x)
     .align  4
 L(wdu1_loop32):
@@ -595,8 +616,12 @@ L(wdu1_loop32):
     lwz   7,4(4)
     stw   10,-8(3)
     stw   11,-4(3)
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
     rlwimi 6,8,8,(32-8),31
+#endif
 L(wdu1_loop32x):
     lwz   10,8(4)
     lwz   11,12(4)
@@ -613,7 +638,11 @@ L(wdu1_loop32x):
     stw   6,16(3)
     stw   7,20(3)
     addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,8
+#else
     slwi  6,8,8
+#endif
     bdnz+ L(wdu1_loop32)
     stw   10,-8(3)
     stw   11,-4(3)
@@ -624,8 +653,12 @@ L(wdu1_32tail):
     blt     cr6,L(wdu_4tail)
     /* calculate and store the final word */
     lwz   8,3(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
+/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8  */
     rlwimi 6,8,8,(32-8),31
+#endif
     b     L(wdu_32tailx)
 
 L(wdu2_32):
@@ -633,7 +666,11 @@ L(wdu2_32):
     lwz     6,-2(4)
     cmplwi  cr6,31,4
     srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,16
+#else
     slwi    6,6,16
+#endif
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
     blt     cr5,L(wdu2_32tail)
     mtctr   8
@@ -641,8 +678,11 @@ L(wdu2_32):
 
     lwz   8,2(4)
     lwz   7,4(4)
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
     rlwimi 6,8,16,(32-16),31
+#endif
     b      L(wdu2_loop32x)
     .align  4
 L(wdu2_loop32):
@@ -651,8 +691,11 @@ L(wdu2_loop32):
     lwz   7,4(4)
     stw   10,-8(3)
     stw   11,-4(3)
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
     rlwimi 6,8,16,(32-16),31
+#endif
 L(wdu2_loop32x):
     lwz   10,8(4)
     lwz   11,12(4)
@@ -670,7 +713,11 @@ L(wdu2_loop32x):
     stw   6,16(3)
     stw   7,20(3)
     addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,16
+#else
     slwi  6,8,16
+#endif
     bdnz+ L(wdu2_loop32)
     stw   10,-8(3)
     stw   11,-4(3)
@@ -681,8 +728,11 @@ L(wdu2_32tail):
     blt     cr6,L(wdu_4tail)
     /* calculate and store the final word */
     lwz   8,2(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
     rlwimi 6,8,16,(32-16),31
+#endif
     b     L(wdu_32tailx)
 
 L(wdu3_32):
@@ -690,7 +740,11 @@ L(wdu3_32):
     lwz     6,-3(4)
     cmplwi  cr6,31,4
     srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,24
+#else
     slwi    6,6,24
+#endif
     clrlwi  31,31,27   /* The remaining bytes, < 32.  */
     blt     cr5,L(wdu3_32tail)
     mtctr   8
@@ -698,8 +752,11 @@ L(wdu3_32):
 
     lwz   8,1(4)
     lwz   7,4(4)
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
     rlwimi 6,8,24,(32-24),31
+#endif
     b      L(wdu3_loop32x)
     .align  4
 L(wdu3_loop32):
@@ -708,8 +765,11 @@ L(wdu3_loop32):
     lwz   7,4(4)
     stw   10,-8(3)
     stw   11,-4(3)
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
     rlwimi 6,8,24,(32-24),31
+#endif
 L(wdu3_loop32x):
     lwz   10,8(4)
     lwz   11,12(4)
@@ -726,7 +786,11 @@ L(wdu3_loop32x):
     stw   6,16(3)
     stw   7,20(3)
     addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,24
+#else
     slwi  6,8,24
+#endif
     bdnz+ L(wdu3_loop32)
     stw   10,-8(3)
     stw   11,-4(3)
@@ -737,8 +801,11 @@ L(wdu3_32tail):
     blt     cr6,L(wdu_4tail)
     /* calculate and store the final word */
     lwz   8,1(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
     rlwimi 6,8,24,(32-24),31
+#endif
     b     L(wdu_32tailx)
     .align  4
 L(wdu_32tailx):
diff --git a/libc/sysdeps/powerpc/powerpc32/power6/memset.S b/libc/sysdeps/powerpc/powerpc32/power6/memset.S
index 8c23c8d13..a4b002a96 100644
--- a/libc/sysdeps/powerpc/powerpc32/power6/memset.S
+++ b/libc/sysdeps/powerpc/powerpc32/power6/memset.S
@@ -48,7 +48,7 @@ L(_memset):
 	ble-	cr1, L(small)
 /* Align to word boundary.  */
 	cmplwi	cr5, rLEN, 31
-	rlwimi	rCHR, rCHR, 8, 16, 23 /* Replicate byte to halfword.  */
+	insrdi	rCHR, rCHR, 8, 48	/* Replicate byte to halfword.  */
 	beq+	L(aligned)
 	mtcrf	0x01, rMEMP0
 	subfic	rALIGN, rALIGN, 4
@@ -64,7 +64,7 @@ L(g0):
 /* Handle the case of size < 31.  */
 L(aligned):
 	mtcrf	0x01, rLEN
-	rlwimi	rCHR, rCHR, 16, 0, 15 /* Replicate halfword to word.  */
+	insrdi	rCHR, rCHR, 16, 32	/* Replicate halfword to word.  */
 	ble	cr5, L(medium)
 /* Align to 32-byte boundary.  */
 	andi.	rALIGN, rMEMP, 0x1C
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
index b2ab5bfe7..095c15547 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_finite.S
@@ -54,9 +54,8 @@ ENTRY (__finite)
 	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
 
 	ori	2,2,0	      /* Force a new dispatch group.  */
-	lhz     r0,8(r1)      /* Fetch the upper portion of the high word of
-			      the FP value (where the exponent and sign bits
-			      are).  */
+	lhz	r0,8+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
 	clrlwi	r0,r0,17      /* r0 = abs(r0).  */
 	addi	r1,r1,16      /* Reset the stack pointer.  */
 	cmpwi	cr7,r0,0x7ff0 /* r4 == 0x7ff0?.  */
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
index 3f8af60a5..0101c8fa1 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isinf.S
@@ -48,14 +48,13 @@ ENTRY (__isinf)
 	li	r3,0
 	bflr    29	      /* If not INF, return.  */
 
-	/* Either we have -INF/+INF or a denormal.  */
+	/* Either we have +INF or -INF.  */
 
 	stwu    r1,-16(r1)    /* Allocate stack space.  */
 	stfd    fp1,8(r1)     /* Transfer FP to GPR's.  */
 	ori	2,2,0	      /* Force a new dispatch group.  */
-	lhz	r4,8(r1)      /* Fetch the upper portion of the high word of
-			      the FP value (where the exponent and sign bits
-			      are).  */
+	lhz	r4,8+HISHORT(r1) /* Fetch the upper 16 bits of the FP value
+				    (biased exponent and sign bit).  */
 	addi	r1,r1,16      /* Reset the stack pointer.  */
 	cmpwi	cr7,r4,0x7ff0 /* r4 == 0x7ff0?  */
 	li	r3,1
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
index 99ff12696..0ad1dcf1f 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_isnan.S
@@ -53,8 +53,8 @@ ENTRY (__isnan)
 	stwu	r1,-16(r1)    /* Allocate stack space.  */
 	stfd	fp1,8(r1)     /* Transfer FP to GPR's.  */
 	ori	2,2,0	      /* Force a new dispatch group.  */
-	lwz     r4,8(r1)      /* Load the upper half of the FP value.  */
-	lwz     r5,12(r1)     /* Load the lower half of the FP value.  */
+	lwz     r4,8+HIWORD(r1) /* Load the upper half of the FP value.  */
+	lwz     r5,8+LOWORD(r1) /* Load the lower half of the FP value.  */
 	addi	r1,r1,16      /* Reset the stack pointer.  */
 	lis     r0,0x7ff0     /* Load the upper portion for an INF/NaN.  */
 	clrlwi  r4,r4,1	      /* r4 = abs(r4).  */
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
index e008ed0c3..1c82577f5 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
+++ b/libc/sysdeps/powerpc/powerpc32/power7/fpu/s_logbl.c
@@ -35,14 +35,14 @@ static const union {
 long double
 __logbl (long double x)
 {
-  double xh, xl;
+  double xh;
   double ret;
 
   if (__builtin_expect (x == 0.0L, 0))
     /* Raise FE_DIVBYZERO and return -HUGE_VAL[LF].  */
     return -1.0L / __builtin_fabsl (x);
 
-  ldbl_unpack (x, &xh, &xl);
+  xh = ldbl_high (x);
   /* ret = x & 0x7ff0000000000000;  */
   asm (
     "xxland %x0,%x1,%x2\n"
@@ -58,9 +58,9 @@ __logbl (long double x)
     {
       /* POSIX specifies that denormal number is treated as
          though it were normalized.  */
-      int64_t lx, hx;
+      int64_t hx;
 
-      GET_LDOUBLE_WORDS64 (hx, lx, x);
+      EXTRACT_WORDS64 (hx, xh);
       return (long double) (-1023 - (__builtin_clzll (hx) - 12));
     }
   /* Test to avoid logb_downward (0.0) == -0.0.  */
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/memchr.S b/libc/sysdeps/powerpc/powerpc32/power7/memchr.S
index 369e5e048..85754f3f1 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/memchr.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/memchr.S
@@ -25,107 +25,111 @@ ENTRY (__memchr)
 	CALL_MCOUNT
 	dcbt	0,r3
 	clrrwi  r8,r3,2
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
 	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	insrdi	r4,r4,16,32
 	cmplwi	r5,16
+	li	r9, -1
+	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
+	addi	r7,r7,-1
+#ifdef __LITTLE_ENDIAN__
+	slw	r9,r9,r6
+#else
+	srw	r9,r9,r6
+#endif
 	ble	L(small_range)
 
-	cmplw	cr7,r3,r7     /* Compare the starting address (r3) with the
-				 ending address (r7).  If (r3 >= r7), the size
-				 passed in is zero or negative.  */
-	ble	cr7,L(proceed)
-
-	li	r7,-1	      /* Artificially set our ending address (r7)
-				 such that we will exit early. */
-L(proceed):
-	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
-	cmpli	cr6,r6,0      /* cr6 == Do we have padding?  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTEs in WORD1.  */
-	beq	cr6,L(proceed_no_padding)
-	slw	r10,r10,r6
-	srw	r10,r10,r6
-L(proceed_no_padding):
-	cmplwi	cr7,r10,0     /* If r10 == 0, no BYTEs have been found.  */
+	cmpb	r3,r12,r4     /* Check for BYTEs in WORD1.  */
+	and	r3,r3,r9
+	clrlwi	r5,r7,30      /* Byte count - 1 in last word.  */
+	clrrwi	r7,r7,2       /* Address of last word.  */
+	cmplwi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(null)
-
 	mtcrf   0x01,r8
 	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
-
 	bt	29,L(loop_setup)
 
 	/* Handle WORD2 of pair.  */
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	cmplwi	cr7,r10,0
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(null)
-
 L(loop_setup):
-	sub	r5,r7,r9
-	srwi	r6,r5,3	      /* Number of loop iterations.  */
+	/* The last word we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the word at
+	   (s + size - 1) & ~3, or r7.  The first word read is at
+	   r8 + 4, we read 2 * cnt words, so the last word read will
+	   be at r8 + 4 + 8 * cnt - 4.  Solving for cnt gives
+	   cnt = (r7 - r8) / 8  */
+	sub	r6,r7,r8
+	srwi	r6,r6,3	      /* Number of loop iterations.  */
 	mtctr	r6            /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for BYTE backwards in the string.  Since
-	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
-	.p2align  5
+
+	/* Main loop to look for BYTE in the string.  Since
+	   it's a small loop (8 instructions), align it to 32-bytes.  */
+	.align	5
 L(loop):
 	/* Load two words, compare and merge in a
 	   single register for speed.  This is an attempt
 	   to speed up the byte-checking process for bigger strings.  */
 	lwz	r12,4(r8)
 	lwzu	r11,8(r8)
-	cmpb	r10,r12,r4
+	cmpb	r3,r12,r4
 	cmpb	r9,r11,r4
-	or	r5,r9,r10     /* Merge everything in one word.  */
-	cmplwi	cr7,r5,0
+	or	r6,r9,r3      /* Merge everything in one word.  */
+	cmplwi	cr7,r6,0
 	bne	cr7,L(found)
 	bdnz	L(loop)
 
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for BYTE in the whole range.  */
-	subi	r11,r7,4
-	cmplw	cr6,r8,r11
-	blt	cr6,L(loop_small)
-	b	L(null)
+	/* We may have one more dword to read.  */
+	cmplw	r8,r7
+	beqlr
+
+	lwzu	r12,4(r8)
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	bne	cr6,L(done)
+	blr
 
+	.align	4
+L(found):
 	/* OK, one (or both) of the words contains BYTE.  Check
 	   the first word and decrement the address in case the first
 	   word really contains BYTE.  */
-	.align	4
-L(found):
-	cmplwi	cr6,r10,0
+	cmplwi	cr6,r3,0
 	addi	r8,r8,-4
 	bne	cr6,L(done)
 
 	/* BYTE must be in the second word.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
+	   again and move the result of cmpb to r3 so we can calculate the
 	   pointer.  */
 
-	mr	r10,r9
+	mr	r3,r9
 	addi	r8,r8,4
 
-	/* r10 has the output of the cmpb instruction, that is, it contains
+	/* r3 has the output of the cmpb instruction, that is, it contains
 	   0xff in the same position as BYTE in the original
 	   word from the string.  Use that to calculate the pointer.
 	   We need to make sure BYTE is *before* the end of the range.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r3,-1
+	andc    r0,r0,r3
+	popcntw	r0,r0	      /* Count trailing zeros.  */
+#else
+	cntlzw	r0,r3	      /* Count leading zeros before the match.  */
+#endif
+	cmplw	r8,r7         /* Are we on the last word?  */
+	srwi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
 	add	r3,r8,r0
-	cmplw	r3,r7
-	bge	L(null)
+	cmplw	cr7,r0,r5     /* If on the last dword, check byte offset.  */
+	bnelr
+	blelr	cr7
+	li	r3,0
 	blr
 
 	.align	4
@@ -137,67 +141,42 @@ L(null):
 	.align	4
 L(small_range):
 	cmplwi	r5,0
-	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
-	beq	L(null)       /* This branch is for the cmplwi r5,0 above */
+	beq	L(null)
 	lwz	r12,0(r8)     /* Load word from memory.  */
-	cmplwi	cr6,r6,0      /* cr6 == Do we have padding?  */
-	cmpb	r10,r12,r4    /* Check for BYTE in DWORD1.  */
-	beq	cr6,L(small_no_padding)
-	slw	r10,r10,r6
-	srw	r10,r10,r6
-L(small_no_padding):
-	cmplwi	cr7,r10,0
+	cmpb	r3,r12,r4     /* Check for BYTE in DWORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0
+	clrlwi	r5,r7,30      /* Byte count - 1 in last word.  */
+	clrrwi	r7,r7,2       /* Address of last word.  */
+	cmplw	r8,r7         /* Are we done already?  */
 	bne	cr7,L(done)
+	beqlr
 
-	/* Are we done already?  */
-	addi    r9,r8,4
-	cmplw	r9,r7
-	bge	L(null)
-
-L(loop_small):                /* loop_small has been unrolled.  */
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
 	bne	cr6,L(done)
-	bge	L(null)
+	beqlr
 
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
 	bne	cr6,L(done)
-	bge	L(null)
+	beqlr
 
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
+	cmplw	r8,r7
 	bne	cr6,L(done)
-	bge	L(null)
+	beqlr
 
 	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	cmplw	r9,r7
+	cmpb	r3,r12,r4
+	cmplwi	cr6,r3,0
 	bne	cr6,L(done)
-	bge	L(null)
-
-	/* For most cases we will never get here.  Under some combinations of
-	   padding + length there is a leftover word that still needs to be
-	   checked.  */
-	lwzu	r12,4(r8)
-	cmpb	r10,r12,r4
-	addi	r9,r8,4
-	cmplwi	cr6,r10,0
-	bne	cr6,L(done)
-
-	/* save a branch and exit directly */
-	li	r3,0
 	blr
 
 END (__memchr)
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/memcmp.S b/libc/sysdeps/powerpc/powerpc32/power7/memcmp.S
index 075e19f14..f160ddebf 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/memcmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/memcmp.S
@@ -23,10 +23,9 @@
 		    size_t size [r5])  */
 
 	.machine power7
-EALIGN (memcmp,4,0)
+EALIGN (memcmp, 4, 0)
 	CALL_MCOUNT
 
-#define rTMP	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -37,35 +36,32 @@ EALIGN (memcmp,4,0)
 #define rWORD4	r9	/* next word in s2 */
 #define rWORD5	r10	/* next word in s1 */
 #define rWORD6	r11	/* next word in s2 */
-#define rBITDIF	r12	/* bits that differ in s1 & s2 words */
 #define rWORD7	r30	/* next word in s1 */
 #define rWORD8	r31	/* next word in s2 */
 
-	xor	rTMP,rSTR2,rSTR1
-	cmplwi	cr6,rN,0
-	cmplwi	cr1,rN,12
-	clrlwi.	rTMP,rTMP,30
-	clrlwi	rBITDIF,rSTR1,30
-	cmplwi	cr5,rBITDIF,0
-	beq-	cr6,L(zeroLength)
-	dcbt	0,rSTR1
-	dcbt	0,rSTR2
-
-	/* If less than 8 bytes or not aligned, use the unaligned
-	   byte loop.  */
-
-	blt	cr1,L(bytealigned)
-	stwu	1,-64(1)
+	xor	r0, rSTR2, rSTR1
+	cmplwi	cr6, rN, 0
+	cmplwi	cr1, rN, 12
+	clrlwi.	r0, r0, 30
+	clrlwi	r12, rSTR1, 30
+	cmplwi	cr5, r12, 0
+	beq-	cr6, L(zeroLength)
+	dcbt	0, rSTR1
+	dcbt	0, rSTR2
+/* If less than 8 bytes or not aligned, use the unaligned
+   byte loop.  */
+	blt	cr1, L(bytealigned)
+	stwu	1, -64(r1)
 	cfi_adjust_cfa_offset(64)
-	stw	r31,48(1)
-	cfi_offset(31,(48-64))
-	stw	r30,44(1)
-	cfi_offset(30,(44-64))
+	stw	rWORD8, 48(r1)
+	cfi_offset(rWORD8, (48-64))
+	stw	rWORD7, 44(r1)
+	cfi_offset(rWORD7, (44-64))
 	bne	L(unaligned)
 /* At this point we know both strings have the same alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then we are already word
+   of r12 to 0.  If r12 == 0 then we are already word
    aligned and can perform the word aligned loop.
 
    Otherwise we know the two strings have the same alignment (but not
@@ -74,332 +70,541 @@ EALIGN (memcmp,4,0)
    eliminate bits preceding the first byte.  Since we want to join the
    normal (word aligned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first word. This insures that the loop count is
+   versioning for the first word. This ensures that the loop count is
    correct and the first word (shifted) is in the expected register pair. */
 	.align	4
 L(samealignment):
-	clrrwi	rSTR1,rSTR1,2
-	clrrwi	rSTR2,rSTR2,2
-	beq	cr5,L(Waligned)
-	add	rN,rN,rBITDIF
-	slwi	r11,rBITDIF,3
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	andi.	rBITDIF,rN,12	/* Get the word remainder */
-	lwz	rWORD1,0(rSTR1)
-	lwz	rWORD2,0(rSTR2)
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	clrlwi	rN,rN,30
+	clrrwi	rSTR1, rSTR1, 2
+	clrrwi	rSTR2, rSTR2, 2
+	beq	cr5, L(Waligned)
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the word remainder */
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
 	beq	L(dPs4)
-	mtctr	rTMP
-	bgt	cr1,L(dPs3)
-	beq	cr1,L(dPs2)
+	mtctr	r0
+	bgt	cr1, L(dPs3)
+	beq	cr1, L(dPs2)
 
 /* Remainder is 4 */
 	.align	3
 L(dsP1):
-	slw	rWORD5,rWORD1,r11
-	slw	rWORD6,rWORD2,r11
-	cmplw	cr5,rWORD5,rWORD6
-	blt	cr7,L(dP1x)
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
+	cmplw	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP1e)
 /* Remainder is 8 */
 	.align	4
 L(dPs2):
-	slw	rWORD5,rWORD1,r11
-	slw	rWORD6,rWORD2,r11
-	cmplw	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP2x)
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD2, rWORD6
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
 /* Do something useful in this cycle since we have to branch anyway.  */
-	lwz	rWORD7,4(rSTR1)
-	lwz	rWORD8,4(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
 	b	L(dP2e)
 /* Remainder is 12 */
 	.align	4
 L(dPs3):
-	slw	rWORD3,rWORD1,r11
-	slw	rWORD4,rWORD2,r11
-	cmplw	cr1,rWORD3,rWORD4
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD2, rWORD6
+	cmplw	cr1, rWORD3, rWORD4
 	b	L(dP3e)
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
 L(dPs4):
-	mtctr	rTMP
-	slw	rWORD1,rWORD1,r11
-	slw	rWORD2,rWORD2,r11
-	cmplw	cr0,rWORD1,rWORD2
+	mtctr	r0
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD2, rWORD6
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(dP4e)
 
 /* At this point we know both strings are word aligned and the
    compare length is at least 8 bytes.  */
 	.align	4
 L(Waligned):
-	andi.	rBITDIF,rN,12	/* Get the word remainder */
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	clrlwi	rN,rN,30
+	andi.	r12, rN, 12	/* Get the word remainder */
+	srwi	r0, rN, 4	/* Divide by 16 */
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
 	beq	L(dP4)
-	bgt	cr1,L(dP3)
-	beq	cr1,L(dP2)
+	bgt	cr1, L(dP3)
+	beq	cr1, L(dP2)
 
 /* Remainder is 4 */
 	.align	4
 L(dP1):
-	mtctr	rTMP
+	mtctr	r0
 /* Normally we'd use rWORD7/rWORD8 here, but since we might exit early
    (8-15 byte compare), we want to use only volatile registers.  This
    means we can avoid restoring non-volatile registers since we did not
    change any on the early exit path.  The key here is the non-early
    exit path only cares about the condition code (cr5), not about which
    register pair was used.  */
-	lwz	rWORD5,0(rSTR1)
-	lwz	rWORD6,0(rSTR2)
-	cmplw	cr5,rWORD5,rWORD6
-	blt	cr7,L(dP1x)
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+	lwz	rWORD6, 0(rSTR2)
+#endif
+	cmplw	cr5, rWORD5, rWORD6
+	blt	cr7, L(dP1x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP1e):
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
-	bne	cr0,L(dLcr0)
-
-	lwzu	rWORD7,16(rSTR1)
-	lwzu	rWORD8,16(rSTR2)
-	bne	cr1,L(dLcr1)
-	cmplw	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5x)
+	bne	cr7, L(dLcr7x)
+
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmplw	cr5, rWORD7, rWORD8
 	bdnz	L(dLoop)
-	bne	cr6,L(dLcr6)
-	lwz	r30,44(1)
-	lwz	r31,48(1)
+	bne	cr6, L(dLcr6)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
 	.align	3
 L(dP1x):
-	slwi.	r12,rN,3
-	bne	cr5,L(dLcr5)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
-	lwz	1,0(1)
+	slwi.	r12, rN, 3
+	bne	cr5, L(dLcr5x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Remainder is 8 */
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP2):
-	mtctr	rTMP
-	lwz	rWORD5,0(rSTR1)
-	lwz	rWORD6,0(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP2x)
-	lwz	rWORD7,4(rSTR1)
-	lwz	rWORD8,4(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+	lwz	rWORD6, 0(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
 L(dP2e):
-	lwz	rWORD1,8(rSTR1)
-	lwz	rWORD2,8(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	lwz	rWORD3,12(rSTR1)
-	lwz	rWORD4,12(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	bne	cr6,L(dLcr6)
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 12(rSTR1)
+	lwz	rWORD4, 12(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr6, L(dLcr6)
+	bne	cr5, L(dLcr5)
 	b	L(dLoop2)
 /* Again we are on a early exit path (16-23 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
 	.align	4
 L(dP2x):
-	lwz	rWORD3,4(rSTR1)
-	lwz	rWORD4,4(rSTR2)
-	cmplw	cr5,rWORD3,rWORD4
-	slwi.	r12,rN,3
-	bne	cr6,L(dLcr6)
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	bne	cr5,L(dLcr5)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
-	lwz	1,0(1)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	slwi.	r12, rN, 3
+	bne	cr6, L(dLcr6x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr1, L(dLcr1x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Remainder is 12 */
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP3):
-	mtctr	rTMP
-	lwz	rWORD3,0(rSTR1)
-	lwz	rWORD4,0(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 0(rSTR1)
+	lwz	rWORD4, 0(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
 L(dP3e):
-	lwz	rWORD5,4(rSTR1)
-	lwz	rWORD6,4(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	blt	cr7,L(dP3x)
-	lwz	rWORD7,8(rSTR1)
-	lwz	rWORD8,8(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	lwz	rWORD1,12(rSTR1)
-	lwz	rWORD2,12(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr1,L(dLcr1)
-	bne	cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 4(rSTR1)
+	lwz	rWORD6, 4(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	blt	cr7, L(dP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 8(rSTR1)
+	lwz	rWORD8, 8(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 12(rSTR1)
+	lwz	rWORD2, 12(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr1, L(dLcr1)
+	bne	cr6, L(dLcr6)
 	b	L(dLoop1)
 /* Again we are on a early exit path (24-31 byte compare), we want to
    only use volatile registers and avoid restoring non-volatile
    registers.  */
 	.align	4
 L(dP3x):
-	lwz	rWORD1,8(rSTR1)
-	lwz	rWORD2,8(rSTR2)
-	cmplw	cr5,rWORD1,rWORD2
-	slwi.	r12,rN,3
-	bne	cr1,L(dLcr1)
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr6,L(dLcr6)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
-	bne	cr5,L(dLcr5)
-	lwz	1,0(1)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	slwi.	r12, rN, 3
+	bne	cr1, L(dLcr1x)
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	bne	cr6, L(dLcr6x)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
+	bne	cr7, L(dLcr7x)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bne	L(d00)
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dP4):
-	mtctr	rTMP
-	lwz	rWORD1,0(rSTR1)
-	lwz	rWORD2,0(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
+	mtctr	r0
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 L(dP4e):
-	lwz	rWORD3,4(rSTR1)
-	lwz	rWORD4,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	lwz	rWORD5,8(rSTR1)
-	lwz	rWORD6,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	lwzu	rWORD7,12(rSTR1)
-	lwzu	rWORD8,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr0,L(dLcr0)
-	bne	cr1,L(dLcr1)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 8(rSTR1)
+	lwz	rWORD6, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 12(rSTR1)
+	lwzu	rWORD8, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
+	bne	cr1, L(dLcr1)
 	bdz-	L(d24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
 	.align	4
 L(dLoop):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(dLcr6)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
 L(dLoop1):
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
 L(dLoop2):
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr0,L(dLcr0)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(dLcr7)
 L(dLoop3):
-	lwzu	rWORD7,16(rSTR1)
-	lwzu	rWORD8,16(rSTR2)
-	bne	cr1,L(dLcr1)
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	bne	cr1, L(dLcr1)
+	cmplw	cr7, rWORD1, rWORD2
 	bdnz	L(dLoop)
 
 L(dL4):
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(dLcr6)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(dLcr5)
-	cmplw	cr5,rWORD7,rWORD8
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(dLcr6)
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(dLcr5)
+	cmplw	cr5, rWORD7, rWORD8
 L(d44):
-	bne	cr0,L(dLcr0)
+	bne	cr7, L(dLcr7)
 L(d34):
-	bne	cr1,L(dLcr1)
+	bne	cr1, L(dLcr1)
 L(d24):
-	bne	cr6,L(dLcr6)
+	bne	cr6, L(dLcr6)
 L(d14):
-	slwi.	r12,rN,3
-	bne	cr5,L(dLcr5)
+	slwi.	r12, rN, 3
+	bne	cr5, L(dLcr5)
 L(d04):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	lwz	1,0(1)
-	subfic	rN,r12,32	/* Shift count is 32 - (rN * 8).  */
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	subfic	rN, r12, 32	/* Shift count is 32 - (rN * 8).  */
 	beq	L(zeroLength)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  Since
    we are aligned it is safe to load the whole word, and use
-   shift right to eliminate bits beyond the compare length. */
+   shift right to eliminate bits beyond the compare length.  */
 L(d00):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	srw	rWORD1,rWORD1,rN
-	srw	rWORD2,rWORD2,rN
-	cmplw	rWORD1,rWORD2
-	li	rRTN,0
-	beqlr
-	li	rRTN,1
-	bgtlr
-	li	rRTN,-1
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	rWORD1, rWORD1, rN
+	srw	rWORD2, rWORD2, rN
+	sub	rRTN, rWORD1, rWORD2
 	blr
 
 	.align	4
-L(dLcr0):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	li	rRTN,1
-	lwz	1,0(1)
-	bgtlr	cr0
-	li	rRTN,-1
+	cfi_adjust_cfa_offset(64)
+L(dLcr7):
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr7x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr1):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	li	rRTN,1
-	lwz	1,0(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr1x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr1
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr6):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
-	li	rRTN,1
-	lwz	1,0(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
+L(dLcr6x):
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr6
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 	.align	4
+	cfi_adjust_cfa_offset(64)
 L(dLcr5):
-	lwz	r30,44(1)
-	lwz	r31,48(1)
+	lwz	rWORD7, 44(r1)
+	lwz	rWORD8, 48(r1)
 L(dLcr5x):
-	li	rRTN,1
-	lwz	1,0(1)
+	li	rRTN, 1
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	bgtlr	cr5
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 
 	.align	4
 L(bytealigned):
-	cfi_adjust_cfa_offset(-64)
 	mtctr	rN
 
 /* We need to prime this loop.  This loop is swing modulo scheduled
@@ -411,38 +616,39 @@ L(bytealigned):
 
    So we must precondition some registers and condition codes so that
    we don't exit the loop early on the first iteration.  */
-	lbz	rWORD1,0(rSTR1)
-	lbz	rWORD2,0(rSTR2)
+
+	lbz	rWORD1, 0(rSTR1)
+	lbz	rWORD2, 0(rSTR2)
 	bdz	L(b11)
-	cmplw	cr0,rWORD1,rWORD2
-	lbz	rWORD3,1(rSTR1)
-	lbz	rWORD4,1(rSTR2)
+	cmplw	cr7, rWORD1, rWORD2
+	lbz	rWORD3, 1(rSTR1)
+	lbz	rWORD4, 1(rSTR2)
 	bdz	L(b12)
-	cmplw	cr1,rWORD3,rWORD4
-	lbzu	rWORD5,2(rSTR1)
-	lbzu	rWORD6,2(rSTR2)
+	cmplw	cr1, rWORD3, rWORD4
+	lbzu	rWORD5, 2(rSTR1)
+	lbzu	rWORD6, 2(rSTR2)
 	bdz	L(b13)
 	.align	4
 L(bLoop):
-	lbzu	rWORD1,1(rSTR1)
-	lbzu	rWORD2,1(rSTR2)
-	bne	cr0,L(bLcr0)
+	lbzu	rWORD1, 1(rSTR1)
+	lbzu	rWORD2, 1(rSTR2)
+	bne	cr7, L(bLcr7)
 
-	cmplw	cr6,rWORD5,rWORD6
+	cmplw	cr6, rWORD5, rWORD6
 	bdz	L(b3i)
 
-	lbzu	rWORD3,1(rSTR1)
-	lbzu	rWORD4,1(rSTR2)
-	bne	cr1,L(bLcr1)
+	lbzu	rWORD3, 1(rSTR1)
+	lbzu	rWORD4, 1(rSTR2)
+	bne	cr1, L(bLcr1)
 
-	cmplw	cr0,rWORD1,rWORD2
+	cmplw	cr7, rWORD1, rWORD2
 	bdz	L(b2i)
 
-	lbzu	rWORD5,1(rSTR1)
-	lbzu	rWORD6,1(rSTR2)
-	bne	cr6,L(bLcr6)
+	lbzu	rWORD5, 1(rSTR1)
+	lbzu	rWORD6, 1(rSTR2)
+	bne	cr6, L(bLcr6)
 
-	cmplw	cr1,rWORD3,rWORD4
+	cmplw	cr1, rWORD3, rWORD4
 	bdnz	L(bLoop)
 
 /* We speculatively loading bytes before we have tested the previous
@@ -452,67 +658,62 @@ L(bLoop):
    tested.  In this case we must complete the pending operations
    before returning.  */
 L(b1i):
-	bne	cr0,L(bLcr0)
-	bne	cr1,L(bLcr1)
+	bne	cr7, L(bLcr7)
+	bne	cr1, L(bLcr1)
 	b	L(bx56)
 	.align	4
 L(b2i):
-	bne	cr6,L(bLcr6)
-	bne	cr0,L(bLcr0)
+	bne	cr6, L(bLcr6)
+	bne	cr7, L(bLcr7)
 	b	L(bx34)
 	.align	4
 L(b3i):
-	bne	cr1,L(bLcr1)
-	bne	cr6,L(bLcr6)
+	bne	cr1, L(bLcr1)
+	bne	cr6, L(bLcr6)
 	b	L(bx12)
 	.align	4
-L(bLcr0):
-	li	rRTN,1
-	bgtlr	cr0
-	li	rRTN,-1
+L(bLcr7):
+	li	rRTN, 1
+	bgtlr	cr7
+	li	rRTN, -1
 	blr
 L(bLcr1):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr1
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 L(bLcr6):
-	li	rRTN,1
+	li	rRTN, 1
 	bgtlr	cr6
-	li	rRTN,-1
+	li	rRTN, -1
 	blr
 
 L(b13):
-	bne	cr0,L(bx12)
-	bne	cr1,L(bx34)
+	bne	cr7, L(bx12)
+	bne	cr1, L(bx34)
 L(bx56):
-	sub	rRTN,rWORD5,rWORD6
+	sub	rRTN, rWORD5, rWORD6
 	blr
 	nop
 L(b12):
-	bne	cr0,L(bx12)
+	bne	cr7, L(bx12)
 L(bx34):
-	sub	rRTN,rWORD3,rWORD4
+	sub	rRTN, rWORD3, rWORD4
 	blr
-
 L(b11):
 L(bx12):
-	sub	rRTN,rWORD1,rWORD2
+	sub	rRTN, rWORD1, rWORD2
 	blr
-
 	.align	4
-L(zeroLengthReturn):
-
 L(zeroLength):
-	li	rRTN,0
+	li	rRTN, 0
 	blr
 
-	cfi_adjust_cfa_offset(64)
 	.align	4
 /* At this point we know the strings have different alignment and the
-   compare length is at least 8 bytes.  rBITDIF contains the low order
+   compare length is at least 8 bytes.  r12 contains the low order
    2 bits of rSTR1 and cr5 contains the result of the logical compare
-   of rBITDIF to 0.  If rBITDIF == 0 then rStr1 is word aligned and can
+   of r12 to 0.  If r12 == 0 then rStr1 is word aligned and can
    perform the Wunaligned loop.
 
    Otherwise we know that rSTR1 is not already word aligned yet.
@@ -521,465 +722,654 @@ L(zeroLength):
    eliminate bits preceding the first byte.  Since we want to join the
    normal (Wualigned) compare loop, starting at the second word,
    we need to adjust the length (rN) and special case the loop
-   versioning for the first W. This insures that the loop count is
+   versioning for the first W. This ensures that the loop count is
    correct and the first W (shifted) is in the expected resister pair.  */
 #define rSHL		r29	/* Unaligned shift left count.  */
 #define rSHR		r28	/* Unaligned shift right count.  */
-#define rB		r27	/* Left rotation temp for rWORD2.  */
-#define rD		r26	/* Left rotation temp for rWORD4.  */
-#define rF		r25	/* Left rotation temp for rWORD6.  */
-#define rH		r24	/* Left rotation temp for rWORD8.  */
-#define rA		r0	/* Right rotation temp for rWORD2.  */
-#define rC		r12	/* Right rotation temp for rWORD4.  */
-#define rE		r0	/* Right rotation temp for rWORD6.  */
-#define rG		r12	/* Right rotation temp for rWORD8.  */
+#define rWORD8_SHIFT	r27	/* Left rotation temp for rWORD2.  */
+#define rWORD2_SHIFT	r26	/* Left rotation temp for rWORD4.  */
+#define rWORD4_SHIFT	r25	/* Left rotation temp for rWORD6.  */
+#define rWORD6_SHIFT	r24	/* Left rotation temp for rWORD8.  */
+	cfi_adjust_cfa_offset(64)
 L(unaligned):
-	stw	r29,40(r1)
-	cfi_offset(r29,(40-64))
-	clrlwi	rSHL,rSTR2,30
-	stw	r28,36(r1)
-	cfi_offset(r28,(36-64))
-	beq	cr5,L(Wunaligned)
-	stw	r27,32(r1)
-	cfi_offset(r27,(32-64))
+	stw	rSHL, 40(r1)
+	cfi_offset(rSHL, (40-64))
+	clrlwi	rSHL, rSTR2, 30
+	stw	rSHR, 36(r1)
+	cfi_offset(rSHR, (36-64))
+	beq	cr5, L(Wunaligned)
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
 /* Adjust the logical start of rSTR2 to compensate for the extra bits
    in the 1st rSTR1 W.  */
-	sub	r27,rSTR2,rBITDIF
+	sub	rWORD8_SHIFT, rSTR2, r12
 /* But do not attempt to address the W before that W that contains
    the actual start of rSTR2.  */
-	clrrwi	rSTR2,rSTR2,2
-	stw	r26,28(r1)
-	cfi_offset(r26,(28-64))
-/* Compute the left/right shift counts for the unalign rSTR2,
+	clrrwi	rSTR2, rSTR2, 2
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+/* Compute the left/right shift counts for the unaligned rSTR2,
    compensating for the logical (W aligned) start of rSTR1.  */
-	clrlwi	rSHL,r27,30
-	clrrwi	rSTR1,rSTR1,2
-	stw	r25,24(r1)
-	cfi_offset(r25,(24-64))
-	slwi	rSHL,rSHL,3
-	cmplw	cr5,r27,rSTR2
-	add	rN,rN,rBITDIF
-	slwi	r11,rBITDIF,3
-	stw	r24,20(r1)
-	cfi_offset(r24,(20-64))
-	subfic	rSHR,rSHL,32
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	andi.	rBITDIF,rN,12	/* Get the W remainder */
+	clrlwi	rSHL, rWORD8_SHIFT, 30
+	clrrwi	rSTR1, rSTR1, 2
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
+	slwi	rSHL, rSHL, 3
+	cmplw	cr5, rWORD8_SHIFT, rSTR2
+	add	rN, rN, r12
+	slwi	rWORD6, r12, 3
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
+	subfic	rSHR, rSHL, 32
+	srwi	r0, rN, 4	/* Divide by 16 */
+	andi.	r12, rN, 12	/* Get the W remainder */
 /* We normally need to load 2 Ws to start the unaligned rSTR2, but in
    this special case those bits may be discarded anyway.  Also we
    must avoid loading a W where none of the bits are part of rSTR2 as
    this may cross a page boundary and cause a page fault.  */
-	li	rWORD8,0
-	blt	cr5,L(dus0)
-	lwz	rWORD8,0(rSTR2)
-	la	rSTR2,4(rSTR2)
-	slw	rWORD8,rWORD8,rSHL
+	li	rWORD8, 0
+	blt	cr5, L(dus0)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD8, 0(rSTR2)
+	addi	rSTR2, rSTR2, 4
+#endif
+	slw	rWORD8, rWORD8, rSHL
 
 L(dus0):
-	lwz	rWORD1,0(rSTR1)
-	lwz	rWORD2,0(rSTR2)
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	srw	rG,rWORD2,rSHR
-	clrlwi	rN,rN,30
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+	lwz	rWORD2, 0(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	srw	r12, rWORD2, rSHR
+	clrlwi	rN, rN, 30
 	beq	L(duPs4)
-	mtctr	rTMP
-	or	rWORD8,rG,rWORD8
-	bgt	cr1,L(duPs3)
-	beq	cr1,L(duPs2)
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	bgt	cr1, L(duPs3)
+	beq	cr1, L(duPs2)
 
 /* Remainder is 4 */
 	.align	4
 L(dusP1):
-	slw	rB,rWORD2,rSHL
-	slw	rWORD7,rWORD1,r11
-	slw	rWORD8,rWORD8,r11
-	bge	cr7,L(duP1e)
+	slw	rWORD8_SHIFT, rWORD2, rSHL
+	slw	rWORD7, rWORD1, rWORD6
+	slw	rWORD8, rWORD8, rWORD6
+	bge	cr7, L(duP1e)
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
-	cmplw	cr5,rWORD7,rWORD8
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+	cmplw	cr5, rWORD7, rWORD8
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
 	.align	4
 L(duPs2):
-	slw	rH,rWORD2,rSHL
-	slw	rWORD5,rWORD1,r11
-	slw	rWORD6,rWORD8,r11
+	slw	rWORD6_SHIFT, rWORD2, rSHL
+	slw	rWORD5, rWORD1, rWORD6
+	slw	rWORD6, rWORD8, rWORD6
 	b	L(duP2e)
 /* Remainder is 12 */
 	.align	4
 L(duPs3):
-	slw	rF,rWORD2,rSHL
-	slw	rWORD3,rWORD1,r11
-	slw	rWORD4,rWORD8,r11
+	slw	rWORD4_SHIFT, rWORD2, rSHL
+	slw	rWORD3, rWORD1, rWORD6
+	slw	rWORD4, rWORD8, rWORD6
 	b	L(duP3e)
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
 L(duPs4):
-	mtctr	rTMP
-	or	rWORD8,rG,rWORD8
-	slw	rD,rWORD2,rSHL
-	slw	rWORD1,rWORD1,r11
-	slw	rWORD2,rWORD8,r11
+	mtctr	r0
+	or	rWORD8, r12, rWORD8
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	slw	rWORD1, rWORD1, rWORD6
+	slw	rWORD2, rWORD8, rWORD6
 	b	L(duP4e)
 
 /* At this point we know rSTR1 is word aligned and the
    compare length is at least 8 bytes.  */
 	.align	4
 L(Wunaligned):
-	stw	r27,32(r1)
-	cfi_offset(r27,(32-64))
-	clrrwi	rSTR2,rSTR2,2
-	stw	r26,28(r1)
-	cfi_offset(r26,(28-64))
-	srwi	rTMP,rN,4	/* Divide by 16 */
-	stw	r25,24(r1)
-	cfi_offset(r25,(24-64))
-	andi.	rBITDIF,rN,12	/* Get the W remainder */
-	stw	r24,20(r1)
-	cfi_offset(r24,(24-64))
-	slwi	rSHL,rSHL,3
-	lwz	rWORD6,0(rSTR2)
-	lwzu	rWORD8,4(rSTR2)
-	cmplwi	cr1,rBITDIF,8
-	cmplwi	cr7,rN,16
-	clrlwi	rN,rN,30
-	subfic	rSHR,rSHL,32
-	slw	rH,rWORD6,rSHL
+	stw	rWORD8_SHIFT, 32(r1)
+	cfi_offset(rWORD8_SHIFT, (32-64))
+	clrrwi	rSTR2, rSTR2, 2
+	stw	rWORD2_SHIFT, 28(r1)
+	cfi_offset(rWORD2_SHIFT, (28-64))
+	srwi	r0, rN, 4	/* Divide by 16 */
+	stw	rWORD4_SHIFT, 24(r1)
+	cfi_offset(rWORD4_SHIFT, (24-64))
+	andi.	r12, rN, 12	/* Get the W remainder */
+	stw	rWORD6_SHIFT, 20(r1)
+	cfi_offset(rWORD6_SHIFT, (20-64))
+	slwi	rSHL, rSHL, 3
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD6, 0(rSTR2)
+	lwzu	rWORD8, 4(rSTR2)
+#endif
+	cmplwi	cr1, r12, 8
+	cmplwi	cr7, rN, 16
+	clrlwi	rN, rN, 30
+	subfic	rSHR, rSHL, 32
+	slw	rWORD6_SHIFT, rWORD6, rSHL
 	beq	L(duP4)
-	mtctr	rTMP
-	bgt	cr1,L(duP3)
-	beq	cr1,L(duP2)
+	mtctr	r0
+	bgt	cr1, L(duP3)
+	beq	cr1, L(duP2)
 
 /* Remainder is 4 */
 	.align	4
 L(duP1):
-	srw	rG,rWORD8,rSHR
-	lwz	rWORD7,0(rSTR1)
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP1x)
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD7, 0(rSTR1)
+#endif
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP1x)
 L(duP1e):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	bne	cr5,L(duLcr5)
-	or	rWORD4,rC,rD
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	bne	cr0,L(duLcr0)
-	or	rWORD6,rE,rF
-	cmplw	cr6,rWORD5,rWORD6
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	bne	cr5, L(duLcr5)
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	bne	cr7, L(duLcr7)
+	or	rWORD6, r0, rWORD4_SHIFT
+	cmplw	cr6, rWORD5, rWORD6
 	b	L(duLoop3)
 	.align	4
 /* At this point we exit early with the first word compare
    complete and remainder of 0 to 3 bytes.  See L(du14) for details on
    how we handle the remaining bytes.  */
 L(duP1x):
-	cmplw	cr5,rWORD7,rWORD8
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+	cmplw	cr5, rWORD7, rWORD8
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	ld	rWORD2,8(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 /* Remainder is 8 */
 	.align	4
 L(duP2):
-	srw	rE,rWORD8,rSHR
-	lwz	rWORD5,0(rSTR1)
-	or	rWORD6,rE,rH
-	slw	rH,rWORD8,rSHL
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD5, 0(rSTR1)
+#endif
+	or	rWORD6, r0, rWORD6_SHIFT
+	slw	rWORD6_SHIFT, rWORD8, rSHL
 L(duP2e):
-	lwz	rWORD7,4(rSTR1)
-	lwz	rWORD8,4(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP2x)
-	lwz	rWORD1,8(rSTR1)
-	lwz	rWORD2,8(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	lwz	rWORD3,12(rSTR1)
-	lwz	rWORD4,12(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	bne	cr5,L(duLcr5)
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	cmplw	cr1,rWORD3,rWORD4
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 4(rSTR1)
+	lwz	rWORD8, 4(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP2x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 8(rSTR1)
+	lwz	rWORD2, 8(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 12(rSTR1)
+	lwz	rWORD4, 12(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	bne	cr5, L(duLcr5)
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	cmplw	cr1, rWORD3, rWORD4
 	b	L(duLoop2)
 	.align	4
 L(duP2x):
-	cmplw	cr5,rWORD7,rWORD8
-	addi	rSTR1,rSTR1,4
-	addi	rSTR2,rSTR2,4
-	bne	cr6,L(duLcr6)
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+	cmplw	cr5, rWORD7, rWORD8
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#endif
+	bne	cr6, L(duLcr6)
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Remainder is 12 */
 	.align	4
 L(duP3):
-	srw	rC,rWORD8,rSHR
-	lwz	rWORD3,0(rSTR1)
-	slw	rF,rWORD8,rSHL
-	or	rWORD4,rC,rH
+	srw	r12, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD3, 0(rSTR1)
+#endif
+	slw	rWORD4_SHIFT, rWORD8, rSHL
+	or	rWORD4, r12, rWORD6_SHIFT
 L(duP3e):
-	lwz	rWORD5,4(rSTR1)
-	lwz	rWORD6,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
-	lwz	rWORD7,8(rSTR1)
-	lwz	rWORD8,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr1,L(duLcr1)
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	blt	cr7,L(duP3x)
-	lwz	rWORD1,12(rSTR1)
-	lwz	rWORD2,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	cmplw	cr0,rWORD1,rWORD2
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 4(rSTR1)
+	lwz	rWORD6, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD7, 8(rSTR1)
+	lwz	rWORD8, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	blt	cr7, L(duP3x)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 12(rSTR1)
+	lwz	rWORD2, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+	cmplw	cr7, rWORD1, rWORD2
 	b	L(duLoop1)
 	.align	4
 L(duP3x):
-	addi	rSTR1,rSTR1,8
-	addi	rSTR2,rSTR2,8
-	bne	cr1,L(duLcr1)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr6,L(duLcr6)
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
-	cmplw	cr7,rN,rSHR
+#ifndef __LITTLE_ENDIAN__
+	addi	rSTR1, rSTR1, 8
+	addi	rSTR2, rSTR2, 8
+#endif
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr6, L(duLcr6)
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	b	L(dutrim)
 
 /* Count is a multiple of 16, remainder is 0 */
 	.align	4
 L(duP4):
-	mtctr	rTMP
-	srw	rA,rWORD8,rSHR
-	lwz	rWORD1,0(rSTR1)
-	slw	rD,rWORD8,rSHL
-	or	rWORD2,rA,rH
+	mtctr	r0
+	srw	r0, rWORD8, rSHR
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	addi	rSTR1, rSTR1, 4
+#else
+	lwz	rWORD1, 0(rSTR1)
+#endif
+	slw	rWORD2_SHIFT, rWORD8, rSHL
+	or	rWORD2, r0, rWORD6_SHIFT
 L(duP4e):
-	lwz	rWORD3,4(rSTR1)
-	lwz	rWORD4,4(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
-	lwz	rWORD5,8(rSTR1)
-	lwz	rWORD6,8(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr0,L(duLcr0)
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
-	lwzu	rWORD7,12(rSTR1)
-	lwzu	rWORD8,12(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr1,L(duLcr1)
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
-	cmplw	cr5,rWORD7,rWORD8
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 4(rSTR1)
+	lwz	rWORD4, 4(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 8(rSTR1)
+	lwz	rWORD6, 8(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 12(rSTR1)
+	lwzu	rWORD8, 12(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
+	cmplw	cr5, rWORD7, rWORD8
 	bdz	L(du24)		/* Adjust CTR as we start with +4 */
 /* This is the primary loop */
 	.align	4
 L(duLoop):
-	lwz	rWORD1,4(rSTR1)
-	lwz	rWORD2,4(rSTR2)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(duLcr6)
-	srw	rA,rWORD2,rSHR
-	slw	rD,rWORD2,rSHL
-	or	rWORD2,rA,rB
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD1, 4(rSTR1)
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	srw	r0, rWORD2, rSHR
+	slw	rWORD2_SHIFT, rWORD2, rSHL
+	or	rWORD2, r0, rWORD8_SHIFT
 L(duLoop1):
-	lwz	rWORD3,8(rSTR1)
-	lwz	rWORD4,8(rSTR2)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(duLcr5)
-	srw	rC,rWORD4,rSHR
-	slw	rF,rWORD4,rSHL
-	or	rWORD4,rC,rD
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD3, 0, rSTR1
+	lwbrx	rWORD4, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD3, 8(rSTR1)
+	lwz	rWORD4, 8(rSTR2)
+#endif
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	srw	r12, rWORD4, rSHR
+	slw	rWORD4_SHIFT, rWORD4, rSHL
+	or	rWORD4, r12, rWORD2_SHIFT
 L(duLoop2):
-	lwz	rWORD5,12(rSTR1)
-	lwz	rWORD6,12(rSTR2)
-	cmplw	cr5,rWORD7,rWORD8
-	bne	cr0,L(duLcr0)
-	srw	rE,rWORD6,rSHR
-	slw	rH,rWORD6,rSHL
-	or	rWORD6,rE,rF
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD5, 0, rSTR1
+	lwbrx	rWORD6, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD5, 12(rSTR1)
+	lwz	rWORD6, 12(rSTR2)
+#endif
+	cmplw	cr5, rWORD7, rWORD8
+	bne	cr7, L(duLcr7)
+	srw	r0, rWORD6, rSHR
+	slw	rWORD6_SHIFT, rWORD6, rSHL
+	or	rWORD6, r0, rWORD4_SHIFT
 L(duLoop3):
-	lwzu	rWORD7,16(rSTR1)
-	lwzu	rWORD8,16(rSTR2)
-	cmplw	cr0,rWORD1,rWORD2
-	bne	cr1,L(duLcr1)
-	srw	rG,rWORD8,rSHR
-	slw	rB,rWORD8,rSHL
-	or	rWORD8,rG,rH
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD7, 0, rSTR1
+	lwbrx	rWORD8, 0, rSTR2
+	addi	rSTR1, rSTR1, 4
+	addi	rSTR2, rSTR2, 4
+#else
+	lwzu	rWORD7, 16(rSTR1)
+	lwzu	rWORD8, 16(rSTR2)
+#endif
+	cmplw	cr7, rWORD1, rWORD2
+	bne	cr1, L(duLcr1)
+	srw	r12, rWORD8, rSHR
+	slw	rWORD8_SHIFT, rWORD8, rSHL
+	or	rWORD8, r12, rWORD6_SHIFT
 	bdnz	L(duLoop)
 
 L(duL4):
-	bne	cr1,L(duLcr1)
-	cmplw	cr1,rWORD3,rWORD4
-	bne	cr6,L(duLcr6)
-	cmplw	cr6,rWORD5,rWORD6
-	bne	cr5,L(duLcr5)
-	cmplw	cr5,rWORD7,rWORD8
+#if 0
+/* Huh?  We've already branched on cr1!  */
+	bne	cr1, L(duLcr1)
+#endif
+	cmplw	cr1, rWORD3, rWORD4
+	bne	cr6, L(duLcr6)
+	cmplw	cr6, rWORD5, rWORD6
+	bne	cr5, L(duLcr5)
+	cmplw	cr5, rWORD7, rWORD8
 L(du44):
-	bne	cr0,L(duLcr0)
+	bne	cr7, L(duLcr7)
 L(du34):
-	bne	cr1,L(duLcr1)
+	bne	cr1, L(duLcr1)
 L(du24):
-	bne	cr6,L(duLcr6)
+	bne	cr6, L(duLcr6)
 L(du14):
-	slwi.	rN,rN,3
-	bne	cr5,L(duLcr5)
+	slwi.	rN, rN, 3
+	bne	cr5, L(duLcr5)
 /* At this point we have a remainder of 1 to 3 bytes to compare.  We use
    shift right to eliminate bits beyond the compare length.
+   This allows the use of word subtract to compute the final result.
 
    However it may not be safe to load rWORD2 which may be beyond the
    string length. So we compare the bit length of the remainder to
    the right shift count (rSHR). If the bit count is less than or equal
    we do not need to load rWORD2 (all significant bits are already in
-   rB).  */
-	cmplw	cr7,rN,rSHR
+   rWORD8_SHIFT).  */
+	cmplw	cr7, rN, rSHR
 	beq	L(duZeroReturn)
-	li	rA,0
-	ble	cr7,L(dutrim)
-	lwz	rWORD2,4(rSTR2)
-	srw	rA,rWORD2,rSHR
+	li	r0, 0
+	ble	cr7, L(dutrim)
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD2, 0, rSTR2
+	addi	rSTR2, rSTR2, 4
+#else
+	lwz	rWORD2, 4(rSTR2)
+#endif
+	srw	r0, rWORD2, rSHR
 	.align	4
 L(dutrim):
-	lwz	rWORD1,4(rSTR1)
-	lwz	r31,48(1)
-	subfic	rN,rN,32	/* Shift count is 32 - (rN * 8).  */
-	or	rWORD2,rA,rB
-	lwz	r30,44(1)
-	lwz	r29,40(r1)
-	srw	rWORD1,rWORD1,rN
-	srw	rWORD2,rWORD2,rN
-	lwz	r28,36(r1)
-	lwz	r27,32(r1)
-	cmplw	rWORD1,rWORD2
-	li	rRTN,0
-	beq	L(dureturn26)
-	li	rRTN,1
-	bgt	L(dureturn26)
-	li	rRTN,-1
+#ifdef __LITTLE_ENDIAN__
+	lwbrx	rWORD1, 0, rSTR1
+#else
+	lwz	rWORD1, 4(rSTR1)
+#endif
+	lwz	rWORD8, 48(r1)
+	subfic	rN, rN, 32	/* Shift count is 32 - (rN * 8).  */
+	or	rWORD2, r0, rWORD8_SHIFT
+	lwz	rWORD7, 44(r1)
+	lwz	rSHL, 40(r1)
+	srw	rWORD1, rWORD1, rN
+	srw	rWORD2, rWORD2, rN
+	lwz	rSHR, 36(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
+	sub	rRTN, rWORD1, rWORD2
 	b	L(dureturn26)
 	.align	4
-L(duLcr0):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr0,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+L(duLcr7):
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr7, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr1):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr1,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr1, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr6):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr6,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr6, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	4
 L(duLcr5):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
-	li	rRTN,1
-	bgt	cr5,L(dureturn29)
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
-	li	rRTN,-1
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
+	li	rRTN, 1
+	bgt	cr5, L(dureturn29)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
+	li	rRTN, -1
 	b	L(dureturn27)
 	.align	3
 L(duZeroReturn):
-	li	rRTN,0
+	li	rRTN, 0
 	.align	4
 L(dureturn):
-	lwz	r31,48(1)
-	lwz	r30,44(1)
+	lwz	rWORD8, 48(r1)
+	lwz	rWORD7, 44(r1)
 L(dureturn29):
-	lwz	r29,40(r1)
-	lwz	r28,36(r1)
+	lwz	rSHL, 40(r1)
+	lwz	rSHR, 36(r1)
 L(dureturn27):
-	lwz	r27,32(r1)
+	lwz	rWORD8_SHIFT, 32(r1)
 L(dureturn26):
-	lwz	r26,28(r1)
+	lwz	rWORD2_SHIFT, 28(r1)
 L(dureturn25):
-	lwz	r25,24(r1)
-	lwz	r24,20(r1)
-	lwz	1,0(1)
+	lwz	rWORD4_SHIFT, 24(r1)
+	lwz	rWORD6_SHIFT, 20(r1)
+	addi	r1, r1, 64
+	cfi_adjust_cfa_offset(-64)
 	blr
 END (memcmp)
+
 libc_hidden_builtin_def (memcmp)
-weak_alias (memcmp,bcmp)
+weak_alias (memcmp, bcmp)
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/memcpy.S b/libc/sysdeps/powerpc/powerpc32/power7/memcpy.S
index 7f0077823..acf3c1019 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/memcpy.S
@@ -383,7 +383,7 @@ L(copy_GE_32_unaligned):
 
 	beq    L(copy_GE_32_unaligned_cont)
 
-	/* SRC is not quadword aligned, get it aligned.  */
+	/* DST is not quadword aligned, get it aligned.  */
 
 	mtcrf   0x01,0
 	subf    31,0,5
@@ -435,13 +435,21 @@ L(copy_GE_32_unaligned_cont):
 	mr      11,12
 	mtcrf   0x01,9
 	cmplwi  cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
 	lvsl    5,0,12
+#endif
 	lvx     3,0,12
 	bf      31,L(setup_unaligned_loop)
 
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 	lvx     4,12,6
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
 	vperm   6,3,4,5
+#endif
 	addi    11,12,16
 	addi    10,3,16
 	stvx    6,0,3
@@ -461,11 +469,17 @@ L(unaligned_loop):
 	vector instructions though.  */
 
 	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr6.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr10.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
 	addi    11,11,32
 	stvx    6,0,10
 	stvx    10,10,6
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/mempcpy.S b/libc/sysdeps/powerpc/powerpc32/power7/mempcpy.S
index 5ad4edb58..4610ec5b5 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/mempcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/mempcpy.S
@@ -325,7 +325,7 @@ L(copy_GE_32_unaligned):
 
 	beq	L(copy_GE_32_unaligned_cont)
 
-	/* SRC is not quadword aligned, get it aligned.  */
+	/* DST is not quadword aligned, get it aligned.  */
 
 	mtcrf	0x01,0
 	subf	31,0,5
@@ -377,13 +377,21 @@ L(copy_GE_32_unaligned_cont):
 	mr	11,12
 	mtcrf	0x01,9
 	cmplwi	cr6,9,1
-	lvsl	5,0,12
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
+	lvsl    5,0,12
+#endif
 	lvx	3,0,12
 	bf	31,L(setup_unaligned_loop)
 
 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 	lvx	4,12,6
-	vperm	6,3,4,5
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	addi	11,12,16
 	addi	10,3,16
 	stvx	6,0,3
@@ -403,11 +411,17 @@ L(unaligned_loop):
 	vector instructions though.  */
 
 	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr6.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr10.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
 	addi	11,11,32
 	stvx	6,0,10
 	stvx	10,10,6
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/memrchr.S b/libc/sysdeps/powerpc/powerpc32/power7/memrchr.S
index defd832b0..9601aa799 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/memrchr.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/memrchr.S
@@ -23,117 +23,131 @@
 	.machine  power7
 ENTRY (__memrchr)
 	CALL_MCOUNT
-	dcbt	0,r3
-	mr	r7,r3
-	add	r3,r7,r5      /* Calculate the last acceptable address.  */
-	cmplw	cr7,r3,r7     /* Is the address equal or less than r3?  */
+	add	r7,r3,r5      /* Calculate the last acceptable address.  */
+	neg	r0,r7
+	addi	r7,r7,-1
+	mr	r10,r3
+	clrrwi	r6,r7,7
+	li	r9,3<<5
+	dcbt	r9,r6,16      /* Stream hint, decreasing addresses.  */
 
 	/* Replicate BYTE to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
-	bge	cr7,L(proceed)
-
-	li	r3,-1	      /* Make r11 the biggest if r4 <= 0.  */
-L(proceed):
+	rldimi	r4,r4,8,48
+	rldimi	r4,r4,16,32
 	li	r6,-4
-	addi	r9,r3,-1
-	clrrwi  r8,r9,2
-	addi	r8,r8,4
-	neg	r0,r3
+	li	r9,-1
 	rlwinm	r0,r0,3,27,28 /* Calculate padding.  */
-
+	clrrwi	r8,r7,2
+	srw	r9,r9,r0
 	cmplwi	r5,16
+	clrrwi	r0,r10,2
 	ble	L(small_range)
 
-	lwbrx	r12,r8,r6     /* Load reversed word from memory.  */
-	cmpb	r10,r12,r4    /* Check for BYTE in WORD1.  */
-	slw	r10,r10,r0
-	srw	r10,r10,r0
-	cmplwi	cr7,r10,0     /* If r10 == 0, no BYTEs have been found.  */
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8      /* Load reversed word from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in WORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0      /* If r3 == 0, no BYTEs have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,-4
-	cmplw	cr6,r9,r7
-	ble	cr6,L(null)
-
 	mtcrf   0x01,r8
 	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
-	mr	r8,r9
-	bt	29,L(loop_setup)
+	bf	29,L(loop_setup)
 
 	/* Handle WORD2 of pair.  */
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,r8,r6
+#else
 	lwbrx	r12,r8,r6
-	cmpb	r10,r12,r4
-	cmplwi	cr7,r10,0
-	bne	cr7,L(done)
-
-	/* Are we done already?  */
+#endif
 	addi	r8,r8,-4
-	cmplw	cr6,r8,r7
-	ble	cr6,L(null)
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
 
 L(loop_setup):
-	li	r0,-8
-	sub	r5,r8,r7
-	srwi	r9,r5,3	      /* Number of loop iterations.  */
+	/* The last word we want to read in the loop below is the one
+	   containing the first byte of the string, ie. the word at
+	   s & ~3, or r0.  The first word read is at r8 - 4, we
+	   read 2 * cnt words, so the last word read will be at
+	   r8 - 4 - 8 * cnt + 4.  Solving for cnt gives
+	   cnt = (r8 - r0) / 8  */
+	sub	r5,r8,r0
+	addi	r8,r8,-4
+	srwi	r9,r5,3       /* Number of loop iterations.  */
 	mtctr	r9	      /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for BYTE backwards in the string.  Since it's a
-	   small loop (< 8 instructions), align it to 32-bytes.  */
-	.p2align  5
+
+	/* Main loop to look for BYTE backwards in the string.
+	   FIXME: Investigate whether 32 byte align helps with this
+	   9 instruction loop.  */
+	.align	5
 L(loop):
 	/* Load two words, compare and merge in a
 	   single register for speed.  This is an attempt
 	   to speed up the byte-checking process for bigger strings.  */
 
-	lwbrx	r12,r8,r6
-	lwbrx	r11,r8,r0
-	addi	r8,r8,-4
-	cmpb	r10,r12,r4
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+	lwzx	r11,r8,r6
+#else
+	lwbrx	r12,0,r8
+	lwbrx	r11,r8,r6
+#endif
+	cmpb	r3,r12,r4
 	cmpb	r9,r11,r4
-	or	r5,r9,r10     /* Merge everything in one word.  */
+	or	r5,r9,r3      /* Merge everything in one word.  */
 	cmplwi	cr7,r5,0
 	bne	cr7,L(found)
-	addi	r8,r8,-4
+	addi	r8,r8,-8
 	bdnz	L(loop)
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for BYTE in the whole range.  Just return
-	   the original range.  */
-	addi	r8,r8,4
-	cmplw	cr6,r8,r7
-	bgt	cr6,L(loop_small)
-	b	L(null)
 
-	/* OK, one (or both) of the words contains BYTE.  Check
-	   the first word and decrement the address in case the first
-	   word really contains BYTE.  */
+	/* We may have one more word to read.  */
+	cmplw	r8,r0
+	bnelr
+
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
+	blr
+
 	.align	4
 L(found):
-	cmplwi	cr6,r10,0
-	addi	r8,r8,4
+	/* OK, one (or both) of the words contains BYTE.  Check
+	   the first word.  */
+	cmplwi	cr6,r3,0
 	bne	cr6,L(done)
 
 	/* BYTE must be in the second word.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
+	   again and move the result of cmpb to r3 so we can calculate the
 	   pointer.  */
 
-	mr	r10,r9
+	mr	r3,r9
 	addi	r8,r8,-4
 
-	/* r10 has the output of the cmpb instruction, that is, it contains
+	/* r3 has the output of the cmpb instruction, that is, it contains
 	   0xff in the same position as BYTE in the original
 	   word from the string.  Use that to calculate the pointer.
 	   We need to make sure BYTE is *before* the end of the
 	   range.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
-	srwi	r6,r0,3	      /* Convert leading zeroes to bytes.  */
-	addi	r0,r6,1
+	cntlzw	r9,r3	      /* Count leading zeros before the match.  */
+	cmplw	r8,r0         /* Are we on the last word?  */
+	srwi	r6,r9,3	      /* Convert leading zeros to bytes.  */
+	addi	r0,r6,-3
 	sub	r3,r8,r0
-	cmplw	r3,r7
-	blt	L(null)
+	cmplw	cr7,r3,r10
+	bnelr
+	bgelr	cr7
+	li	r3,0
 	blr
 
 	.align	4
@@ -147,28 +161,35 @@ L(small_range):
 	cmplwi	r5,0
 	beq	L(null)
 
-	lwbrx	r12,r8,r6     /* Load reversed word from memory.  */
-	cmpb	r10,r12,r4    /* Check for null bytes in WORD1.  */
-	slw	r10,r10,r0
-	srw	r10,r10,r0
-	cmplwi	cr7,r10,0
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8      /* Load reversed word from memory.  */
+#endif
+	cmpb	r3,r12,r4     /* Check for BYTE in WORD1.  */
+	and	r3,r3,r9
+	cmplwi	cr7,r3,0
 	bne	cr7,L(done)
 
+	/* Are we done already?  */
+	cmplw	r8,r0
 	addi	r8,r8,-4
-	cmplw	r8,r7
-	ble	L(null)
-	b	L(loop_small)
+	beqlr
 
-	.p2align  5
+	.align	5
 L(loop_small):
-	lwbrx	r12,r8,r6
-	cmpb	r10,r12,r4
-	cmplwi	cr6,r10,0
-	bne	cr6,L(done)
+#ifdef __LITTLE_ENDIAN__
+	lwzx	r12,0,r8
+#else
+	lwbrx	r12,0,r8
+#endif
+	cmpb	r3,r12,r4
+	cmplw	r8,r0
+	cmplwi	cr7,r3,0
+	bne	cr7,L(done)
 	addi	r8,r8,-4
-	cmplw	r8,r7
-	ble	L(null)
-	b	L(loop_small)
+	bne	L(loop_small)
+	blr
 
 END (__memrchr)
 weak_alias (__memrchr, memrchr)
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/memset.S b/libc/sysdeps/powerpc/powerpc32/power7/memset.S
index 360ea717f..aadda2558 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/memset.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/memset.S
@@ -35,8 +35,8 @@ L(_memset):
 	cfi_offset(31,-8)
 
 	/* Replicate byte to word.  */
-	rlwimi	4,4,8,16,23
-	rlwimi	4,4,16,0,15
+	insrdi	4,4,8,48
+	insrdi	4,4,16,32
 
 	ble	cr6,L(small)	/* If length <= 8, use short copy code.  */
 
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/rawmemchr.S b/libc/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
index a80c74a09..c2d8c4b7b 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/rawmemchr.S
@@ -27,16 +27,21 @@ ENTRY (__rawmemchr)
 	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
 
 	/* Replicate byte to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	rldimi	r4,r4,8,48
+	rldimi	r4,r4,16,32
 
 	/* Now r4 has a word of c bytes.  */
 
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
 	cmpb	r5,r12,r4     /* Compare each byte against c byte.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r5,r5,r6
+	slw	r5,r5,r6
+#else
 	slw	r5,r5,r6      /* Move left to discard ignored bits.  */
 	srw	r5,r5,r6      /* Bring the bits back as zeros.  */
+#endif
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c bytes have been found.  */
 	bne	cr7,L(done)
 
@@ -90,8 +95,14 @@ L(loop):
 	   word from the string.  Use that fact to find out what is
 	   the position of the byte inside the string.  */
 L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+#endif
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching char.  */
 	blr
 END (__rawmemchr)
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/strchr.S b/libc/sysdeps/powerpc/powerpc32/power7/strchr.S
index 0ecadb271..b66265967 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/strchr.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/strchr.S
@@ -35,8 +35,8 @@ ENTRY (strchr)
 	beq	cr7,L(null_match)
 
 	/* Replicate byte to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 
 	/* Now r4 has a word of c bytes and r0 has
 	   a word of null bytes.  */
@@ -46,11 +46,17 @@ ENTRY (strchr)
 
 	/* Move the words left and right to discard the bits that are
 	   not part of the string and to bring them back as zeros.  */
-
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	srw	r11,r11,r6
+	slw	r10,r10,r6
+	slw	r11,r11,r6
+#else
 	slw	r10,r10,r6
 	slw	r11,r11,r6
 	srw	r10,r10,r6
 	srw	r11,r11,r6
+#endif
 	or	r5,r10,r11    /* OR the results to speed things up.  */
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
 				 have been found.  */
@@ -65,7 +71,7 @@ ENTRY (strchr)
 
 	/* Handle WORD2 of pair.  */
 	lwzu	r12,4(r8)
-	cmpb    r10,r12,r4
+	cmpb	r10,r12,r4
 	cmpb	r11,r12,r0
 	or	r5,r10,r11
 	cmpwi	cr7,r5,0
@@ -100,22 +106,31 @@ L(loop):
 	bne	cr6,L(done)
 
 	/* The c/null byte must be in the second word.  Adjust the address
-	   again and move the result of cmpb to r10 so we can calculate the
-	   pointer.  */
+	   again and move the result of cmpb to r10/r11 so we can calculate
+	   the pointer.  */
 
 	mr	r10,r6
 	mr	r11,r7
 	addi	r8,r8,4
 
-	/* r5 has the output of the cmpb instruction, that is, it contains
+	/* r10/r11 have the output of the cmpb instructions, that is,
 	   0xff in the same position as the c/null byte in the original
 	   word from the string.  Use that to calculate the pointer.  */
 L(done):
-	cntlzw	r4,r10	      /* Count leading zeroes before c matches.  */
-	cntlzw	r0,r11	      /* Count leading zeroes before null matches.  */
-	cmplw	cr7,r4,r0
+#ifdef __LITTLE_ENDIAN__
+	addi    r3,r10,-1
+	andc    r3,r3,r10
+	popcntw	r0,r3
+	addi    r4,r11,-1
+	andc    r4,r4,r11
+	cmplw	cr7,r3,r4
+	bgt	cr7,L(no_match)
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before c matches.  */
+	cmplw	cr7,r11,r10
 	bgt	cr7,L(no_match)
-	srwi	r0,r4,3	      /* Convert leading zeroes to bytes.  */
+#endif
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching c byte
 				 or null in case c was not found.  */
 	blr
@@ -133,10 +148,14 @@ L(null_match):
 	cmpb	r5,r12,r0     /* Compare each byte against null bytes.  */
 
 	/* Move the words left and right to discard the bits that are
-	   not part of the string and to bring them back as zeros.  */
-
+	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r5,r5,r6
+	slw	r5,r5,r6
+#else
 	slw	r5,r5,r6
 	srw	r5,r5,r6
+#endif
 	cmpwi	cr7,r5,0      /* If r10 == 0, no c or null bytes
 				 have been found.  */
 	bne	cr7,L(done_null)
@@ -191,7 +210,13 @@ L(loop_null):
 	   0xff in the same position as the null byte in the original
 	   word from the string.  Use that to calculate the pointer.  */
 L(done_null):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
+#endif
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of the matching null byte.  */
 	blr
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/strchrnul.S b/libc/sysdeps/powerpc/powerpc32/power7/strchrnul.S
index d4cacab60..f5d24d434 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/strchrnul.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/strchrnul.S
@@ -27,8 +27,8 @@ ENTRY (__strchrnul)
 	clrrwi	r8,r3,2	      /* Align the address to word boundary.  */
 
 	/* Replicate byte to word.  */
-	rlwimi	r4,r4,8,16,23
-	rlwimi	r4,r4,16,0,15
+	insrdi	r4,r4,8,48
+	insrdi	r4,r4,16,32
 
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
@@ -43,10 +43,17 @@ ENTRY (__strchrnul)
 
 	/* Move the words left and right to discard the bits that are
 	   not part of the string and bring them back as zeros.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	srw	r9,r9,r6
+	slw	r10,r10,r6
+	slw	r9,r9,r6
+#else
 	slw	r10,r10,r6
 	slw	r9,r9,r6
 	srw	r10,r10,r6
 	srw	r9,r9,r6
+#endif
 	or	r5,r9,r10     /* OR the results to speed things up.  */
 	cmpwi	cr7,r5,0      /* If r5 == 0, no c or null bytes
 				 have been found.  */
@@ -54,7 +61,7 @@ ENTRY (__strchrnul)
 
 	mtcrf   0x01,r8
 
-	/* Are we now aligned to a quadword boundary?  If so, skip to
+	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
 
 	bt	29,L(loop)
@@ -76,7 +83,7 @@ L(loop):
 	   single register for speed.  This is an attempt
 	   to speed up the null-checking process for bigger strings.  */
 	lwz	r12,4(r8)
-	lwzu     r11,8(r8)
+	lwzu	r11,8(r8)
 	cmpb	r10,r12,r0
 	cmpb	r9,r12,r4
 	cmpb	r6,r11,r0
@@ -95,9 +102,9 @@ L(loop):
 	addi	r8,r8,-4
 	bne	cr6,L(done)
 
-	/* The c/null byte must be in the second word.  Adjust the
-	   address again and move the result of cmpb to r10 so we can calculate
-	   the pointer.  */
+	/* The c/null byte must be in the second word.  Adjust the address
+	   again and move the result of cmpb to r5 so we can calculate the
+	   pointer.  */
 	mr	r5,r10
 	addi	r8,r8,4
 
@@ -105,7 +112,13 @@ L(loop):
 	   0xff in the same position as the c/null byte in the original
 	   word from the string.  Use that to calculate the pointer.  */
 L(done):
+#ifdef __LITTLE_ENDIAN__
+	addi    r0,r5,-1
+	andc    r0,r0,r5
+	popcntw	r0,r0
+#else
 	cntlzw	r0,r5	      /* Count leading zeros before the match.  */
+#endif
 	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r8,r0      /* Return address of matching c/null byte.  */
 	blr
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/strlen.S b/libc/sysdeps/powerpc/powerpc32/power7/strlen.S
index b71a10f5c..b08d6c028 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/strlen.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/strlen.S
@@ -29,7 +29,11 @@ ENTRY (strlen)
 	li	r0,0	      /* Word with null chars to use with cmpb.  */
 	li	r5,-1	      /* MASK = 0xffffffffffffffff.  */
 	lwz	r12,0(r4)     /* Load word from memory.  */
+#ifdef __LITTLE_ENDIAN__
+	slw	r5,r5,r6
+#else
 	srw	r5,r5,r6      /* MASK = MASK >> padding.  */
+#endif
 	orc	r9,r12,r5     /* Mask bits that are not part of the string.  */
 	cmpb	r10,r9,r0     /* Check for null bytes in WORD1.  */
 	cmpwi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
@@ -47,9 +51,6 @@ ENTRY (strlen)
 	cmpb	r10,r12,r0
 	cmpwi	cr7,r10,0
 	bne	cr7,L(done)
-	b	L(loop)	      /* We branch here (rather than falling through)
-				 to skip the nops due to heavy alignment
-				 of the loop below.  */
 
 	/* Main loop to look for the end of the string.  Since it's a
 	   small loop (< 8 instructions), align it to 32-bytes.  */
@@ -86,9 +87,15 @@ L(loop):
 	   0xff in the same position as the null byte in the original
 	   word from the string.  Use that to calculate the length.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	r9, r10, -1   /* Form a mask from trailing zeros.  */
+	andc	r9, r9, r10
+	popcntw r0, r9	      /* Count the bits in the mask.  */
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before the match.  */
+#endif
 	subf	r5,r3,r4
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
+	srwi	r0,r0,3	      /* Convert leading zeros to bytes.  */
 	add	r3,r5,r0      /* Compute final length.  */
 	blr
 END (strlen)
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/strncmp.S b/libc/sysdeps/powerpc/powerpc32/power7/strncmp.S
index fdae44d26..10c9d251b 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/strncmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/strncmp.S
@@ -26,7 +26,7 @@
 
 EALIGN (strncmp,5,0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -39,6 +39,7 @@ EALIGN (strncmp,5,0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	nop
@@ -78,13 +79,45 @@ L(g1):	add	rTMP,rFEFE,rWORD1
 /* OK. We've hit the end of the string. We need to be careful that
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	slwi	rTMP, rTMP, 1
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rldimi	rTMP2, rWORD2, 24, 32
+	rldimi	rTMP, rWORD1, 24, 32
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr
+	ori	rRTN, rTMP2, 1
+	blr
 
+#else
 L(endstring):
 	and	rTMP,r7F7F,rWORD1
 	beq	cr1,L(equal)
 	add	rTMP,rTMP,r7F7F
 	xor.	rBITDIF,rWORD1,rWORD2
-
 	andc	rNEG,rNEG,rTMP
 	blt	L(highbit)
 	cntlzw	rBITDIF,rBITDIF
@@ -92,28 +125,20 @@ L(endstring):
 	addi	rNEG,rNEG,7
 	cmpw	cr1,rNEG,rBITDIF
 	sub	rRTN,rWORD1,rWORD2
-	blt	cr1,L(equal)
-	srawi	rRTN,rRTN,31
-	ori	rRTN,rRTN,1
-	blr
+	bgelr	cr1
 L(equal):
 	li	rRTN,0
 	blr
 
 L(different):
-	lwzu	rWORD1,-4(rSTR1)
+	lwz	rWORD1,-4(rSTR1)
 	xor.	rBITDIF,rWORD1,rWORD2
 	sub	rRTN,rWORD1,rWORD2
-	blt	L(highbit)
-	srawi	rRTN,rRTN,31
-	ori	rRTN,rRTN,1
-	blr
+	bgelr
 L(highbit):
-	srwi	rWORD2,rWORD2,24
-	srwi	rWORD1,rWORD1,24
-	sub	rRTN,rWORD1,rWORD2
+	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well. In this case, we just do a byte-by-byte comparison.  */
 	.align	4
diff --git a/libc/sysdeps/powerpc/powerpc32/power7/strnlen.S b/libc/sysdeps/powerpc/powerpc32/power7/strnlen.S
index ed088366a..eb52afd1a 100644
--- a/libc/sysdeps/powerpc/powerpc32/power7/strnlen.S
+++ b/libc/sysdeps/powerpc/powerpc32/power7/strnlen.S
@@ -28,51 +28,47 @@ ENTRY (__strnlen)
 	add	r7,r3,r4      /* Calculate the last acceptable address.  */
 	cmplwi	r4,16
 	li	r0,0	      /* Word with null chars.  */
+	addi	r7,r7,-1
 	ble	L(small_range)
 
-	cmplw	cr7,r3,r7     /* Is the address equal or less than r3?  If
-				 it's equal or less, it means size is either 0
-				 or a negative number.  */
-	ble	cr7,L(proceed)
-
-	li	r7,-1	      /* Make r11 the biggest if r4 <= 0.  */
-L(proceed):
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
 	cmpb	r10,r12,r0    /* Check for null bytes in DWORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	slw	r10,r10,r6
+#else
 	slw	r10,r10,r6
 	srw	r10,r10,r6
+#endif
 	cmplwi	cr7,r10,0     /* If r10 == 0, no null's have been found.  */
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(end_max)
-
+	clrrwi	r7,r7,2       /* Address of last word.  */
 	mtcrf   0x01,r8
 	/* Are we now aligned to a doubleword boundary?  If so, skip to
 	   the main loop.  Otherwise, go through the alignment code.  */
 
 	bt	29,L(loop_setup)
 
-	/* Handle DWORD2 of pair.  */
+	/* Handle WORD2 of pair.  */
 	lwzu	r12,4(r8)
 	cmpb	r10,r12,r0
 	cmplwi	cr7,r10,0
 	bne	cr7,L(done)
 
-	/* Are we done already?  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	bge	cr6,L(end_max)
-
 L(loop_setup):
-	sub	r5,r7,r9
+	/* The last word we want to read in the loop below is the one
+	   containing the last byte of the string, ie. the word at
+	   (s + size - 1) & ~3, or r7.  The first word read is at
+	   r8 + 4, we read 2 * cnt words, so the last word read will
+	   be at r8 + 4 + 8 * cnt - 4.  Solving for cnt gives
+	   cnt = (r7 - r8) / 8  */
+	sub	r5,r7,r8
 	srwi	r6,r5,3	      /* Number of loop iterations.  */
 	mtctr	r6	      /* Setup the counter.  */
-	b	L(loop)
-	/* Main loop to look for the null byte backwards in the string.  Since
+
+	/* Main loop to look for the null byte in the string.  Since
 	   it's a small loop (< 8 instructions), align it to 32-bytes.  */
 	.p2align  5
 L(loop):
@@ -88,15 +84,18 @@ L(loop):
 	cmplwi	cr7,r5,0
 	bne	cr7,L(found)
 	bdnz	L(loop)
-	/* We're here because the counter reached 0, and that means we
-	   didn't have any matches for null in the whole range.  Just return
-	   the original size.  */
-	addi	r9,r8,4
-	cmplw	cr6,r9,r7
-	blt	cr6,L(loop_small)
+
+	/* We may have one more word to read.  */
+	cmplw	cr6,r8,r7
+	beq	cr6,L(end_max)
+
+	lwzu	r12,4(r8)
+	cmpb	r10,r12,r0
+	cmplwi	cr6,r10,0
+	bne	cr6,L(done)
 
 L(end_max):
-	sub	r3,r7,r3
+	mr	r3,r4
 	blr
 
 	/* OK, one (or both) of the words contains a null byte.  Check
@@ -121,49 +120,56 @@ L(found):
 	   We need to make sure the null char is *before* the end of the
 	   range.  */
 L(done):
-	cntlzw	r0,r10	      /* Count leading zeroes before the match.  */
-	srwi	r0,r0,3	      /* Convert leading zeroes to bytes.  */
-	add	r9,r8,r0
-	sub	r6,r9,r3      /* Length until the match.  */
-	cmplw	r9,r7
-	bgt	L(end_max)
-	mr	r3,r6
-	blr
-
-	.align	4
-L(zero):
-	li	r3,0
+#ifdef __LITTLE_ENDIAN__
+	addi	r0,r10,-1
+	andc	r0,r0,r10
+	popcntw	r0,r0
+#else
+	cntlzw	r0,r10	      /* Count leading zeros before the match.  */
+#endif
+	sub	r3,r8,r3
+	srwi	r0,r0,3	      /* Convert leading/trailing zeros to bytes.  */
+	add	r3,r3,r0      /* Length until the match.  */
+	cmplw	r3,r4
+	blelr
+	mr	r3,r4
 	blr
 
-/* Deals with size <= 32.  */
+/* Deals with size <= 16.  */
 	.align	4
 L(small_range):
 	cmplwi	r4,0
-	beq	L(zero)
+	beq	L(end_max)
+
+	clrrwi	r7,r7,2       /* Address of last word.  */
 
 	rlwinm	r6,r3,3,27,28 /* Calculate padding.  */
 	lwz	r12,0(r8)     /* Load word from memory.  */
 	cmpb	r10,r12,r0    /* Check for null bytes in WORD1.  */
+#ifdef __LITTLE_ENDIAN__
+	srw	r10,r10,r6
+	slw	r10,r10,r6
+#else
 	slw	r10,r10,r6
 	srw	r10,r10,r6
+#endif
 	cmplwi	cr7,r10,0
 	bne	cr7,L(done)
 
-	addi    r9,r8,4
-	cmplw	r9,r7
-	bge	L(end_max)
-	b	L(loop_small)
+	cmplw	r8,r7
+	beq	L(end_max)
 
 	.p2align  5
 L(loop_small):
 	lwzu	r12,4(r8)
 	cmpb	r10,r12,r0
-	addi	r9,r8,4
 	cmplwi	cr6,r10,0
 	bne	cr6,L(done)
-	cmplw	r9,r7
-	bge	L(end_max)
-	b	L(loop_small)
+	cmplw	r8,r7
+	bne	L(loop_small)
+	mr	r3,r4
+	blr
+
 END (__strnlen)
 weak_alias (__strnlen, strnlen)
 libc_hidden_builtin_def (strnlen)
diff --git a/libc/sysdeps/powerpc/powerpc32/setjmp-common.S b/libc/sysdeps/powerpc/powerpc32/setjmp-common.S
index 60b0026fa..3fb65b5f7 100644
--- a/libc/sysdeps/powerpc/powerpc32/setjmp-common.S
+++ b/libc/sysdeps/powerpc/powerpc32/setjmp-common.S
@@ -24,6 +24,11 @@
 # include <jmpbuf-offsets.h>
 #endif
 
+#if defined __SPE__ || (defined __NO_FPRS__ && !defined _SOFT_FLOAT)
+# define SAVE_GP(N)	evstdd r##N,((JB_FPRS+((N)-14)*2)*4)(3)
+#else
+# define SAVE_GP(N)	stw r##N,((JB_GPRS+(N)-14)*4)(3)
+#endif
 
 ENTRY (__sigsetjmp)
 
@@ -35,31 +40,31 @@ ENTRY (__sigsetjmp)
 	stw  r1,(JB_GPR1*4)(3)
 #endif
 	mflr r0
-	stw  r14,((JB_GPRS+0)*4)(3)
+	SAVE_GP (14)
 #ifdef PTR_MANGLE
 	PTR_MANGLE2 (r0, r10)
 	li   r10,0
 #endif
 	stw  r0,(JB_LR*4)(3)
-	stw  r15,((JB_GPRS+1)*4)(3)
+	SAVE_GP (15)
 	mfcr r0
-	stw  r16,((JB_GPRS+2)*4)(3)
+	SAVE_GP (16)
 	stw  r0,(JB_CR*4)(3)
-	stw  r17,((JB_GPRS+3)*4)(3)
-	stw  r18,((JB_GPRS+4)*4)(3)
-	stw  r19,((JB_GPRS+5)*4)(3)
-	stw  r20,((JB_GPRS+6)*4)(3)
-	stw  r21,((JB_GPRS+7)*4)(3)
-	stw  r22,((JB_GPRS+8)*4)(3)
-	stw  r23,((JB_GPRS+9)*4)(3)
-	stw  r24,((JB_GPRS+10)*4)(3)
-	stw  r25,((JB_GPRS+11)*4)(3)
-	stw  r26,((JB_GPRS+12)*4)(3)
-	stw  r27,((JB_GPRS+13)*4)(3)
-	stw  r28,((JB_GPRS+14)*4)(3)
-	stw  r29,((JB_GPRS+15)*4)(3)
-	stw  r30,((JB_GPRS+16)*4)(3)
-	stw  r31,((JB_GPRS+17)*4)(3)
+	SAVE_GP (17)
+	SAVE_GP (18)
+	SAVE_GP (19)
+	SAVE_GP (20)
+	SAVE_GP (21)
+	SAVE_GP (22)
+	SAVE_GP (23)
+	SAVE_GP (24)
+	SAVE_GP (25)
+	SAVE_GP (26)
+	SAVE_GP (27)
+	SAVE_GP (28)
+	SAVE_GP (29)
+	SAVE_GP (30)
+	SAVE_GP (31)
 #if defined NOT_IN_libc && defined IS_IN_rtld
 	li   r3,0
 	blr
diff --git a/libc/sysdeps/powerpc/powerpc32/setjmp.S b/libc/sysdeps/powerpc/powerpc32/setjmp.S
index 8a8cf0d6e..49b64ecf0 100644
--- a/libc/sysdeps/powerpc/powerpc32/setjmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/setjmp.S
@@ -25,7 +25,7 @@
 
 #else /* !NOT_IN_libc */
 /* Build a versioned object for libc.  */
-default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
+versioned_symbol (libc, __vmx__sigsetjmp, __sigsetjmp, GLIBC_2_3_4)
 # define __sigsetjmp __vmx__sigsetjmp
 # define __sigjmp_save __vmx__sigjmp_save
 # include "setjmp-common.S"
@@ -35,7 +35,7 @@ default_symbol_version (__vmx__sigsetjmp,__sigsetjmp,GLIBC_2.3.4)
 #  undef __sigsetjmp
 #  undef __sigjmp_save
 #  undef JB_SIZE
-symbol_version (__novmx__sigsetjmp,__sigsetjmp,GLIBC_2.0)
+compat_symbol (libc, __novmx__sigsetjmp, __sigsetjmp, GLIBC_2_0)
 #  define __sigsetjmp __novmx__sigsetjmp
 #  define __sigjmp_save __novmx__sigjmp_save
 #  include "setjmp-common.S"
diff --git a/libc/sysdeps/powerpc/powerpc32/stackguard-macros.h b/libc/sysdeps/powerpc/powerpc32/stackguard-macros.h
index 839f6a4b9..b3d0af830 100644
--- a/libc/sysdeps/powerpc/powerpc32/stackguard-macros.h
+++ b/libc/sysdeps/powerpc/powerpc32/stackguard-macros.h
@@ -2,3 +2,13 @@
 
 #define STACK_CHK_GUARD \
   ({ uintptr_t x; asm ("lwz %0,-28680(2)" : "=r" (x)); x; })
+
+#define POINTER_CHK_GUARD \
+  ({												\
+     uintptr_t x;										\
+     asm ("lwz %0,%1(2)"									\
+	  : "=r" (x)										\
+	  : "i" (offsetof (tcbhead_t, pointer_guard) - TLS_TCB_OFFSET - sizeof (tcbhead_t))	\
+         );											\
+     x;												\
+   })
diff --git a/libc/sysdeps/powerpc/powerpc32/stpcpy.S b/libc/sysdeps/powerpc/powerpc32/stpcpy.S
index 03c6dddc3..7e106e0e6 100644
--- a/libc/sysdeps/powerpc/powerpc32/stpcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/stpcpy.S
@@ -62,7 +62,22 @@ L(g2):	add	rTMP, rFEFE, rWORD
 
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stbu	rALT, 4(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stbu	rTMP, 1(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stbu	rTMP, 1(rDEST)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stbu	rTMP, 1(rDEST)
+	blr
+#else
+	rlwinm.	rTMP, rALT, 8, 24, 31
 	stbu	rTMP, 4(rDEST)
 	beqlr-
 	rlwinm.	rTMP, rALT, 16, 24, 31
@@ -73,6 +88,7 @@ L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
 	beqlr-
 	stbu	rALT, 1(rDEST)
 	blr
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4
diff --git a/libc/sysdeps/powerpc/powerpc32/strchr.S b/libc/sysdeps/powerpc/powerpc32/strchr.S
index c9952eecc..605056577 100644
--- a/libc/sysdeps/powerpc/powerpc32/strchr.S
+++ b/libc/sysdeps/powerpc/powerpc32/strchr.S
@@ -36,6 +36,8 @@ ENTRY (strchr)
 #define rIGN	r10	/* number of bits we should ignore in the first word */
 #define rMASK	r11	/* mask with the bits to ignore set to 0 */
 #define rTMP3	r12
+#define rTMP4	rIGN
+#define rTMP5	rMASK
 
 
 	rlwimi	rCHR, rCHR, 8, 16, 23
@@ -49,64 +51,93 @@ ENTRY (strchr)
 	addi	r7F7F, r7F7F, 0x7f7f
 /* Test the first (partial?) word.  */
 	lwz	rWORD, 0(rSTR)
+#ifdef __LITTLE_ENDIAN__
+	slw	rMASK, rMASK, rIGN
+#else
 	srw	rMASK, rMASK, rIGN
+#endif
 	orc	rWORD, rWORD, rMASK
 	add	rTMP1, rFEFE, rWORD
 	nor	rTMP2, r7F7F, rWORD
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2
 	xor	rTMP3, rCHR, rWORD
 	orc	rTMP3, rTMP3, rMASK
 	b	L(loopentry)
 
 /* The loop.  */
 
-L(loop):lwzu rWORD, 4(rSTR)
-	and.	rTMP1, rTMP1, rTMP2
+L(loop):
+	lwzu	rWORD, 4(rSTR)
+	and.	rTMP5, rTMP1, rTMP2
 /* Test for 0.	*/
-	add	rTMP1, rFEFE, rWORD
-	nor	rTMP2, r7F7F, rWORD
+	add	rTMP1, rFEFE, rWORD /* x - 0x01010101.  */
+	nor	rTMP2, r7F7F, rWORD /* ~(x | 0x7f7f7f7f) == ~x & 0x80808080.  */
 	bne	L(foundit)
-	and.	rTMP1, rTMP1, rTMP2
+	and.	rTMP4, rTMP1, rTMP2 /* (x - 0x01010101) & ~x & 0x80808080.  */
 /* Start test for the bytes we're looking for.  */
 	xor	rTMP3, rCHR, rWORD
 L(loopentry):
 	add	rTMP1, rFEFE, rTMP3
 	nor	rTMP2, r7F7F, rTMP3
 	beq	L(loop)
+
 /* There is a zero byte in the word, but may also be a matching byte (either
    before or after the zero byte).  In fact, we may be looking for a
-   zero byte, in which case we return a match.  We guess that this hasn't
-   happened, though.  */
-L(missed):
-	and.	rTMP1, rTMP1, rTMP2
+   zero byte, in which case we return a match.  */
+	and.	rTMP5, rTMP1, rTMP2
 	li	rRTN, 0
 	beqlr
-/* It did happen. Decide which one was first...
-   I'm not sure if this is actually faster than a sequence of
-   rotates, compares, and branches (we use it anyway because it's shorter).  */
+/* At this point:
+   rTMP5 bytes are 0x80 for each match of c, 0 otherwise.
+   rTMP4 bytes are 0x80 for each match of 0, 0 otherwise.
+   But there may be false matches in the next most significant byte from
+   a true match due to carries.  This means we need to recalculate the
+   matches using a longer method for big-endian.  */
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzw	rCLZB, rTMP1
+	addi	rTMP2, rTMP4, -1
+	andc	rTMP2, rTMP2, rTMP4
+	cmplw	rTMP1, rTMP2
+	bgtlr
+	subfic	rCLZB, rCLZB, 32-7
+#else
+/* I think we could reduce this by two instructions by keeping the "nor"
+   results from the loop for reuse here.  See strlen.S tail.  Similarly
+   one instruction could be pruned from L(foundit).  */
 	and	rFEFE, r7F7F, rWORD
-	or	rMASK, r7F7F, rWORD
+	or	rTMP5, r7F7F, rWORD
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rFEFE, rFEFE, r7F7F
 	add	rTMP1, rTMP1, r7F7F
-	nor	rWORD, rMASK, rFEFE
-	nor	rTMP2, rIGN, rTMP1
+	nor	rWORD, rTMP5, rFEFE
+	nor	rTMP2, rTMP4, rTMP1
+	cntlzw	rCLZB, rTMP2
 	cmplw	rWORD, rTMP2
 	bgtlr
-	cntlzw	rCLZB, rTMP2
+#endif
 	srwi	rCLZB, rCLZB, 3
 	add	rRTN, rSTR, rCLZB
 	blr
 
 L(foundit):
+#ifdef __LITTLE_ENDIAN__
+	addi	rTMP1, rTMP5, -1
+	andc	rTMP1, rTMP1, rTMP5
+	cntlzw	rCLZB, rTMP1
+	subfic	rCLZB, rCLZB, 32-7-32
+	srawi	rCLZB, rCLZB, 3
+#else
 	and	rTMP1, r7F7F, rTMP3
-	or	rIGN, r7F7F, rTMP3
+	or	rTMP4, r7F7F, rTMP3
 	add	rTMP1, rTMP1, r7F7F
-	nor	rTMP2, rIGN, rTMP1
+	nor	rTMP2, rTMP4, rTMP1
 	cntlzw	rCLZB, rTMP2
 	subi	rSTR, rSTR, 4
 	srwi	rCLZB, rCLZB, 3
+#endif
 	add	rRTN, rSTR, rCLZB
 	blr
 END (strchr)
diff --git a/libc/sysdeps/powerpc/powerpc32/strcmp.S b/libc/sysdeps/powerpc/powerpc32/strcmp.S
index 297ca3c1b..91d60c905 100644
--- a/libc/sysdeps/powerpc/powerpc32/strcmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/strcmp.S
@@ -24,7 +24,7 @@
 
 EALIGN (strcmp, 4, 0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -34,6 +34,7 @@ EALIGN (strcmp, 4, 0)
 #define r7F7F	r8	/* constant 0x7f7f7f7f */
 #define rNEG	r9	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r10	/* bits that differ in s1 & s2 words */
+#define rTMP	r11
 
 
 	or	rTMP, rSTR2, rSTR1
@@ -56,10 +57,45 @@ L(g1):	add	rTMP, rFEFE, rWORD1
 	and.	rTMP, rTMP, rNEG
 	cmpw	cr1, rWORD1, rWORD2
 	beq+	L(g0)
-L(endstring):
+
 /* OK. We've hit the end of the string. We need to be careful that
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	rlwimi	rTMP2, rTMP2, 1, 0, 30
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+#else
+L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
@@ -84,7 +120,7 @@ L(different):
 L(highbit):
 	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
diff --git a/libc/sysdeps/powerpc/powerpc32/strcpy.S b/libc/sysdeps/powerpc/powerpc32/strcpy.S
index 4ae577dbb..e938cc42a 100644
--- a/libc/sysdeps/powerpc/powerpc32/strcpy.S
+++ b/libc/sysdeps/powerpc/powerpc32/strcpy.S
@@ -62,7 +62,22 @@ L(g2):	add	rTMP, rFEFE, rWORD
 
 	mr	rALT, rWORD
 /* We've hit the end of the string.  Do the rest byte-by-byte.  */
-L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
+L(g1):
+#ifdef __LITTLE_ENDIAN__
+	rlwinm.	rTMP, rALT, 0, 24, 31
+	stb	rALT, 4(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 24, 24, 31
+	stb	rTMP, 5(rDEST)
+	beqlr-
+	rlwinm.	rTMP, rALT, 16, 24, 31
+	stb	rTMP, 6(rDEST)
+	beqlr-
+	rlwinm	rTMP, rALT, 8, 24, 31
+	stb	rTMP, 7(rDEST)
+	blr
+#else
+	rlwinm.	rTMP, rALT, 8, 24, 31
 	stb	rTMP, 4(rDEST)
 	beqlr-
 	rlwinm.	rTMP, rALT, 16, 24, 31
@@ -73,6 +88,7 @@ L(g1):	rlwinm.	rTMP, rALT, 8, 24, 31
 	beqlr-
 	stb	rALT, 7(rDEST)
 	blr
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte copy.  */
 	.align 4
diff --git a/libc/sysdeps/powerpc/powerpc32/strlen.S b/libc/sysdeps/powerpc/powerpc32/strlen.S
index 9a6eafc38..a7153ed7a 100644
--- a/libc/sysdeps/powerpc/powerpc32/strlen.S
+++ b/libc/sysdeps/powerpc/powerpc32/strlen.S
@@ -29,7 +29,12 @@
       1 is subtracted you get a value in the range 0x00-0x7f, none of which
       have their high bit set. The expression here is
       (x + 0xfefefeff) & ~(x | 0x7f7f7f7f), which gives 0x00000000 when
-      there were no 0x00 bytes in the word.
+      there were no 0x00 bytes in the word.  You get 0x80 in bytes that
+      match, but possibly false 0x80 matches in the next more significant
+      byte to a true match due to carries.  For little-endian this is
+      of no consequence since the least significant match is the one
+      we're interested in, but big-endian needs method 2 to find which
+      byte matches.
 
    2) Given a word 'x', we can test to see _which_ byte was zero by
       calculating ~(((x & 0x7f7f7f7f) + 0x7f7f7f7f) | x | 0x7f7f7f7f).
@@ -72,7 +77,7 @@
 
 ENTRY (strlen)
 
-#define rTMP1	r0
+#define rTMP4	r0
 #define rRTN	r3	/* incoming STR arg, outgoing result */
 #define rSTR	r4	/* current string position */
 #define rPADN	r5	/* number of padding bits we prepend to the
@@ -82,9 +87,9 @@ ENTRY (strlen)
 #define rWORD1	r8	/* current string word */
 #define rWORD2	r9	/* next string word */
 #define rMASK	r9	/* mask for first string word */
-#define rTMP2	r10
-#define rTMP3	r11
-#define rTMP4	r12
+#define rTMP1	r10
+#define rTMP2	r11
+#define rTMP3	r12
 
 
 	clrrwi	rSTR, rRTN, 2
@@ -93,15 +98,20 @@ ENTRY (strlen)
 	lwz	rWORD1, 0(rSTR)
 	li	rMASK, -1
 	addi	r7F7F, r7F7F, 0x7f7f
-/* That's the setup done, now do the first pair of words.
-   We make an exception and use method (2) on the first two words, to reduce
-   overhead.  */
+/* We use method (2) on the first two words, because rFEFE isn't
+   required which reduces setup overhead.  Also gives a faster return
+   for small strings on big-endian due to needing to recalculate with
+   method (2) anyway.  */
+#ifdef __LITTLE_ENDIAN__
+	slw	rMASK, rMASK, rPADN
+#else
 	srw	rMASK, rMASK, rPADN
+#endif
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
-	nor	rTMP1, rTMP2, rTMP1
-	and.	rWORD1, rTMP1, rMASK
+	nor	rTMP3, rTMP2, rTMP1
+	and.	rTMP3, rTMP3, rMASK
 	mtcrf	0x01, rRTN
 	bne	L(done0)
 	lis	rFEFE, -0x101
@@ -110,11 +120,12 @@ ENTRY (strlen)
 	bt	29, L(loop)
 
 /* Handle second word of pair.  */
+/* Perhaps use method (1) here for little-endian, saving one instruction?  */
 	lwzu	rWORD1, 4(rSTR)
 	and	rTMP1, r7F7F, rWORD1
 	or	rTMP2, r7F7F, rWORD1
 	add	rTMP1, rTMP1, r7F7F
-	nor.	rWORD1, rTMP2, rTMP1
+	nor.	rTMP3, rTMP2, rTMP1
 	bne	L(done0)
 
 /* The loop.  */
@@ -128,28 +139,52 @@ L(loop):
 	add	rTMP3, rFEFE, rWORD2
 	nor	rTMP4, r7F7F, rWORD2
 	bne	L(done1)
-	and.	rTMP1, rTMP3, rTMP4
+	and.	rTMP3, rTMP3, rTMP4
 	beq	L(loop)
 
+#ifndef __LITTLE_ENDIAN__
 	and	rTMP1, r7F7F, rWORD2
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP4, rTMP1
+	andc	rTMP3, rTMP4, rTMP1
 	b	L(done0)
 
 L(done1):
 	and	rTMP1, r7F7F, rWORD1
 	subi	rSTR, rSTR, 4
 	add	rTMP1, rTMP1, r7F7F
-	andc	rWORD1, rTMP2, rTMP1
+	andc	rTMP3, rTMP2, rTMP1
 
 /* When we get to here, rSTR points to the first word in the string that
-   contains a zero byte, and the most significant set bit in rWORD1 is in that
-   byte.  */
+   contains a zero byte, and rTMP3 has 0x80 for bytes that are zero,
+   and 0x00 otherwise.  */
 L(done0):
-	cntlzw	rTMP3, rWORD1
+	cntlzw	rTMP3, rTMP3
 	subf	rTMP1, rRTN, rSTR
 	srwi	rTMP3, rTMP3, 3
 	add	rRTN, rTMP1, rTMP3
 	blr
+#else
+
+L(done0):
+	addi	rTMP1, rTMP3, -1	/* Form a mask from trailing zeros.  */
+	andc	rTMP1, rTMP1, rTMP3
+	cntlzw	rTMP1, rTMP1		/* Count bits not in the mask.  */
+	subf	rTMP3, rRTN, rSTR
+	subfic	rTMP1, rTMP1, 32-7
+	srwi	rTMP1, rTMP1, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+
+L(done1):
+	addi	rTMP3, rTMP1, -1
+	andc	rTMP3, rTMP3, rTMP1
+	cntlzw	rTMP3, rTMP3
+	subf	rTMP1, rRTN, rSTR
+	subfic	rTMP3, rTMP3, 32-7-32
+	srawi	rTMP3, rTMP3, 3
+	add	rRTN, rTMP1, rTMP3
+	blr
+#endif
+
 END (strlen)
 libc_hidden_builtin_def (strlen)
diff --git a/libc/sysdeps/powerpc/powerpc32/strncmp.S b/libc/sysdeps/powerpc/powerpc32/strncmp.S
index fa345d293..e36a160a8 100644
--- a/libc/sysdeps/powerpc/powerpc32/strncmp.S
+++ b/libc/sysdeps/powerpc/powerpc32/strncmp.S
@@ -24,7 +24,7 @@
 
 EALIGN (strncmp, 4, 0)
 
-#define rTMP	r0
+#define rTMP2	r0
 #define rRTN	r3
 #define rSTR1	r3	/* first string arg */
 #define rSTR2	r4	/* second string arg */
@@ -35,6 +35,7 @@ EALIGN (strncmp, 4, 0)
 #define r7F7F	r9	/* constant 0x7f7f7f7f */
 #define rNEG	r10	/* ~(word in s1 | 0x7f7f7f7f) */
 #define rBITDIF	r11	/* bits that differ in s1 & s2 words */
+#define rTMP	r12
 
 	dcbt	0,rSTR1
 	or	rTMP, rSTR2, rSTR1
@@ -73,12 +74,45 @@ L(g1):	add	rTMP, rFEFE, rWORD1
    we don't compare two strings as different because of gunk beyond
    the end of the strings...  */
 
+#ifdef __LITTLE_ENDIAN__
+L(endstring):
+	slwi	rTMP, rTMP, 1
+	addi    rTMP2, rTMP, -1
+	andc    rTMP2, rTMP2, rTMP
+	and	rWORD2, rWORD2, rTMP2		/* Mask off gunk.  */
+	and	rWORD1, rWORD1, rTMP2
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+L(different):
+	lwz	rWORD1, -4(rSTR1)
+	rlwinm	rTMP2, rWORD2, 8, 0xffffffff	/* Byte reverse word.  */
+	rlwinm	rTMP, rWORD1, 8, 0xffffffff
+	rlwimi	rTMP2, rWORD2, 24, 0, 7
+	rlwimi	rTMP, rWORD1, 24, 0, 7
+	rlwimi	rTMP2, rWORD2, 24, 16, 23
+	rlwimi	rTMP, rWORD1, 24, 16, 23
+	xor.	rBITDIF, rTMP, rTMP2
+	sub	rRTN, rTMP, rTMP2
+	bgelr+
+	ori	rRTN, rTMP2, 1
+	blr
+
+#else
 L(endstring):
 	and	rTMP, r7F7F, rWORD1
 	beq	cr1, L(equal)
 	add	rTMP, rTMP, r7F7F
 	xor.	rBITDIF, rWORD1, rWORD2
-
 	andc	rNEG, rNEG, rTMP
 	blt-	L(highbit)
 	cntlzw	rBITDIF, rBITDIF
@@ -86,28 +120,20 @@ L(endstring):
 	addi	rNEG, rNEG, 7
 	cmpw	cr1, rNEG, rBITDIF
 	sub	rRTN, rWORD1, rWORD2
-	blt-	cr1, L(equal)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+	cr1
 L(equal):
 	li	rRTN, 0
 	blr
 
 L(different):
-	lwzu	rWORD1, -4(rSTR1)
+	lwz	rWORD1, -4(rSTR1)
 	xor.	rBITDIF, rWORD1, rWORD2
 	sub	rRTN, rWORD1, rWORD2
-	blt-	L(highbit)
-	srawi	rRTN, rRTN, 31
-	ori	rRTN, rRTN, 1
-	blr
+	bgelr+
 L(highbit):
-	srwi	rWORD2, rWORD2, 24
-	srwi	rWORD1, rWORD1, 24
-	sub	rRTN, rWORD1, rWORD2
+	ori	rRTN, rWORD2, 1
 	blr
-
+#endif
 
 /* Oh well.  In this case, we just do a byte-by-byte comparison.  */
 	.align 4
author	joseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d>	2013-10-18 21:33:25 +0000
committer	joseph <joseph@7b3dc134-2b1b-0410-93df-9e9f96275f8d>	2013-10-18 21:33:25 +0000
commit	fe2ed5aaa408e1ab996a9fe1595a05634208a79c (patch)
tree	e1027fbc9d8a4a8c33f8149b2b42e8cde89c74f6 /libc/sysdeps/powerpc/powerpc32
parent	571c782b982d888565e7d06bfc2f3d47582fe829 (diff)