aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorNoah Goldstein <goldstein.w.n@gmail.com>2022-07-12 12:29:03 -0700
committerNoah Goldstein <goldstein.w.n@gmail.com>2022-07-13 14:55:31 -0700
commit08af081ffd3baa371435da0c6906453e9c8be5f5 (patch)
tree870a327fc9a7eff3be5771ae781e078edc31bfcb
parent6b9006bfb03c5975f31de286311041d3c933f5ac (diff)
x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S
This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch.
-rw-r--r--sysdeps/x86_64/memrchr.S332
-rw-r--r--sysdeps/x86_64/multiarch/memrchr-sse2.S336
2 files changed, 334 insertions, 334 deletions
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index b0dffd2ae2..385e2c5668 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -17,334 +17,6 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-#define VEC_SIZE 16
-#define PAGE_SIZE 4096
-
- .text
-ENTRY_P2ALIGN(__memrchr, 6)
-#ifdef __ILP32__
- /* Clear upper bits. */
- mov %RDX_LP, %RDX_LP
-#endif
- movd %esi, %xmm0
-
- /* Get end pointer. */
- leaq (%rdx, %rdi), %rcx
-
- punpcklbw %xmm0, %xmm0
- punpcklwd %xmm0, %xmm0
- pshufd $0, %xmm0, %xmm0
-
- /* Check if we can load 1x VEC without cross a page. */
- testl $(PAGE_SIZE - VEC_SIZE), %ecx
- jz L(page_cross)
-
- /* NB: This load happens regardless of whether rdx (len) is zero. Since
- it doesn't cross a page and the standard gurantees any pointer have
- at least one-valid byte this load must be safe. For the entire
- history of the x86 memrchr implementation this has been possible so
- no code "should" be relying on a zero-length check before this load.
- The zero-length check is moved to the page cross case because it is
- 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
- into 2-cache lines. */
- movups -(VEC_SIZE)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- subq $VEC_SIZE, %rdx
- ja L(more_1x_vec)
-L(ret_vec_x0_test):
- /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
- zero. */
- bsrl %eax, %eax
- jz L(ret_0)
- /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
- if out of bounds. */
- addl %edx, %eax
- jl L(zero_0)
- /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
- ptr. */
- addq %rdi, %rax
-L(ret_0):
- ret
-
- .p2align 4,, 5
-L(ret_vec_x0):
- bsrl %eax, %eax
- leaq -(VEC_SIZE)(%rcx, %rax), %rax
- ret
-
- .p2align 4,, 2
-L(zero_0):
- xorl %eax, %eax
- ret
-
-
- .p2align 4,, 8
-L(more_1x_vec):
- testl %eax, %eax
- jnz L(ret_vec_x0)
-
- /* Align rcx (pointer to string). */
- decq %rcx
- andq $-VEC_SIZE, %rcx
-
- movq %rcx, %rdx
- /* NB: We could consistenyl save 1-byte in this pattern with `movaps
- %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
- it adds more frontend uops (even if the moves can be eliminated) and
- some percentage of the time actual backend uops. */
- movaps -(VEC_SIZE)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- subq %rdi, %rdx
- pmovmskb %xmm1, %eax
-
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(more_2x_vec)
-L(last_2x_vec):
- subl $VEC_SIZE, %edx
- jbe L(ret_vec_x0_test)
-
- testl %eax, %eax
- jnz L(ret_vec_x0)
-
- movaps -(VEC_SIZE * 2)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- subl $VEC_SIZE, %edx
- bsrl %eax, %eax
- jz L(ret_1)
- addl %edx, %eax
- jl L(zero_0)
- addq %rdi, %rax
-L(ret_1):
- ret
-
- /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
- causes the hot pause (length <= VEC_SIZE) to span multiple cache
- lines. Naturally aligned % 16 to 8-bytes. */
-L(page_cross):
- /* Zero length check. */
- testq %rdx, %rdx
- jz L(zero_0)
-
- leaq -1(%rcx), %r8
- andq $-(VEC_SIZE), %r8
-
- movaps (%r8), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %esi
- /* Shift out negative alignment (because we are starting from endptr and
- working backwards). */
- negl %ecx
- /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
- explicitly. */
- andl $(VEC_SIZE - 1), %ecx
- shl %cl, %esi
- movzwl %si, %eax
- leaq (%rdi, %rdx), %rcx
- cmpq %rdi, %r8
- ja L(more_1x_vec)
- subl $VEC_SIZE, %edx
- bsrl %eax, %eax
- jz L(ret_2)
- addl %edx, %eax
- jl L(zero_1)
- addq %rdi, %rax
-L(ret_2):
- ret
-
- /* Fits in aliging bytes. */
-L(zero_1):
- xorl %eax, %eax
- ret
-
- .p2align 4,, 5
-L(ret_vec_x1):
- bsrl %eax, %eax
- leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
- ret
-
- .p2align 4,, 8
-L(more_2x_vec):
- testl %eax, %eax
- jnz L(ret_vec_x0)
-
- movaps -(VEC_SIZE * 2)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- testl %eax, %eax
- jnz L(ret_vec_x1)
-
-
- movaps -(VEC_SIZE * 3)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- subq $(VEC_SIZE * 4), %rdx
- ja L(more_4x_vec)
-
- addl $(VEC_SIZE), %edx
- jle L(ret_vec_x2_test)
-
-L(last_vec):
- testl %eax, %eax
- jnz L(ret_vec_x2)
-
- movaps -(VEC_SIZE * 4)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- subl $(VEC_SIZE), %edx
- bsrl %eax, %eax
- jz L(ret_3)
- addl %edx, %eax
- jl L(zero_2)
- addq %rdi, %rax
-L(ret_3):
- ret
-
- .p2align 4,, 6
-L(ret_vec_x2_test):
- bsrl %eax, %eax
- jz L(zero_2)
- addl %edx, %eax
- jl L(zero_2)
- addq %rdi, %rax
- ret
-
-L(zero_2):
- xorl %eax, %eax
- ret
-
-
- .p2align 4,, 5
-L(ret_vec_x2):
- bsrl %eax, %eax
- leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
- ret
-
- .p2align 4,, 5
-L(ret_vec_x3):
- bsrl %eax, %eax
- leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
- ret
-
- .p2align 4,, 8
-L(more_4x_vec):
- testl %eax, %eax
- jnz L(ret_vec_x2)
-
- movaps -(VEC_SIZE * 4)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- testl %eax, %eax
- jnz L(ret_vec_x3)
-
- addq $-(VEC_SIZE * 4), %rcx
- cmpq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec)
-
- /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
- keeping the code from spilling to the next cache line. */
- addq $(VEC_SIZE * 4 - 1), %rcx
- andq $-(VEC_SIZE * 4), %rcx
- leaq (VEC_SIZE * 4)(%rdi), %rdx
- andq $-(VEC_SIZE * 4), %rdx
-
- .p2align 4,, 11
-L(loop_4x_vec):
- movaps (VEC_SIZE * -1)(%rcx), %xmm1
- movaps (VEC_SIZE * -2)(%rcx), %xmm2
- movaps (VEC_SIZE * -3)(%rcx), %xmm3
- movaps (VEC_SIZE * -4)(%rcx), %xmm4
- pcmpeqb %xmm0, %xmm1
- pcmpeqb %xmm0, %xmm2
- pcmpeqb %xmm0, %xmm3
- pcmpeqb %xmm0, %xmm4
-
- por %xmm1, %xmm2
- por %xmm3, %xmm4
- por %xmm2, %xmm4
-
- pmovmskb %xmm4, %esi
- testl %esi, %esi
- jnz L(loop_end)
-
- addq $-(VEC_SIZE * 4), %rcx
- cmpq %rdx, %rcx
- jne L(loop_4x_vec)
-
- subl %edi, %edx
-
- /* Ends up being 1-byte nop. */
- .p2align 4,, 2
-L(last_4x_vec):
- movaps -(VEC_SIZE)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- cmpl $(VEC_SIZE * 2), %edx
- jbe L(last_2x_vec)
-
- testl %eax, %eax
- jnz L(ret_vec_x0)
-
-
- movaps -(VEC_SIZE * 2)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- testl %eax, %eax
- jnz L(ret_vec_end)
-
- movaps -(VEC_SIZE * 3)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- subl $(VEC_SIZE * 3), %edx
- ja L(last_vec)
- bsrl %eax, %eax
- jz L(ret_4)
- addl %edx, %eax
- jl L(zero_3)
- addq %rdi, %rax
-L(ret_4):
- ret
-
- /* Ends up being 1-byte nop. */
- .p2align 4,, 3
-L(loop_end):
- pmovmskb %xmm1, %eax
- sall $16, %eax
- jnz L(ret_vec_end)
-
- pmovmskb %xmm2, %eax
- testl %eax, %eax
- jnz L(ret_vec_end)
-
- pmovmskb %xmm3, %eax
- /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
- then it won't affect the result in esi (VEC4). If ecx is non-zero
- then CHAR in VEC3 and bsrq will use that position. */
- sall $16, %eax
- orl %esi, %eax
- bsrl %eax, %eax
- leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
- ret
-
-L(ret_vec_end):
- bsrl %eax, %eax
- leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
- ret
- /* Use in L(last_4x_vec). In the same cache line. This is just a spare
- aligning bytes. */
-L(zero_3):
- xorl %eax, %eax
- ret
- /* 2-bytes from next cache line. */
-END(__memrchr)
+#define MEMRCHR __memrchr
+#include "multiarch/memrchr-sse2.S"
weak_alias (__memrchr, memrchr)
diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S
index b04202e171..d92a4022dc 100644
--- a/sysdeps/x86_64/multiarch/memrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S
@@ -17,10 +17,338 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define __memrchr __memrchr_sse2
+# ifndef MEMRCHR
+# define MEMRCHR __memrchr_sse2
+# endif
+#endif
+
+#include <sysdep.h>
+#define VEC_SIZE 16
+#define PAGE_SIZE 4096
-# undef weak_alias
-# define weak_alias(__memrchr, memrchr)
+ .text
+ENTRY_P2ALIGN(MEMRCHR, 6)
+#ifdef __ILP32__
+ /* Clear upper bits. */
+ mov %RDX_LP, %RDX_LP
#endif
+ movd %esi, %xmm0
+
+ /* Get end pointer. */
+ leaq (%rdx, %rdi), %rcx
+
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0
+
+ /* Check if we can load 1x VEC without cross a page. */
+ testl $(PAGE_SIZE - VEC_SIZE), %ecx
+ jz L(page_cross)
+
+ /* NB: This load happens regardless of whether rdx (len) is zero. Since
+ it doesn't cross a page and the standard gurantees any pointer have
+ at least one-valid byte this load must be safe. For the entire
+ history of the x86 memrchr implementation this has been possible so
+ no code "should" be relying on a zero-length check before this load.
+ The zero-length check is moved to the page cross case because it is
+ 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+ into 2-cache lines. */
+ movups -(VEC_SIZE)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subq $VEC_SIZE, %rdx
+ ja L(more_1x_vec)
+L(ret_vec_x0_test):
+ /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+ zero. */
+ bsrl %eax, %eax
+ jz L(ret_0)
+ /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+ if out of bounds. */
+ addl %edx, %eax
+ jl L(zero_0)
+ /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+ ptr. */
+ addq %rdi, %rax
+L(ret_0):
+ ret
+
+ .p2align 4,, 5
+L(ret_vec_x0):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE)(%rcx, %rax), %rax
+ ret
+
+ .p2align 4,, 2
+L(zero_0):
+ xorl %eax, %eax
+ ret
+
+
+ .p2align 4,, 8
+L(more_1x_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
+
+ /* Align rcx (pointer to string). */
+ decq %rcx
+ andq $-VEC_SIZE, %rcx
+
+ movq %rcx, %rdx
+ /* NB: We could consistenyl save 1-byte in this pattern with `movaps
+ %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+ it adds more frontend uops (even if the moves can be eliminated) and
+ some percentage of the time actual backend uops. */
+ movaps -(VEC_SIZE)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ subq %rdi, %rdx
+ pmovmskb %xmm1, %eax
+
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+L(last_2x_vec):
+ subl $VEC_SIZE, %edx
+ jbe L(ret_vec_x0_test)
+
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
+
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subl $VEC_SIZE, %edx
+ bsrl %eax, %eax
+ jz L(ret_1)
+ addl %edx, %eax
+ jl L(zero_0)
+ addq %rdi, %rax
+L(ret_1):
+ ret
+
+ /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+ causes the hot pause (length <= VEC_SIZE) to span multiple cache
+ lines. Naturally aligned % 16 to 8-bytes. */
+L(page_cross):
+ /* Zero length check. */
+ testq %rdx, %rdx
+ jz L(zero_0)
+
+ leaq -1(%rcx), %r8
+ andq $-(VEC_SIZE), %r8
+
+ movaps (%r8), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ /* Shift out negative alignment (because we are starting from endptr and
+ working backwards). */
+ negl %ecx
+ /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+ explicitly. */
+ andl $(VEC_SIZE - 1), %ecx
+ shl %cl, %esi
+ movzwl %si, %eax
+ leaq (%rdi, %rdx), %rcx
+ cmpq %rdi, %r8
+ ja L(more_1x_vec)
+ subl $VEC_SIZE, %edx
+ bsrl %eax, %eax
+ jz L(ret_2)
+ addl %edx, %eax
+ jl L(zero_1)
+ addq %rdi, %rax
+L(ret_2):
+ ret
+
+ /* Fits in aliging bytes. */
+L(zero_1):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4,, 5
+L(ret_vec_x1):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
+ ret
+
+ .p2align 4,, 8
+L(more_2x_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
+
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jnz L(ret_vec_x1)
+
+
+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(more_4x_vec)
+
+ addl $(VEC_SIZE), %edx
+ jle L(ret_vec_x2_test)
+
+L(last_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x2)
+
+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subl $(VEC_SIZE), %edx
+ bsrl %eax, %eax
+ jz L(ret_3)
+ addl %edx, %eax
+ jl L(zero_2)
+ addq %rdi, %rax
+L(ret_3):
+ ret
+
+ .p2align 4,, 6
+L(ret_vec_x2_test):
+ bsrl %eax, %eax
+ jz L(zero_2)
+ addl %edx, %eax
+ jl L(zero_2)
+ addq %rdi, %rax
+ ret
+
+L(zero_2):
+ xorl %eax, %eax
+ ret
+
+
+ .p2align 4,, 5
+L(ret_vec_x2):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
+ ret
+
+ .p2align 4,, 5
+L(ret_vec_x3):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
+ ret
+
+ .p2align 4,, 8
+L(more_4x_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x2)
+
+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ testl %eax, %eax
+ jnz L(ret_vec_x3)
+
+ addq $-(VEC_SIZE * 4), %rcx
+ cmpq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec)
+
+ /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+ keeping the code from spilling to the next cache line. */
+ addq $(VEC_SIZE * 4 - 1), %rcx
+ andq $-(VEC_SIZE * 4), %rcx
+ leaq (VEC_SIZE * 4)(%rdi), %rdx
+ andq $-(VEC_SIZE * 4), %rdx
+
+ .p2align 4,, 11
+L(loop_4x_vec):
+ movaps (VEC_SIZE * -1)(%rcx), %xmm1
+ movaps (VEC_SIZE * -2)(%rcx), %xmm2
+ movaps (VEC_SIZE * -3)(%rcx), %xmm3
+ movaps (VEC_SIZE * -4)(%rcx), %xmm4
+ pcmpeqb %xmm0, %xmm1
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm0, %xmm3
+ pcmpeqb %xmm0, %xmm4
+
+ por %xmm1, %xmm2
+ por %xmm3, %xmm4
+ por %xmm2, %xmm4
+
+ pmovmskb %xmm4, %esi
+ testl %esi, %esi
+ jnz L(loop_end)
+
+ addq $-(VEC_SIZE * 4), %rcx
+ cmpq %rdx, %rcx
+ jne L(loop_4x_vec)
+
+ subl %edi, %edx
+
+ /* Ends up being 1-byte nop. */
+ .p2align 4,, 2
+L(last_4x_vec):
+ movaps -(VEC_SIZE)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ cmpl $(VEC_SIZE * 2), %edx
+ jbe L(last_2x_vec)
+
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
+
+
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ testl %eax, %eax
+ jnz L(ret_vec_end)
+
+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subl $(VEC_SIZE * 3), %edx
+ ja L(last_vec)
+ bsrl %eax, %eax
+ jz L(ret_4)
+ addl %edx, %eax
+ jl L(zero_3)
+ addq %rdi, %rax
+L(ret_4):
+ ret
+
+ /* Ends up being 1-byte nop. */
+ .p2align 4,, 3
+L(loop_end):
+ pmovmskb %xmm1, %eax
+ sall $16, %eax
+ jnz L(ret_vec_end)
+
+ pmovmskb %xmm2, %eax
+ testl %eax, %eax
+ jnz L(ret_vec_end)
+
+ pmovmskb %xmm3, %eax
+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+ then it won't affect the result in esi (VEC4). If ecx is non-zero
+ then CHAR in VEC3 and bsrq will use that position. */
+ sall $16, %eax
+ orl %esi, %eax
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
+ ret
-#include "../memrchr.S"
+L(ret_vec_end):
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
+ ret
+ /* Use in L(last_4x_vec). In the same cache line. This is just a spare
+ aligning bytes. */
+L(zero_3):
+ xorl %eax, %eax
+ ret
+ /* 2-bytes from next cache line. */
+END(MEMRCHR)