aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorWilco Dijkstra <wilco.dijkstra@arm.com>2023-01-11 13:53:05 +0000
committerWilco Dijkstra <wilco.dijkstra@arm.com>2024-04-08 16:46:11 +0100
commit7cbcc959270dcf25789b2f09c4bf1c60a86b760a (patch)
tree75811df41b1680b2a817f2ca68f402eac10be47b
parentdaa13ed0acd760f0c853e5521bfbfc050cc23574 (diff)
AArch64: Optimize strnlen
Optimize strnlen using the shrn instruction and improve the main loop. Small strings are around 10% faster, large strings are 40% faster on modern CPUs. Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com> (cherry picked from commit ad098893ba3c3344a5f2f6ab1627c47204afdb47)
-rw-r--r--sysdeps/aarch64/strnlen.S39
1 files changed, 18 insertions, 21 deletions
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 282bddc9aa..a44a49a920 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -44,19 +44,16 @@
/*
Core algorithm:
-
- For each 16-byte chunk we calculate a 64-bit nibble mask value with four bits
- per byte. We take 4 bits of every comparison byte with shift right and narrow
- by 4 instruction. Since the bits in the nibble mask reflect the order in
- which things occur in the original string, counting trailing zeros identifies
- exactly which byte matched. */
+ Process the string in 16-byte aligned chunks. Compute a 64-bit mask with
+ four bits per byte using the shrn instruction. A count trailing zeros then
+ identifies the first zero byte. */
ENTRY (__strnlen)
PTR_ARG (0)
SIZE_ARG (1)
bic src, srcin, 15
cbz cntin, L(nomatch)
- ld1 {vdata.16b}, [src], 16
+ ld1 {vdata.16b}, [src]
cmeq vhas_chr.16b, vdata.16b, 0
lsl shift, srcin, 2
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
@@ -71,36 +68,40 @@ L(finish):
csel result, cntin, result, ls
ret
+L(nomatch):
+ mov result, cntin
+ ret
+
L(start_loop):
sub tmp, src, srcin
+ add tmp, tmp, 17
subs cntrem, cntin, tmp
- b.ls L(nomatch)
+ b.lo L(nomatch)
/* Make sure that it won't overread by a 16-byte chunk */
- add tmp, cntrem, 15
- tbnz tmp, 4, L(loop32_2)
-
+ tbz cntrem, 4, L(loop32_2)
+ sub src, src, 16
.p2align 5
L(loop32):
- ldr qdata, [src], 16
+ ldr qdata, [src, 32]!
cmeq vhas_chr.16b, vdata.16b, 0
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbnz synd, L(end)
L(loop32_2):
- ldr qdata, [src], 16
+ ldr qdata, [src, 16]
subs cntrem, cntrem, 32
cmeq vhas_chr.16b, vdata.16b, 0
- b.ls L(end)
+ b.lo L(end_2)
umaxp vend.16b, vhas_chr.16b, vhas_chr.16b /* 128->64 */
fmov synd, dend
cbz synd, L(loop32)
-
+L(end_2):
+ add src, src, 16
L(end):
shrn vend.8b, vhas_chr.8h, 4 /* 128->64 */
- sub src, src, 16
- mov synd, vend.d[0]
sub result, src, srcin
+ fmov synd, dend
#ifndef __AARCH64EB__
rbit synd, synd
#endif
@@ -110,10 +111,6 @@ L(end):
csel result, cntin, result, ls
ret
-L(nomatch):
- mov result, cntin
- ret
-
END (__strnlen)
libc_hidden_def (__strnlen)
weak_alias (__strnlen, strnlen)