]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: optimize Index/IndexString/IndexByte/IndexByteString on arm64
authorVasily Leonenko <vasiliy.leonenko@gmail.com>
Wed, 2 Apr 2025 18:31:41 +0000 (21:31 +0300)
committerGopher Robot <gobot@golang.org>
Thu, 3 Apr 2025 18:25:06 +0000 (11:25 -0700)
Introduce ABIInternal support for Index/IndexString/IndexByte/IndexByteString

goos: linux
goarch: arm64
pkg: bytes
                              │   base.txt    │                new.txt                │
                              │      B/s      │      B/s       vs base                │
IndexByte/10                     1.090Gi ± 0%    1.313Gi ± 0%  +20.51% (p=0.000 n=10)
IndexByte/32                     3.714Gi ± 0%    4.289Gi ± 0%  +15.47% (p=0.000 n=10)
IndexByte/4K                     22.92Gi ± 0%    23.01Gi ± 0%   +0.37% (p=0.000 n=10)
IndexByte/4M                     20.23Gi ± 0%    20.35Gi ± 0%   +0.60% (p=0.000 n=10)
IndexByte/64M                    23.82Gi ± 0%    23.81Gi ± 0%   -0.01% (p=0.002 n=10)
IndexBytePortable/10             788.5Mi ± 0%    788.5Mi ± 0%        ~ (p=0.722 n=10)
IndexBytePortable/32            1002.3Mi ± 0%   1002.3Mi ± 0%        ~ (p=0.137 n=10)
IndexBytePortable/4K             1.111Gi ± 0%    1.111Gi ± 0%        ~ (p=0.692 n=10)
IndexBytePortable/4M             1.116Gi ± 0%    1.116Gi ± 0%        ~ (p=0.158 n=10)
IndexBytePortable/64M            1.116Gi ± 0%    1.116Gi ± 0%   -0.01% (p=0.000 n=10)
IndexRune/10                     352.1Mi ± 0%    445.0Mi ± 0%  +26.38% (p=0.000 n=10)
IndexRune/32                     1.101Gi ± 0%    1.391Gi ± 0%  +26.43% (p=0.000 n=10)
IndexRune/4K                     21.07Gi ± 0%    21.25Gi ± 0%   +0.82% (p=0.000 n=10)
IndexRune/4M                     23.81Gi ± 0%    23.81Gi ± 0%        ~ (p=0.218 n=10)
IndexRune/64M                    23.81Gi ± 0%    23.81Gi ± 0%        ~ (p=0.271 n=10)
IndexRuneASCII/10                1.038Gi ± 0%    1.190Gi ± 1%  +14.63% (p=0.000 n=10)
IndexRuneASCII/32                3.643Gi ± 2%    4.203Gi ± 0%  +15.38% (p=0.000 n=10)
IndexRuneASCII/4K                22.90Gi ± 0%    22.98Gi ± 0%   +0.34% (p=0.000 n=10)
IndexRuneASCII/4M                23.81Gi ± 0%    23.81Gi ± 0%        ~ (p=0.108 n=10)
IndexRuneASCII/64M               23.82Gi ± 0%    23.81Gi ± 0%        ~ (p=0.105 n=10)
IndexRuneUnicode/Latin/10        404.4Mi ± 0%    493.7Mi ± 0%  +22.10% (p=0.000 n=10)
IndexRuneUnicode/Latin/32        1.261Gi ± 0%    1.543Gi ± 0%  +22.31% (p=0.000 n=10)
IndexRuneUnicode/Latin/4K        6.966Gi ± 0%    8.115Gi ± 0%  +16.50% (p=0.000 n=10)
IndexRuneUnicode/Latin/4M        6.599Gi ± 0%    7.576Gi ± 0%  +14.80% (p=0.000 n=10)
IndexRuneUnicode/Latin/64M       6.297Gi ± 0%    7.070Gi ± 2%  +12.28% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/10     385.9Mi ± 0%    440.1Mi ± 0%  +14.03% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/32     1.206Gi ± 0%    1.375Gi ± 0%  +14.05% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4K     2.468Gi ± 0%    2.921Gi ± 0%  +18.37% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4M     2.386Gi ± 0%    2.845Gi ± 0%  +19.23% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/64M    2.280Gi ± 0%    2.717Gi ± 0%  +19.14% (p=0.000 n=10)
IndexRuneUnicode/Han/10          307.1Mi ± 0%    331.5Mi ± 0%   +7.94% (p=0.000 n=10)
IndexRuneUnicode/Han/32          982.2Mi ± 0%   1060.2Mi ± 0%   +7.94% (p=0.000 n=10)
IndexRuneUnicode/Han/4K          4.986Gi ± 0%    5.957Gi ± 0%  +19.48% (p=0.000 n=10)
IndexRuneUnicode/Han/4M          3.822Gi ± 0%    4.198Gi ± 0%   +9.83% (p=0.000 n=10)
IndexRuneUnicode/Han/64M         3.765Gi ± 0%    4.140Gi ± 0%   +9.96% (p=0.000 n=10)
Index/10                         634.6Mi ± 0%    635.2Mi ± 0%   +0.09% (p=0.000 n=10)
Index/32                         375.3Mi ± 0%    385.1Mi ± 0%   +2.63% (p=0.000 n=10)
Index/4K                         754.8Mi ± 0%    755.2Mi ± 0%   +0.04% (p=0.001 n=10)
Index/4M                         746.5Mi ± 0%    746.3Mi ± 0%   -0.03% (p=0.000 n=10)
Index/64M                        746.5Mi ± 0%    746.3Mi ± 0%   -0.03% (p=0.000 n=10)
IndexEasy/10                     714.6Mi ± 0%    714.6Mi ± 0%   +0.00% (p=0.001 n=10)
IndexEasy/32                     1.221Gi ± 0%    1.524Gi ± 0%  +24.81% (p=0.000 n=10)
IndexEasy/4K                     21.06Gi ± 0%    21.47Gi ± 0%   +1.91% (p=0.000 n=10)
IndexEasy/4M                     20.23Gi ± 0%    20.24Gi ± 0%        ~ (p=0.684 n=10)
IndexEasy/64M                    13.07Gi ± 0%    12.58Gi ± 4%   -3.75% (p=0.000 n=10)
IndexHard1                       1.114Gi ± 0%    1.114Gi ± 0%        ~ (p=0.193 n=10)
IndexHard2                       1.111Gi ± 0%    1.112Gi ± 0%   +0.04% (p=0.001 n=10)
IndexHard3                       1.086Gi ± 0%    1.081Gi ± 0%   -0.37% (p=0.000 n=10)
IndexHard4                       607.9Mi ± 0%    607.9Mi ± 0%        ~ (p=0.136 n=10)
geomean                          2.536Gi         2.720Gi        +7.26%

Change-Id: I1fc246783ebb215882d7144d05dbe2433dc66751
Reviewed-on: https://go-review.googlesource.com/c/go/+/662415
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>

src/internal/bytealg/index_arm64.s
src/internal/bytealg/indexbyte_arm64.s

index 3a551a72da13932257ee95528f6e83690947801c..38e0b14e75781a04c5c075a079c6ed816c89666a 100644 (file)
@@ -5,29 +5,30 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·Index(SB),NOSPLIT,$0-56
-       MOVD    a_base+0(FP), R0
-       MOVD    a_len+8(FP), R1
-       MOVD    b_base+24(FP), R2
-       MOVD    b_len+32(FP), R3
-       MOVD    $ret+48(FP), R9
-       B       indexbody<>(SB)
-
-TEXT ·IndexString(SB),NOSPLIT,$0-40
-       MOVD    a_base+0(FP), R0
-       MOVD    a_len+8(FP), R1
-       MOVD    b_base+16(FP), R2
-       MOVD    b_len+24(FP), R3
-       MOVD    $ret+32(FP), R9
-       B       indexbody<>(SB)
+// func Index(a, b []byte) int
+// input:
+//   R0: a ptr (haystack)
+//   R1: a len (haystack)
+//   R2: a cap (haystack) (unused)
+//   R3: b ptr (needle)
+//   R4: b len (needle) (2 <= len <= 32)
+//   R5: b cap (needle) (unused)
+// return:
+//   R0: result
+TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
+       MOVD    R3, R2
+       MOVD    R4, R3
+       B       ·IndexString<ABIInternal>(SB)
 
+// func IndexString(a, b string) int
 // input:
-//   R0: haystack
-//   R1: length of haystack
-//   R2: needle
-//   R3: length of needle (2 <= len <= 32)
-//   R9: address to put result
-TEXT indexbody<>(SB),NOSPLIT,$0-56
+//   R0: a ptr (haystack)
+//   R1: a len (haystack)
+//   R2: b ptr (needle)
+//   R3: b len (needle) (2 <= len <= 32)
+// return:
+//   R0: result
+TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
        // main idea is to load 'sep' into separate register(s)
        // to avoid repeatedly re-load it again and again
        // for sebsequent substring comparisons
@@ -136,11 +137,9 @@ loop_2:
        BNE     loop_2
 found:
        SUB     R8, R0, R0
-       MOVD    R0, (R9)
        RET
 not_found:
        MOVD    $-1, R0
-       MOVD    R0, (R9)
        RET
 greater_8:
        SUB     $9, R3, R11     // len(sep) - 9, offset of R0 for last 8 bytes
index 40843fbc5b443a022aa4ff5bc78be9a97761ffd9..92a61a43029a86a1fea544b168bd465c434c0ec6 100644 (file)
@@ -4,26 +4,26 @@
 
 #include "textflag.h"
 
-TEXT ·IndexByte(SB),NOSPLIT,$0-40
-       MOVD    b_base+0(FP), R0
-       MOVD    b_len+8(FP), R2
-       MOVBU   c+24(FP), R1
-       MOVD    $ret+32(FP), R8
-       B       indexbytebody<>(SB)
-
-TEXT ·IndexByteString(SB),NOSPLIT,$0-32
-       MOVD    s_base+0(FP), R0
-       MOVD    s_len+8(FP), R2
-       MOVBU   c+16(FP), R1
-       MOVD    $ret+24(FP), R8
-       B       indexbytebody<>(SB)
+// func IndexByte(b []byte, c byte) int
+// input:
+//   R0: b ptr
+//   R1: b len
+//   R2: b cap (unused)
+//   R3: c byte to search
+// return
+//   R0: result
+TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
+       MOVD    R3, R2
+       B       ·IndexByteString<ABIInternal>(SB)
 
+// func IndexByteString(s string, c byte) int
 // input:
-//   R0: data
-//   R1: byte to search
-//   R2: data len
-//   R8: address to put result
-TEXT indexbytebody<>(SB),NOSPLIT,$0
+//   R0: s ptr
+//   R1: s len
+//   R2: c byte to search
+// return
+//   R0: result
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
        // Core algorithm:
        // For each 32-byte chunk we calculate a 64-bit syndrome value,
        // with two bits per byte. For each tuple, bit 0 is set if the
@@ -33,19 +33,19 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
        // in the original string, counting trailing zeros allows to
        // identify exactly which byte has matched.
 
-       CBZ     R2, fail
+       CBZ     R1, fail
        MOVD    R0, R11
        // Magic constant 0x40100401 allows us to identify
        // which lane matches the requested byte.
        // 0x40100401 = ((1<<0) + (4<<8) + (16<<16) + (64<<24))
        // Different bytes have different bit masks (i.e: 1, 4, 16, 64)
        MOVD    $0x40100401, R5
-       VMOV    R1, V0.B16
+       VMOV    R2, V0.B16
        // Work with aligned 32-byte chunks
        BIC     $0x1f, R0, R3
        VMOV    R5, V5.S4
        ANDS    $0x1f, R0, R9
-       AND     $0x1f, R2, R10
+       AND     $0x1f, R1, R10
        BEQ     loop
 
        // Input string is not 32-byte aligned. We calculate the
@@ -53,7 +53,7 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
        // the first bytes and mask off the irrelevant part.
        VLD1.P  (R3), [V1.B16, V2.B16]
        SUB     $0x20, R9, R4
-       ADDS    R4, R2, R2
+       ADDS    R4, R1, R1
        VCMEQ   V0.B16, V1.B16, V3.B16
        VCMEQ   V0.B16, V2.B16, V4.B16
        VAND    V5.B16, V3.B16, V3.B16
@@ -72,7 +72,7 @@ TEXT indexbytebody<>(SB),NOSPLIT,$0
 
 loop:
        VLD1.P  (R3), [V1.B16, V2.B16]
-       SUBS    $0x20, R2, R2
+       SUBS    $0x20, R1, R1
        VCMEQ   V0.B16, V1.B16, V3.B16
        VCMEQ   V0.B16, V2.B16, V4.B16
        // If we're out of data we finish regardless of the result
@@ -117,10 +117,8 @@ tail:
        ADD     R6>>1, R3, R0
        // Compute the offset result
        SUB     R11, R0, R0
-       MOVD    R0, (R8)
        RET
 
 fail:
        MOVD    $-1, R0
-       MOVD    R0, (R8)
        RET