]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: optimize Count/CountString on arm64
authorVasily Leonenko <vasiliy.leonenko@gmail.com>
Mon, 31 Mar 2025 20:25:00 +0000 (23:25 +0300)
committerGopher Robot <gobot@golang.org>
Thu, 3 Apr 2025 16:21:11 +0000 (09:21 -0700)
Introduce ABIInternal support for Count/CountString
Move <32 size block from function end to beginning as fastpath

goos: linux
goarch: arm64
pkg: strings
                   │   base.txt   │               new.txt                │
                   │     B/s      │     B/s       vs base                │
CountByte/10         672.5Mi ± 0%   692.9Mi ± 0%   +3.04% (p=0.000 n=10)
CountByte/32         3.592Gi ± 0%   3.970Gi ± 0%  +10.53% (p=0.000 n=10)
CountByte/4096       16.63Gi ± 0%   16.73Gi ± 0%   +0.64% (p=0.000 n=10)
CountByte/4194304    14.97Gi ± 2%   15.02Gi ± 1%        ~ (p=0.190 n=10)
CountByte/67108864   12.50Gi ± 0%   12.50Gi ± 0%        ~ (p=0.853 n=10)
geomean              5.931Gi        6.099Gi        +2.83%

Change-Id: I5af1be2b117d9fb8d570739637499923de62251c
Reviewed-on: https://go-review.googlesource.com/c/go/+/662395
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
Commit-Queue: Keith Randall <khr@golang.org>
Reviewed-by: Cherry Mui <cherryyz@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
src/internal/bytealg/count_arm64.s

index e616627b1a210d207dff6161921394b5b53c691b..1e39cd5f3dab2fc1d80486399b78c20b39ea0ed3 100644 (file)
@@ -5,33 +5,45 @@
 #include "go_asm.h"
 #include "textflag.h"
 
-TEXT ·Count(SB),NOSPLIT,$0-40
-       MOVD    b_base+0(FP), R0
-       MOVD    b_len+8(FP), R2
-       MOVBU   c+24(FP), R1
-       MOVD    $ret+32(FP), R8
-       B       countbytebody<>(SB)
-
-TEXT ·CountString(SB),NOSPLIT,$0-32
-       MOVD    s_base+0(FP), R0
-       MOVD    s_len+8(FP), R2
-       MOVBU   c+16(FP), R1
-       MOVD    $ret+24(FP), R8
-       B       countbytebody<>(SB)
+// func Count(b []byte, c byte) int
+// input:
+//   R0: b ptr
+//   R1: b len
+//   R2: b cap
+//   R3: c byte to search
+// return:
+//   R0: result
+TEXT ·Count<ABIInternal>(SB),NOSPLIT,$0-40
+       MOVD    R3, R2
+       B       ·CountString<ABIInternal>(SB)
 
+// func CountString(s string, c byte) int
 // input:
-//   R0: data
-//   R2: data len
-//   R1: byte to find
-//   R8: address to put result
-TEXT countbytebody<>(SB),NOSPLIT,$0
+//   R0: s ptr
+//   R1: s len
+//   R2: c byte to search (due to ABIInternal upper bits can contain junk)
+// return:
+//   R0: result
+TEXT ·CountString<ABIInternal>(SB),NOSPLIT,$0-32
        // R11 = count of byte to search
        MOVD    $0, R11
        // short path to handle 0-byte case
-       CBZ     R2, done
-       CMP     $0x20, R2
-       // jump directly to tail if length < 32
-       BLO     tail
+       CBZ     R1, done
+       CMP     $0x20, R1
+       // jump directly to head if length >= 32
+       BHS     head
+tail:
+       // Work with tail shorter than 32 bytes
+       MOVBU.P 1(R0), R5
+       SUB     $1, R1, R1
+       CMP     R2.UXTB, R5
+       CINC    EQ, R11, R11
+       CBNZ    R1, tail
+done:
+       MOVD    R11, R0
+       RET
+       PCALIGN $16
+head:
        ANDS    $0x1f, R0, R9
        BEQ     chunk
        // Work with not 32-byte aligned head
@@ -40,24 +52,23 @@ TEXT countbytebody<>(SB),NOSPLIT,$0
        PCALIGN $16
 head_loop:
        MOVBU.P 1(R0), R5
-       CMP     R5, R1
+       CMP     R2.UXTB, R5
        CINC    EQ, R11, R11
-       SUB     $1, R2, R2
+       SUB     $1, R1, R1
        CMP     R0, R3
        BNE     head_loop
-       // Work with 32-byte aligned chunks
 chunk:
-       BIC     $0x1f, R2, R9
+       BIC     $0x1f, R1, R9
        // The first chunk can also be the last
        CBZ     R9, tail
        // R3 = end of 32-byte chunks
        ADD     R0, R9, R3
        MOVD    $1, R5
        VMOV    R5, V5.B16
-       // R2 = length of tail
-       SUB     R9, R2, R2
-       // Duplicate R1 (byte to search) to 16 1-byte elements of V0
-       VMOV    R1, V0.B16
+       // R1 = length of tail
+       SUB     R9, R1, R1
+       // Duplicate R2 (byte to search) to 16 1-byte elements of V0
+       VMOV    R2, V0.B16
        // Clear the low 64-bit element of V7 and V8
        VEOR    V7.B8, V7.B8, V7.B8
        VEOR    V8.B8, V8.B8, V8.B8
@@ -79,14 +90,5 @@ chunk_loop:
        BNE     chunk_loop
        VMOV    V8.D[0], R6
        ADD     R6, R11, R11
-       CBZ     R2, done
-tail:
-       // Work with tail shorter than 32 bytes
-       MOVBU.P 1(R0), R5
-       SUB     $1, R2, R2
-       CMP     R5, R1
-       CINC    EQ, R11, R11
-       CBNZ    R2, tail
-done:
-       MOVD    R11, (R8)
-       RET
+       CBZ     R1, done
+       B       tail