]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: vector implementation of indexbyte for riscv64
authorJoel Sing <joel@sing.id.au>
Fri, 7 Feb 2025 14:03:23 +0000 (01:03 +1100)
committerJoel Sing <joel@sing.id.au>
Wed, 6 Aug 2025 13:23:02 +0000 (06:23 -0700)
Provide a vector implementation of indexbyte for riscv64, which is used
when compiled with the rva23u64 profile, or when vector is detected
to be available. Inputs that are smaller than 24 bytes will continue
to use the non-vector path.

On a Banana Pi F3, with GORISCV64=rva23u64:

                │  indexbyte.1  │             indexbyte.2              │
                │    sec/op     │    sec/op     vs base                │
IndexByte/10-8     52.68n ±  0%   47.26n ±  0%  -10.30% (p=0.000 n=10)
IndexByte/32-8     68.62n ±  0%   47.02n ±  0%  -31.49% (p=0.000 n=10)
IndexByte/4K-8    2217.0n ±  0%   420.4n ±  0%  -81.04% (p=0.000 n=10)
IndexByte/4M-8    2624.4µ ±  0%   767.5µ ±  0%  -70.75% (p=0.000 n=10)
IndexByte/64M-8    68.08m ± 10%   47.84m ± 45%  -29.73% (p=0.004 n=10)
geomean            17.03µ         8.073µ        -52.59%

                │ indexbyte.1  │               indexbyte.2               │
                │     B/s      │      B/s        vs base                 │
IndexByte/10-8    181.0Mi ± 0%    201.8Mi ±  0%   +11.48% (p=0.000 n=10)
IndexByte/32-8    444.7Mi ± 0%    649.1Mi ±  0%   +45.97% (p=0.000 n=10)
IndexByte/4K-8    1.721Gi ± 0%    9.076Gi ±  0%  +427.51% (p=0.000 n=10)
IndexByte/4M-8    1.488Gi ± 0%    5.089Gi ±  0%  +241.93% (p=0.000 n=10)
IndexByte/64M-8   940.3Mi ± 9%   1337.8Mi ± 31%   +42.27% (p=0.004 n=10)
geomean           727.1Mi         1.498Gi        +110.94%

Change-Id: If7b0dbef38d76fa7a2021e4ecaed668a1d4b9783
Reviewed-on: https://go-review.googlesource.com/c/go/+/648856
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Meng Zhuo <mengzhuo1203@gmail.com>
Reviewed-by: Mark Freeman <markfreeman@google.com>
Reviewed-by: Mark Ryan <markdryan@rivosinc.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
src/internal/bytealg/indexbyte_riscv64.s

index fde00da0eac7d9f986b3354167a31cfcded97bd7..527ae6d35ed55b1a60f33e43268f858ecca804b9 100644 (file)
@@ -2,6 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#include "asm_riscv64.h"
 #include "go_asm.h"
 #include "textflag.h"
 
@@ -11,12 +12,14 @@ TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
        // X12 = b_cap (unused)
        // X13 = byte to find
        AND     $0xff, X13, X12         // x12 byte to look for
-       MOV     X10, X13                // store base for later
 
        SLTI    $24, X11, X14
-       ADD     X10, X11                // end
-       BEQZ    X14, bigBody
+       BNEZ    X14, small
+       JMP     indexByteBig<>(SB)
 
+small:
+       MOV     X10, X13                // store base for later
+       ADD     X10, X11                // end
        SUB     $1, X10
 loop:
        ADD     $1, X10
@@ -31,21 +34,19 @@ notfound:
        MOV     $-1, X10
        RET
 
-bigBody:
-       JMP     indexByteBig<>(SB)
-
 TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
        // X10 = b_base
        // X11 = b_len
        // X12 = byte to find
-
        AND     $0xff, X12              // x12 byte to look for
-       MOV     X10, X13                // store base for later
 
        SLTI    $24, X11, X14
-       ADD     X10, X11                // end
-       BEQZ    X14, bigBody
+       BNEZ    X14, small
+       JMP     indexByteBig<>(SB)
 
+small:
+       MOV     X10, X13                // store base for later
+       ADD     X10, X11                // end
        SUB     $1, X10
 loop:
        ADD     $1, X10
@@ -60,20 +61,41 @@ notfound:
        MOV     $-1, X10
        RET
 
-bigBody:
-       JMP     indexByteBig<>(SB)
-
 TEXT indexByteBig<>(SB),NOSPLIT|NOFRAME,$0
-       // On entry
+       // On entry:
        // X10 = b_base
-       // X11 = end
+       // X11 = b_len (at least 16 bytes)
        // X12 = byte to find
-       // X13 = b_base
-       // X11 is at least 16 bytes > X10
-
-       // On exit
+       // On exit:
        // X10 = index of first instance of sought byte, if found, or -1 otherwise
 
+       MOV     X10, X13                // store base for later
+
+#ifndef hasV
+       MOVB    internal∕cpu·RISCV64+const_offsetRISCV64HasV(SB), X5
+       BEQZ    X5, indexbyte_scalar
+#endif
+
+       PCALIGN $16
+vector_loop:
+       VSETVLI X11, E8, M8, TA, MA, X5
+       VLE8V   (X10), V8
+       VMSEQVX X12, V8, V0
+       VFIRSTM V0, X6
+       BGEZ    X6, vector_found
+       ADD     X5, X10
+       SUB     X5, X11
+       BNEZ    X11, vector_loop
+       JMP     notfound
+
+vector_found:
+       SUB     X13, X10
+       ADD     X6, X10
+       RET
+
+indexbyte_scalar:
+       ADD     X10, X11                // end
+
        // Process the first few bytes until we get to an 8 byte boundary
        // No need to check for end here as we have at least 16 bytes in
        // the buffer.