]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: optimize the function indexbyte using SIMD on loong64
authorlimeidan <limeidan@loongson.cn>
Wed, 23 Apr 2025 03:04:15 +0000 (11:04 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Fri, 9 May 2025 02:13:30 +0000 (19:13 -0700)
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3C5000 @ 2200.00MHz
              │     old      │                 new                 │
              │    sec/op    │   sec/op     vs base                │
IndexByte/10     19.32n ± 0%   11.84n ± 0%  -38.72% (p=0.000 n=10)
IndexByte/32     49.34n ± 0%   14.11n ± 0%  -71.40% (p=0.000 n=10)
IndexByte/4K    5608.0n ± 0%   138.8n ± 0%  -97.52% (p=0.000 n=10)
IndexByte/4M    3822.8µ ± 0%   119.4µ ± 0%  -96.88% (p=0.000 n=10)
IndexByte/64M   61.826m ± 1%   3.812m ± 0%  -93.83% (p=0.000 n=10)
geomean          16.61µ        1.602µ       -90.35%

goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
              │      old      │                 new                  │
              │    sec/op     │    sec/op     vs base                │
IndexByte/10      6.809n ± 0%   5.804n ±  0%  -14.75% (p=0.000 n=10)
IndexByte/32     16.015n ± 0%   6.404n ±  0%  -60.01% (p=0.000 n=10)
IndexByte/4K    1651.00n ± 0%   52.83n ±  0%  -96.80% (p=0.000 n=10)
IndexByte/4M    1680.76µ ± 0%   91.10µ ±  0%  -94.58% (p=0.000 n=10)
IndexByte/64M    26.878m ± 0%   2.010m ± 27%  -92.52% (p=0.000 n=10)
geomean           6.054µ        815.0n        -86.54%

Change-Id: Ib75b997249708f921c6717eba43543c6650bf376
Reviewed-on: https://go-review.googlesource.com/c/go/+/668055
Reviewed-by: Keith Randall <khr@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
src/internal/bytealg/indexbyte_loong64.s

index c9591b3cdaa569006726398088ffcde0e4fa8c99..3618501063b3123cd257f3b73e2b1b4c14c6900b 100644 (file)
 #include "go_asm.h"
 #include "textflag.h"
 
+// input:
+//   R4 = b_base
+//   R5 = b_len
+//   R6 = b_cap (unused)
+//   R7 = byte to find
 TEXT ·IndexByte<ABIInternal>(SB),NOSPLIT,$0-40
-       // R4 = b_base
-       // R5 = b_len
-       // R6 = b_cap (unused)
-       // R7 = byte to find
        AND     $0xff, R7
+       JMP     indexbytebody<>(SB)
+
+// input:
+//   R4 = s_base
+//   R5 = s_len
+//   R6 = byte to find
+TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
+       AND     $0xff, R6, R7   // byte to find
+       JMP     indexbytebody<>(SB)
+
+// input:
+//   R4: b_base
+//   R5: len
+//   R7: byte to find
+TEXT indexbytebody<>(SB),NOSPLIT,$0
+       BEQ     R5, notfound    // len == 0
+
        MOVV    R4, R6          // store base for later
-       ADDV    R4, R5          // end
-       ADDV    $-1, R4
+       ADDV    R4, R5, R8      // end
+
+       MOVV    $32, R9
+       BGE     R5, R9, lasx
+tail:
+       MOVV    $8, R9
+       BLT     R5, R9, lt_8
+generic8_loop:
+       MOVV    (R4), R10
+
+       AND     $0xff, R10, R11
+       BEQ     R7, R11, found
+
+       BSTRPICKV       $15, R10, $8, R11
+       BEQ     R7, R11, byte_1th
+
+       BSTRPICKV       $23, R10, $16, R11
+       BEQ     R7, R11, byte_2th
+
+       BSTRPICKV       $31, R10, $24, R11
+       BEQ     R7, R11, byte_3th
 
-       PCALIGN $16
-loop:
+       BSTRPICKV       $39, R10, $32, R11
+       BEQ     R7, R11, byte_4th
+
+       BSTRPICKV       $47, R10, $40, R11
+       BEQ     R7, R11, byte_5th
+
+       BSTRPICKV       $55, R10, $48, R11
+       BEQ     R7, R11, byte_6th
+
+       BSTRPICKV       $63, R10, $56, R11
+       BEQ     R7, R11, byte_7th
+
+       ADDV    $8, R4
+       ADDV    $-8, R5
+       BGE     R5, R9, generic8_loop
+
+lt_8:
+       BEQ     R4, R8, notfound
+       MOVBU   (R4), R10
+       BEQ     R7, R10, found
        ADDV    $1, R4
-       BEQ     R4, R5, notfound
-       MOVBU   (R4), R8
-       BNE     R7, R8, loop
+       JMP     lt_8
 
-       SUBV    R6, R4          // remove base
+byte_1th:
+       ADDV    $1, R4
+       SUBV    R6, R4
        RET
 
-notfound:
-       MOVV    $-1, R4
+byte_2th:
+       ADDV    $2, R4
+       SUBV    R6, R4
        RET
 
-TEXT ·IndexByteString<ABIInternal>(SB),NOSPLIT,$0-32
-       // R4 = s_base
-       // R5 = s_len
-       // R6 = byte to find
-       MOVV    R4, R7          // store base for later
-       ADDV    R4, R5          // end
-       ADDV    $-1, R4
-
-       PCALIGN $16
-loop:
-       ADDV    $1, R4
-       BEQ     R4, R5, notfound
-       MOVBU   (R4), R8
-       BNE     R6, R8, loop
+byte_3th:
+       ADDV    $3, R4
+       SUBV    R6, R4
+       RET
+
+byte_4th:
+       ADDV    $4, R4
+       SUBV    R6, R4
+       RET
+
+byte_5th:
+       ADDV    $5, R4
+       SUBV    R6, R4
+       RET
+
+byte_6th:
+       ADDV    $6, R4
+       SUBV    R6, R4
+       RET
+
+byte_7th:
+       ADDV    $7, R4
 
-       SUBV    R7, R4          // remove base
+found:
+       SUBV    R6, R4
        RET
 
 notfound:
        MOVV    $-1, R4
        RET
+
+lasx:
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R9
+       BEQ     R9, lsx
+       XVMOVQ  R7, X0.B32
+
+       MOVV    $128, R9
+       BLT     R5, R9, lasx32_loop
+lasx128_loop:
+       XVMOVQ  0(R4), X1
+       XVMOVQ  32(R4), X2
+       XVMOVQ  64(R4), X3
+       XVMOVQ  96(R4), X4
+
+       XVSEQB  X1, X0, X1
+       XVSETNEV        X1, FCC0
+       BFPT    lasx_found_add_0
+
+       XVSEQB  X2, X0, X1
+       XVSETNEV        X1, FCC0
+       BFPT    lasx_found_add_32
+
+       XVSEQB  X3, X0, X1
+       XVSETNEV        X1, FCC0
+       BFPT    lasx_found_add_64
+
+       XVSEQB  X4, X0, X1
+       XVSETNEV        X1, FCC0
+       BFPT    lasx_found_add_96
+
+       ADDV    $128, R4
+       ADDV    $-128, R5
+       BGE     R5, R9, lasx128_loop
+
+       BEQ     R5, notfound
+
+       MOVV    $32, R9
+       BLT     R5, R9, tail
+lasx32_loop:
+       XVMOVQ  0(R4), X1
+
+       XVSEQB  X1, X0, X1
+       XVSETNEV        X1, FCC0
+       BFPT    lasx_found_add_0
+
+       ADDV    $32, R4
+       ADDV    $-32, R5
+       BGE     R5, R9, lasx32_loop
+
+       BEQ     R5, notfound
+
+       JMP     tail
+
+lasx_found_add_0:
+       MOVV    R0, R11
+       JMP     lasx_index_cal
+
+lasx_found_add_32:
+       MOVV    $32, R11
+       JMP     lasx_index_cal
+
+lasx_found_add_64:
+       MOVV    $64, R11
+       JMP     lasx_index_cal
+
+lasx_found_add_96:
+       MOVV    $96, R11
+       JMP     lasx_index_cal
+
+lasx_index_cal:
+       MOVV    $64, R9
+       XVMOVQ  X1.V[0], R10
+       CTZV    R10, R10
+       BNE     R10, R9, index_cal
+       ADDV    $8, R11
+
+       XVMOVQ  X1.V[1], R10
+       CTZV    R10, R10
+       BNE     R10, R9, index_cal
+       ADDV    $8, R11
+
+       XVMOVQ  X1.V[2], R10
+       CTZV    R10, R10
+       BNE     R10, R9, index_cal
+       ADDV    $8, R11
+
+       XVMOVQ  X1.V[3], R10
+       CTZV    R10, R10
+       JMP     index_cal
+
+lsx:
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R9
+       BEQ     R9, tail
+       VMOVQ   R7, V0.B16
+
+       MOVV    $64, R9
+       BLT     R5, R9, lsx16_loop
+lsx64_loop:
+       VMOVQ   0(R4), V1
+       VMOVQ   16(R4), V2
+       VMOVQ   32(R4), V3
+       VMOVQ   48(R4), V4
+
+       VSEQB   V1, V0, V1
+       VSETNEV V1, FCC0
+       BFPT    lsx_found_add_0
+
+       VSEQB   V2, V0, V1
+       VSETNEV V1, FCC0
+       BFPT    lsx_found_add_16
+
+       VSEQB   V3, V0, V1
+       VSETNEV V1, FCC0
+       BFPT    lsx_found_add_32
+
+       VSEQB   V4, V0, V1
+       VSETNEV V1, FCC0
+       BFPT    lsx_found_add_48
+
+       ADDV    $64, R4
+       ADDV    $-64, R5
+       BGE     R5, R9, lsx64_loop
+
+       BEQ     R5, notfound
+
+       MOVV    $16, R9
+       BLT     R5, R9, tail
+lsx16_loop:
+       VMOVQ   0(R4), V1
+
+       VSEQB   V1, V0, V1
+       VSETNEV V1, FCC0
+       BFPT    lsx_found_add_0
+
+       ADDV    $16, R4
+       ADDV    $-16, R5
+       BGE     R5, R9, lsx16_loop
+
+       BEQ     R5, notfound
+
+       JMP     tail
+
+lsx_found_add_0:
+       MOVV    R0, R11
+       JMP     lsx_index_cal
+
+lsx_found_add_16:
+       MOVV    $16, R11
+       JMP     lsx_index_cal
+
+lsx_found_add_32:
+       MOVV    $32, R11
+       JMP     lsx_index_cal
+
+lsx_found_add_48:
+       MOVV    $48, R11
+       JMP     lsx_index_cal
+
+lsx_index_cal:
+       MOVV    $64, R9
+
+       VMOVQ   V1.V[0], R10
+       CTZV    R10, R10
+       BNE     R10, R9, index_cal
+       ADDV    $8, R11
+
+       VMOVQ   V1.V[1], R10
+       CTZV    R10, R10
+       JMP     index_cal
+
+index_cal:
+       SRLV    $3, R10
+       ADDV    R11, R10
+       ADDV    R10, R4
+       JMP     found