]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: optimize the function compare using SIMD on loong64
authorlimeidan <limeidan@loongson.cn>
Wed, 7 May 2025 09:04:54 +0000 (17:04 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Wed, 14 May 2025 00:41:37 +0000 (17:41 -0700)
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
                  │      old      │                 new                  │
                  │    sec/op     │    sec/op     vs base                │
BytesCompare/1       7.238n ± 25%    5.204n ± 0%  -28.10% (p=0.001 n=10)
BytesCompare/2       7.242n ±  6%    5.204n ± 0%  -28.14% (p=0.000 n=10)
BytesCompare/4       7.229n ±  5%    4.403n ± 0%  -39.10% (p=0.000 n=10)
BytesCompare/8       7.077n ± 36%    4.403n ± 0%  -37.78% (p=0.000 n=10)
BytesCompare/16      8.373n ±  6%    6.004n ± 0%  -28.30% (p=0.000 n=10)
BytesCompare/32      8.040n ±  3%    4.803n ± 0%  -40.26% (p=0.000 n=10)
BytesCompare/64      8.434n ± 24%   10.410n ± 0%  +23.42% (p=0.014 n=10)
BytesCompare/128    11.530n ± 23%    5.604n ± 0%  -51.40% (p=0.000 n=10)
BytesCompare/256    14.180n ±  0%    7.606n ± 0%  -46.36% (p=0.000 n=10)
BytesCompare/512     26.83n ±  0%    10.81n ± 0%  -59.71% (p=0.000 n=10)
BytesCompare/1024    52.60n ±  0%    17.21n ± 0%  -67.28% (p=0.000 n=10)
BytesCompare/2048   103.70n ±  0%    30.02n ± 0%  -71.05% (p=0.000 n=10)
geomean              13.49n          7.607n       -43.63%

goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
                                      │     old     │                 new                  │
                                      │   sec/op    │    sec/op     vs base                │
CompareBytesEqual                       5.603n ± 0%   5.604n ±  0%        ~ (p=0.191 n=10)
CompareBytesToNil                       3.202n ± 0%   3.202n ±  0%        ~ (p=1.000 n=10)
CompareBytesEmpty                       2.802n ± 0%   2.802n ±  0%        ~ (p=1.000 n=10)
CompareBytesIdentical                   3.202n ± 0%   2.538n ±  1%  -20.72% (p=0.000 n=10)
CompareBytesSameLength                  8.805n ± 0%   4.803n ±  0%  -45.45% (p=0.000 n=10)
CompareBytesDifferentLength             9.206n ± 0%   4.403n ±  0%  -52.17% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=1       82.04µ ± 0%   45.91µ ±  0%  -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=2       82.04µ ± 0%   45.91µ ±  0%  -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=3       82.04µ ± 0%   45.91µ ±  0%  -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=4       82.04µ ± 0%   45.91µ ±  0%  -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=5       82.04µ ± 0%   45.91µ ±  0%  -44.04% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=6       82.03µ ± 0%   45.93µ ±  0%  -44.01% (p=0.000 n=10)
CompareBytesBigUnaligned/offset=7       82.04µ ± 0%   45.93µ ±  0%  -44.01% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=0   78.76µ ± 0%   45.69µ ±  0%  -41.98% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=1   85.32µ ± 0%   46.04µ ±  0%  -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=2   85.31µ ± 0%   46.04µ ±  0%  -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=3   85.32µ ± 0%   46.04µ ±  0%  -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=4   85.32µ ± 0%   46.04µ ±  0%  -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=5   85.32µ ± 0%   46.04µ ±  0%  -46.03% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=6   85.31µ ± 0%   46.06µ ±  0%  -46.02% (p=0.000 n=10)
CompareBytesBigBothUnaligned/offset=7   85.32µ ± 0%   52.32µ ±  7%  -38.68% (p=0.000 n=10)
CompareBytesBig                         78.76µ ± 0%   50.20µ ±  6%  -36.26% (p=0.000 n=10)
CompareBytesBigIdentical                3.202n ± 0%   3.442n ± 24%        ~ (p=0.462 n=10)
geomean                                 4.197µ        2.630µ        -37.34%

Change-Id: I621145aef3e6a2c68e7127152f26ed047c6b2ece
Reviewed-on: https://go-review.googlesource.com/c/go/+/671315
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Cherry Mui <cherryyz@google.com>
src/internal/bytealg/compare_loong64.s

index 99c8cda77521d01fd8030fce45782109fe42518c..933053196449d541f57f4e0b3301281d86d5330e 100644 (file)
@@ -23,139 +23,140 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
        // R7 = b_len
        JMP     cmpbody<>(SB)
 
-// On entry:
-// R5 length of a
-// R7 length of b
-// R4 points to the start of a
-// R6 points to the start of b
+// input:
+//    R4: points to the start of a
+//    R5: length of a
+//    R6: points to the start of b
+//    R7: length of b
 // for regabi the return value (-1/0/1) in R4
 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
-       BEQ     R4, R6, cmp_len         // same start of a and b, then compare lengths
+       BEQ     R4, R6, cmp_len // same start of a and b, then compare lengths
 
        SGTU    R5, R7, R9
        BNE     R9, b_lt_a
        MOVV    R5, R14
        JMP     entry
+
 b_lt_a:
-       MOVV    R7, R14                 // R14 is min(R5, R7)
+       MOVV    R7, R14
+
 entry:
-       ADDV    R4, R14, R12            // R4 start of a, R12 end of a
-       BEQ     R4, R12, cmp_len        // minlength is 0
+       BEQ     R14, cmp_len    // minlength is 0
 
+       MOVV    $32, R15
+       BGE     R14, R15, lasx
 tail:
-       MOVV    $2, R15
-       BLT     R14, R15, cmp1          // min < 2
-       SLLV    $1, R15
-       BLT     R14, R15, cmp2          // min < 4
-       SLLV    $1, R15
-       BLT     R14, R15, cmp4          // min < 8
-       SLLV    $1, R15
-       BLT     R14, R15, cmp8          // min < 16
-       SLLV    $1, R15
-       BLT     R14, R15, cmp16         // min < 32
-
-// When min >= 32 bytes, enter the cmp32_loop loop processing:
-// take out 4 8-bytes from a and b in turn for comparison.
-cmp32_loop:
-       MOVV    (R4), R8
-       MOVV    (R6), R9
-       MOVV    8(R4), R10
-       MOVV    8(R6), R11
-       BNE     R8, R9, cmp8a
-       BNE     R10, R11, cmp8b
-       MOVV    16(R4), R8
-       MOVV    16(R6), R9
-       MOVV    24(R4), R10
-       MOVV    24(R6), R11
-       BNE     R8, R9, cmp8a
-       BNE     R10, R11, cmp8b
-       ADDV    $32, R4
-       ADDV    $32, R6
-       SUBV    $32, R14
-       BGE     R14, R15, cmp32_loop
-       BEQ     R14, cmp_len
-
-check16:
-       MOVV    $16, R15
-       BLT     R14, R15, check8
-cmp16:
-       MOVV    (R4), R8
-       MOVV    (R6), R9
-       MOVV    8(R4), R10
-       MOVV    8(R6), R11
-       BNE     R8, R9, cmp8a
-       BNE     R10, R11, cmp8b
-       ADDV    $16, R4
-       ADDV    $16, R6
-       SUBV    $16, R14
-       BEQ     R14, cmp_len
-
-check8:
        MOVV    $8, R15
-       BLT     R14, R15, check4
+       BLT     R14, R15, lt_8
+generic8_loop:
+       MOVV    (R4), R10
+       MOVV    (R6), R11
+       BEQ     R10, R11, generic8_equal
+
 cmp8:
-       MOVV    (R4), R8
-       MOVV    (R6), R9
-       BNE     R8, R9, cmp8a
+       AND     $0xff, R10, R16
+       AND     $0xff, R11, R17
+       BNE     R16, R17, cmp_byte
+
+       BSTRPICKV       $15, R10, $8, R16
+       BSTRPICKV       $15, R11, $8, R17
+       BNE     R16, R17, cmp_byte
+
+       BSTRPICKV       $23, R10, $16, R16
+       BSTRPICKV       $23, R11, $16, R17
+       BNE     R16, R17, cmp_byte
+
+       BSTRPICKV       $31, R10, $24, R16
+       BSTRPICKV       $31, R11, $24, R17
+       BNE     R16, R17, cmp_byte
+
+       BSTRPICKV       $39, R10, $32, R16
+       BSTRPICKV       $39, R11, $32, R17
+       BNE     R16, R17, cmp_byte
+
+       BSTRPICKV       $47, R10, $40, R16
+       BSTRPICKV       $47, R11, $40, R17
+       BNE     R16, R17, cmp_byte
+
+       BSTRPICKV       $55, R10, $48, R16
+       BSTRPICKV       $55, R11, $48, R17
+       BNE     R16, R17, cmp_byte
+
+       BSTRPICKV       $63, R10, $56, R16
+       BSTRPICKV       $63, R11, $56, R17
+       BNE     R16, R17, cmp_byte
+
+generic8_equal:
+       ADDV    $-8, R14
+       BEQ     R14, cmp_len
        ADDV    $8, R4
        ADDV    $8, R6
-       SUBV    $8, R14
-       BEQ     R14, cmp_len
+       BGE     R14, R15, generic8_loop
 
-check4:
+lt_8:
        MOVV    $4, R15
-       BLT     R14, R15, check2
-cmp4:
-       MOVW    (R4), R8
-       MOVW    (R6), R9
-       BNE     R8, R9, cmp8a
+       BLT     R14, R15, lt_4
+
+       MOVWU   (R4), R10
+       MOVWU   (R6), R11
+       BEQ     R10, R11, lt_8_equal
+
+       AND     $0xff, R10, R16
+       AND     $0xff, R11, R17
+       BNE     R16, R17, cmp_byte
+
+       BSTRPICKV       $15, R10, $8, R16
+       BSTRPICKV       $15, R11, $8, R17
+       BNE     R16, R17, cmp_byte
+
+       BSTRPICKV       $23, R10, $16, R16
+       BSTRPICKV       $23, R11, $16, R17
+       BNE     R16, R17, cmp_byte
+
+       BSTRPICKV       $31, R10, $24, R16
+       BSTRPICKV       $31, R11, $24, R17
+       BNE     R16, R17, cmp_byte
+
+lt_8_equal:
+       ADDV    $-4, R14
+       BEQ     R14, cmp_len
        ADDV    $4, R4
        ADDV    $4, R6
-       SUBV    $4, R14
-       BEQ     R14, cmp_len
 
-check2:
+lt_4:
        MOVV    $2, R15
-       BLT     R14, R15, cmp1
-cmp2:
-       MOVH    (R4), R8
-       MOVH    (R6), R9
-       BNE     R8, R9, cmp8a
+       BLT     R14, R15, lt_2
+
+       MOVHU   (R4), R10
+       MOVHU   (R6), R11
+       BEQ     R10, R11, lt_4_equal
+
+       AND     $0xff, R10, R16
+       AND     $0xff, R11, R17
+       BNE     R16, R17, cmp_byte
+
+       BSTRPICKV       $15, R10, $8, R16
+       BSTRPICKV       $15, R11, $8, R17
+       BNE     R16, R17, cmp_byte
+
+lt_4_equal:
+       ADDV    $-2, R14
+       BEQ     R14, cmp_len
        ADDV    $2, R4
        ADDV    $2, R6
-       SUBV    $2, R14
-       BEQ     R14, cmp_len
 
-cmp1:
-       BEQ     R14, cmp_len
-       MOVBU   (R4), R8
-       MOVBU   (R6), R9
-       BNE     R8, R9, byte_cmp
+lt_2:
+       MOVBU   (R4), R16
+       MOVBU   (R6), R17
+       BNE     R16, R17, cmp_byte
        JMP     cmp_len
 
-       // Compare 8/4/2 bytes taken from R8/R9 that are known to differ.
-cmp8a:
-       MOVV    R8, R10
-       MOVV    R9, R11
-
-       // Compare 8/4/2 bytes taken from R10/R11 that are known to differ.
-cmp8b:
-       MOVV    $0xff, R15
-
-       // Take single bytes from R10/R11 in turn for cyclic comparison.
-cmp8_loop:
-       AND     R10, R15, R8
-       AND     R11, R15, R9
-       BNE     R8, R9, byte_cmp
-       SLLV    $8, R15
-       JMP     cmp8_loop
-
-       // Compare 1 bytes taken from R8/R9 that are known to differ.
-byte_cmp:
-       SGTU    R8, R9, R4              // R4 = 1 if (R8 > R9)
+       // Compare 1 byte taken from R16/R17 that are known to differ.
+cmp_byte:
+       SGTU    R16, R17, R4    // R4 = 1 if (R16 > R17)
        BNE     R0, R4, ret
        MOVV    $-1, R4
-       JMP     ret
+       RET
 
 cmp_len:
        SGTU    R5, R7, R8
@@ -164,3 +165,199 @@ cmp_len:
 
 ret:
        RET
+
+lasx:
+       MOVV    $64, R20
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R9
+       BEQ     R9, lsx
+
+       MOVV    $128, R15
+       BLT     R14, R15, lasx32_loop
+lasx128_loop:
+       XVMOVQ  (R4), X0
+       XVMOVQ  (R6), X1
+       XVSEQB  X0, X1, X0
+       XVSETANYEQB     X0, FCC0
+       BFPT    lasx_found_0
+
+       XVMOVQ  32(R4), X0
+       XVMOVQ  32(R6), X1
+       XVSEQB  X0, X1, X0
+       XVSETANYEQB     X0, FCC0
+       BFPT    lasx_found_32
+
+       XVMOVQ  64(R4), X0
+       XVMOVQ  64(R6), X1
+       XVSEQB  X0, X1, X0
+       XVSETANYEQB     X0, FCC0
+       BFPT    lasx_found_64
+
+       XVMOVQ  96(R4), X0
+       XVMOVQ  96(R6), X1
+       XVSEQB  X0, X1, X0
+       XVSETANYEQB     X0, FCC0
+       BFPT    lasx_found_96
+
+       ADDV    $-128, R14
+       BEQ     R14, cmp_len
+       ADDV    $128, R4
+       ADDV    $128, R6
+       BGE     R14, R15, lasx128_loop
+
+       MOVV    $32, R15
+       BLT     R14, R15, tail
+lasx32_loop:
+       XVMOVQ  (R4), X0
+       XVMOVQ  (R6), X1
+       XVSEQB  X0, X1, X0
+       XVSETANYEQB     X0, FCC0
+       BFPT    lasx_found_0
+
+       ADDV    $-32, R14
+       BEQ     R14, cmp_len
+       ADDV    $32, R4
+       ADDV    $32, R6
+       BGE     R14, R15, lasx32_loop
+       JMP     tail
+
+lasx_found_0:
+       MOVV    R0, R11
+       JMP     lasx_find_byte
+
+lasx_found_32:
+       MOVV    $32, R11
+       JMP     lasx_find_byte
+
+lasx_found_64:
+       MOVV    $64, R11
+       JMP     lasx_find_byte
+
+lasx_found_96:
+       MOVV    $96, R11
+
+lasx_find_byte:
+       XVMOVQ  X0.V[0], R10
+       CTOV    R10, R10
+       BNE     R10, R20, find_byte
+       ADDV    $8, R11
+
+       XVMOVQ  X0.V[1], R10
+       CTOV    R10, R10
+       BNE     R10, R20, find_byte
+       ADDV    $8, R11
+
+       XVMOVQ  X0.V[2], R10
+       CTOV    R10, R10
+       BNE     R10, R20, find_byte
+       ADDV    $8, R11
+
+       XVMOVQ  X0.V[3], R10
+       CTOV    R10, R10
+       JMP     find_byte
+
+lsx:
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R9
+       BEQ     R9, generic32_loop
+
+       MOVV    $64, R15
+       BLT     R14, R15, lsx16_loop
+lsx64_loop:
+       VMOVQ   (R4), V0
+       VMOVQ   (R6), V1
+       VSEQB   V0, V1, V0
+       VSETANYEQB      V0, FCC0
+       BFPT    lsx_found_0
+
+       VMOVQ   16(R4), V0
+       VMOVQ   16(R6), V1
+       VSEQB   V0, V1, V0
+       VSETANYEQB      V0, FCC0
+       BFPT    lsx_found_16
+
+       VMOVQ   32(R4), V0
+       VMOVQ   32(R6), V1
+       VSEQB   V0, V1, V0
+       VSETANYEQB      V0, FCC0
+       BFPT    lsx_found_32
+
+       VMOVQ   48(R4), V0
+       VMOVQ   48(R6), V1
+       VSEQB   V0, V1, V0
+       VSETANYEQB      V0, FCC0
+       BFPT    lsx_found_48
+
+       ADDV    $-64, R14
+       BEQ     R14, cmp_len
+       ADDV    $64, R4
+       ADDV    $64, R6
+       BGE     R14, R15, lsx64_loop
+
+       MOVV    $16, R15
+       BLT     R14, R15, tail
+lsx16_loop:
+       VMOVQ   (R4), V0
+       VMOVQ   (R6), V1
+       VSEQB   V0, V1, V0
+       VSETANYEQB      V0, FCC0
+       BFPT    lsx_found_0
+
+       ADDV    $-16, R14
+       BEQ     R14, cmp_len
+       ADDV    $16, R4
+       ADDV    $16, R6
+       BGE     R14, R15, lsx16_loop
+       JMP     tail
+
+lsx_found_0:
+       MOVV    R0, R11
+       JMP     lsx_find_byte
+
+lsx_found_16:
+       MOVV    $16, R11
+       JMP     lsx_find_byte
+
+lsx_found_32:
+       MOVV    $32, R11
+       JMP     lsx_find_byte
+
+lsx_found_48:
+       MOVV    $48, R11
+
+lsx_find_byte:
+       VMOVQ   V0.V[0], R10
+       CTOV    R10, R10
+       BNE     R10, R20, find_byte
+       ADDV    $8, R11
+
+       VMOVQ   V0.V[1], R10
+       CTOV    R10, R10
+
+find_byte:
+       SRLV    $3, R10
+       ADDV    R10, R11
+       ADDV    R11, R4
+       ADDV    R11, R6
+       MOVB    (R4), R16
+       MOVB    (R6), R17
+       JMP     cmp_byte
+
+generic32_loop:
+       MOVV    (R4), R10
+       MOVV    (R6), R11
+       BNE     R10, R11, cmp8
+       MOVV    8(R4), R10
+       MOVV    8(R6), R11
+       BNE     R10, R11, cmp8
+       MOVV    16(R4), R10
+       MOVV    16(R6), R11
+       BNE     R10, R11, cmp8
+       MOVV    24(R4), R10
+       MOVV    24(R6), R11
+       BNE     R10, R11, cmp8
+       ADDV    $-32, R14
+       BEQ     R14, cmp_len
+       ADDV    $32, R4
+       ADDV    $32, R6
+       MOVV    $32, R15
+       BGE     R14, R15, generic32_loop
+       JMP     tail