]> Cypherpunks repositories - gostls13.git/commitdiff
src/internal/bytealg: optimize the function Compare on loong64
authorXiaolin Zhao <zhaoxiaolin@loongson.cn>
Mon, 3 Jun 2024 03:45:04 +0000 (11:45 +0800)
committerGopher Robot <gobot@golang.org>
Wed, 11 Sep 2024 22:34:54 +0000 (22:34 +0000)
The relevant performance improved by 66.73%.

benchmark:
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000 @ 2500.00MHz
                                      │     old      │                 new                 │
                                      │    sec/op    │   sec/op     vs base                │
BytesCompare/1                           5.603n ± 0%   4.002n ± 0%  -28.57% (p=0.000 n=20)
BytesCompare/2                           6.405n ± 0%   4.002n ± 0%  -37.52% (p=0.000 n=20)
BytesCompare/4                           8.007n ± 0%   4.002n ± 0%  -50.02% (p=0.000 n=20)
BytesCompare/8                          11.210n ± 0%   4.002n ± 0%  -64.30% (p=0.000 n=20)
BytesCompare/16                          6.005n ± 0%   4.802n ± 0%  -20.03% (p=0.000 n=20)
BytesCompare/32                          6.806n ± 0%   4.402n ± 0%  -35.32% (p=0.000 n=20)
BytesCompare/64                          8.407n ± 0%   6.003n ± 0%  -28.60% (p=0.000 n=20)
BytesCompare/128                        11.610n ± 0%   8.404n ± 0%  -27.61% (p=0.000 n=20)
BytesCompare/256                         18.02n ± 0%   14.01n ± 0%  -22.25% (p=0.000 n=20)
BytesCompare/512                         31.23n ± 0%   26.98n ± 0%  -13.61% (p=0.000 n=20)
BytesCompare/1024                        56.85n ± 0%   52.43n ± 0%   -7.77% (p=0.000 n=20)
BytesCompare/2048                        108.1n ± 0%   103.8n ± 0%   -3.98% (p=0.000 n=20)
CompareBytesEqual                       15.610n ± 0%   5.203n ± 0%  -66.67% (p=0.000 n=20)
CompareBytesToNil                        3.203n ± 0%   3.202n ± 0%   -0.03% (p=0.000 n=20)
CompareBytesEmpty                        3.203n ± 0%   2.423n ± 0%  -24.35% (p=0.000 n=20)
CompareBytesIdentical                    3.203n ± 0%   2.424n ± 0%  -24.32% (p=0.000 n=20)
CompareBytesSameLength                   8.407n ± 0%   8.004n ± 0%   -4.79% (p=0.000 n=20)
CompareBytesDifferentLength              8.808n ± 0%   7.604n ± 0%  -13.67% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=1       839.85µ ± 0%   82.04µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=2       839.86µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=3       839.86µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=4       839.86µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=5       839.85µ ± 0%   82.04µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=6       839.85µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigUnaligned/offset=7       839.85µ ± 0%   82.03µ ± 0%  -90.23% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=0    78.77µ ± 0%   78.75µ ± 0%   -0.03% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=1   839.84µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=2   839.84µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=3   839.85µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=4   839.83µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=5   839.85µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=6   839.85µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBigBothUnaligned/offset=7   839.84µ ± 0%   85.31µ ± 0%  -89.84% (p=0.000 n=20)
CompareBytesBig                          78.77µ ± 0%   78.75µ ± 0%   -0.03% (p=0.001 n=20)
CompareBytesBigIdentical                 2.802n ± 0%   2.801n ± 0%   -0.04% (p=0.001 n=20)
geomean                                  1.524µ        507.2n       -66.73%

Change-Id: Ice9f4ef0ce0fbb5a6424823c5f8e0c0c369fd159
Reviewed-on: https://go-review.googlesource.com/c/go/+/589538
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Tim King <taking@google.com>
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Auto-Submit: Tim King <taking@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
src/internal/bytealg/compare_loong64.s

index df72a1122bc714434b11bfaf90a15caf33b10631..99c8cda77521d01fd8030fce45782109fe42518c 100644 (file)
@@ -28,58 +28,136 @@ TEXT runtime·cmpstring<ABIInternal>(SB),NOSPLIT,$0-40
 // R7 length of b
 // R4 points to the start of a
 // R6 points to the start of b
-// R13 points to the return value (-1/0/1)
+// for regabi the return value (-1/0/1) in R4
 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0
-       BEQ     R4, R6, samebytes // same start of a and b
+       BEQ     R4, R6, cmp_len         // same start of a and b, then compare lengths
 
        SGTU    R5, R7, R9
-       BNE     R0, R9, r2_lt_r1
+       BNE     R9, b_lt_a
        MOVV    R5, R14
        JMP     entry
-r2_lt_r1:
-       MOVV    R7, R14 // R14 is min(R4, R5)
+b_lt_a:
+       MOVV    R7, R14                 // R14 is min(R5, R7)
 entry:
-       ADDV    R4, R14, R12    // R6 start of a, R14 end of a
-       BEQ     R4, R12, samebytes // length is 0
+       ADDV    R4, R14, R12            // R4 start of a, R12 end of a
+       BEQ     R4, R12, cmp_len        // minlength is 0
 
-       SRLV    $4, R14         // R14 is number of chunks
-       BEQ     R0, R14, byte_loop
+tail:
+       MOVV    $2, R15
+       BLT     R14, R15, cmp1          // min < 2
+       SLLV    $1, R15
+       BLT     R14, R15, cmp2          // min < 4
+       SLLV    $1, R15
+       BLT     R14, R15, cmp4          // min < 8
+       SLLV    $1, R15
+       BLT     R14, R15, cmp8          // min < 16
+       SLLV    $1, R15
+       BLT     R14, R15, cmp16         // min < 32
 
-       // make sure both a and b are aligned.
-       OR      R4, R6, R15
-       AND     $7, R15
-       BNE     R0, R15, byte_loop
+// When min >= 32 bytes, enter the cmp32_loop loop processing:
+// take out 4 8-bytes from a and b in turn for comparison.
+cmp32_loop:
+       MOVV    (R4), R8
+       MOVV    (R6), R9
+       MOVV    8(R4), R10
+       MOVV    8(R6), R11
+       BNE     R8, R9, cmp8a
+       BNE     R10, R11, cmp8b
+       MOVV    16(R4), R8
+       MOVV    16(R6), R9
+       MOVV    24(R4), R10
+       MOVV    24(R6), R11
+       BNE     R8, R9, cmp8a
+       BNE     R10, R11, cmp8b
+       ADDV    $32, R4
+       ADDV    $32, R6
+       SUBV    $32, R14
+       BGE     R14, R15, cmp32_loop
+       BEQ     R14, cmp_len
 
-       PCALIGN $16
-chunk16_loop:
-       BEQ     R0, R14, byte_loop
+check16:
+       MOVV    $16, R15
+       BLT     R14, R15, check8
+cmp16:
        MOVV    (R4), R8
        MOVV    (R6), R9
-       BNE     R8, R9, byte_loop
-       MOVV    8(R4), R16
-       MOVV    8(R6), R17
+       MOVV    8(R4), R10
+       MOVV    8(R6), R11
+       BNE     R8, R9, cmp8a
+       BNE     R10, R11, cmp8b
        ADDV    $16, R4
        ADDV    $16, R6
-       SUBVU   $1, R14
-       BEQ     R16, R17, chunk16_loop
-       SUBV    $8, R4
-       SUBV    $8, R6
+       SUBV    $16, R14
+       BEQ     R14, cmp_len
+
+check8:
+       MOVV    $8, R15
+       BLT     R14, R15, check4
+cmp8:
+       MOVV    (R4), R8
+       MOVV    (R6), R9
+       BNE     R8, R9, cmp8a
+       ADDV    $8, R4
+       ADDV    $8, R6
+       SUBV    $8, R14
+       BEQ     R14, cmp_len
 
-byte_loop:
-       BEQ     R4, R12, samebytes
+check4:
+       MOVV    $4, R15
+       BLT     R14, R15, check2
+cmp4:
+       MOVW    (R4), R8
+       MOVW    (R6), R9
+       BNE     R8, R9, cmp8a
+       ADDV    $4, R4
+       ADDV    $4, R6
+       SUBV    $4, R14
+       BEQ     R14, cmp_len
+
+check2:
+       MOVV    $2, R15
+       BLT     R14, R15, cmp1
+cmp2:
+       MOVH    (R4), R8
+       MOVH    (R6), R9
+       BNE     R8, R9, cmp8a
+       ADDV    $2, R4
+       ADDV    $2, R6
+       SUBV    $2, R14
+       BEQ     R14, cmp_len
+
+cmp1:
+       BEQ     R14, cmp_len
        MOVBU   (R4), R8
-       ADDVU   $1, R4
        MOVBU   (R6), R9
-       ADDVU   $1, R6
-       BEQ     R8, R9, byte_loop
+       BNE     R8, R9, byte_cmp
+       JMP     cmp_len
+
+       // Compare 8/4/2 bytes taken from R8/R9 that are known to differ.
+cmp8a:
+       MOVV    R8, R10
+       MOVV    R9, R11
+
+       // Compare 8/4/2 bytes taken from R10/R11 that are known to differ.
+cmp8b:
+       MOVV    $0xff, R15
+
+       // Take single bytes from R10/R11 in turn for cyclic comparison.
+cmp8_loop:
+       AND     R10, R15, R8
+       AND     R11, R15, R9
+       BNE     R8, R9, byte_cmp
+       SLLV    $8, R15
+       JMP     cmp8_loop
 
+       // Compare 1 bytes taken from R8/R9 that are known to differ.
 byte_cmp:
-       SGTU    R8, R9, R4 // R12 = 1 if (R8 > R9)
+       SGTU    R8, R9, R4              // R4 = 1 if (R8 > R9)
        BNE     R0, R4, ret
        MOVV    $-1, R4
        JMP     ret
 
-samebytes:
+cmp_len:
        SGTU    R5, R7, R8
        SGTU    R7, R5, R9
        SUBV    R9, R8, R4