]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: optimize the function memequal using SIMD on loong64
authorlimeidan <limeidan@loongson.cn>
Tue, 22 Apr 2025 02:24:27 +0000 (10:24 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Wed, 23 Apr 2025 08:29:52 +0000 (01:29 -0700)
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000-HV @ 2500.00MHz
                              │      old      │                 new                  │
                              │    sec/op     │    sec/op     vs base                │
Equal/0                          0.4012n ± 0%   0.4003n ± 0%   -0.21% (p=0.000 n=10)
Equal/same/1                      2.555n ± 1%    2.419n ± 0%   -5.32% (p=0.000 n=10)
Equal/same/6                      2.574n ± 1%    2.425n ± 1%   -5.79% (p=0.000 n=10)
Equal/same/9                      2.578n ± 0%    2.419n ± 1%   -6.19% (p=0.000 n=10)
Equal/same/15                     2.565n ± 1%    2.417n ± 0%   -5.73% (p=0.000 n=10)
Equal/same/16                     2.576n ± 1%    2.414n ± 0%   -6.31% (p=0.000 n=10)
Equal/same/20                     2.573n ± 1%    2.416n ± 0%   -6.10% (p=0.000 n=10)
Equal/same/32                     2.559n ± 0%    2.411n ± 0%   -5.80% (p=0.000 n=10)
Equal/same/4K                     2.579n ± 1%    2.410n ± 0%   -6.53% (p=0.000 n=10)
Equal/same/4M                     2.571n ± 0%    2.411n ± 0%   -6.22% (p=0.000 n=10)
Equal/same/64M                    2.568n ± 1%    2.413n ± 0%   -6.05% (p=0.000 n=10)
Equal/1                           5.215n ± 0%    6.404n ± 0%  +22.80% (p=0.000 n=10)
Equal/6                          11.630n ± 0%    6.404n ± 0%  -44.94% (p=0.000 n=10)
Equal/9                          15.240n ± 0%    6.404n ± 0%  -57.98% (p=0.000 n=10)
Equal/15                         22.925n ± 0%    6.404n ± 0%  -72.07% (p=0.000 n=10)
Equal/16                         24.070n ± 0%    5.203n ± 0%  -78.38% (p=0.000 n=10)
Equal/20                         28.880n ± 0%    6.404n ± 0%  -77.83% (p=0.000 n=10)
Equal/32                         43.320n ± 0%    6.404n ± 0%  -85.22% (p=0.000 n=10)
Equal/4K                        4938.50n ± 0%    55.43n ± 0%  -98.88% (p=0.000 n=10)
Equal/4M                         5048.8µ ± 0%    202.0µ ± 0%  -96.00% (p=0.000 n=10)
Equal/64M                        80.819m ± 0%    4.539m ± 0%  -94.38% (p=0.000 n=10)
EqualBothUnaligned/64_0          79.830n ± 0%    4.803n ± 0%  -93.98% (p=0.000 n=10)
EqualBothUnaligned/64_1          79.830n ± 0%    4.803n ± 0%  -93.98% (p=0.000 n=10)
EqualBothUnaligned/64_4          79.830n ± 0%    4.803n ± 0%  -93.98% (p=0.000 n=10)
EqualBothUnaligned/64_7          79.830n ± 0%    4.803n ± 0%  -93.98% (p=0.000 n=10)
EqualBothUnaligned/4096_0       4937.00n ± 0%    65.64n ± 0%  -98.67% (p=0.000 n=10)
EqualBothUnaligned/4096_1       4937.00n ± 0%    78.85n ± 0%  -98.40% (p=0.000 n=10)
EqualBothUnaligned/4096_4       4937.00n ± 0%    78.87n ± 0%  -98.40% (p=0.000 n=10)
EqualBothUnaligned/4096_7       4937.00n ± 0%    78.87n ± 0%  -98.40% (p=0.000 n=10)
EqualBothUnaligned/4194304_0     5049.2µ ± 0%    204.2µ ± 0%  -95.96% (p=0.000 n=10)
EqualBothUnaligned/4194304_1     5049.2µ ± 0%    205.1µ ± 0%  -95.94% (p=0.000 n=10)
EqualBothUnaligned/4194304_4     5049.4µ ± 0%    205.1µ ± 0%  -95.94% (p=0.000 n=10)
EqualBothUnaligned/4194304_7     5049.2µ ± 0%    205.1µ ± 0%  -95.94% (p=0.000 n=10)
EqualBothUnaligned/67108864_0    80.796m ± 0%    3.863m ± 0%  -95.22% (p=0.000 n=10)
EqualBothUnaligned/67108864_1    80.801m ± 0%    3.706m ± 0%  -95.41% (p=0.000 n=10)
EqualBothUnaligned/67108864_4    80.799m ± 0%    3.706m ± 0%  -95.41% (p=0.000 n=10)
EqualBothUnaligned/67108864_7    80.781m ± 0%    3.706m ± 0%  -95.41% (p=0.000 n=10)
geomean                           1.040µ         149.6n       -85.63%

Change-Id: Id4c2bc0ca758337dd9759df83750c761814be488
Reviewed-on: https://go-review.googlesource.com/c/go/+/667255
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: sophie zhao <zhaoxiaolin@loongson.cn>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
src/internal/bytealg/equal_loong64.s

index 830b09bd2cf3d59c22af0f415158ccc0fa5d5710..8f570e8eaedb8d9535f8380f3f7456caa15f6366 100644 (file)
 #define        REGCTXT R29
 
 // memequal(a, b unsafe.Pointer, size uintptr) bool
-TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0-25
-       BEQ     R4, R5, eq
-       ADDV    R4, R6, R7
-       PCALIGN $16
-loop:
-       BNE     R4, R7, test
-       MOVV    $1, R4
-       RET
-test:
-       MOVBU   (R4), R9
-       ADDV    $1, R4
-       MOVBU   (R5), R10
-       ADDV    $1, R5
-       BEQ     R9, R10, loop
-
-       MOVB    R0, R4
-       RET
-eq:
-       MOVV    $1, R4
-       RET
+TEXT runtime·memequal<ABIInternal>(SB),NOSPLIT|NOFRAME,$0
+       // R4 = a_base
+       // R5 = b_base
+       // R6 = size
+       JMP     equalbody<>(SB)
 
 // memequal_varlen(a, b unsafe.Pointer) bool
-TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$40-17
-       BEQ     R4, R5, eq
+TEXT runtime·memequal_varlen<ABIInternal>(SB),NOSPLIT,$0
+       // R4 = a_base
+       // R5 = b_base
        MOVV    8(REGCTXT), R6    // compiler stores size at offset 8 in the closure
-       MOVV    R4, 8(R3)
-       MOVV    R5, 16(R3)
-       MOVV    R6, 24(R3)
-       JAL     runtime·memequal(SB)
-       MOVBU   32(R3), R4
-       RET
-eq:
+       JMP     equalbody<>(SB)
+
+// input:
+//   R4 = a_base
+//   R5 = b_base
+//   R6 = size
+TEXT equalbody<>(SB),NOSPLIT|NOFRAME,$0
+       // a_base == b_base
+       BEQ     R4, R5, equal
+       // 0 bytes
+       BEQ     R6, equal
+
+       MOVV    $64, R7
+       BGE     R6, R7, lasx
+
+       // size < 64 bytes
+tail:
+       MOVV    $16, R7
+       BLT     R6, R7, lt_16
+generic16_loop:
+       ADDV    $-16, R6
+       MOVV    0(R4), R8
+       MOVV    8(R4), R9
+       MOVV    0(R5), R10
+       MOVV    8(R5), R11
+       BNE     R8, R10, not_equal
+       BNE     R9, R11, not_equal
+       BEQ     R6, equal
+       ADDV    $16, R4
+       ADDV    $16, R5
+       BGE     R6, R7, generic16_loop
+
+       // size < 16 bytes
+lt_16:
+       MOVV    $8, R7
+       BLT     R6, R7, lt_8
+       ADDV    $-8, R6
+       MOVV    0(R4), R8
+       MOVV    0(R5), R9
+       BNE     R8, R9, not_equal
+       BEQ     R6, equal
+       ADDV    $8, R4
+       ADDV    $8, R5
+
+       // size < 8 bytes
+lt_8:
+       MOVV    $4, R7
+       BLT     R6, R7, lt_4
+       ADDV    $-4, R6
+       MOVW    0(R4), R8
+       MOVW    0(R5), R9
+       BNE     R8, R9, not_equal
+       BEQ     R6, equal
+       ADDV    $4, R4
+       ADDV    $4, R5
+
+       // size < 4 bytes
+lt_4:
+       MOVV    $2, R7
+       BLT     R6, R7, lt_2
+       ADDV    $-2, R6
+       MOVH    0(R4), R8
+       MOVH    0(R5), R9
+       BNE     R8, R9, not_equal
+       BEQ     R6, equal
+       ADDV    $2, R4
+       ADDV    $2, R5
+
+       // size < 2 bytes
+lt_2:
+       MOVB    0(R4), R8
+       MOVB    0(R5), R9
+       BNE     R8, R9, not_equal
+
+equal:
        MOVV    $1, R4
        RET
+
+not_equal:
+       MOVV    R0, R4
+       RET
+
+       // Implemented using 256-bit SIMD instructions
+lasx:
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R7
+       BEQ     R7, lsx
+
+lasx256:
+       MOVV    $256, R7
+       BLT     R6, R7, lasx64
+lasx256_loop:
+       ADDV    $-256, R6
+       XVMOVQ  0(R4), X0
+       XVMOVQ  32(R4), X1
+       XVMOVQ  64(R4), X2
+       XVMOVQ  96(R4), X3
+       XVMOVQ  128(R4), X4
+       XVMOVQ  160(R4), X5
+       XVMOVQ  192(R4), X6
+       XVMOVQ  224(R4), X7
+       XVMOVQ  0(R5), X8
+       XVMOVQ  32(R5), X9
+       XVMOVQ  64(R5), X10
+       XVMOVQ  96(R5), X11
+       XVMOVQ  128(R5), X12
+       XVMOVQ  160(R5), X13
+       XVMOVQ  192(R5), X14
+       XVMOVQ  224(R5), X15
+       XVSEQV  X0, X8, X0
+       XVSEQV  X1, X9, X1
+       XVSEQV  X2, X10, X2
+       XVSEQV  X3, X11, X3
+       XVSEQV  X4, X12, X4
+       XVSEQV  X5, X13, X5
+       XVSEQV  X6, X14, X6
+       XVSEQV  X7, X15, X7
+       XVANDV  X0, X1, X0
+       XVANDV  X2, X3, X2
+       XVANDV  X4, X5, X4
+       XVANDV  X6, X7, X6
+       XVANDV  X0, X2, X0
+       XVANDV  X4, X6, X4
+       XVANDV  X0, X4, X0
+       XVSETALLNEV     X0, FCC0
+       BFPF    not_equal
+       BEQ     R6, equal
+       ADDV    $256, R4
+       ADDV    $256, R5
+       BGE     R6, R7, lasx256_loop
+
+lasx64:
+       MOVV    $64, R7
+       BLT     R6, R7, tail
+lasx64_loop:
+       ADDV    $-64, R6
+       XVMOVQ  0(R4), X0
+       XVMOVQ  32(R4), X1
+       XVMOVQ  0(R5), X2
+       XVMOVQ  32(R5), X3
+       XVSEQV  X0, X2, X0
+       XVSEQV  X1, X3, X1
+       XVANDV  X0, X1, X0
+       XVSETALLNEV     X0, FCC0
+       BFPF    not_equal
+       BEQ     R6, equal
+       ADDV    $64, R4
+       ADDV    $64, R5
+       BGE     R6, R7, lasx64_loop
+       JMP     tail
+
+       // Implemented using 128-bit SIMD instructions
+lsx:
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R7
+       BEQ     R7, generic64_loop
+
+lsx128:
+       MOVV    $128, R7
+       BLT     R6, R7, lsx32
+lsx128_loop:
+       ADDV    $-128, R6
+       VMOVQ   0(R4), V0
+       VMOVQ   16(R4), V1
+       VMOVQ   32(R4), V2
+       VMOVQ   48(R4), V3
+       VMOVQ   64(R4), V4
+       VMOVQ   80(R4), V5
+       VMOVQ   96(R4), V6
+       VMOVQ   112(R4), V7
+       VMOVQ   0(R5), V8
+       VMOVQ   16(R5), V9
+       VMOVQ   32(R5), V10
+       VMOVQ   48(R5), V11
+       VMOVQ   64(R5), V12
+       VMOVQ   80(R5), V13
+       VMOVQ   96(R5), V14
+       VMOVQ   112(R5), V15
+       VSEQV   V0, V8, V0
+       VSEQV   V1, V9, V1
+       VSEQV   V2, V10, V2
+       VSEQV   V3, V11, V3
+       VSEQV   V4, V12, V4
+       VSEQV   V5, V13, V5
+       VSEQV   V6, V14, V6
+       VSEQV   V7, V15, V7
+       VANDV   V0, V1, V0
+       VANDV   V2, V3, V2
+       VANDV   V4, V5, V4
+       VANDV   V6, V7, V6
+       VANDV   V0, V2, V0
+       VANDV   V4, V6, V4
+       VANDV   V0, V4, V0
+       VSETALLNEV      V0, FCC0
+       BFPF    not_equal
+       BEQ     R6, equal
+
+       ADDV    $128, R4
+       ADDV    $128, R5
+       BGE     R6, R7, lsx128_loop
+
+lsx32:
+       MOVV    $32, R7
+       BLT     R6, R7, tail
+lsx32_loop:
+       ADDV    $-32, R6
+       VMOVQ   0(R4), V0
+       VMOVQ   16(R4), V1
+       VMOVQ   0(R5), V2
+       VMOVQ   16(R5), V3
+       VSEQV   V0, V2, V0
+       VSEQV   V1, V3, V1
+       VANDV   V0, V1, V0
+       VSETALLNEV      V0, FCC0
+       BFPF    not_equal
+       BEQ     R6, equal
+       ADDV    $32, R4
+       ADDV    $32, R5
+       BGE     R6, R7, lsx32_loop
+       JMP tail
+
+       // Implemented using general instructions
+generic64_loop:
+       ADDV    $-64, R6
+       MOVV    0(R4), R7
+       MOVV    8(R4), R8
+       MOVV    16(R4), R9
+       MOVV    24(R4), R10
+       MOVV    0(R5), R15
+       MOVV    8(R5), R16
+       MOVV    16(R5), R17
+       MOVV    24(R5), R18
+       BNE     R7, R15, not_equal
+       BNE     R8, R16, not_equal
+       BNE     R9, R17, not_equal
+       BNE     R10, R18, not_equal
+       MOVV    32(R4), R11
+       MOVV    40(R4), R12
+       MOVV    48(R4), R13
+       MOVV    56(R4), R14
+       MOVV    32(R5), R19
+       MOVV    40(R5), R20
+       MOVV    48(R5), R21
+       MOVV    56(R5), R23
+       BNE     R11, R19, not_equal
+       BNE     R12, R20, not_equal
+       BNE     R13, R21, not_equal
+       BNE     R14, R23, not_equal
+       BEQ     R6, equal
+       ADDV    $64, R4
+       ADDV    $64, R5
+       MOVV    $64, R7
+       BGE     R6, R7, generic64_loop
+       JMP tail