From: limeidan Date: Wed, 7 May 2025 09:04:54 +0000 (+0800) Subject: internal/bytealg: optimize the function compare using SIMD on loong64 X-Git-Tag: go1.25rc1~280 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=a1c3e2f008267b976e69866b599b113399ad4724;p=gostls13.git internal/bytealg: optimize the function compare using SIMD on loong64 goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3A6000-HV @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ BytesCompare/1 7.238n ± 25% 5.204n ± 0% -28.10% (p=0.001 n=10) BytesCompare/2 7.242n ± 6% 5.204n ± 0% -28.14% (p=0.000 n=10) BytesCompare/4 7.229n ± 5% 4.403n ± 0% -39.10% (p=0.000 n=10) BytesCompare/8 7.077n ± 36% 4.403n ± 0% -37.78% (p=0.000 n=10) BytesCompare/16 8.373n ± 6% 6.004n ± 0% -28.30% (p=0.000 n=10) BytesCompare/32 8.040n ± 3% 4.803n ± 0% -40.26% (p=0.000 n=10) BytesCompare/64 8.434n ± 24% 10.410n ± 0% +23.42% (p=0.014 n=10) BytesCompare/128 11.530n ± 23% 5.604n ± 0% -51.40% (p=0.000 n=10) BytesCompare/256 14.180n ± 0% 7.606n ± 0% -46.36% (p=0.000 n=10) BytesCompare/512 26.83n ± 0% 10.81n ± 0% -59.71% (p=0.000 n=10) BytesCompare/1024 52.60n ± 0% 17.21n ± 0% -67.28% (p=0.000 n=10) BytesCompare/2048 103.70n ± 0% 30.02n ± 0% -71.05% (p=0.000 n=10) geomean 13.49n 7.607n -43.63% goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3A6000-HV @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ CompareBytesEqual 5.603n ± 0% 5.604n ± 0% ~ (p=0.191 n=10) CompareBytesToNil 3.202n ± 0% 3.202n ± 0% ~ (p=1.000 n=10) CompareBytesEmpty 2.802n ± 0% 2.802n ± 0% ~ (p=1.000 n=10) CompareBytesIdentical 3.202n ± 0% 2.538n ± 1% -20.72% (p=0.000 n=10) CompareBytesSameLength 8.805n ± 0% 4.803n ± 0% -45.45% (p=0.000 n=10) CompareBytesDifferentLength 9.206n ± 0% 4.403n ± 0% -52.17% (p=0.000 n=10) CompareBytesBigUnaligned/offset=1 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10) CompareBytesBigUnaligned/offset=2 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10) CompareBytesBigUnaligned/offset=3 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10) CompareBytesBigUnaligned/offset=4 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10) CompareBytesBigUnaligned/offset=5 82.04µ ± 0% 45.91µ ± 0% -44.04% (p=0.000 n=10) CompareBytesBigUnaligned/offset=6 82.03µ ± 0% 45.93µ ± 0% -44.01% (p=0.000 n=10) CompareBytesBigUnaligned/offset=7 82.04µ ± 0% 45.93µ ± 0% -44.01% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=0 78.76µ ± 0% 45.69µ ± 0% -41.98% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=1 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=2 85.31µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=3 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=4 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=5 85.32µ ± 0% 46.04µ ± 0% -46.03% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=6 85.31µ ± 0% 46.06µ ± 0% -46.02% (p=0.000 n=10) CompareBytesBigBothUnaligned/offset=7 85.32µ ± 0% 52.32µ ± 7% -38.68% (p=0.000 n=10) CompareBytesBig 78.76µ ± 0% 50.20µ ± 6% -36.26% (p=0.000 n=10) CompareBytesBigIdentical 3.202n ± 0% 3.442n ± 24% ~ (p=0.462 n=10) geomean 4.197µ 2.630µ -37.34% Change-Id: I621145aef3e6a2c68e7127152f26ed047c6b2ece Reviewed-on: https://go-review.googlesource.com/c/go/+/671315 LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Knyszek Reviewed-by: abner chenc Reviewed-by: Cherry Mui --- diff --git a/src/internal/bytealg/compare_loong64.s b/src/internal/bytealg/compare_loong64.s index 99c8cda775..9330531964 100644 --- a/src/internal/bytealg/compare_loong64.s +++ b/src/internal/bytealg/compare_loong64.s @@ -23,139 +23,140 @@ TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 // R7 = b_len JMP cmpbody<>(SB) -// On entry: -// R5 length of a -// R7 length of b -// R4 points to the start of a -// R6 points to the start of b +// input: +// R4: points to the start of a +// R5: length of a +// R6: points to the start of b +// R7: length of b // for regabi the return value (-1/0/1) in R4 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0 - BEQ R4, R6, cmp_len // same start of a and b, then compare lengths + BEQ R4, R6, cmp_len // same start of a and b, then compare lengths SGTU R5, R7, R9 BNE R9, b_lt_a MOVV R5, R14 JMP entry + b_lt_a: - MOVV R7, R14 // R14 is min(R5, R7) + MOVV R7, R14 + entry: - ADDV R4, R14, R12 // R4 start of a, R12 end of a - BEQ R4, R12, cmp_len // minlength is 0 + BEQ R14, cmp_len // minlength is 0 + MOVV $32, R15 + BGE R14, R15, lasx tail: - MOVV $2, R15 - BLT R14, R15, cmp1 // min < 2 - SLLV $1, R15 - BLT R14, R15, cmp2 // min < 4 - SLLV $1, R15 - BLT R14, R15, cmp4 // min < 8 - SLLV $1, R15 - BLT R14, R15, cmp8 // min < 16 - SLLV $1, R15 - BLT R14, R15, cmp16 // min < 32 - -// When min >= 32 bytes, enter the cmp32_loop loop processing: -// take out 4 8-bytes from a and b in turn for comparison. -cmp32_loop: - MOVV (R4), R8 - MOVV (R6), R9 - MOVV 8(R4), R10 - MOVV 8(R6), R11 - BNE R8, R9, cmp8a - BNE R10, R11, cmp8b - MOVV 16(R4), R8 - MOVV 16(R6), R9 - MOVV 24(R4), R10 - MOVV 24(R6), R11 - BNE R8, R9, cmp8a - BNE R10, R11, cmp8b - ADDV $32, R4 - ADDV $32, R6 - SUBV $32, R14 - BGE R14, R15, cmp32_loop - BEQ R14, cmp_len - -check16: - MOVV $16, R15 - BLT R14, R15, check8 -cmp16: - MOVV (R4), R8 - MOVV (R6), R9 - MOVV 8(R4), R10 - MOVV 8(R6), R11 - BNE R8, R9, cmp8a - BNE R10, R11, cmp8b - ADDV $16, R4 - ADDV $16, R6 - SUBV $16, R14 - BEQ R14, cmp_len - -check8: MOVV $8, R15 - BLT R14, R15, check4 + BLT R14, R15, lt_8 +generic8_loop: + MOVV (R4), R10 + MOVV (R6), R11 + BEQ R10, R11, generic8_equal + cmp8: - MOVV (R4), R8 - MOVV (R6), R9 - BNE R8, R9, cmp8a + AND $0xff, R10, R16 + AND $0xff, R11, R17 + BNE R16, R17, cmp_byte + + BSTRPICKV $15, R10, $8, R16 + BSTRPICKV $15, R11, $8, R17 + BNE R16, R17, cmp_byte + + BSTRPICKV $23, R10, $16, R16 + BSTRPICKV $23, R11, $16, R17 + BNE R16, R17, cmp_byte + + BSTRPICKV $31, R10, $24, R16 + BSTRPICKV $31, R11, $24, R17 + BNE R16, R17, cmp_byte + + BSTRPICKV $39, R10, $32, R16 + BSTRPICKV $39, R11, $32, R17 + BNE R16, R17, cmp_byte + + BSTRPICKV $47, R10, $40, R16 + BSTRPICKV $47, R11, $40, R17 + BNE R16, R17, cmp_byte + + BSTRPICKV $55, R10, $48, R16 + BSTRPICKV $55, R11, $48, R17 + BNE R16, R17, cmp_byte + + BSTRPICKV $63, R10, $56, R16 + BSTRPICKV $63, R11, $56, R17 + BNE R16, R17, cmp_byte + +generic8_equal: + ADDV $-8, R14 + BEQ R14, cmp_len ADDV $8, R4 ADDV $8, R6 - SUBV $8, R14 - BEQ R14, cmp_len + BGE R14, R15, generic8_loop -check4: +lt_8: MOVV $4, R15 - BLT R14, R15, check2 -cmp4: - MOVW (R4), R8 - MOVW (R6), R9 - BNE R8, R9, cmp8a + BLT R14, R15, lt_4 + + MOVWU (R4), R10 + MOVWU (R6), R11 + BEQ R10, R11, lt_8_equal + + AND $0xff, R10, R16 + AND $0xff, R11, R17 + BNE R16, R17, cmp_byte + + BSTRPICKV $15, R10, $8, R16 + BSTRPICKV $15, R11, $8, R17 + BNE R16, R17, cmp_byte + + BSTRPICKV $23, R10, $16, R16 + BSTRPICKV $23, R11, $16, R17 + BNE R16, R17, cmp_byte + + BSTRPICKV $31, R10, $24, R16 + BSTRPICKV $31, R11, $24, R17 + BNE R16, R17, cmp_byte + +lt_8_equal: + ADDV $-4, R14 + BEQ R14, cmp_len ADDV $4, R4 ADDV $4, R6 - SUBV $4, R14 - BEQ R14, cmp_len -check2: +lt_4: MOVV $2, R15 - BLT R14, R15, cmp1 -cmp2: - MOVH (R4), R8 - MOVH (R6), R9 - BNE R8, R9, cmp8a + BLT R14, R15, lt_2 + + MOVHU (R4), R10 + MOVHU (R6), R11 + BEQ R10, R11, lt_4_equal + + AND $0xff, R10, R16 + AND $0xff, R11, R17 + BNE R16, R17, cmp_byte + + BSTRPICKV $15, R10, $8, R16 + BSTRPICKV $15, R11, $8, R17 + BNE R16, R17, cmp_byte + +lt_4_equal: + ADDV $-2, R14 + BEQ R14, cmp_len ADDV $2, R4 ADDV $2, R6 - SUBV $2, R14 - BEQ R14, cmp_len -cmp1: - BEQ R14, cmp_len - MOVBU (R4), R8 - MOVBU (R6), R9 - BNE R8, R9, byte_cmp +lt_2: + MOVBU (R4), R16 + MOVBU (R6), R17 + BNE R16, R17, cmp_byte JMP cmp_len - // Compare 8/4/2 bytes taken from R8/R9 that are known to differ. -cmp8a: - MOVV R8, R10 - MOVV R9, R11 - - // Compare 8/4/2 bytes taken from R10/R11 that are known to differ. -cmp8b: - MOVV $0xff, R15 - - // Take single bytes from R10/R11 in turn for cyclic comparison. -cmp8_loop: - AND R10, R15, R8 - AND R11, R15, R9 - BNE R8, R9, byte_cmp - SLLV $8, R15 - JMP cmp8_loop - - // Compare 1 bytes taken from R8/R9 that are known to differ. -byte_cmp: - SGTU R8, R9, R4 // R4 = 1 if (R8 > R9) + // Compare 1 byte taken from R16/R17 that are known to differ. +cmp_byte: + SGTU R16, R17, R4 // R4 = 1 if (R16 > R17) BNE R0, R4, ret MOVV $-1, R4 - JMP ret + RET cmp_len: SGTU R5, R7, R8 @@ -164,3 +165,199 @@ cmp_len: ret: RET + +lasx: + MOVV $64, R20 + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R9 + BEQ R9, lsx + + MOVV $128, R15 + BLT R14, R15, lasx32_loop +lasx128_loop: + XVMOVQ (R4), X0 + XVMOVQ (R6), X1 + XVSEQB X0, X1, X0 + XVSETANYEQB X0, FCC0 + BFPT lasx_found_0 + + XVMOVQ 32(R4), X0 + XVMOVQ 32(R6), X1 + XVSEQB X0, X1, X0 + XVSETANYEQB X0, FCC0 + BFPT lasx_found_32 + + XVMOVQ 64(R4), X0 + XVMOVQ 64(R6), X1 + XVSEQB X0, X1, X0 + XVSETANYEQB X0, FCC0 + BFPT lasx_found_64 + + XVMOVQ 96(R4), X0 + XVMOVQ 96(R6), X1 + XVSEQB X0, X1, X0 + XVSETANYEQB X0, FCC0 + BFPT lasx_found_96 + + ADDV $-128, R14 + BEQ R14, cmp_len + ADDV $128, R4 + ADDV $128, R6 + BGE R14, R15, lasx128_loop + + MOVV $32, R15 + BLT R14, R15, tail +lasx32_loop: + XVMOVQ (R4), X0 + XVMOVQ (R6), X1 + XVSEQB X0, X1, X0 + XVSETANYEQB X0, FCC0 + BFPT lasx_found_0 + + ADDV $-32, R14 + BEQ R14, cmp_len + ADDV $32, R4 + ADDV $32, R6 + BGE R14, R15, lasx32_loop + JMP tail + +lasx_found_0: + MOVV R0, R11 + JMP lasx_find_byte + +lasx_found_32: + MOVV $32, R11 + JMP lasx_find_byte + +lasx_found_64: + MOVV $64, R11 + JMP lasx_find_byte + +lasx_found_96: + MOVV $96, R11 + +lasx_find_byte: + XVMOVQ X0.V[0], R10 + CTOV R10, R10 + BNE R10, R20, find_byte + ADDV $8, R11 + + XVMOVQ X0.V[1], R10 + CTOV R10, R10 + BNE R10, R20, find_byte + ADDV $8, R11 + + XVMOVQ X0.V[2], R10 + CTOV R10, R10 + BNE R10, R20, find_byte + ADDV $8, R11 + + XVMOVQ X0.V[3], R10 + CTOV R10, R10 + JMP find_byte + +lsx: + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R9 + BEQ R9, generic32_loop + + MOVV $64, R15 + BLT R14, R15, lsx16_loop +lsx64_loop: + VMOVQ (R4), V0 + VMOVQ (R6), V1 + VSEQB V0, V1, V0 + VSETANYEQB V0, FCC0 + BFPT lsx_found_0 + + VMOVQ 16(R4), V0 + VMOVQ 16(R6), V1 + VSEQB V0, V1, V0 + VSETANYEQB V0, FCC0 + BFPT lsx_found_16 + + VMOVQ 32(R4), V0 + VMOVQ 32(R6), V1 + VSEQB V0, V1, V0 + VSETANYEQB V0, FCC0 + BFPT lsx_found_32 + + VMOVQ 48(R4), V0 + VMOVQ 48(R6), V1 + VSEQB V0, V1, V0 + VSETANYEQB V0, FCC0 + BFPT lsx_found_48 + + ADDV $-64, R14 + BEQ R14, cmp_len + ADDV $64, R4 + ADDV $64, R6 + BGE R14, R15, lsx64_loop + + MOVV $16, R15 + BLT R14, R15, tail +lsx16_loop: + VMOVQ (R4), V0 + VMOVQ (R6), V1 + VSEQB V0, V1, V0 + VSETANYEQB V0, FCC0 + BFPT lsx_found_0 + + ADDV $-16, R14 + BEQ R14, cmp_len + ADDV $16, R4 + ADDV $16, R6 + BGE R14, R15, lsx16_loop + JMP tail + +lsx_found_0: + MOVV R0, R11 + JMP lsx_find_byte + +lsx_found_16: + MOVV $16, R11 + JMP lsx_find_byte + +lsx_found_32: + MOVV $32, R11 + JMP lsx_find_byte + +lsx_found_48: + MOVV $48, R11 + +lsx_find_byte: + VMOVQ V0.V[0], R10 + CTOV R10, R10 + BNE R10, R20, find_byte + ADDV $8, R11 + + VMOVQ V0.V[1], R10 + CTOV R10, R10 + +find_byte: + SRLV $3, R10 + ADDV R10, R11 + ADDV R11, R4 + ADDV R11, R6 + MOVB (R4), R16 + MOVB (R6), R17 + JMP cmp_byte + +generic32_loop: + MOVV (R4), R10 + MOVV (R6), R11 + BNE R10, R11, cmp8 + MOVV 8(R4), R10 + MOVV 8(R6), R11 + BNE R10, R11, cmp8 + MOVV 16(R4), R10 + MOVV 16(R6), R11 + BNE R10, R11, cmp8 + MOVV 24(R4), R10 + MOVV 24(R6), R11 + BNE R10, R11, cmp8 + ADDV $-32, R14 + BEQ R14, cmp_len + ADDV $32, R4 + ADDV $32, R6 + MOVV $32, R15 + BGE R14, R15, generic32_loop + JMP tail