From: Xiaolin Zhao Date: Mon, 3 Jun 2024 03:45:04 +0000 (+0800) Subject: src/internal/bytealg: optimize the function Compare on loong64 X-Git-Tag: go1.24rc1~926 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=50e536daa18a57d29ece6209320fff5a837a749c;p=gostls13.git src/internal/bytealg: optimize the function Compare on loong64 The relevant performance improved by 66.73%. benchmark: goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3A6000 @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ BytesCompare/1 5.603n ± 0% 4.002n ± 0% -28.57% (p=0.000 n=20) BytesCompare/2 6.405n ± 0% 4.002n ± 0% -37.52% (p=0.000 n=20) BytesCompare/4 8.007n ± 0% 4.002n ± 0% -50.02% (p=0.000 n=20) BytesCompare/8 11.210n ± 0% 4.002n ± 0% -64.30% (p=0.000 n=20) BytesCompare/16 6.005n ± 0% 4.802n ± 0% -20.03% (p=0.000 n=20) BytesCompare/32 6.806n ± 0% 4.402n ± 0% -35.32% (p=0.000 n=20) BytesCompare/64 8.407n ± 0% 6.003n ± 0% -28.60% (p=0.000 n=20) BytesCompare/128 11.610n ± 0% 8.404n ± 0% -27.61% (p=0.000 n=20) BytesCompare/256 18.02n ± 0% 14.01n ± 0% -22.25% (p=0.000 n=20) BytesCompare/512 31.23n ± 0% 26.98n ± 0% -13.61% (p=0.000 n=20) BytesCompare/1024 56.85n ± 0% 52.43n ± 0% -7.77% (p=0.000 n=20) BytesCompare/2048 108.1n ± 0% 103.8n ± 0% -3.98% (p=0.000 n=20) CompareBytesEqual 15.610n ± 0% 5.203n ± 0% -66.67% (p=0.000 n=20) CompareBytesToNil 3.203n ± 0% 3.202n ± 0% -0.03% (p=0.000 n=20) CompareBytesEmpty 3.203n ± 0% 2.423n ± 0% -24.35% (p=0.000 n=20) CompareBytesIdentical 3.203n ± 0% 2.424n ± 0% -24.32% (p=0.000 n=20) CompareBytesSameLength 8.407n ± 0% 8.004n ± 0% -4.79% (p=0.000 n=20) CompareBytesDifferentLength 8.808n ± 0% 7.604n ± 0% -13.67% (p=0.000 n=20) CompareBytesBigUnaligned/offset=1 839.85µ ± 0% 82.04µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=2 839.86µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=3 839.86µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=4 839.86µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=5 839.85µ ± 0% 82.04µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=6 839.85µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigUnaligned/offset=7 839.85µ ± 0% 82.03µ ± 0% -90.23% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=0 78.77µ ± 0% 78.75µ ± 0% -0.03% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=1 839.84µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=2 839.84µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=3 839.85µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=4 839.83µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=5 839.85µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=6 839.85µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBigBothUnaligned/offset=7 839.84µ ± 0% 85.31µ ± 0% -89.84% (p=0.000 n=20) CompareBytesBig 78.77µ ± 0% 78.75µ ± 0% -0.03% (p=0.001 n=20) CompareBytesBigIdentical 2.802n ± 0% 2.801n ± 0% -0.04% (p=0.001 n=20) geomean 1.524µ 507.2n -66.73% Change-Id: Ice9f4ef0ce0fbb5a6424823c5f8e0c0c369fd159 Reviewed-on: https://go-review.googlesource.com/c/go/+/589538 LUCI-TryBot-Result: Go LUCI Reviewed-by: Tim King Reviewed-by: abner chenc Auto-Submit: Tim King Reviewed-by: Dmitri Shuralyov --- diff --git a/src/internal/bytealg/compare_loong64.s b/src/internal/bytealg/compare_loong64.s index df72a1122b..99c8cda775 100644 --- a/src/internal/bytealg/compare_loong64.s +++ b/src/internal/bytealg/compare_loong64.s @@ -28,58 +28,136 @@ TEXT runtime·cmpstring(SB),NOSPLIT,$0-40 // R7 length of b // R4 points to the start of a // R6 points to the start of b -// R13 points to the return value (-1/0/1) +// for regabi the return value (-1/0/1) in R4 TEXT cmpbody<>(SB),NOSPLIT|NOFRAME,$0 - BEQ R4, R6, samebytes // same start of a and b + BEQ R4, R6, cmp_len // same start of a and b, then compare lengths SGTU R5, R7, R9 - BNE R0, R9, r2_lt_r1 + BNE R9, b_lt_a MOVV R5, R14 JMP entry -r2_lt_r1: - MOVV R7, R14 // R14 is min(R4, R5) +b_lt_a: + MOVV R7, R14 // R14 is min(R5, R7) entry: - ADDV R4, R14, R12 // R6 start of a, R14 end of a - BEQ R4, R12, samebytes // length is 0 + ADDV R4, R14, R12 // R4 start of a, R12 end of a + BEQ R4, R12, cmp_len // minlength is 0 - SRLV $4, R14 // R14 is number of chunks - BEQ R0, R14, byte_loop +tail: + MOVV $2, R15 + BLT R14, R15, cmp1 // min < 2 + SLLV $1, R15 + BLT R14, R15, cmp2 // min < 4 + SLLV $1, R15 + BLT R14, R15, cmp4 // min < 8 + SLLV $1, R15 + BLT R14, R15, cmp8 // min < 16 + SLLV $1, R15 + BLT R14, R15, cmp16 // min < 32 - // make sure both a and b are aligned. - OR R4, R6, R15 - AND $7, R15 - BNE R0, R15, byte_loop +// When min >= 32 bytes, enter the cmp32_loop loop processing: +// take out 4 8-bytes from a and b in turn for comparison. +cmp32_loop: + MOVV (R4), R8 + MOVV (R6), R9 + MOVV 8(R4), R10 + MOVV 8(R6), R11 + BNE R8, R9, cmp8a + BNE R10, R11, cmp8b + MOVV 16(R4), R8 + MOVV 16(R6), R9 + MOVV 24(R4), R10 + MOVV 24(R6), R11 + BNE R8, R9, cmp8a + BNE R10, R11, cmp8b + ADDV $32, R4 + ADDV $32, R6 + SUBV $32, R14 + BGE R14, R15, cmp32_loop + BEQ R14, cmp_len - PCALIGN $16 -chunk16_loop: - BEQ R0, R14, byte_loop +check16: + MOVV $16, R15 + BLT R14, R15, check8 +cmp16: MOVV (R4), R8 MOVV (R6), R9 - BNE R8, R9, byte_loop - MOVV 8(R4), R16 - MOVV 8(R6), R17 + MOVV 8(R4), R10 + MOVV 8(R6), R11 + BNE R8, R9, cmp8a + BNE R10, R11, cmp8b ADDV $16, R4 ADDV $16, R6 - SUBVU $1, R14 - BEQ R16, R17, chunk16_loop - SUBV $8, R4 - SUBV $8, R6 + SUBV $16, R14 + BEQ R14, cmp_len + +check8: + MOVV $8, R15 + BLT R14, R15, check4 +cmp8: + MOVV (R4), R8 + MOVV (R6), R9 + BNE R8, R9, cmp8a + ADDV $8, R4 + ADDV $8, R6 + SUBV $8, R14 + BEQ R14, cmp_len -byte_loop: - BEQ R4, R12, samebytes +check4: + MOVV $4, R15 + BLT R14, R15, check2 +cmp4: + MOVW (R4), R8 + MOVW (R6), R9 + BNE R8, R9, cmp8a + ADDV $4, R4 + ADDV $4, R6 + SUBV $4, R14 + BEQ R14, cmp_len + +check2: + MOVV $2, R15 + BLT R14, R15, cmp1 +cmp2: + MOVH (R4), R8 + MOVH (R6), R9 + BNE R8, R9, cmp8a + ADDV $2, R4 + ADDV $2, R6 + SUBV $2, R14 + BEQ R14, cmp_len + +cmp1: + BEQ R14, cmp_len MOVBU (R4), R8 - ADDVU $1, R4 MOVBU (R6), R9 - ADDVU $1, R6 - BEQ R8, R9, byte_loop + BNE R8, R9, byte_cmp + JMP cmp_len + + // Compare 8/4/2 bytes taken from R8/R9 that are known to differ. +cmp8a: + MOVV R8, R10 + MOVV R9, R11 + + // Compare 8/4/2 bytes taken from R10/R11 that are known to differ. +cmp8b: + MOVV $0xff, R15 + + // Take single bytes from R10/R11 in turn for cyclic comparison. +cmp8_loop: + AND R10, R15, R8 + AND R11, R15, R9 + BNE R8, R9, byte_cmp + SLLV $8, R15 + JMP cmp8_loop + // Compare 1 bytes taken from R8/R9 that are known to differ. byte_cmp: - SGTU R8, R9, R4 // R12 = 1 if (R8 > R9) + SGTU R8, R9, R4 // R4 = 1 if (R8 > R9) BNE R0, R4, ret MOVV $-1, R4 JMP ret -samebytes: +cmp_len: SGTU R5, R7, R8 SGTU R7, R5, R9 SUBV R9, R8, R4