From: limeidan Date: Wed, 23 Apr 2025 03:04:15 +0000 (+0800) Subject: internal/bytealg: optimize the function indexbyte using SIMD on loong64 X-Git-Tag: go1.25rc1~319 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=7bc714d60173af7e783fc69e1bc532ab4aab11ba;p=gostls13.git internal/bytealg: optimize the function indexbyte using SIMD on loong64 goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3C5000 @ 2200.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ IndexByte/10 19.32n ± 0% 11.84n ± 0% -38.72% (p=0.000 n=10) IndexByte/32 49.34n ± 0% 14.11n ± 0% -71.40% (p=0.000 n=10) IndexByte/4K 5608.0n ± 0% 138.8n ± 0% -97.52% (p=0.000 n=10) IndexByte/4M 3822.8µ ± 0% 119.4µ ± 0% -96.88% (p=0.000 n=10) IndexByte/64M 61.826m ± 1% 3.812m ± 0% -93.83% (p=0.000 n=10) geomean 16.61µ 1.602µ -90.35% goos: linux goarch: loong64 pkg: bytes cpu: Loongson-3A6000-HV @ 2500.00MHz │ old │ new │ │ sec/op │ sec/op vs base │ IndexByte/10 6.809n ± 0% 5.804n ± 0% -14.75% (p=0.000 n=10) IndexByte/32 16.015n ± 0% 6.404n ± 0% -60.01% (p=0.000 n=10) IndexByte/4K 1651.00n ± 0% 52.83n ± 0% -96.80% (p=0.000 n=10) IndexByte/4M 1680.76µ ± 0% 91.10µ ± 0% -94.58% (p=0.000 n=10) IndexByte/64M 26.878m ± 0% 2.010m ± 27% -92.52% (p=0.000 n=10) geomean 6.054µ 815.0n -86.54% Change-Id: Ib75b997249708f921c6717eba43543c6650bf376 Reviewed-on: https://go-review.googlesource.com/c/go/+/668055 Reviewed-by: Keith Randall Reviewed-by: abner chenc Reviewed-by: Cherry Mui LUCI-TryBot-Result: Go LUCI Reviewed-by: sophie zhao --- diff --git a/src/internal/bytealg/indexbyte_loong64.s b/src/internal/bytealg/indexbyte_loong64.s index c9591b3cda..3618501063 100644 --- a/src/internal/bytealg/indexbyte_loong64.s +++ b/src/internal/bytealg/indexbyte_loong64.s @@ -5,48 +5,288 @@ #include "go_asm.h" #include "textflag.h" +// input: +// R4 = b_base +// R5 = b_len +// R6 = b_cap (unused) +// R7 = byte to find TEXT ·IndexByte(SB),NOSPLIT,$0-40 - // R4 = b_base - // R5 = b_len - // R6 = b_cap (unused) - // R7 = byte to find AND $0xff, R7 + JMP indexbytebody<>(SB) + +// input: +// R4 = s_base +// R5 = s_len +// R6 = byte to find +TEXT ·IndexByteString(SB),NOSPLIT,$0-32 + AND $0xff, R6, R7 // byte to find + JMP indexbytebody<>(SB) + +// input: +// R4: b_base +// R5: len +// R7: byte to find +TEXT indexbytebody<>(SB),NOSPLIT,$0 + BEQ R5, notfound // len == 0 + MOVV R4, R6 // store base for later - ADDV R4, R5 // end - ADDV $-1, R4 + ADDV R4, R5, R8 // end + + MOVV $32, R9 + BGE R5, R9, lasx +tail: + MOVV $8, R9 + BLT R5, R9, lt_8 +generic8_loop: + MOVV (R4), R10 + + AND $0xff, R10, R11 + BEQ R7, R11, found + + BSTRPICKV $15, R10, $8, R11 + BEQ R7, R11, byte_1th + + BSTRPICKV $23, R10, $16, R11 + BEQ R7, R11, byte_2th + + BSTRPICKV $31, R10, $24, R11 + BEQ R7, R11, byte_3th - PCALIGN $16 -loop: + BSTRPICKV $39, R10, $32, R11 + BEQ R7, R11, byte_4th + + BSTRPICKV $47, R10, $40, R11 + BEQ R7, R11, byte_5th + + BSTRPICKV $55, R10, $48, R11 + BEQ R7, R11, byte_6th + + BSTRPICKV $63, R10, $56, R11 + BEQ R7, R11, byte_7th + + ADDV $8, R4 + ADDV $-8, R5 + BGE R5, R9, generic8_loop + +lt_8: + BEQ R4, R8, notfound + MOVBU (R4), R10 + BEQ R7, R10, found ADDV $1, R4 - BEQ R4, R5, notfound - MOVBU (R4), R8 - BNE R7, R8, loop + JMP lt_8 - SUBV R6, R4 // remove base +byte_1th: + ADDV $1, R4 + SUBV R6, R4 RET -notfound: - MOVV $-1, R4 +byte_2th: + ADDV $2, R4 + SUBV R6, R4 RET -TEXT ·IndexByteString(SB),NOSPLIT,$0-32 - // R4 = s_base - // R5 = s_len - // R6 = byte to find - MOVV R4, R7 // store base for later - ADDV R4, R5 // end - ADDV $-1, R4 - - PCALIGN $16 -loop: - ADDV $1, R4 - BEQ R4, R5, notfound - MOVBU (R4), R8 - BNE R6, R8, loop +byte_3th: + ADDV $3, R4 + SUBV R6, R4 + RET + +byte_4th: + ADDV $4, R4 + SUBV R6, R4 + RET + +byte_5th: + ADDV $5, R4 + SUBV R6, R4 + RET + +byte_6th: + ADDV $6, R4 + SUBV R6, R4 + RET + +byte_7th: + ADDV $7, R4 - SUBV R7, R4 // remove base +found: + SUBV R6, R4 RET notfound: MOVV $-1, R4 RET + +lasx: + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R9 + BEQ R9, lsx + XVMOVQ R7, X0.B32 + + MOVV $128, R9 + BLT R5, R9, lasx32_loop +lasx128_loop: + XVMOVQ 0(R4), X1 + XVMOVQ 32(R4), X2 + XVMOVQ 64(R4), X3 + XVMOVQ 96(R4), X4 + + XVSEQB X1, X0, X1 + XVSETNEV X1, FCC0 + BFPT lasx_found_add_0 + + XVSEQB X2, X0, X1 + XVSETNEV X1, FCC0 + BFPT lasx_found_add_32 + + XVSEQB X3, X0, X1 + XVSETNEV X1, FCC0 + BFPT lasx_found_add_64 + + XVSEQB X4, X0, X1 + XVSETNEV X1, FCC0 + BFPT lasx_found_add_96 + + ADDV $128, R4 + ADDV $-128, R5 + BGE R5, R9, lasx128_loop + + BEQ R5, notfound + + MOVV $32, R9 + BLT R5, R9, tail +lasx32_loop: + XVMOVQ 0(R4), X1 + + XVSEQB X1, X0, X1 + XVSETNEV X1, FCC0 + BFPT lasx_found_add_0 + + ADDV $32, R4 + ADDV $-32, R5 + BGE R5, R9, lasx32_loop + + BEQ R5, notfound + + JMP tail + +lasx_found_add_0: + MOVV R0, R11 + JMP lasx_index_cal + +lasx_found_add_32: + MOVV $32, R11 + JMP lasx_index_cal + +lasx_found_add_64: + MOVV $64, R11 + JMP lasx_index_cal + +lasx_found_add_96: + MOVV $96, R11 + JMP lasx_index_cal + +lasx_index_cal: + MOVV $64, R9 + XVMOVQ X1.V[0], R10 + CTZV R10, R10 + BNE R10, R9, index_cal + ADDV $8, R11 + + XVMOVQ X1.V[1], R10 + CTZV R10, R10 + BNE R10, R9, index_cal + ADDV $8, R11 + + XVMOVQ X1.V[2], R10 + CTZV R10, R10 + BNE R10, R9, index_cal + ADDV $8, R11 + + XVMOVQ X1.V[3], R10 + CTZV R10, R10 + JMP index_cal + +lsx: + MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R9 + BEQ R9, tail + VMOVQ R7, V0.B16 + + MOVV $64, R9 + BLT R5, R9, lsx16_loop +lsx64_loop: + VMOVQ 0(R4), V1 + VMOVQ 16(R4), V2 + VMOVQ 32(R4), V3 + VMOVQ 48(R4), V4 + + VSEQB V1, V0, V1 + VSETNEV V1, FCC0 + BFPT lsx_found_add_0 + + VSEQB V2, V0, V1 + VSETNEV V1, FCC0 + BFPT lsx_found_add_16 + + VSEQB V3, V0, V1 + VSETNEV V1, FCC0 + BFPT lsx_found_add_32 + + VSEQB V4, V0, V1 + VSETNEV V1, FCC0 + BFPT lsx_found_add_48 + + ADDV $64, R4 + ADDV $-64, R5 + BGE R5, R9, lsx64_loop + + BEQ R5, notfound + + MOVV $16, R9 + BLT R5, R9, tail +lsx16_loop: + VMOVQ 0(R4), V1 + + VSEQB V1, V0, V1 + VSETNEV V1, FCC0 + BFPT lsx_found_add_0 + + ADDV $16, R4 + ADDV $-16, R5 + BGE R5, R9, lsx16_loop + + BEQ R5, notfound + + JMP tail + +lsx_found_add_0: + MOVV R0, R11 + JMP lsx_index_cal + +lsx_found_add_16: + MOVV $16, R11 + JMP lsx_index_cal + +lsx_found_add_32: + MOVV $32, R11 + JMP lsx_index_cal + +lsx_found_add_48: + MOVV $48, R11 + JMP lsx_index_cal + +lsx_index_cal: + MOVV $64, R9 + + VMOVQ V1.V[0], R10 + CTZV R10, R10 + BNE R10, R9, index_cal + ADDV $8, R11 + + VMOVQ V1.V[1], R10 + CTZV R10, R10 + JMP index_cal + +index_cal: + SRLV $3, R10 + ADDV R11, R10 + ADDV R10, R4 + JMP found