]> Cypherpunks repositories - gostls13.git/commitdiff
internal/bytealg: optimize Index/IndexString on loong64
authorlimeidan <limeidan@loongson.cn>
Thu, 7 Aug 2025 03:34:28 +0000 (11:34 +0800)
committerabner chenc <chenguoqi@loongson.cn>
Fri, 8 Aug 2025 03:32:55 +0000 (20:32 -0700)
goos: linux
goarch: loong64
pkg: bytes
cpu: Loongson-3A6000 @ 2500.00MHz
                              | 3a6000.old.txt |           3a6000.new.txt            |
                              |     sec/op     |   sec/op     vs base                |
IndexRune/10                      23.56n ±  1%   20.42n ± 0%  -13.33% (p=0.000 n=10)
IndexRune/32                      29.91n ±  1%   22.46n ± 0%  -24.90% (p=0.000 n=10)
IndexRune/4K                     102.45n ±  2%   72.66n ± 0%  -29.08% (p=0.000 n=10)
IndexRune/4M                     111.96µ ±  1%   52.50µ ± 1%  -53.11% (p=0.000 n=10)
IndexRune/64M                     3.653m ± 30%   3.633m ± 0%        ~ (p=0.143 n=10)
IndexRuneASCII/10                 8.736n ±  2%   7.206n ± 0%  -17.51% (p=0.000 n=10)
IndexRuneASCII/32                10.195n ±  2%   8.008n ± 0%  -21.45% (p=0.000 n=10)
IndexRuneASCII/4K                 70.27n ±  2%   52.84n ± 0%  -24.80% (p=0.000 n=10)
IndexRuneASCII/4M                 98.15µ ±  1%   87.87µ ± 1%  -10.47% (p=0.000 n=10)
IndexRuneASCII/64M                2.028m ±  0%   1.918m ± 2%   -5.41% (p=0.000 n=10)
IndexRuneUnicode/Latin/10         18.80n ±  1%   13.61n ± 0%  -27.59% (p=0.000 n=10)
IndexRuneUnicode/Latin/32         28.09n ±  2%   20.82n ± 0%  -25.88% (p=0.000 n=10)
IndexRuneUnicode/Latin/4K         373.8n ±  1%   357.1n ± 0%   -4.47% (p=0.000 n=10)
IndexRuneUnicode/Latin/4M         395.8µ ±  0%   381.0µ ± 0%   -3.74% (p=0.000 n=10)
IndexRuneUnicode/Latin/64M        8.056m ±  0%   7.614m ± 0%   -5.49% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/10      23.72n ±  1%   20.42n ± 0%  -13.91% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/32      30.20n ±  1%   22.42n ± 0%  -25.77% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4K      1.134µ ±  1%   1.122µ ± 0%   -1.06% (p=0.000 n=10)
IndexRuneUnicode/Cyrillic/4M      1.160m ±  1%   1.152m ± 0%   -0.72% (p=0.005 n=10)
IndexRuneUnicode/Cyrillic/64M     20.26m ±  1%   19.61m ± 0%   -3.24% (p=0.000 n=10)
IndexRuneUnicode/Han/10           30.11n ±  2%   24.82n ± 0%  -17.57% (p=0.000 n=10)
IndexRuneUnicode/Han/32           36.16n ±  2%   27.20n ± 0%  -24.78% (p=0.000 n=10)
IndexRuneUnicode/Han/4K           548.1n ±  0%   524.8n ± 0%   -4.25% (p=0.000 n=10)
IndexRuneUnicode/Han/4M           706.7µ ±  1%   624.0µ ± 0%  -11.70% (p=0.000 n=10)
IndexRuneUnicode/Han/64M          12.50m ±  1%   10.84m ± 1%  -13.24% (p=0.000 n=10)
Index/10                          42.03n ±  2%   10.01n ± 0%  -76.18% (p=0.000 n=10)
Index/32                         133.15n ±  1%   40.03n ± 0%  -69.94% (p=0.000 n=10)
Index/4K                         11.647µ ±  1%   2.493µ ± 0%  -78.60% (p=0.000 n=10)
Index/4M                         11.536m ±  0%   2.519m ± 0%  -78.16% (p=0.000 n=10)
Index/64M                        184.60m ±  1%   40.42m ± 0%  -78.10% (p=0.000 n=10)
IndexEasy/10                     17.290n ±  2%   9.608n ± 0%  -44.43% (p=0.000 n=10)
IndexEasy/32                      23.71n ±  2%   16.61n ± 0%  -29.95% (p=0.000 n=10)
IndexEasy/4K                      95.64n ±  2%   68.25n ± 0%  -28.64% (p=0.000 n=10)
IndexEasy/4M                     105.04µ ±  1%   91.94µ ± 0%  -12.47% (p=0.000 n=10)
IndexEasy/64M                     4.280m ±  0%   4.264m ± 0%   -0.38% (p=0.002 n=10)
Count/10                          53.09n ±  1%   16.81n ± 0%  -68.33% (p=0.000 n=10)
Count/32                         142.20n ±  2%   46.44n ± 0%  -67.34% (p=0.000 n=10)
Count/4K                         11.428µ ±  1%   2.500µ ± 1%  -78.12% (p=0.000 n=10)
Count/4M                         11.536m ±  1%   2.520m ± 0%  -78.16% (p=0.000 n=10)
Count/64M                        183.80m ±  1%   40.42m ± 0%  -78.01% (p=0.000 n=10)
IndexHard1                       2906.4µ ±  1%   420.4µ ± 0%  -85.54% (p=0.000 n=10)
IndexHard2                       2918.0µ ±  1%   421.1µ ± 1%  -85.57% (p=0.000 n=10)
IndexHard3                       2912.8µ ±  1%   440.2µ ± 0%  -84.89% (p=0.000 n=10)
IndexHard4                       2909.6µ ±  1%   840.4µ ± 0%  -71.12% (p=0.000 n=10)
LastIndexHard1                    2.939m ±  1%   2.621m ± 0%  -10.83% (p=0.000 n=10)
LastIndexHard2                    2.924m ±  1%   2.624m ± 0%  -10.26% (p=0.000 n=10)
LastIndexHard3                    2.936m ±  1%   2.580m ± 1%  -12.12% (p=0.000 n=10)
CountHard1                       2900.4µ ±  1%   420.0µ ± 0%  -85.52% (p=0.000 n=10)
CountHard2                       2915.6µ ±  1%   420.0µ ± 0%  -85.59% (p=0.000 n=10)
CountHard3                       2905.0µ ±  0%   440.0µ ± 0%  -84.85% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic2     181.95µ ±  1%   26.28µ ± 0%  -85.56% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic4     182.59µ ±  1%   26.29µ ± 0%  -85.60% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic8      183.9µ ±  1%   108.2µ ± 0%  -41.14% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic16     58.24µ ±  0%   56.58µ ± 0%   -2.86% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic32     30.82µ ±  0%   29.62µ ± 0%   -3.92% (p=0.000 n=10)
IndexPeriodic/IndexPeriodic64     16.59µ ±  0%   15.00µ ± 0%   -9.62% (p=0.000 n=10)
geomean                           22.69µ         11.59µ       -48.92%

Change-Id: Iacc9e686027f99bb0413b566cfc8ee6cd873d2d9
Reviewed-on: https://go-review.googlesource.com/c/go/+/693878
Reviewed-by: abner chenc <chenguoqi@loongson.cn>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
Reviewed-by: Mark Freeman <markfreeman@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/internal/bytealg/index_generic.go
src/internal/bytealg/index_loong64.go [new file with mode: 0644]
src/internal/bytealg/index_loong64.s [new file with mode: 0644]
src/internal/bytealg/index_native.go

index a59e32938e76ec2dc1a7b5e6e4b9f958e284a9df..643bb59ab1edbbaaa1d3707168cd5c3a5c5f6c7c 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build !amd64 && !arm64 && !s390x && !ppc64le && !ppc64
+//go:build !amd64 && !arm64 && !loong64 && !s390x && !ppc64le && !ppc64
 
 package bytealg
 
diff --git a/src/internal/bytealg/index_loong64.go b/src/internal/bytealg/index_loong64.go
new file mode 100644 (file)
index 0000000..ad574d6
--- /dev/null
@@ -0,0 +1,30 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+// Empirical data shows that using Index can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+       // If SIMD is supported, optimize the cases where the substring length is less than 64 bytes,
+       // otherwise, cases the length less than 32 bytes is optimized.
+       if cpu.Loong64.HasLASX || cpu.Loong64.HasLSX {
+               MaxLen = 64
+       } else {
+               MaxLen = 32
+       }
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+       // 1 error per 8 characters, plus a few slop to start.
+       return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_loong64.s b/src/internal/bytealg/index_loong64.s
new file mode 100644 (file)
index 0000000..1016db7
--- /dev/null
@@ -0,0 +1,303 @@
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
+       MOVV    R7, R6          // R6 = separator pointer
+       MOVV    R8, R7          // R7 = separator length
+       JMP     indexbody<>(SB)
+
+TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
+       JMP     indexbody<>(SB)
+
+// input:
+//   R4 = string
+//   R5 = length
+//   R6 = separator pointer
+//   R7 = separator length (2 <= len <= 64)
+TEXT indexbody<>(SB),NOSPLIT,$0
+       // main idea is to load 'sep' into separate register(s)
+       // to avoid repeatedly re-load it again and again
+       // for sebsequent substring comparisons
+       SUBV    R7, R5, R8
+       ADDV    R4, R8          // R8 contains the start of last substring for comparison
+       ADDV    $1, R4, R9      // store base for later
+
+       MOVV    $8, R5
+       BGE     R7, R5, len_gt_or_eq_8
+len_2_7:
+       AND     $0x4, R7, R5
+       BNE     R5, len_4_7
+
+len_2_3:
+       AND     $0x1, R7, R5
+       BNE     R5, len_3
+
+len_2:
+       MOVHU   (R6), R10
+loop_2:
+       BLT     R8, R4, not_found
+       MOVHU   (R4), R11
+       ADDV    $1, R4
+       BNE     R10, R11, loop_2
+       JMP     found
+
+len_3:
+       MOVHU   (R6), R10
+       MOVBU   2(R6), R11
+loop_3:
+       BLT     R8, R4, not_found
+       MOVHU   (R4), R12
+       ADDV    $1, R4
+       BNE     R10, R12, loop_3
+       MOVBU   1(R4), R13
+       BNE     R11, R13, loop_3
+       JMP     found
+
+len_4_7:
+       AND     $0x2, R7, R5
+       BNE     R5, len_6_7
+       AND     $0x1, R7, R5
+       BNE     R5, len_5
+len_4:
+       MOVWU   (R6), R10
+loop_4:
+       BLT     R8, R4, not_found
+       MOVWU   (R4), R11
+       ADDV    $1, R4
+       BNE     R10, R11, loop_4
+       JMP     found
+
+len_5:
+       MOVWU   (R6), R10
+       MOVBU   4(R6), R11
+loop_5:
+       BLT     R8, R4, not_found
+       MOVWU   (R4), R12
+       ADDV    $1, R4
+       BNE     R10, R12, loop_5
+       MOVBU   3(R4), R13
+       BNE     R11, R13, loop_5
+       JMP     found
+
+len_6_7:
+       AND     $0x1, R7, R5
+       BNE     R5, len_7
+len_6:
+       MOVWU   (R6), R10
+       MOVHU   4(R6), R11
+loop_6:
+       BLT     R8, R4, not_found
+       MOVWU   (R4), R12
+       ADDV    $1, R4
+       BNE     R10, R12, loop_6
+       MOVHU   3(R4), R13
+       BNE     R11, R13, loop_6
+       JMP     found
+
+len_7:
+       MOVWU   (R6), R10
+       MOVWU   3(R6), R11
+loop_7:
+       BLT     R8, R4, not_found
+       MOVWU   (R4), R12
+       ADDV    $1, R4
+       BNE     R10, R12, loop_7
+       MOVWU   2(R4), R13
+       BNE     R11, R13, loop_7
+       JMP     found
+
+len_gt_or_eq_8:
+       BEQ     R5, R7, len_8
+       MOVV    $17, R5
+       BGE     R7, R5, len_gt_or_eq_17
+       JMP     len_9_16
+len_8:
+       MOVV    (R6), R10
+loop_8:
+       BLT     R8, R4, not_found
+       MOVV    (R4), R11
+       ADDV    $1, R4
+       BNE     R10, R11, loop_8
+       JMP     found
+
+len_9_16:
+       MOVV    (R6), R10
+       SUBV    $8, R7
+       MOVV    (R6)(R7), R11
+       SUBV    $1, R7
+loop_9_16:
+       BLT     R8, R4, not_found
+       MOVV    (R4), R12
+       ADDV    $1, R4
+       BNE     R10, R12, loop_9_16
+       MOVV    (R4)(R7), R13
+       BNE     R11, R13, loop_9_16
+       JMP     found
+
+len_gt_or_eq_17:
+       MOVV    $25, R5
+       BGE     R7, R5, len_gt_or_eq_25
+len_17_24:
+       MOVV    0(R6), R10
+       MOVV    8(R6), R11
+       SUBV    $8, R7
+       MOVV    (R6)(R7), R12
+       SUBV    $1, R7
+loop_17_24:
+       BLT     R8, R4, not_found
+       MOVV    (R4), R13
+       ADDV    $1, R4
+       BNE     R10, R13, loop_17_24
+       MOVV    7(R4), R14
+       BNE     R11, R14, loop_17_24
+       MOVV    (R4)(R7), R15
+       BNE     R12, R15, loop_17_24
+       JMP     found
+
+len_gt_or_eq_25:
+       MOVV    $33, R5
+       BGE     R7, R5, len_gt_or_eq_33
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R10
+       BNE     R10, lsx_len_25_32
+len_25_32:
+       MOVV    0(R6), R10
+       MOVV    8(R6), R11
+       MOVV    16(R6), R12
+       SUBV    $8, R7
+       MOVV    (R6)(R7), R13
+       SUBV    $1, R7
+loop_25_32:
+       BLT     R8, R4, not_found
+       MOVV    (R4), R14
+       ADDV    $1, R4
+       BNE     R10, R14, loop_25_32
+       MOVV    7(R4), R15
+       BNE     R11, R15, loop_25_32
+       MOVV    15(R4), R16
+       BNE     R12, R16, loop_25_32
+       MOVV    (R4)(R7), R17
+       BNE     R13, R17, loop_25_32
+       JMP     found
+
+       // On loong64, LSX is included if LASX is supported.
+lasx_len_25_32:
+lsx_len_25_32:
+       VMOVQ   0(R6), V0
+       SUBV    $16, R7
+       VMOVQ   (R6)(R7), V1
+       SUBV    $1, R7
+lsx_loop_25_32:
+       BLT     R8, R4, not_found
+       VMOVQ   (R4), V2
+       ADDV    $1, R4
+       VSEQV   V0, V2, V2
+       VSETANYEQV      V2, FCC0
+       BFPT    FCC0, lsx_loop_25_32
+
+       VMOVQ   (R4)(R7), V3
+       VSEQV   V1, V3, V3
+       VSETANYEQV      V3, FCC1
+       BFPT    FCC1, lsx_loop_25_32
+       JMP     found
+
+len_gt_or_eq_33:
+       MOVBU   internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R10
+       MOVV    $49, R5
+       BGE     R7, R5, len_gt_or_eq_49
+len_33_48:
+       BNE     R10, lasx_len_33_48
+       JMP     lsx_len_33_48
+
+len_gt_or_eq_49:
+len_49_64:
+       BNE     R10, lasx_len_49_64
+       JMP     lsx_len_49_64
+
+lsx_len_33_48:
+       VMOVQ   0(R6), V0
+       VMOVQ   16(R6), V1
+       SUBV    $16, R7
+       VMOVQ   (R6)(R7), V2
+       SUBV    $1, R7
+lsx_loop_33_48:
+       BLT     R8, R4, not_found
+       VMOVQ   0(R4), V3
+       ADDV    $1, R4
+       VSEQV   V0, V3, V3
+       VSETANYEQV      V3, FCC0
+       BFPT    FCC0, lsx_loop_33_48
+
+       VMOVQ   15(R4), V4
+       VSEQV   V1, V4, V4
+       VSETANYEQV      V4, FCC1
+       BFPT    FCC1, lsx_loop_33_48
+
+       VMOVQ   (R4)(R7), V5
+       VSEQV   V2, V5, V5
+       VSETANYEQV      V5, FCC2
+       BFPT    FCC2, lsx_loop_33_48
+       JMP     found
+
+lsx_len_49_64:
+       VMOVQ   0(R6), V0
+       VMOVQ   16(R6), V1
+       VMOVQ   32(R6), V2
+       SUBV    $16, R7
+       VMOVQ   (R6)(R7), V3
+       SUBV    $1, R7
+lsx_loop_49_64:
+       BLT     R8, R4, not_found
+       VMOVQ   0(R4), V4
+       ADDV    $1, R4
+       VSEQV   V0, V4, V4
+       VSETANYEQV      V4, FCC0
+       BFPT    FCC0, lsx_loop_49_64
+
+       VMOVQ   15(R4), V5
+       VSEQV   V1, V5, V5
+       VSETANYEQV      V5, FCC1
+       BFPT    FCC1, lsx_loop_49_64
+
+       VMOVQ   31(R4), V6
+       VSEQV   V2, V6, V6
+       VSETANYEQV      V6, FCC2
+       BFPT    FCC2, lsx_loop_49_64
+
+       VMOVQ   (R4)(R7), V7
+       VSEQV   V3, V7, V7
+       VSETANYEQV      V7, FCC3
+       BFPT    FCC3, lsx_loop_49_64
+       JMP     found
+
+lasx_len_33_48:
+lasx_len_49_64:
+lasx_len_33_64:
+       XVMOVQ  (R6), X0
+       SUBV    $32, R7
+       XVMOVQ  (R6)(R7), X1
+       SUBV    $1, R7
+lasx_loop_33_64:
+       BLT     R8, R4, not_found
+       XVMOVQ  (R4), X2
+       ADDV    $1, R4
+       XVSEQV  X0, X2, X3
+       XVSETANYEQV     X3, FCC0
+       BFPT    FCC0, lasx_loop_33_64
+
+       XVMOVQ  (R4)(R7), X4
+       XVSEQV  X1, X4, X5
+       XVSETANYEQV     X5, FCC1
+       BFPT    FCC1, lasx_loop_33_64
+       JMP     found
+
+found:
+       SUBV    R9, R4
+       RET
+
+not_found:
+       MOVV    $-1, R4
+       RET
index 59c93f9d126b904d398fa27acc16961d64a948e5..f917c7a92adbf1e1f595821c26d07762e294dadc 100644 (file)
@@ -2,7 +2,7 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-//go:build amd64 || arm64 || s390x || ppc64le || ppc64
+//go:build amd64 || arm64 || loong64 || s390x || ppc64le || ppc64
 
 package bytealg