--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+// Empirical data shows that using Index can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+ // If SIMD is supported, optimize the cases where the substring length is less than 64 bytes,
+ // otherwise, cases the length less than 32 bytes is optimized.
+ if cpu.Loong64.HasLASX || cpu.Loong64.HasLSX {
+ MaxLen = 64
+ } else {
+ MaxLen = 32
+ }
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ // 1 error per 8 characters, plus a few slop to start.
+ return (n + 16) / 8
+}
--- /dev/null
+// Copyright 2025 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index<ABIInternal>(SB),NOSPLIT,$0-56
+ MOVV R7, R6 // R6 = separator pointer
+ MOVV R8, R7 // R7 = separator length
+ JMP indexbody<>(SB)
+
+TEXT ·IndexString<ABIInternal>(SB),NOSPLIT,$0-40
+ JMP indexbody<>(SB)
+
+// input:
+// R4 = string
+// R5 = length
+// R6 = separator pointer
+// R7 = separator length (2 <= len <= 64)
+TEXT indexbody<>(SB),NOSPLIT,$0
+ // main idea is to load 'sep' into separate register(s)
+ // to avoid repeatedly re-load it again and again
+ // for sebsequent substring comparisons
+ SUBV R7, R5, R8
+ ADDV R4, R8 // R8 contains the start of last substring for comparison
+ ADDV $1, R4, R9 // store base for later
+
+ MOVV $8, R5
+ BGE R7, R5, len_gt_or_eq_8
+len_2_7:
+ AND $0x4, R7, R5
+ BNE R5, len_4_7
+
+len_2_3:
+ AND $0x1, R7, R5
+ BNE R5, len_3
+
+len_2:
+ MOVHU (R6), R10
+loop_2:
+ BLT R8, R4, not_found
+ MOVHU (R4), R11
+ ADDV $1, R4
+ BNE R10, R11, loop_2
+ JMP found
+
+len_3:
+ MOVHU (R6), R10
+ MOVBU 2(R6), R11
+loop_3:
+ BLT R8, R4, not_found
+ MOVHU (R4), R12
+ ADDV $1, R4
+ BNE R10, R12, loop_3
+ MOVBU 1(R4), R13
+ BNE R11, R13, loop_3
+ JMP found
+
+len_4_7:
+ AND $0x2, R7, R5
+ BNE R5, len_6_7
+ AND $0x1, R7, R5
+ BNE R5, len_5
+len_4:
+ MOVWU (R6), R10
+loop_4:
+ BLT R8, R4, not_found
+ MOVWU (R4), R11
+ ADDV $1, R4
+ BNE R10, R11, loop_4
+ JMP found
+
+len_5:
+ MOVWU (R6), R10
+ MOVBU 4(R6), R11
+loop_5:
+ BLT R8, R4, not_found
+ MOVWU (R4), R12
+ ADDV $1, R4
+ BNE R10, R12, loop_5
+ MOVBU 3(R4), R13
+ BNE R11, R13, loop_5
+ JMP found
+
+len_6_7:
+ AND $0x1, R7, R5
+ BNE R5, len_7
+len_6:
+ MOVWU (R6), R10
+ MOVHU 4(R6), R11
+loop_6:
+ BLT R8, R4, not_found
+ MOVWU (R4), R12
+ ADDV $1, R4
+ BNE R10, R12, loop_6
+ MOVHU 3(R4), R13
+ BNE R11, R13, loop_6
+ JMP found
+
+len_7:
+ MOVWU (R6), R10
+ MOVWU 3(R6), R11
+loop_7:
+ BLT R8, R4, not_found
+ MOVWU (R4), R12
+ ADDV $1, R4
+ BNE R10, R12, loop_7
+ MOVWU 2(R4), R13
+ BNE R11, R13, loop_7
+ JMP found
+
+len_gt_or_eq_8:
+ BEQ R5, R7, len_8
+ MOVV $17, R5
+ BGE R7, R5, len_gt_or_eq_17
+ JMP len_9_16
+len_8:
+ MOVV (R6), R10
+loop_8:
+ BLT R8, R4, not_found
+ MOVV (R4), R11
+ ADDV $1, R4
+ BNE R10, R11, loop_8
+ JMP found
+
+len_9_16:
+ MOVV (R6), R10
+ SUBV $8, R7
+ MOVV (R6)(R7), R11
+ SUBV $1, R7
+loop_9_16:
+ BLT R8, R4, not_found
+ MOVV (R4), R12
+ ADDV $1, R4
+ BNE R10, R12, loop_9_16
+ MOVV (R4)(R7), R13
+ BNE R11, R13, loop_9_16
+ JMP found
+
+len_gt_or_eq_17:
+ MOVV $25, R5
+ BGE R7, R5, len_gt_or_eq_25
+len_17_24:
+ MOVV 0(R6), R10
+ MOVV 8(R6), R11
+ SUBV $8, R7
+ MOVV (R6)(R7), R12
+ SUBV $1, R7
+loop_17_24:
+ BLT R8, R4, not_found
+ MOVV (R4), R13
+ ADDV $1, R4
+ BNE R10, R13, loop_17_24
+ MOVV 7(R4), R14
+ BNE R11, R14, loop_17_24
+ MOVV (R4)(R7), R15
+ BNE R12, R15, loop_17_24
+ JMP found
+
+len_gt_or_eq_25:
+ MOVV $33, R5
+ BGE R7, R5, len_gt_or_eq_33
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLSX(SB), R10
+ BNE R10, lsx_len_25_32
+len_25_32:
+ MOVV 0(R6), R10
+ MOVV 8(R6), R11
+ MOVV 16(R6), R12
+ SUBV $8, R7
+ MOVV (R6)(R7), R13
+ SUBV $1, R7
+loop_25_32:
+ BLT R8, R4, not_found
+ MOVV (R4), R14
+ ADDV $1, R4
+ BNE R10, R14, loop_25_32
+ MOVV 7(R4), R15
+ BNE R11, R15, loop_25_32
+ MOVV 15(R4), R16
+ BNE R12, R16, loop_25_32
+ MOVV (R4)(R7), R17
+ BNE R13, R17, loop_25_32
+ JMP found
+
+ // On loong64, LSX is included if LASX is supported.
+lasx_len_25_32:
+lsx_len_25_32:
+ VMOVQ 0(R6), V0
+ SUBV $16, R7
+ VMOVQ (R6)(R7), V1
+ SUBV $1, R7
+lsx_loop_25_32:
+ BLT R8, R4, not_found
+ VMOVQ (R4), V2
+ ADDV $1, R4
+ VSEQV V0, V2, V2
+ VSETANYEQV V2, FCC0
+ BFPT FCC0, lsx_loop_25_32
+
+ VMOVQ (R4)(R7), V3
+ VSEQV V1, V3, V3
+ VSETANYEQV V3, FCC1
+ BFPT FCC1, lsx_loop_25_32
+ JMP found
+
+len_gt_or_eq_33:
+ MOVBU internal∕cpu·Loong64+const_offsetLOONG64HasLASX(SB), R10
+ MOVV $49, R5
+ BGE R7, R5, len_gt_or_eq_49
+len_33_48:
+ BNE R10, lasx_len_33_48
+ JMP lsx_len_33_48
+
+len_gt_or_eq_49:
+len_49_64:
+ BNE R10, lasx_len_49_64
+ JMP lsx_len_49_64
+
+lsx_len_33_48:
+ VMOVQ 0(R6), V0
+ VMOVQ 16(R6), V1
+ SUBV $16, R7
+ VMOVQ (R6)(R7), V2
+ SUBV $1, R7
+lsx_loop_33_48:
+ BLT R8, R4, not_found
+ VMOVQ 0(R4), V3
+ ADDV $1, R4
+ VSEQV V0, V3, V3
+ VSETANYEQV V3, FCC0
+ BFPT FCC0, lsx_loop_33_48
+
+ VMOVQ 15(R4), V4
+ VSEQV V1, V4, V4
+ VSETANYEQV V4, FCC1
+ BFPT FCC1, lsx_loop_33_48
+
+ VMOVQ (R4)(R7), V5
+ VSEQV V2, V5, V5
+ VSETANYEQV V5, FCC2
+ BFPT FCC2, lsx_loop_33_48
+ JMP found
+
+lsx_len_49_64:
+ VMOVQ 0(R6), V0
+ VMOVQ 16(R6), V1
+ VMOVQ 32(R6), V2
+ SUBV $16, R7
+ VMOVQ (R6)(R7), V3
+ SUBV $1, R7
+lsx_loop_49_64:
+ BLT R8, R4, not_found
+ VMOVQ 0(R4), V4
+ ADDV $1, R4
+ VSEQV V0, V4, V4
+ VSETANYEQV V4, FCC0
+ BFPT FCC0, lsx_loop_49_64
+
+ VMOVQ 15(R4), V5
+ VSEQV V1, V5, V5
+ VSETANYEQV V5, FCC1
+ BFPT FCC1, lsx_loop_49_64
+
+ VMOVQ 31(R4), V6
+ VSEQV V2, V6, V6
+ VSETANYEQV V6, FCC2
+ BFPT FCC2, lsx_loop_49_64
+
+ VMOVQ (R4)(R7), V7
+ VSEQV V3, V7, V7
+ VSETANYEQV V7, FCC3
+ BFPT FCC3, lsx_loop_49_64
+ JMP found
+
+lasx_len_33_48:
+lasx_len_49_64:
+lasx_len_33_64:
+ XVMOVQ (R6), X0
+ SUBV $32, R7
+ XVMOVQ (R6)(R7), X1
+ SUBV $1, R7
+lasx_loop_33_64:
+ BLT R8, R4, not_found
+ XVMOVQ (R4), X2
+ ADDV $1, R4
+ XVSEQV X0, X2, X3
+ XVSETANYEQV X3, FCC0
+ BFPT FCC0, lasx_loop_33_64
+
+ XVMOVQ (R4)(R7), X4
+ XVSEQV X1, X4, X5
+ XVSETANYEQV X5, FCC1
+ BFPT FCC1, lasx_loop_33_64
+ JMP found
+
+found:
+ SUBV R9, R4
+ RET
+
+not_found:
+ MOVV $-1, R4
+ RET