Also move the arm64 CountByte implementation while we're here.
Fixes #19792
Change-Id: I1e0fdf1e03e3135af84150a2703b58dad1b0d57e
Reviewed-on: https://go-review.googlesource.com/98518
Run-TryBot: Keith Randall <khr@golang.org>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Brad Fitzpatrick <bradfitz@golang.org>
return len(s) == len(t)
}
+// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
+func Index(s, sep []byte) int {
+ n := len(sep)
+ switch {
+ case n == 0:
+ return 0
+ case n == 1:
+ return IndexByte(s, sep[0])
+ case n == len(s):
+ if Equal(sep, s) {
+ return 0
+ }
+ return -1
+ case n > len(s):
+ return -1
+ case n <= bytealg.MaxLen:
+ // Use brute force when s and sep both are small
+ if len(s) <= bytealg.MaxBruteForce {
+ return bytealg.Index(s, sep)
+ }
+ c := sep[0]
+ i := 0
+ t := s[:len(s)-n+1]
+ fails := 0
+ for i < len(t) {
+ if t[i] != c {
+ // IndexByte is faster than bytealg.Index, so use it as long as
+ // we're not getting lots of false positives.
+ o := IndexByte(t[i:], c)
+ if o < 0 {
+ return -1
+ }
+ i += o
+ }
+ if Equal(s[i:i+n], sep) {
+ return i
+ }
+ fails++
+ i++
+ // Switch to bytealg.Index when IndexByte produces too many false positives.
+ if fails > bytealg.Cutover(i) {
+ r := bytealg.Index(s[i:], sep)
+ if r >= 0 {
+ return r + i
+ }
+ return -1
+ }
+ }
+ return -1
+ }
+ c := sep[0]
+ i := 0
+ fails := 0
+ t := s[:len(s)-n+1]
+ for i < len(t) {
+ if t[i] != c {
+ o := IndexByte(t[i:], c)
+ if o < 0 {
+ break
+ }
+ i += o
+ }
+ if Equal(s[i:i+n], sep) {
+ return i
+ }
+ i++
+ fails++
+ if fails >= 4+i>>4 && i < len(t) {
+ // Give up on IndexByte, it isn't skipping ahead
+ // far enough to be better than Rabin-Karp.
+ // Experiments (using IndexPeriodic) suggest
+ // the cutover is about 16 byte skips.
+ // TODO: if large prefixes of sep are matching
+ // we should cutover at even larger average skips,
+ // because Equal becomes that much more expensive.
+ // This code does not take that effect into account.
+ j := indexRabinKarp(s[i:], sep)
+ if j < 0 {
+ return -1
+ }
+ return i + j
+ }
+ }
+ return -1
+}
+
func indexRabinKarp(s, sep []byte) int {
// Rabin-Karp search
hashsep, pow := hashStr(sep)
+++ /dev/null
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package bytes
-
-import "internal/cpu"
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
-// indexShortStr requires 2 <= len(c) <= shortStringLen
-func indexShortStr(s, c []byte) int // ../runtime/asm_amd64.s
-func countByte(s []byte, c byte) int // ../runtime/asm_amd64.s
-
-var shortStringLen int
-
-func init() {
- if cpu.X86.HasAVX2 {
- shortStringLen = 63
- } else {
- shortStringLen = 31
- }
-}
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
- n := len(sep)
- switch {
- case n == 0:
- return 0
- case n == 1:
- return IndexByte(s, sep[0])
- case n == len(s):
- if Equal(sep, s) {
- return 0
- }
- return -1
- case n > len(s):
- return -1
- case n <= shortStringLen:
- // Use brute force when s and sep both are small
- if len(s) <= 64 {
- return indexShortStr(s, sep)
- }
- c := sep[0]
- i := 0
- t := s[:len(s)-n+1]
- fails := 0
- for i < len(t) {
- if t[i] != c {
- // IndexByte skips 16/32 bytes per iteration,
- // so it's faster than indexShortStr.
- o := IndexByte(t[i:], c)
- if o < 0 {
- return -1
- }
- i += o
- }
- if Equal(s[i:i+n], sep) {
- return i
- }
- fails++
- i++
- // Switch to indexShortStr when IndexByte produces too many false positives.
- // Too many means more that 1 error per 8 characters.
- // Allow some errors in the beginning.
- if fails > (i+16)/8 {
- r := indexShortStr(s[i:], sep)
- if r >= 0 {
- return r + i
- }
- return -1
- }
- }
- return -1
- }
- return indexRabinKarp(s, sep)
-}
+++ /dev/null
-// Copyright 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package bytes
-
-func countByte(s []byte, c byte) int // bytes_arm64.s
-
-// 8 bytes can be completely loaded into 1 register.
-const shortStringLen = 8
-
-//go:noescape
-func indexShortStr(s, sep []byte) int
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
- n := len(sep)
- switch {
- case n == 0:
- return 0
- case n == 1:
- return IndexByte(s, sep[0])
- case n == len(s):
- if Equal(sep, s) {
- return 0
- }
- return -1
- case n > len(s):
- return -1
- case n <= shortStringLen:
- // Use brute force when both s and sep are small.
- // Empirical data shows that it can get better
- // performance when len(s) <= 16.
- if len(s) <= 16 {
- return indexShortStr(s, sep)
- }
- }
- c := sep[0]
- i := 0
- fails := 0
- t := s[:len(s)-n+1]
- for i < len(t) {
- if t[i] != c {
- o := IndexByte(t[i:], c)
- if o < 0 {
- break
- }
- i += o
- }
- if Equal(s[i:i+n], sep) {
- return i
- }
- i++
- fails++
- if fails >= 4+i>>4 && i < len(t) {
- // Give up on IndexByte, it isn't skipping ahead
- // far enough to be better than Rabin-Karp.
- // Experiments (using IndexPeriodic) suggest
- // the cutover is about 16 byte skips.
- // TODO: if large prefixes of sep are matching
- // we should cutover at even larger average skips,
- // because Equal becomes that much more expensive.
- // This code does not take that effect into account.
- j := indexRabinKarp(s[i:], sep)
- if j < 0 {
- return -1
- }
- return i + j
- }
- }
- return -1
-}
+++ /dev/null
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64,!s390x,!arm64
-
-package bytes
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
- n := len(sep)
- switch {
- case n == 0:
- return 0
- case n == 1:
- return IndexByte(s, sep[0])
- case n == len(s):
- if Equal(sep, s) {
- return 0
- }
- return -1
- case n > len(s):
- return -1
- }
- c := sep[0]
- i := 0
- fails := 0
- t := s[:len(s)-n+1]
- for i < len(t) {
- if t[i] != c {
- o := IndexByte(t[i:], c)
- if o < 0 {
- break
- }
- i += o
- }
- if Equal(s[i:i+n], sep) {
- return i
- }
- i++
- fails++
- if fails >= 4+i>>4 && i < len(t) {
- // Give up on IndexByte, it isn't skipping ahead
- // far enough to be better than Rabin-Karp.
- // Experiments (using IndexPeriodic) suggest
- // the cutover is about 16 byte skips.
- // TODO: if large prefixes of sep are matching
- // we should cutover at even larger average skips,
- // because Equal becomes that much more expensive.
- // This code does not take that effect into account.
- j := indexRabinKarp(s[i:], sep)
- if j < 0 {
- return -1
- }
- return i + j
- }
- }
- return -1
-}
+++ /dev/null
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package bytes
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of sep in s,
-// or -1 if sep is not present in s.
-// indexShortStr requires 2 <= len(sep) <= shortStringLen
-func indexShortStr(s, c []byte) int // ../runtime/asm_s390x.s
-
-// supportsVX reports whether the vector facility is available.
-// indexShortStr must not be called if the vector facility is not
-// available.
-func supportsVX() bool // ../runtime/asm_s390x.s
-
-var shortStringLen = -1
-
-func init() {
- if supportsVX() {
- shortStringLen = 64
- }
-}
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
- n := len(sep)
- switch {
- case n == 0:
- return 0
- case n == 1:
- return IndexByte(s, sep[0])
- case n == len(s):
- if Equal(sep, s) {
- return 0
- }
- return -1
- case n > len(s):
- return -1
- case n <= shortStringLen:
- // Use brute force when s and sep both are small
- if len(s) <= 64 {
- return indexShortStr(s, sep)
- }
- c := sep[0]
- i := 0
- t := s[:len(s)-n+1]
- fails := 0
- for i < len(t) {
- if t[i] != c {
- // IndexByte skips 16/32 bytes per iteration,
- // so it's faster than indexShortStr.
- o := IndexByte(t[i:], c)
- if o < 0 {
- return -1
- }
- i += o
- }
- if Equal(s[i:i+n], sep) {
- return i
- }
- fails++
- i++
- // Switch to indexShortStr when IndexByte produces too many false positives.
- // Too many means more that 1 error per 8 characters.
- // Allow some errors in the beginning.
- if fails > (i+16)/8 {
- r := indexShortStr(s[i:], sep)
- if r >= 0 {
- return r + i
- }
- return -1
- }
- }
- return -1
- }
- return indexRabinKarp(s, sep)
-}
// amd64-specific vet whitelist. See readme.txt for details.
-internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
-internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime
-
// False positives.
+// Nothing much to do about cross-package assembly. Unfortunate.
+internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
+internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime
// reflect trampolines intentionally omit arg size. Same for morestack.
runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame
runtime/asm_amd64.s: [amd64] morestack: use of 16(SP) points beyond argument frame
runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame
-// Nothing much to do about cross-package assembly. Unfortunate.
-runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package strings
-runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package bytes
-
// Intentionally missing declarations. These are special assembly routines.
// Some are jumped into from other routines, with values in specific registers.
// duff* have direct calls from the compiler.
runtime/duff_amd64.s: [amd64] duffzero: function duffzero missing Go declaration
runtime/duff_amd64.s: [amd64] duffcopy: function duffcopy missing Go declaration
runtime/asm_amd64.s: [amd64] stackcheck: function stackcheck missing Go declaration
-runtime/asm_amd64.s: [amd64] indexShortStr: function indexShortStr missing Go declaration
runtime/asm_s390x.s: [s390x] abort: function abort missing Go declaration
internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: Compare is in package bytes
internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: cmpstring is in package runtime
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package strings
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package bytes
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package strings
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package bytes
-runtime/asm_s390x.s: [s390x] indexShortStr: function indexShortStr missing Go declaration
runtime/asm_s390x.s: [s390x] addmoduledata: function addmoduledata missing Go declaration
runtime/memclr_s390x.s: [s390x] memclr_s390x_exrl_xc: function memclr_s390x_exrl_xc missing Go declaration
runtime/memmove_s390x.s: [s390x] memmove_s390x_exrl_mvc: function memmove_s390x_exrl_mvc missing Go declaration
--- /dev/null
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import (
+ "internal/cpu"
+ "unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+ x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
+ x86_HasSSE42 = unsafe.Offsetof(cpu.X86.HasSSE42)
+ x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
+ x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+ s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX)
+)
+
+// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+var MaxLen int
--- /dev/null
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+ MOVD b_base+0(FP), R0
+ MOVD b_len+8(FP), R2
+ MOVBU c+24(FP), R1
+ MOVD $ret+32(FP), R8
+ B countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+ MOVD s_base+0(FP), R0
+ MOVD s_len+8(FP), R2
+ MOVBU c+16(FP), R1
+ MOVD $ret+24(FP), R8
+ B countbytebody<>(SB)
+
+// input:
+// R0: data
+// R2: data len
+// R1: byte to find
+// R8: address to put result
+TEXT countbytebody<>(SB),NOSPLIT,$0
+ // R11 = count of byte to search
+ MOVD $0, R11
+ // short path to handle 0-byte case
+ CBZ R2, done
+ CMP $0x20, R2
+ // jump directly to tail if length < 32
+ BLO tail
+ ANDS $0x1f, R0, R9
+ BEQ chunk
+ // Work with not 32-byte aligned head
+ BIC $0x1f, R0, R3
+ ADD $0x20, R3
+head_loop:
+ MOVBU.P 1(R0), R5
+ CMP R5, R1
+ CINC EQ, R11, R11
+ SUB $1, R2, R2
+ CMP R0, R3
+ BNE head_loop
+ // Work with 32-byte aligned chunks
+chunk:
+ BIC $0x1f, R2, R9
+ // The first chunk can also be the last
+ CBZ R9, tail
+ // R3 = end of 32-byte chunks
+ ADD R0, R9, R3
+ MOVD $1, R5
+ VMOV R5, V5.B16
+ // R2 = length of tail
+ SUB R9, R2, R2
+ // Duplicate R1 (byte to search) to 16 1-byte elements of V0
+ VMOV R1, V0.B16
+ // Clear the low 64-bit element of V7 and V8
+ VEOR V7.B8, V7.B8, V7.B8
+ VEOR V8.B8, V8.B8, V8.B8
+ // Count the target byte in 32-byte chunk
+chunk_loop:
+ VLD1.P (R0), [V1.B16, V2.B16]
+ CMP R0, R3
+ VCMEQ V0.B16, V1.B16, V3.B16
+ VCMEQ V0.B16, V2.B16, V4.B16
+ // Clear the higher 7 bits
+ VAND V5.B16, V3.B16, V3.B16
+ VAND V5.B16, V4.B16, V4.B16
+ // Count lanes match the requested byte
+ VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
+ VUADDLV V6.B16, V7
+ // Accumulate the count in low 64-bit element of V8 when inside the loop
+ VADD V7, V8
+ BNE chunk_loop
+ VMOV V8.D[0], R6
+ ADD R6, R11, R11
+ CBZ R2, done
+tail:
+ // Work with tail shorter than 32 bytes
+ MOVBU.P 1(R0), R5
+ SUB $1, R2, R2
+ CMP R5, R1
+ CINC EQ, R11, R11
+ CBNZ R2, tail
+done:
+ MOVD R11, (R8)
+ RET
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build !amd64
+// +build !amd64,!arm64
package bytealg
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
-// +build amd64
+// +build amd64 arm64
package bytealg
package bytealg
-import (
- "internal/cpu"
- "unsafe"
-)
-
// Note: there's no equal_generic.go because every platform must implement at least memequal_varlen in assembly.
-// Because equal_native.go is unconditional, it's a good place to compute asm constants.
-// TODO: find a better way to do this?
-
-// Offsets into internal/cpu records for use in assembly.
-const (
- x86_HasSSE2 = unsafe.Offsetof(cpu.X86.HasSSE2)
- x86_HasAVX2 = unsafe.Offsetof(cpu.X86.HasAVX2)
- x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
- s390x_HasVX = unsafe.Offsetof(cpu.S390X.HasVX)
-)
-
//go:noescape
func Equal(a, b []byte) bool
--- /dev/null
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+ if cpu.X86.HasAVX2 {
+ MaxLen = 63
+ } else {
+ MaxLen = 31
+ }
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ // 1 error per 8 characters, plus a few slop to start.
+ return (n + 16) / 8
+}
--- /dev/null
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+ MOVQ a_base+0(FP), DI
+ MOVQ a_len+8(FP), DX
+ MOVQ b_base+24(FP), BP
+ MOVQ b_len+32(FP), AX
+ MOVQ DI, R10
+ LEAQ ret+48(FP), R11
+ JMP indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+ MOVQ a_base+0(FP), DI
+ MOVQ a_len+8(FP), DX
+ MOVQ b_base+16(FP), BP
+ MOVQ b_len+24(FP), AX
+ MOVQ DI, R10
+ LEAQ ret+32(FP), R11
+ JMP indexbody<>(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// BP: pointer to string, that we are searching for
+// R11: address, where to put return value
+// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
+TEXT indexbody<>(SB),NOSPLIT,$0
+ CMPQ AX, DX
+ JA fail
+ CMPQ DX, $16
+ JAE sse42
+no_sse42:
+ CMPQ AX, $2
+ JA _3_or_more
+ MOVW (BP), BP
+ LEAQ -1(DI)(DX*1), DX
+loop2:
+ MOVW (DI), SI
+ CMPW SI,BP
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop2
+ JMP fail
+_3_or_more:
+ CMPQ AX, $3
+ JA _4_or_more
+ MOVW 1(BP), BX
+ MOVW (BP), BP
+ LEAQ -2(DI)(DX*1), DX
+loop3:
+ MOVW (DI), SI
+ CMPW SI,BP
+ JZ partial_success3
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop3
+ JMP fail
+partial_success3:
+ MOVW 1(DI), SI
+ CMPW SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop3
+ JMP fail
+_4_or_more:
+ CMPQ AX, $4
+ JA _5_or_more
+ MOVL (BP), BP
+ LEAQ -3(DI)(DX*1), DX
+loop4:
+ MOVL (DI), SI
+ CMPL SI,BP
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop4
+ JMP fail
+_5_or_more:
+ CMPQ AX, $7
+ JA _8_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVL -4(BP)(AX*1), BX
+ MOVL (BP), BP
+loop5to7:
+ MOVL (DI), SI
+ CMPL SI,BP
+ JZ partial_success5to7
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop5to7
+ JMP fail
+partial_success5to7:
+ MOVL -4(AX)(DI*1), SI
+ CMPL SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop5to7
+ JMP fail
+_8_or_more:
+ CMPQ AX, $8
+ JA _9_or_more
+ MOVQ (BP), BP
+ LEAQ -7(DI)(DX*1), DX
+loop8:
+ MOVQ (DI), SI
+ CMPQ SI,BP
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop8
+ JMP fail
+_9_or_more:
+ CMPQ AX, $15
+ JA _16_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVQ -8(BP)(AX*1), BX
+ MOVQ (BP), BP
+loop9to15:
+ MOVQ (DI), SI
+ CMPQ SI,BP
+ JZ partial_success9to15
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop9to15
+ JMP fail
+partial_success9to15:
+ MOVQ -8(AX)(DI*1), SI
+ CMPQ SI,BX
+ JZ success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop9to15
+ JMP fail
+_16_or_more:
+ CMPQ AX, $16
+ JA _17_or_more
+ MOVOU (BP), X1
+ LEAQ -15(DI)(DX*1), DX
+loop16:
+ MOVOU (DI), X2
+ PCMPEQB X1, X2
+ PMOVMSKB X2, SI
+ CMPQ SI, $0xffff
+ JE success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop16
+ JMP fail
+_17_or_more:
+ CMPQ AX, $31
+ JA _32_or_more
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ MOVOU -16(BP)(AX*1), X0
+ MOVOU (BP), X1
+loop17to31:
+ MOVOU (DI), X2
+ PCMPEQB X1,X2
+ PMOVMSKB X2, SI
+ CMPQ SI, $0xffff
+ JE partial_success17to31
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop17to31
+ JMP fail
+partial_success17to31:
+ MOVOU -16(AX)(DI*1), X3
+ PCMPEQB X0, X3
+ PMOVMSKB X3, SI
+ CMPQ SI, $0xffff
+ JE success
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop17to31
+ JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+ CMPQ AX, $32
+ JA _33_to_63
+ VMOVDQU (BP), Y1
+ LEAQ -31(DI)(DX*1), DX
+loop32:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, SI
+ CMPL SI, $0xffffffff
+ JE success_avx2
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop32
+ JMP fail_avx2
+_33_to_63:
+ LEAQ 1(DI)(DX*1), DX
+ SUBQ AX, DX
+ VMOVDQU -32(BP)(AX*1), Y0
+ VMOVDQU (BP), Y1
+loop33to63:
+ VMOVDQU (DI), Y2
+ VPCMPEQB Y1, Y2, Y3
+ VPMOVMSKB Y3, SI
+ CMPL SI, $0xffffffff
+ JE partial_success33to63
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop33to63
+ JMP fail_avx2
+partial_success33to63:
+ VMOVDQU -32(AX)(DI*1), Y3
+ VPCMPEQB Y0, Y3, Y4
+ VPMOVMSKB Y4, SI
+ CMPL SI, $0xffffffff
+ JE success_avx2
+ ADDQ $1,DI
+ CMPQ DI,DX
+ JB loop33to63
+fail_avx2:
+ VZEROUPPER
+fail:
+ MOVQ $-1, (R11)
+ RET
+success_avx2:
+ VZEROUPPER
+ JMP success
+sse42:
+ CMPB internal∕cpu·X86+const_x86_HasSSE42(SB), $1
+ JNE no_sse42
+ CMPQ AX, $12
+ // PCMPESTRI is slower than normal compare,
+ // so using it makes sense only if we advance 4+ bytes per compare
+ // This value was determined experimentally and is the ~same
+ // on Nehalem (first with SSE42) and Haswell.
+ JAE _9_or_more
+ LEAQ 16(BP), SI
+ TESTW $0xff0, SI
+ JEQ no_sse42
+ MOVOU (BP), X1
+ LEAQ -15(DI)(DX*1), SI
+ MOVQ $16, R9
+ SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+ // 0x0c means: unsigned byte compare (bits 0,1 are 00)
+ // for equality (bits 2,3 are 11)
+ // result is not masked or inverted (bits 4,5 are 00)
+ // and corresponds to first matching byte (bit 6 is 0)
+ PCMPESTRI $0x0c, (DI), X1
+ // CX == 16 means no match,
+ // CX > R9 means partial match at the end of the string,
+ // otherwise sep is at offset CX from X1 start
+ CMPQ CX, R9
+ JBE sse42_success
+ ADDQ R9, DI
+ CMPQ DI, SI
+ JB loop_sse42
+ PCMPESTRI $0x0c, -1(SI), X1
+ CMPQ CX, R9
+ JA fail
+ LEAQ -1(SI), DI
+sse42_success:
+ ADDQ CX, DI
+success:
+ SUBQ R10, DI
+ MOVQ DI, (R11)
+ RET
--- /dev/null
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Empirical data shows that using IndexShortStr can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+ // 8 bytes can be completely loaded into 1 register.
+ MaxLen = 8
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to IndexShortStr.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ // 1 error per 16 characters, plus a few slop to start.
+ return 4 + n>>4
+}
-// Copyright 2017 The Go Authors. All rights reserved.
+// Copyright 2018 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
+#include "go_asm.h"
#include "textflag.h"
-// countByte(s []byte, c byte) int
-TEXT bytes·countByte(SB),NOSPLIT,$0-40
- MOVD s_base+0(FP), R0
- MOVD s_len+8(FP), R2
- MOVBU c+24(FP), R1
- // R11 = count of byte to search
- MOVD $0, R11
- // short path to handle 0-byte case
- CBZ R2, done
- CMP $0x20, R2
- // jump directly to tail if length < 32
- BLO tail
- ANDS $0x1f, R0, R9
- BEQ chunk
- // Work with not 32-byte aligned head
- BIC $0x1f, R0, R3
- ADD $0x20, R3
-head_loop:
- MOVBU.P 1(R0), R5
- CMP R5, R1
- CINC EQ, R11, R11
- SUB $1, R2, R2
- CMP R0, R3
- BNE head_loop
- // Work with 32-byte aligned chunks
-chunk:
- BIC $0x1f, R2, R9
- // The first chunk can also be the last
- CBZ R9, tail
- // R3 = end of 32-byte chunks
- ADD R0, R9, R3
- MOVD $1, R5
- VMOV R5, V5.B16
- // R2 = length of tail
- SUB R9, R2, R2
- // Duplicate R1 (byte to search) to 16 1-byte elements of V0
- VMOV R1, V0.B16
- // Clear the low 64-bit element of V7 and V8
- VEOR V7.B8, V7.B8, V7.B8
- VEOR V8.B8, V8.B8, V8.B8
- // Count the target byte in 32-byte chunk
-chunk_loop:
- VLD1.P (R0), [V1.B16, V2.B16]
- CMP R0, R3
- VCMEQ V0.B16, V1.B16, V3.B16
- VCMEQ V0.B16, V2.B16, V4.B16
- // Clear the higher 7 bits
- VAND V5.B16, V3.B16, V3.B16
- VAND V5.B16, V4.B16, V4.B16
- // Count lanes match the requested byte
- VADDP V4.B16, V3.B16, V6.B16 // 32B->16B
- VUADDLV V6.B16, V7
- // Accumulate the count in low 64-bit element of V8 when inside the loop
- VADD V7, V8
- BNE chunk_loop
- VMOV V8.D[0], R6
- ADD R6, R11, R11
- CBZ R2, done
-tail:
- // Work with tail shorter than 32 bytes
- MOVBU.P 1(R0), R5
- SUB $1, R2, R2
- CMP R5, R1
- CINC EQ, R11, R11
- CBNZ R2, tail
-done:
- MOVD R11, ret+32(FP)
- RET
+TEXT ·Index(SB),NOSPLIT,$0-56
+ MOVD a_base+0(FP), R0
+ MOVD a_len+8(FP), R1
+ MOVD b_base+24(FP), R2
+ MOVD b_len+32(FP), R3
+ MOVD $ret+48(FP), R9
+ B indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+ MOVD a_base+0(FP), R0
+ MOVD a_len+8(FP), R1
+ MOVD b_base+16(FP), R2
+ MOVD b_len+24(FP), R3
+ MOVD $ret+32(FP), R9
+ B indexbody<>(SB)
-// indexShortStr(s, sep []byte) int
-// precondition: 2 <= len(sep) <= 8
-TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
+// input:
+// R0: haystack
+// R1: length of haystack
+// R2: needle
+// R3: length of needle (2 <= len <= 8)
+// R9: address to put result
+TEXT indexbody<>(SB),NOSPLIT,$0-56
// main idea is to load 'sep' into separate register(s)
// to avoid repeatedly re-load it again and again
// for sebsequent substring comparisons
- MOVD s+0(FP), R0
- MOVD s_len+8(FP), R1
- MOVD sep+24(FP), R2
- MOVD sep_len+32(FP), R3
+ MOVD a_base+0(FP), R0
+ MOVD a_len+8(FP), R1
+ MOVD b_base+24(FP), R2
+ MOVD b_len+32(FP), R3
SUB R3, R1, R4
// R4 contains the start of last substring for comparsion
ADD R0, R4, R4
BLS loop_2
not_found:
MOVD $-1, R0
- MOVD R0, ret+48(FP)
+ MOVD R0, (R9)
RET
found:
SUB R8, R0, R0
- MOVD R0, ret+48(FP)
+ MOVD R0, (R9)
RET
--- /dev/null
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!arm64,!s390x
+
+package bytealg
+
+const MaxBruteForce = 0
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int {
+ panic("unimplemented")
+}
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int {
+ panic("unimplemented")
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ panic("unimplemented")
+}
--- /dev/null
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 arm64 s390x
+
+package bytealg
+
+//go:noescape
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int
+
+//go:noescape
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int
--- /dev/null
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+ // Note: we're kind of lucky that this flag is available at this point.
+ // The runtime sets HasVX when processing auxv records, and that happens
+ // to happen *before* running the init functions of packages that
+ // the runtime depends on.
+ // TODO: it would really be nicer for internal/cpu to figure out this
+ // flag by itself. Then we wouldn't need to depend on quirks of
+ // early startup initialization order.
+ if cpu.S390X.HasVX {
+ MaxLen = 64
+ }
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+ // 1 error per 8 characters, plus a few slop to start.
+ return (n + 16) / 8
+}
--- /dev/null
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
+ LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
+ LMG b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+ MOVD $ret+48(FP), R5
+ BR indexbody<>(SB)
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
+ LMG a_base+0(FP), R1, R2 // R1=&s[0], R2=len(s)
+ LMG b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+ MOVD $ret+32(FP), R5
+ BR indexbody<>(SB)
+
+// s: string we are searching
+// sep: string to search for
+// R1=&s[0], R2=len(s)
+// R3=&sep[0], R4=len(sep)
+// R5=&ret (int)
+// Caller must confirm availability of vx facility before calling.
+TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
+ CMPBGT R4, R2, notfound
+ ADD R1, R2
+ SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
+ CMPBEQ R4, $0, notfound
+ SUB $1, R4 // R4=len(sep)-1 for use as VLL index
+ VLL R4, (R3), V0 // contains first 16 bytes of sep
+ MOVD R1, R7
+index2plus:
+ CMPBNE R4, $1, index3plus
+ MOVD $15(R7), R9
+ CMPBGE R9, R2, index2to16
+ VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
+ VONE V16
+ VREPH $0, V0, V1
+ CMPBGE R9, R2, index2to16
+index2loop:
+ VL 0(R7), V2 // 16 bytes, even indices
+ VL 1(R7), V4 // 16 bytes, odd indices
+ VCEQH V1, V2, V5 // compare even indices
+ VCEQH V1, V4, V6 // compare odd indices
+ VSEL V5, V6, V31, V7 // merge even and odd indices
+ VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
+ BLT foundV17
+ MOVD $16(R7), R7 // R7+=16
+ ADD $15, R7, R9
+ CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
+ CMPBLE R7, R2, index2to16
+ BR notfound
+
+index3plus:
+ CMPBNE R4, $2, index4plus
+ ADD $15, R7, R9
+ CMPBGE R9, R2, index2to16
+ MOVD $1, R0
+ VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
+ VONE V16
+ VREPH $0, V0, V1
+ VREPB $2, V0, V8
+index3loop:
+ VL (R7), V2 // load 16-bytes into V2
+ VLL R0, 16(R7), V3 // load 2-bytes into V3
+ VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
+ VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2
+ VCEQH V1, V2, V5 // compare 2-byte even indices
+ VCEQH V1, V4, V6 // compare 2-byte odd indices
+ VCEQB V8, V9, V10 // compare last bytes
+ VSEL V5, V6, V31, V7 // merge even and odd indices
+ VN V7, V10, V7 // AND indices with last byte
+ VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
+ BLT foundV17
+ MOVD $16(R7), R7 // R7+=16
+ ADD $15, R7, R9
+ CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
+ CMPBLE R7, R2, index2to16
+ BR notfound
+
+index4plus:
+ CMPBNE R4, $3, index5plus
+ ADD $15, R7, R9
+ CMPBGE R9, R2, index2to16
+ MOVD $2, R0
+ VGBM $0x8888, V29 // 0xff000000ff000000...
+ VGBM $0x2222, V30 // 0x0000ff000000ff00...
+ VGBM $0xcccc, V31 // 0xffff0000ffff0000...
+ VONE V16
+ VREPF $0, V0, V1
+index4loop:
+ VL (R7), V2 // load 16-bytes into V2
+ VLL R0, 16(R7), V3 // load 3-bytes into V3
+ VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
+ VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1
+ VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1
+ VCEQF V1, V2, V5 // compare index 0, 4, ...
+ VCEQF V1, V4, V6 // compare index 1, 5, ...
+ VCEQF V1, V9, V11 // compare index 2, 6, ...
+ VCEQF V1, V10, V12 // compare index 3, 7, ...
+ VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ...
+ VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
+ VSEL V13, V14, V31, V7 // final merge
+ VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
+ BLT foundV17
+ MOVD $16(R7), R7 // R7+=16
+ ADD $15, R7, R9
+ CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
+ CMPBLE R7, R2, index2to16
+ BR notfound
+
+index5plus:
+ CMPBGT R4, $15, index17plus
+index2to16:
+ CMPBGT R7, R2, notfound
+ MOVD $1(R7), R8
+ CMPBGT R8, R2, index2to16tail
+index2to16loop:
+ // unrolled 2x
+ VLL R4, (R7), V1
+ VLL R4, 1(R7), V2
+ VCEQGS V0, V1, V3
+ BEQ found
+ MOVD $1(R7), R7
+ VCEQGS V0, V2, V4
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLT R7, R2, index2to16loop
+ CMPBGT R7, R2, notfound
+index2to16tail:
+ VLL R4, (R7), V1
+ VCEQGS V0, V1, V2
+ BEQ found
+ BR notfound
+
+index17plus:
+ CMPBGT R4, $31, index33plus
+ SUB $16, R4, R0
+ VLL R0, 16(R3), V1
+ VONE V7
+index17to32loop:
+ VL (R7), V2
+ VLL R0, 16(R7), V3
+ VCEQG V0, V2, V4
+ VCEQG V1, V3, V5
+ VN V4, V5, V6
+ VCEQGS V6, V7, V8
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLE R7, R2, index17to32loop
+ BR notfound
+
+index33plus:
+ CMPBGT R4, $47, index49plus
+ SUB $32, R4, R0
+ VL 16(R3), V1
+ VLL R0, 32(R3), V2
+ VONE V11
+index33to48loop:
+ VL (R7), V3
+ VL 16(R7), V4
+ VLL R0, 32(R7), V5
+ VCEQG V0, V3, V6
+ VCEQG V1, V4, V7
+ VCEQG V2, V5, V8
+ VN V6, V7, V9
+ VN V8, V9, V10
+ VCEQGS V10, V11, V12
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLE R7, R2, index33to48loop
+ BR notfound
+
+index49plus:
+ CMPBGT R4, $63, index65plus
+ SUB $48, R4, R0
+ VL 16(R3), V1
+ VL 32(R3), V2
+ VLL R0, 48(R3), V3
+ VONE V15
+index49to64loop:
+ VL (R7), V4
+ VL 16(R7), V5
+ VL 32(R7), V6
+ VLL R0, 48(R7), V7
+ VCEQG V0, V4, V8
+ VCEQG V1, V5, V9
+ VCEQG V2, V6, V10
+ VCEQG V3, V7, V11
+ VN V8, V9, V12
+ VN V10, V11, V13
+ VN V12, V13, V14
+ VCEQGS V14, V15, V16
+ BEQ found
+ MOVD $1(R7), R7
+ CMPBLE R7, R2, index49to64loop
+notfound:
+ MOVD $-1, (R5)
+ RET
+
+index65plus:
+ // not implemented
+ MOVD $0, (R0)
+ RET
+
+foundV17: // index is in doubleword V17[0]
+ VLGVG $0, V17, R8
+ ADD R8, R7
+found:
+ SUB R1, R7
+ MOVD R7, (R5)
+ RET
DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
GLOBL shifts<>(SB),RODATA,$256
-TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
- MOVQ s+0(FP), DI
- // We want len in DX and AX, because PCMPESTRI implicitly consumes them
- MOVQ s_len+8(FP), DX
- MOVQ c+16(FP), BP
- MOVQ c_len+24(FP), AX
- MOVQ DI, R10
- LEAQ ret+32(FP), R11
- JMP runtime·indexShortStr(SB)
-
-TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
- MOVQ s+0(FP), DI
- MOVQ s_len+8(FP), DX
- MOVQ c+24(FP), BP
- MOVQ c_len+32(FP), AX
- MOVQ DI, R10
- LEAQ ret+48(FP), R11
- JMP runtime·indexShortStr(SB)
-
-// AX: length of string, that we are searching for
-// DX: length of string, in which we are searching
-// DI: pointer to string, in which we are searching
-// BP: pointer to string, that we are searching for
-// R11: address, where to put return value
-TEXT runtime·indexShortStr(SB),NOSPLIT,$0
- CMPQ AX, DX
- JA fail
- CMPQ DX, $16
- JAE sse42
-no_sse42:
- CMPQ AX, $2
- JA _3_or_more
- MOVW (BP), BP
- LEAQ -1(DI)(DX*1), DX
-loop2:
- MOVW (DI), SI
- CMPW SI,BP
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop2
- JMP fail
-_3_or_more:
- CMPQ AX, $3
- JA _4_or_more
- MOVW 1(BP), BX
- MOVW (BP), BP
- LEAQ -2(DI)(DX*1), DX
-loop3:
- MOVW (DI), SI
- CMPW SI,BP
- JZ partial_success3
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop3
- JMP fail
-partial_success3:
- MOVW 1(DI), SI
- CMPW SI,BX
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop3
- JMP fail
-_4_or_more:
- CMPQ AX, $4
- JA _5_or_more
- MOVL (BP), BP
- LEAQ -3(DI)(DX*1), DX
-loop4:
- MOVL (DI), SI
- CMPL SI,BP
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop4
- JMP fail
-_5_or_more:
- CMPQ AX, $7
- JA _8_or_more
- LEAQ 1(DI)(DX*1), DX
- SUBQ AX, DX
- MOVL -4(BP)(AX*1), BX
- MOVL (BP), BP
-loop5to7:
- MOVL (DI), SI
- CMPL SI,BP
- JZ partial_success5to7
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop5to7
- JMP fail
-partial_success5to7:
- MOVL -4(AX)(DI*1), SI
- CMPL SI,BX
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop5to7
- JMP fail
-_8_or_more:
- CMPQ AX, $8
- JA _9_or_more
- MOVQ (BP), BP
- LEAQ -7(DI)(DX*1), DX
-loop8:
- MOVQ (DI), SI
- CMPQ SI,BP
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop8
- JMP fail
-_9_or_more:
- CMPQ AX, $15
- JA _16_or_more
- LEAQ 1(DI)(DX*1), DX
- SUBQ AX, DX
- MOVQ -8(BP)(AX*1), BX
- MOVQ (BP), BP
-loop9to15:
- MOVQ (DI), SI
- CMPQ SI,BP
- JZ partial_success9to15
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop9to15
- JMP fail
-partial_success9to15:
- MOVQ -8(AX)(DI*1), SI
- CMPQ SI,BX
- JZ success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop9to15
- JMP fail
-_16_or_more:
- CMPQ AX, $16
- JA _17_or_more
- MOVOU (BP), X1
- LEAQ -15(DI)(DX*1), DX
-loop16:
- MOVOU (DI), X2
- PCMPEQB X1, X2
- PMOVMSKB X2, SI
- CMPQ SI, $0xffff
- JE success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop16
- JMP fail
-_17_or_more:
- CMPQ AX, $31
- JA _32_or_more
- LEAQ 1(DI)(DX*1), DX
- SUBQ AX, DX
- MOVOU -16(BP)(AX*1), X0
- MOVOU (BP), X1
-loop17to31:
- MOVOU (DI), X2
- PCMPEQB X1,X2
- PMOVMSKB X2, SI
- CMPQ SI, $0xffff
- JE partial_success17to31
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop17to31
- JMP fail
-partial_success17to31:
- MOVOU -16(AX)(DI*1), X3
- PCMPEQB X0, X3
- PMOVMSKB X3, SI
- CMPQ SI, $0xffff
- JE success
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop17to31
- JMP fail
-// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
-// So no need to check cpuid
-_32_or_more:
- CMPQ AX, $32
- JA _33_to_63
- VMOVDQU (BP), Y1
- LEAQ -31(DI)(DX*1), DX
-loop32:
- VMOVDQU (DI), Y2
- VPCMPEQB Y1, Y2, Y3
- VPMOVMSKB Y3, SI
- CMPL SI, $0xffffffff
- JE success_avx2
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop32
- JMP fail_avx2
-_33_to_63:
- LEAQ 1(DI)(DX*1), DX
- SUBQ AX, DX
- VMOVDQU -32(BP)(AX*1), Y0
- VMOVDQU (BP), Y1
-loop33to63:
- VMOVDQU (DI), Y2
- VPCMPEQB Y1, Y2, Y3
- VPMOVMSKB Y3, SI
- CMPL SI, $0xffffffff
- JE partial_success33to63
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop33to63
- JMP fail_avx2
-partial_success33to63:
- VMOVDQU -32(AX)(DI*1), Y3
- VPCMPEQB Y0, Y3, Y4
- VPMOVMSKB Y4, SI
- CMPL SI, $0xffffffff
- JE success_avx2
- ADDQ $1,DI
- CMPQ DI,DX
- JB loop33to63
-fail_avx2:
- VZEROUPPER
-fail:
- MOVQ $-1, (R11)
- RET
-success_avx2:
- VZEROUPPER
- JMP success
-sse42:
- CMPB runtime·support_sse42(SB), $1
- JNE no_sse42
- CMPQ AX, $12
- // PCMPESTRI is slower than normal compare,
- // so using it makes sense only if we advance 4+ bytes per compare
- // This value was determined experimentally and is the ~same
- // on Nehalem (first with SSE42) and Haswell.
- JAE _9_or_more
- LEAQ 16(BP), SI
- TESTW $0xff0, SI
- JEQ no_sse42
- MOVOU (BP), X1
- LEAQ -15(DI)(DX*1), SI
- MOVQ $16, R9
- SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
-loop_sse42:
- // 0x0c means: unsigned byte compare (bits 0,1 are 00)
- // for equality (bits 2,3 are 11)
- // result is not masked or inverted (bits 4,5 are 00)
- // and corresponds to first matching byte (bit 6 is 0)
- PCMPESTRI $0x0c, (DI), X1
- // CX == 16 means no match,
- // CX > R9 means partial match at the end of the string,
- // otherwise sep is at offset CX from X1 start
- CMPQ CX, R9
- JBE sse42_success
- ADDQ R9, DI
- CMPQ DI, SI
- JB loop_sse42
- PCMPESTRI $0x0c, -1(SI), X1
- CMPQ CX, R9
- JA fail
- LEAQ -1(SI), DI
-sse42_success:
- ADDQ CX, DI
-success:
- SUBQ R10, DI
- MOVQ DI, (R11)
- RET
-
TEXT runtime·return0(SB), NOSPLIT, $0
MOVL $0, AX
RET
// compile barrier.
RET
-// func supportsVX() bool
-TEXT strings·supportsVX(SB),NOSPLIT,$0-1
- MOVBZ runtime·cpu+facilities_hasVX(SB), R0
- MOVB R0, ret+0(FP)
- RET
-
-// func supportsVX() bool
-TEXT bytes·supportsVX(SB),NOSPLIT,$0-1
- MOVBZ runtime·cpu+facilities_hasVX(SB), R0
- MOVB R0, ret+0(FP)
- RET
-
-// func indexShortStr(s, sep string) int
-// Caller must confirm availability of vx facility before calling.
-TEXT strings·indexShortStr(SB),NOSPLIT|NOFRAME,$0-40
- LMG s+0(FP), R1, R2 // R1=&s[0], R2=len(s)
- LMG sep+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
- MOVD $ret+32(FP), R5
- BR runtime·indexShortStr(SB)
-
-// func indexShortStr(s, sep []byte) int
-// Caller must confirm availability of vx facility before calling.
-TEXT bytes·indexShortStr(SB),NOSPLIT|NOFRAME,$0-56
- LMG s+0(FP), R1, R2 // R1=&s[0], R2=len(s)
- LMG sep+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
- MOVD $ret+48(FP), R5
- BR runtime·indexShortStr(SB)
-
-// s: string we are searching
-// sep: string to search for
-// R1=&s[0], R2=len(s)
-// R3=&sep[0], R4=len(sep)
-// R5=&ret (int)
-// Caller must confirm availability of vx facility before calling.
-TEXT runtime·indexShortStr(SB),NOSPLIT|NOFRAME,$0
- CMPBGT R4, R2, notfound
- ADD R1, R2
- SUB R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
- CMPBEQ R4, $0, notfound
- SUB $1, R4 // R4=len(sep)-1 for use as VLL index
- VLL R4, (R3), V0 // contains first 16 bytes of sep
- MOVD R1, R7
-index2plus:
- CMPBNE R4, $1, index3plus
- MOVD $15(R7), R9
- CMPBGE R9, R2, index2to16
- VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
- VONE V16
- VREPH $0, V0, V1
- CMPBGE R9, R2, index2to16
-index2loop:
- VL 0(R7), V2 // 16 bytes, even indices
- VL 1(R7), V4 // 16 bytes, odd indices
- VCEQH V1, V2, V5 // compare even indices
- VCEQH V1, V4, V6 // compare odd indices
- VSEL V5, V6, V31, V7 // merge even and odd indices
- VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
- BLT foundV17
- MOVD $16(R7), R7 // R7+=16
- ADD $15, R7, R9
- CMPBLE R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
- CMPBLE R7, R2, index2to16
- BR notfound
-
-index3plus:
- CMPBNE R4, $2, index4plus
- ADD $15, R7, R9
- CMPBGE R9, R2, index2to16
- MOVD $1, R0
- VGBM $0xaaaa, V31 // 0xff00ff00ff00ff00...
- VONE V16
- VREPH $0, V0, V1
- VREPB $2, V0, V8
-index3loop:
- VL (R7), V2 // load 16-bytes into V2
- VLL R0, 16(R7), V3 // load 2-bytes into V3
- VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
- VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<2
- VCEQH V1, V2, V5 // compare 2-byte even indices
- VCEQH V1, V4, V6 // compare 2-byte odd indices
- VCEQB V8, V9, V10 // compare last bytes
- VSEL V5, V6, V31, V7 // merge even and odd indices
- VN V7, V10, V7 // AND indices with last byte
- VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
- BLT foundV17
- MOVD $16(R7), R7 // R7+=16
- ADD $15, R7, R9
- CMPBLE R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
- CMPBLE R7, R2, index2to16
- BR notfound
-
-index4plus:
- CMPBNE R4, $3, index5plus
- ADD $15, R7, R9
- CMPBGE R9, R2, index2to16
- MOVD $2, R0
- VGBM $0x8888, V29 // 0xff000000ff000000...
- VGBM $0x2222, V30 // 0x0000ff000000ff00...
- VGBM $0xcccc, V31 // 0xffff0000ffff0000...
- VONE V16
- VREPF $0, V0, V1
-index4loop:
- VL (R7), V2 // load 16-bytes into V2
- VLL R0, 16(R7), V3 // load 3-bytes into V3
- VSLDB $1, V2, V3, V4 // V4=(V2:V3)<<1
- VSLDB $2, V2, V3, V9 // V9=(V2:V3)<<1
- VSLDB $3, V2, V3, V10 // V10=(V2:V3)<<1
- VCEQF V1, V2, V5 // compare index 0, 4, ...
- VCEQF V1, V4, V6 // compare index 1, 5, ...
- VCEQF V1, V9, V11 // compare index 2, 6, ...
- VCEQF V1, V10, V12 // compare index 3, 7, ...
- VSEL V5, V6, V29, V13 // merge index 0, 1, 4, 5, ...
- VSEL V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
- VSEL V13, V14, V31, V7 // final merge
- VFEEBS V16, V7, V17 // find leftmost index, set condition to 1 if found
- BLT foundV17
- MOVD $16(R7), R7 // R7+=16
- ADD $15, R7, R9
- CMPBLE R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
- CMPBLE R7, R2, index2to16
- BR notfound
-
-index5plus:
- CMPBGT R4, $15, index17plus
-index2to16:
- CMPBGT R7, R2, notfound
- MOVD $1(R7), R8
- CMPBGT R8, R2, index2to16tail
-index2to16loop:
- // unrolled 2x
- VLL R4, (R7), V1
- VLL R4, 1(R7), V2
- VCEQGS V0, V1, V3
- BEQ found
- MOVD $1(R7), R7
- VCEQGS V0, V2, V4
- BEQ found
- MOVD $1(R7), R7
- CMPBLT R7, R2, index2to16loop
- CMPBGT R7, R2, notfound
-index2to16tail:
- VLL R4, (R7), V1
- VCEQGS V0, V1, V2
- BEQ found
- BR notfound
-
-index17plus:
- CMPBGT R4, $31, index33plus
- SUB $16, R4, R0
- VLL R0, 16(R3), V1
- VONE V7
-index17to32loop:
- VL (R7), V2
- VLL R0, 16(R7), V3
- VCEQG V0, V2, V4
- VCEQG V1, V3, V5
- VN V4, V5, V6
- VCEQGS V6, V7, V8
- BEQ found
- MOVD $1(R7), R7
- CMPBLE R7, R2, index17to32loop
- BR notfound
-
-index33plus:
- CMPBGT R4, $47, index49plus
- SUB $32, R4, R0
- VL 16(R3), V1
- VLL R0, 32(R3), V2
- VONE V11
-index33to48loop:
- VL (R7), V3
- VL 16(R7), V4
- VLL R0, 32(R7), V5
- VCEQG V0, V3, V6
- VCEQG V1, V4, V7
- VCEQG V2, V5, V8
- VN V6, V7, V9
- VN V8, V9, V10
- VCEQGS V10, V11, V12
- BEQ found
- MOVD $1(R7), R7
- CMPBLE R7, R2, index33to48loop
- BR notfound
-
-index49plus:
- CMPBGT R4, $63, index65plus
- SUB $48, R4, R0
- VL 16(R3), V1
- VL 32(R3), V2
- VLL R0, 48(R3), V3
- VONE V15
-index49to64loop:
- VL (R7), V4
- VL 16(R7), V5
- VL 32(R7), V6
- VLL R0, 48(R7), V7
- VCEQG V0, V4, V8
- VCEQG V1, V5, V9
- VCEQG V2, V6, V10
- VCEQG V3, V7, V11
- VN V8, V9, V12
- VN V10, V11, V13
- VN V12, V13, V14
- VCEQGS V14, V15, V16
- BEQ found
- MOVD $1(R7), R7
- CMPBLE R7, R2, index49to64loop
-notfound:
- MOVD $-1, (R5)
- RET
-
-index65plus:
- // not implemented
- MOVD $0, (R0)
- RET
-
-foundV17: // index is in doubleword V17[0]
- VLGVG $0, V17, R8
- ADD R8, R7
-found:
- SUB R1, R7
- MOVD R7, (R5)
- RET
-
// This is called from .init_array and follows the platform, not Go, ABI.
// We are overly conservative. We could only save the registers we use.
// However, since this function is only called once per loaded module
package runtime
-import (
- internalcpu "internal/cpu"
- "runtime/internal/sys"
-)
+import "internal/cpu"
const (
// bit masks taken from bits/hwcap.h
_HWCAP_S390_VX = 2048 // vector facility
)
-// facilities is padded to avoid false sharing.
-type facilities struct {
- _ [sys.CacheLineSize]byte
- hasVX bool // vector facility
- _ [sys.CacheLineSize]byte
-}
-
-// cpu indicates the availability of s390x facilities that can be used in
-// Go assembly but are optional on models supported by Go.
-// TODO: remove this once we're only using internal/cpu.
-var cpu facilities
-
func archauxv(tag, val uintptr) {
switch tag {
case _AT_HWCAP: // CPU capability bit flags
- internalcpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
- cpu.hasVX = val&_HWCAP_S390_VX != 0
+ cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
}
}
return s == t
}
+// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
+func Index(s, substr string) int {
+ n := len(substr)
+ switch {
+ case n == 0:
+ return 0
+ case n == 1:
+ return IndexByte(s, substr[0])
+ case n == len(s):
+ if substr == s {
+ return 0
+ }
+ return -1
+ case n > len(s):
+ return -1
+ case n <= bytealg.MaxLen:
+ // Use brute force when s and substr both are small
+ if len(s) <= bytealg.MaxBruteForce {
+ return bytealg.IndexString(s, substr)
+ }
+ c := substr[0]
+ i := 0
+ t := s[:len(s)-n+1]
+ fails := 0
+ for i < len(t) {
+ if t[i] != c {
+ // IndexByte is faster than bytealg.IndexString, so use it as long as
+ // we're not getting lots of false positives.
+ o := IndexByte(t[i:], c)
+ if o < 0 {
+ return -1
+ }
+ i += o
+ }
+ if s[i:i+n] == substr {
+ return i
+ }
+ fails++
+ i++
+ // Switch to bytealg.IndexString when IndexByte produces too many false positives.
+ if fails > bytealg.Cutover(i) {
+ r := bytealg.IndexString(s[i:], substr)
+ if r >= 0 {
+ return r + i
+ }
+ return -1
+ }
+ }
+ return -1
+ }
+ c := substr[0]
+ i := 0
+ t := s[:len(s)-n+1]
+ fails := 0
+ for i < len(t) {
+ if t[i] != c {
+ o := IndexByte(t[i:], c)
+ if o < 0 {
+ return -1
+ }
+ i += o
+ }
+ if s[i:i+n] == substr {
+ return i
+ }
+ i++
+ fails++
+ if fails >= 4+i>>4 && i < len(t) {
+ // See comment in ../bytes/bytes_generic.go.
+ j := indexRabinKarp(s[i:], substr)
+ if j < 0 {
+ return -1
+ }
+ return i + j
+ }
+ }
+ return -1
+}
+
func indexRabinKarp(s, substr string) int {
// Rabin-Karp search
hashss, pow := hashStr(substr)
+++ /dev/null
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package strings
-
-import "internal/cpu"
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
-// indexShortStr requires 2 <= len(c) <= shortStringLen
-func indexShortStr(s, c string) int // ../runtime/asm_amd64.s
-func countByte(s string, c byte) int // ../runtime/asm_amd64.s
-
-var shortStringLen int
-
-func init() {
- if cpu.X86.HasAVX2 {
- shortStringLen = 63
- } else {
- shortStringLen = 31
- }
-}
-
-// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
-func Index(s, substr string) int {
- n := len(substr)
- switch {
- case n == 0:
- return 0
- case n == 1:
- return IndexByte(s, substr[0])
- case n == len(s):
- if substr == s {
- return 0
- }
- return -1
- case n > len(s):
- return -1
- case n <= shortStringLen:
- // Use brute force when s and substr both are small
- if len(s) <= 64 {
- return indexShortStr(s, substr)
- }
- c := substr[0]
- i := 0
- t := s[:len(s)-n+1]
- fails := 0
- for i < len(t) {
- if t[i] != c {
- // IndexByte skips 16/32 bytes per iteration,
- // so it's faster than indexShortStr.
- o := IndexByte(t[i:], c)
- if o < 0 {
- return -1
- }
- i += o
- }
- if s[i:i+n] == substr {
- return i
- }
- fails++
- i++
- // Switch to indexShortStr when IndexByte produces too many false positives.
- // Too many means more that 1 error per 8 characters.
- // Allow some errors in the beginning.
- if fails > (i+16)/8 {
- r := indexShortStr(s[i:], substr)
- if r >= 0 {
- return r + i
- }
- return -1
- }
- }
- return -1
- }
- return indexRabinKarp(s, substr)
-}
+++ /dev/null
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64,!s390x
-
-package strings
-
-// TODO: implements short string optimization on non amd64 platforms
-// and get rid of strings_amd64.go
-
-// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
-func Index(s, substr string) int {
- n := len(substr)
- switch {
- case n == 0:
- return 0
- case n == 1:
- return IndexByte(s, substr[0])
- case n == len(s):
- if substr == s {
- return 0
- }
- return -1
- case n > len(s):
- return -1
- }
- c := substr[0]
- i := 0
- t := s[:len(s)-n+1]
- fails := 0
- for i < len(t) {
- if t[i] != c {
- o := IndexByte(t[i:], c)
- if o < 0 {
- return -1
- }
- i += o
- }
- if s[i:i+n] == substr {
- return i
- }
- i++
- fails++
- if fails >= 4+i>>4 && i < len(t) {
- // See comment in ../bytes/bytes_generic.go.
- j := indexRabinKarp(s[i:], substr)
- if j < 0 {
- return -1
- }
- return i + j
- }
- }
- return -1
-}
+++ /dev/null
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package strings
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of sep in s,
-// or -1 if sep is not present in s.
-// indexShortStr requires 2 <= len(sep) <= shortStringLen
-func indexShortStr(s, sep string) int // ../runtime/asm_$GOARCH.s
-
-// supportsVX reports whether the vector facility is available.
-// indexShortStr must not be called if the vector facility is not
-// available.
-func supportsVX() bool // ../runtime/asm_s390x.s
-
-var shortStringLen = -1
-
-func init() {
- if supportsVX() {
- shortStringLen = 64
- }
-}
-
-// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
-func Index(s, substr string) int {
- n := len(substr)
- switch {
- case n == 0:
- return 0
- case n == 1:
- return IndexByte(s, substr[0])
- case n == len(s):
- if substr == s {
- return 0
- }
- return -1
- case n > len(s):
- return -1
- case n <= shortStringLen:
- // Use brute force when s and substr both are small
- if len(s) <= 64 {
- return indexShortStr(s, substr)
- }
- c := substr[0]
- i := 0
- t := s[:len(s)-n+1]
- fails := 0
- for i < len(t) {
- if t[i] != c {
- // IndexByte skips 16/32 bytes per iteration,
- // so it's faster than indexShortStr.
- o := IndexByte(t[i:], c)
- if o < 0 {
- return -1
- }
- i += o
- }
- if s[i:i+n] == substr {
- return i
- }
- fails++
- i++
- // Switch to indexShortStr when IndexByte produces too many false positives.
- // Too many means more that 1 error per 8 characters.
- // Allow some errors in the beginning.
- if fails > (i+16)/8 {
- r := indexShortStr(s[i:], substr)
- if r >= 0 {
- return r + i
- }
- return -1
- }
- }
- return -1
- }
- return indexRabinKarp(s, substr)
-}