internal/bytealg: move short string Index implementations into bytealg

author Keith Randall <khr@golang.org>

Sun, 4 Mar 2018 17:47:47 +0000 (09:47 -0800)

committer Keith Randall <khr@golang.org>

Sun, 4 Mar 2018 19:49:44 +0000 (19:49 +0000)
author Keith Randall <khr@golang.org>
Sun, 4 Mar 2018 17:47:47 +0000 (09:47 -0800)
committer Keith Randall <khr@golang.org>
Sun, 4 Mar 2018 19:49:44 +0000 (19:49 +0000)
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go

index 08d8260e9ee1b8115728ffee62b7d00917f62551..32bf6ab30d32dcee31ae001352a0c61a8621811d 100644 (file)
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@@ -829,6 +829,92 @@ func EqualFold(s, t []byte) bool {
         return len(s) == len(t)
  }
  
+// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
+func Index(s, sep []byte) int {
+       n := len(sep)
+       switch {
+       case n == 0:
+               return 0
+       case n == 1:
+               return IndexByte(s, sep[0])
+       case n == len(s):
+               if Equal(sep, s) {
+                       return 0
+               }
+               return -1
+       case n > len(s):
+               return -1
+       case n <= bytealg.MaxLen:
+               // Use brute force when s and sep both are small
+               if len(s) <= bytealg.MaxBruteForce {
+                       return bytealg.Index(s, sep)
+               }
+               c := sep[0]
+               i := 0
+               t := s[:len(s)-n+1]
+               fails := 0
+               for i < len(t) {
+                       if t[i] != c {
+                               // IndexByte is faster than bytealg.Index, so use it as long as
+                               // we're not getting lots of false positives.
+                               o := IndexByte(t[i:], c)
+                               if o < 0 {
+                                       return -1
+                               }
+                               i += o
+                       }
+                       if Equal(s[i:i+n], sep) {
+                               return i
+                       }
+                       fails++
+                       i++
+                       // Switch to bytealg.Index when IndexByte produces too many false positives.
+                       if fails > bytealg.Cutover(i) {
+                               r := bytealg.Index(s[i:], sep)
+                               if r >= 0 {
+                                       return r + i
+                               }
+                               return -1
+                       }
+               }
+               return -1
+       }
+       c := sep[0]
+       i := 0
+       fails := 0
+       t := s[:len(s)-n+1]
+       for i < len(t) {
+               if t[i] != c {
+                       o := IndexByte(t[i:], c)
+                       if o < 0 {
+                               break
+                       }
+                       i += o
+               }
+               if Equal(s[i:i+n], sep) {
+                       return i
+               }
+               i++
+               fails++
+               if fails >= 4+i>>4 && i < len(t) {
+                       // Give up on IndexByte, it isn't skipping ahead
+                       // far enough to be better than Rabin-Karp.
+                       // Experiments (using IndexPeriodic) suggest
+                       // the cutover is about 16 byte skips.
+                       // TODO: if large prefixes of sep are matching
+                       // we should cutover at even larger average skips,
+                       // because Equal becomes that much more expensive.
+                       // This code does not take that effect into account.
+                       j := indexRabinKarp(s[i:], sep)
+                       if j < 0 {
+                               return -1
+                       }
+                       return i + j
+               }
+       }
+       return -1
+}
+
  func indexRabinKarp(s, sep []byte) int {
         // Rabin-Karp search
         hashsep, pow := hashStr(sep)
diff --git a/src/bytes/bytes_amd64.go b/src/bytes/bytes_amd64.go

deleted file mode 100644 (file)

index 2fc88c6..0000000
--- a/src/bytes/bytes_amd64.go
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package bytes
-
-import "internal/cpu"
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
-// indexShortStr requires 2 <= len(c) <= shortStringLen
-func indexShortStr(s, c []byte) int  // ../runtime/asm_amd64.s
-func countByte(s []byte, c byte) int // ../runtime/asm_amd64.s
-
-var shortStringLen int
-
-func init() {
-       if cpu.X86.HasAVX2 {
-               shortStringLen = 63
-       } else {
-               shortStringLen = 31
-       }
-}
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
-       n := len(sep)
-       switch {
-       case n == 0:
-               return 0
-       case n == 1:
-               return IndexByte(s, sep[0])
-       case n == len(s):
-               if Equal(sep, s) {
-                       return 0
-               }
-               return -1
-       case n > len(s):
-               return -1
-       case n <= shortStringLen:
-               // Use brute force when s and sep both are small
-               if len(s) <= 64 {
-                       return indexShortStr(s, sep)
-               }
-               c := sep[0]
-               i := 0
-               t := s[:len(s)-n+1]
-               fails := 0
-               for i < len(t) {
-                       if t[i] != c {
-                               // IndexByte skips 16/32 bytes per iteration,
-                               // so it's faster than indexShortStr.
-                               o := IndexByte(t[i:], c)
-                               if o < 0 {
-                                       return -1
-                               }
-                               i += o
-                       }
-                       if Equal(s[i:i+n], sep) {
-                               return i
-                       }
-                       fails++
-                       i++
-                       // Switch to indexShortStr when IndexByte produces too many false positives.
-                       // Too many means more that 1 error per 8 characters.
-                       // Allow some errors in the beginning.
-                       if fails > (i+16)/8 {
-                               r := indexShortStr(s[i:], sep)
-                               if r >= 0 {
-                                       return r + i
-                               }
-                               return -1
-                       }
-               }
-               return -1
-       }
-       return indexRabinKarp(s, sep)
-}
diff --git a/src/bytes/bytes_arm64.go b/src/bytes/bytes_arm64.go

deleted file mode 100644 (file)

index 39e9562..0000000
--- a/src/bytes/bytes_arm64.go
+++ /dev/null
@@ -1,72 +0,0 @@
-// Copyright 2017 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package bytes
-
-func countByte(s []byte, c byte) int // bytes_arm64.s
-
-// 8 bytes can be completely loaded into 1 register.
-const shortStringLen = 8
-
-//go:noescape
-func indexShortStr(s, sep []byte) int
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
-       n := len(sep)
-       switch {
-       case n == 0:
-               return 0
-       case n == 1:
-               return IndexByte(s, sep[0])
-       case n == len(s):
-               if Equal(sep, s) {
-                       return 0
-               }
-               return -1
-       case n > len(s):
-               return -1
-       case n <= shortStringLen:
-               // Use brute force when both s and sep are small.
-               // Empirical data shows that it can get better
-               // performance when len(s) <= 16.
-               if len(s) <= 16 {
-                       return indexShortStr(s, sep)
-               }
-       }
-       c := sep[0]
-       i := 0
-       fails := 0
-       t := s[:len(s)-n+1]
-       for i < len(t) {
-               if t[i] != c {
-                       o := IndexByte(t[i:], c)
-                       if o < 0 {
-                               break
-                       }
-                       i += o
-               }
-               if Equal(s[i:i+n], sep) {
-                       return i
-               }
-               i++
-               fails++
-               if fails >= 4+i>>4 && i < len(t) {
-                       // Give up on IndexByte, it isn't skipping ahead
-                       // far enough to be better than Rabin-Karp.
-                       // Experiments (using IndexPeriodic) suggest
-                       // the cutover is about 16 byte skips.
-                       // TODO: if large prefixes of sep are matching
-                       // we should cutover at even larger average skips,
-                       // because Equal becomes that much more expensive.
-                       // This code does not take that effect into account.
-                       j := indexRabinKarp(s[i:], sep)
-                       if j < 0 {
-                               return -1
-                       }
-                       return i + j
-               }
-       }
-       return -1
-}
diff --git a/src/bytes/bytes_generic.go b/src/bytes/bytes_generic.go

deleted file mode 100644 (file)

index 347d284..0000000
--- a/src/bytes/bytes_generic.go
+++ /dev/null
@@ -1,59 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64,!s390x,!arm64
-
-package bytes
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
-       n := len(sep)
-       switch {
-       case n == 0:
-               return 0
-       case n == 1:
-               return IndexByte(s, sep[0])
-       case n == len(s):
-               if Equal(sep, s) {
-                       return 0
-               }
-               return -1
-       case n > len(s):
-               return -1
-       }
-       c := sep[0]
-       i := 0
-       fails := 0
-       t := s[:len(s)-n+1]
-       for i < len(t) {
-               if t[i] != c {
-                       o := IndexByte(t[i:], c)
-                       if o < 0 {
-                               break
-                       }
-                       i += o
-               }
-               if Equal(s[i:i+n], sep) {
-                       return i
-               }
-               i++
-               fails++
-               if fails >= 4+i>>4 && i < len(t) {
-                       // Give up on IndexByte, it isn't skipping ahead
-                       // far enough to be better than Rabin-Karp.
-                       // Experiments (using IndexPeriodic) suggest
-                       // the cutover is about 16 byte skips.
-                       // TODO: if large prefixes of sep are matching
-                       // we should cutover at even larger average skips,
-                       // because Equal becomes that much more expensive.
-                       // This code does not take that effect into account.
-                       j := indexRabinKarp(s[i:], sep)
-                       if j < 0 {
-                               return -1
-                       }
-                       return i + j
-               }
-       }
-       return -1
-}
diff --git a/src/bytes/bytes_s390x.go b/src/bytes/bytes_s390x.go

deleted file mode 100644 (file)

index 84f040d..0000000
--- a/src/bytes/bytes_s390x.go
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package bytes
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of sep in s,
-// or -1 if sep is not present in s.
-// indexShortStr requires 2 <= len(sep) <= shortStringLen
-func indexShortStr(s, c []byte) int // ../runtime/asm_s390x.s
-
-// supportsVX reports whether the vector facility is available.
-// indexShortStr must not be called if the vector facility is not
-// available.
-func supportsVX() bool // ../runtime/asm_s390x.s
-
-var shortStringLen = -1
-
-func init() {
-       if supportsVX() {
-               shortStringLen = 64
-       }
-}
-
-// Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
-func Index(s, sep []byte) int {
-       n := len(sep)
-       switch {
-       case n == 0:
-               return 0
-       case n == 1:
-               return IndexByte(s, sep[0])
-       case n == len(s):
-               if Equal(sep, s) {
-                       return 0
-               }
-               return -1
-       case n > len(s):
-               return -1
-       case n <= shortStringLen:
-               // Use brute force when s and sep both are small
-               if len(s) <= 64 {
-                       return indexShortStr(s, sep)
-               }
-               c := sep[0]
-               i := 0
-               t := s[:len(s)-n+1]
-               fails := 0
-               for i < len(t) {
-                       if t[i] != c {
-                               // IndexByte skips 16/32 bytes per iteration,
-                               // so it's faster than indexShortStr.
-                               o := IndexByte(t[i:], c)
-                               if o < 0 {
-                                       return -1
-                               }
-                               i += o
-                       }
-                       if Equal(s[i:i+n], sep) {
-                               return i
-                       }
-                       fails++
-                       i++
-                       // Switch to indexShortStr when IndexByte produces too many false positives.
-                       // Too many means more that 1 error per 8 characters.
-                       // Allow some errors in the beginning.
-                       if fails > (i+16)/8 {
-                               r := indexShortStr(s[i:], sep)
-                               if r >= 0 {
-                                       return r + i
-                               }
-                               return -1
-                       }
-               }
-               return -1
-       }
-       return indexRabinKarp(s, sep)
-}
diff --git a/src/cmd/vet/all/whitelist/amd64.txt b/src/cmd/vet/all/whitelist/amd64.txt

index 0e8ff7419452fb963aea60daca607b5f00aaa22d..2268b3935392055fd9163c557d9f09d53e175447 100644 (file)
--- a/src/cmd/vet/all/whitelist/amd64.txt
+++ b/src/cmd/vet/all/whitelist/amd64.txt
@@ -1,20 +1,16 @@
  // amd64-specific vet whitelist. See readme.txt for details.
  
-internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
-internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime
-
  // False positives.
  
+// Nothing much to do about cross-package assembly. Unfortunate.
+internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: Compare is in package bytes
+internal/bytealg/compare_amd64.s: [amd64] cannot check cross-package assembly function: cmpstring is in package runtime
  
  // reflect trampolines intentionally omit arg size. Same for morestack.
  runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame
  runtime/asm_amd64.s: [amd64] morestack: use of 16(SP) points beyond argument frame
  runtime/asm_amd64.s: [amd64] morestack: use of 8(SP) points beyond argument frame
  
-// Nothing much to do about cross-package assembly. Unfortunate.
-runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package strings
-runtime/asm_amd64.s: [amd64] cannot check cross-package assembly function: indexShortStr is in package bytes
-
  // Intentionally missing declarations. These are special assembly routines.
  // Some are jumped into from other routines, with values in specific registers.
  // duff* have direct calls from the compiler.
@@ -25,4 +21,3 @@ runtime/asm_amd64.s: [amd64] addmoduledata: function addmoduledata missing Go de
  runtime/duff_amd64.s: [amd64] duffzero: function duffzero missing Go declaration
  runtime/duff_amd64.s: [amd64] duffcopy: function duffcopy missing Go declaration
  runtime/asm_amd64.s: [amd64] stackcheck: function stackcheck missing Go declaration
-runtime/asm_amd64.s: [amd64] indexShortStr: function indexShortStr missing Go declaration
diff --git a/src/cmd/vet/all/whitelist/s390x.txt b/src/cmd/vet/all/whitelist/s390x.txt

index 025c9dce52baa4e99ba4ab11daa29db3ab95fabd..57ff51f360a6e8530abbd941e07ea76e29eadeab 100644 (file)
--- a/src/cmd/vet/all/whitelist/s390x.txt
+++ b/src/cmd/vet/all/whitelist/s390x.txt
@@ -1,11 +1,6 @@
  runtime/asm_s390x.s: [s390x] abort: function abort missing Go declaration
  internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: Compare is in package bytes
  internal/bytealg/compare_s390x.s: [s390x] cannot check cross-package assembly function: cmpstring is in package runtime
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package strings
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: supportsVX is in package bytes
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package strings
-runtime/asm_s390x.s: [s390x] cannot check cross-package assembly function: indexShortStr is in package bytes
-runtime/asm_s390x.s: [s390x] indexShortStr: function indexShortStr missing Go declaration
  runtime/asm_s390x.s: [s390x] addmoduledata: function addmoduledata missing Go declaration
  runtime/memclr_s390x.s: [s390x] memclr_s390x_exrl_xc: function memclr_s390x_exrl_xc missing Go declaration
  runtime/memmove_s390x.s: [s390x] memmove_s390x_exrl_mvc: function memmove_s390x_exrl_mvc missing Go declaration
diff --git a/src/internal/bytealg/bytealg.go b/src/internal/bytealg/bytealg.go

new file mode 100644 (file)

index 0000000..1ab7c30
--- /dev/null
+++ b/src/internal/bytealg/bytealg.go
@@ -0,0 +1,22 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import (
+       "internal/cpu"
+       "unsafe"
+)
+
+// Offsets into internal/cpu records for use in assembly.
+const (
+       x86_HasSSE2   = unsafe.Offsetof(cpu.X86.HasSSE2)
+       x86_HasSSE42  = unsafe.Offsetof(cpu.X86.HasSSE42)
+       x86_HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
+       x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
+       s390x_HasVX   = unsafe.Offsetof(cpu.S390X.HasVX)
+)
+
+// MaxLen is the maximum length of the string to be searched for (argument b) in Index.
+var MaxLen int
diff --git a/src/internal/bytealg/count_arm64.s b/src/internal/bytealg/count_arm64.s

new file mode 100644 (file)

index 0000000..8cd703d
--- /dev/null
+++ b/src/internal/bytealg/count_arm64.s
@@ -0,0 +1,90 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Count(SB),NOSPLIT,$0-40
+       MOVD    b_base+0(FP), R0
+       MOVD    b_len+8(FP), R2
+       MOVBU   c+24(FP), R1
+       MOVD    $ret+32(FP), R8
+       B       countbytebody<>(SB)
+
+TEXT ·CountString(SB),NOSPLIT,$0-32
+       MOVD    s_base+0(FP), R0
+       MOVD    s_len+8(FP), R2
+       MOVBU   c+16(FP), R1
+       MOVD    $ret+24(FP), R8
+       B       countbytebody<>(SB)
+
+// input:
+//   R0: data
+//   R2: data len
+//   R1: byte to find
+//   R8: address to put result
+TEXT countbytebody<>(SB),NOSPLIT,$0
+       // R11 = count of byte to search
+       MOVD    $0, R11
+       // short path to handle 0-byte case
+       CBZ     R2, done
+       CMP     $0x20, R2
+       // jump directly to tail if length < 32
+       BLO     tail
+       ANDS    $0x1f, R0, R9
+       BEQ     chunk
+       // Work with not 32-byte aligned head
+       BIC     $0x1f, R0, R3
+       ADD     $0x20, R3
+head_loop:
+       MOVBU.P 1(R0), R5
+       CMP     R5, R1
+       CINC    EQ, R11, R11
+       SUB     $1, R2, R2
+       CMP     R0, R3
+       BNE     head_loop
+       // Work with 32-byte aligned chunks
+chunk:
+       BIC     $0x1f, R2, R9
+       // The first chunk can also be the last
+       CBZ     R9, tail
+       // R3 = end of 32-byte chunks
+       ADD     R0, R9, R3
+       MOVD    $1, R5
+       VMOV    R5, V5.B16
+       // R2 = length of tail
+       SUB     R9, R2, R2
+       // Duplicate R1 (byte to search) to 16 1-byte elements of V0
+       VMOV    R1, V0.B16
+       // Clear the low 64-bit element of V7 and V8
+       VEOR    V7.B8, V7.B8, V7.B8
+       VEOR    V8.B8, V8.B8, V8.B8
+       // Count the target byte in 32-byte chunk
+chunk_loop:
+       VLD1.P  (R0), [V1.B16, V2.B16]
+       CMP     R0, R3
+       VCMEQ   V0.B16, V1.B16, V3.B16
+       VCMEQ   V0.B16, V2.B16, V4.B16
+       // Clear the higher 7 bits
+       VAND    V5.B16, V3.B16, V3.B16
+       VAND    V5.B16, V4.B16, V4.B16
+       // Count lanes match the requested byte
+       VADDP   V4.B16, V3.B16, V6.B16 // 32B->16B
+       VUADDLV V6.B16, V7
+       // Accumulate the count in low 64-bit element of V8 when inside the loop
+       VADD    V7, V8
+       BNE     chunk_loop
+       VMOV    V8.D[0], R6
+       ADD     R6, R11, R11
+       CBZ     R2, done
+tail:
+       // Work with tail shorter than 32 bytes
+       MOVBU.P 1(R0), R5
+       SUB     $1, R2, R2
+       CMP     R5, R1
+       CINC    EQ, R11, R11
+       CBNZ    R2, tail
+done:
+       MOVD    R11, (R8)
+       RET
diff --git a/src/internal/bytealg/count_generic.go b/src/internal/bytealg/count_generic.go

index acc5a79827f84dc44178c95349470cb1ac6c6f1a..a763b3bc616be440ddaa57b60cfc9303b919a1d4 100644 (file)
--- a/src/internal/bytealg/count_generic.go
+++ b/src/internal/bytealg/count_generic.go
@@ -2,7 +2,7 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
-// +build !amd64
+// +build !amd64,!arm64
  
  package bytealg
  
diff --git a/src/internal/bytealg/count_native.go b/src/internal/bytealg/count_native.go

index e6d3b066aa00c2434e5771bac97d37ab94659372..a62c4cb5c096387151fc5614148ccbc37c9eb1c4 100644 (file)
--- a/src/internal/bytealg/count_native.go
+++ b/src/internal/bytealg/count_native.go
@@ -2,7 +2,7 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
-// +build amd64
+// +build amd64 arm64
  
  package bytealg
  
diff --git a/src/internal/bytealg/equal_native.go b/src/internal/bytealg/equal_native.go

index 55d184a58ba0bce0a051ebb2f6796fe2a6d6e4be..b5c453086c9c95547657f4bc58dea9570c704182 100644 (file)
--- a/src/internal/bytealg/equal_native.go
+++ b/src/internal/bytealg/equal_native.go
@@ -4,24 +4,8 @@
  
  package bytealg
  
-import (
-       "internal/cpu"
-       "unsafe"
-)
-
  // Note: there's no equal_generic.go because every platform must implement at least memequal_varlen in assembly.
  
-// Because equal_native.go is unconditional, it's a good place to compute asm constants.
-// TODO: find a better way to do this?
-
-// Offsets into internal/cpu records for use in assembly.
-const (
-       x86_HasSSE2   = unsafe.Offsetof(cpu.X86.HasSSE2)
-       x86_HasAVX2   = unsafe.Offsetof(cpu.X86.HasAVX2)
-       x86_HasPOPCNT = unsafe.Offsetof(cpu.X86.HasPOPCNT)
-       s390x_HasVX   = unsafe.Offsetof(cpu.S390X.HasVX)
-)
-
  //go:noescape
  func Equal(a, b []byte) bool
  
diff --git a/src/internal/bytealg/index_amd64.go b/src/internal/bytealg/index_amd64.go

new file mode 100644 (file)

index 0000000..c7a1941
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.go
@@ -0,0 +1,26 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+       if cpu.X86.HasAVX2 {
+               MaxLen = 63
+       } else {
+               MaxLen = 31
+       }
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+       // 1 error per 8 characters, plus a few slop to start.
+       return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_amd64.s b/src/internal/bytealg/index_amd64.s

new file mode 100644 (file)

index 0000000..f7297c0
--- /dev/null
+++ b/src/internal/bytealg/index_amd64.s
@@ -0,0 +1,274 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+TEXT ·Index(SB),NOSPLIT,$0-56
+       MOVQ a_base+0(FP), DI
+       MOVQ a_len+8(FP), DX
+       MOVQ b_base+24(FP), BP
+       MOVQ b_len+32(FP), AX
+       MOVQ DI, R10
+       LEAQ ret+48(FP), R11
+       JMP  indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+       MOVQ a_base+0(FP), DI
+       MOVQ a_len+8(FP), DX
+       MOVQ b_base+16(FP), BP
+       MOVQ b_len+24(FP), AX
+       MOVQ DI, R10
+       LEAQ ret+32(FP), R11
+       JMP  indexbody<>(SB)
+
+// AX: length of string, that we are searching for
+// DX: length of string, in which we are searching
+// DI: pointer to string, in which we are searching
+// BP: pointer to string, that we are searching for
+// R11: address, where to put return value
+// Note: We want len in DX and AX, because PCMPESTRI implicitly consumes them
+TEXT indexbody<>(SB),NOSPLIT,$0
+       CMPQ AX, DX
+       JA fail
+       CMPQ DX, $16
+       JAE sse42
+no_sse42:
+       CMPQ AX, $2
+       JA   _3_or_more
+       MOVW (BP), BP
+       LEAQ -1(DI)(DX*1), DX
+loop2:
+       MOVW (DI), SI
+       CMPW SI,BP
+       JZ success
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop2
+       JMP fail
+_3_or_more:
+       CMPQ AX, $3
+       JA   _4_or_more
+       MOVW 1(BP), BX
+       MOVW (BP), BP
+       LEAQ -2(DI)(DX*1), DX
+loop3:
+       MOVW (DI), SI
+       CMPW SI,BP
+       JZ   partial_success3
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop3
+       JMP fail
+partial_success3:
+       MOVW 1(DI), SI
+       CMPW SI,BX
+       JZ success
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop3
+       JMP fail
+_4_or_more:
+       CMPQ AX, $4
+       JA   _5_or_more
+       MOVL (BP), BP
+       LEAQ -3(DI)(DX*1), DX
+loop4:
+       MOVL (DI), SI
+       CMPL SI,BP
+       JZ   success
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop4
+       JMP fail
+_5_or_more:
+       CMPQ AX, $7
+       JA   _8_or_more
+       LEAQ 1(DI)(DX*1), DX
+       SUBQ AX, DX
+       MOVL -4(BP)(AX*1), BX
+       MOVL (BP), BP
+loop5to7:
+       MOVL (DI), SI
+       CMPL SI,BP
+       JZ   partial_success5to7
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop5to7
+       JMP fail
+partial_success5to7:
+       MOVL -4(AX)(DI*1), SI
+       CMPL SI,BX
+       JZ success
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop5to7
+       JMP fail
+_8_or_more:
+       CMPQ AX, $8
+       JA   _9_or_more
+       MOVQ (BP), BP
+       LEAQ -7(DI)(DX*1), DX
+loop8:
+       MOVQ (DI), SI
+       CMPQ SI,BP
+       JZ   success
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop8
+       JMP fail
+_9_or_more:
+       CMPQ AX, $15
+       JA   _16_or_more
+       LEAQ 1(DI)(DX*1), DX
+       SUBQ AX, DX
+       MOVQ -8(BP)(AX*1), BX
+       MOVQ (BP), BP
+loop9to15:
+       MOVQ (DI), SI
+       CMPQ SI,BP
+       JZ   partial_success9to15
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop9to15
+       JMP fail
+partial_success9to15:
+       MOVQ -8(AX)(DI*1), SI
+       CMPQ SI,BX
+       JZ success
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop9to15
+       JMP fail
+_16_or_more:
+       CMPQ AX, $16
+       JA   _17_or_more
+       MOVOU (BP), X1
+       LEAQ -15(DI)(DX*1), DX
+loop16:
+       MOVOU (DI), X2
+       PCMPEQB X1, X2
+       PMOVMSKB X2, SI
+       CMPQ  SI, $0xffff
+       JE   success
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop16
+       JMP fail
+_17_or_more:
+       CMPQ AX, $31
+       JA   _32_or_more
+       LEAQ 1(DI)(DX*1), DX
+       SUBQ AX, DX
+       MOVOU -16(BP)(AX*1), X0
+       MOVOU (BP), X1
+loop17to31:
+       MOVOU (DI), X2
+       PCMPEQB X1,X2
+       PMOVMSKB X2, SI
+       CMPQ  SI, $0xffff
+       JE   partial_success17to31
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop17to31
+       JMP fail
+partial_success17to31:
+       MOVOU -16(AX)(DI*1), X3
+       PCMPEQB X0, X3
+       PMOVMSKB X3, SI
+       CMPQ  SI, $0xffff
+       JE success
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop17to31
+       JMP fail
+// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
+// So no need to check cpuid
+_32_or_more:
+       CMPQ AX, $32
+       JA   _33_to_63
+       VMOVDQU (BP), Y1
+       LEAQ -31(DI)(DX*1), DX
+loop32:
+       VMOVDQU (DI), Y2
+       VPCMPEQB Y1, Y2, Y3
+       VPMOVMSKB Y3, SI
+       CMPL  SI, $0xffffffff
+       JE   success_avx2
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop32
+       JMP fail_avx2
+_33_to_63:
+       LEAQ 1(DI)(DX*1), DX
+       SUBQ AX, DX
+       VMOVDQU -32(BP)(AX*1), Y0
+       VMOVDQU (BP), Y1
+loop33to63:
+       VMOVDQU (DI), Y2
+       VPCMPEQB Y1, Y2, Y3
+       VPMOVMSKB Y3, SI
+       CMPL  SI, $0xffffffff
+       JE   partial_success33to63
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop33to63
+       JMP fail_avx2
+partial_success33to63:
+       VMOVDQU -32(AX)(DI*1), Y3
+       VPCMPEQB Y0, Y3, Y4
+       VPMOVMSKB Y4, SI
+       CMPL  SI, $0xffffffff
+       JE success_avx2
+       ADDQ $1,DI
+       CMPQ DI,DX
+       JB loop33to63
+fail_avx2:
+       VZEROUPPER
+fail:
+       MOVQ $-1, (R11)
+       RET
+success_avx2:
+       VZEROUPPER
+       JMP success
+sse42:
+       CMPB internal∕cpu·X86+const_x86_HasSSE42(SB), $1
+       JNE no_sse42
+       CMPQ AX, $12
+       // PCMPESTRI is slower than normal compare,
+       // so using it makes sense only if we advance 4+ bytes per compare
+       // This value was determined experimentally and is the ~same
+       // on Nehalem (first with SSE42) and Haswell.
+       JAE _9_or_more
+       LEAQ 16(BP), SI
+       TESTW $0xff0, SI
+       JEQ no_sse42
+       MOVOU (BP), X1
+       LEAQ -15(DI)(DX*1), SI
+       MOVQ $16, R9
+       SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
+loop_sse42:
+       // 0x0c means: unsigned byte compare (bits 0,1 are 00)
+       // for equality (bits 2,3 are 11)
+       // result is not masked or inverted (bits 4,5 are 00)
+       // and corresponds to first matching byte (bit 6 is 0)
+       PCMPESTRI $0x0c, (DI), X1
+       // CX == 16 means no match,
+       // CX > R9 means partial match at the end of the string,
+       // otherwise sep is at offset CX from X1 start
+       CMPQ CX, R9
+       JBE sse42_success
+       ADDQ R9, DI
+       CMPQ DI, SI
+       JB loop_sse42
+       PCMPESTRI $0x0c, -1(SI), X1
+       CMPQ CX, R9
+       JA fail
+       LEAQ -1(SI), DI
+sse42_success:
+       ADDQ CX, DI
+success:
+       SUBQ R10, DI
+       MOVQ DI, (R11)
+       RET
diff --git a/src/internal/bytealg/index_arm64.go b/src/internal/bytealg/index_arm64.go

new file mode 100644 (file)

index 0000000..0f87ae1
--- /dev/null
+++ b/src/internal/bytealg/index_arm64.go
@@ -0,0 +1,23 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+// Empirical data shows that using IndexShortStr can get better
+// performance when len(s) <= 16.
+const MaxBruteForce = 16
+
+func init() {
+       // 8 bytes can be completely loaded into 1 register.
+       MaxLen = 8
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to IndexShortStr.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+       // 1 error per 16 characters, plus a few slop to start.
+       return 4 + n>>4
+}
diff --git a/src/bytes/bytes_arm64.s b/src/internal/bytealg/index_arm64.s

similarity index 50%

rename from src/bytes/bytes_arm64.s

rename to src/internal/bytealg/index_arm64.s

index 84e96d52cecbddd5cd28dc1c3df511dc2d3090a4..8cffcd10b5d19025d99971ddfccd0f99e70599a6 100644 (file)
--- a/src/bytes/bytes_arm64.s
+++ b/src/internal/bytealg/index_arm64.s
@@ -1,88 +1,40 @@
-// Copyright 2017 The Go Authors. All rights reserved.
+// Copyright 2018 The Go Authors. All rights reserved.
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
+#include "go_asm.h"
  #include "textflag.h"
  
-// countByte(s []byte, c byte) int
-TEXT bytes·countByte(SB),NOSPLIT,$0-40
-       MOVD    s_base+0(FP), R0
-       MOVD    s_len+8(FP), R2
-       MOVBU   c+24(FP), R1
-       // R11 = count of byte to search
-       MOVD    $0, R11
-       // short path to handle 0-byte case
-       CBZ     R2, done
-       CMP     $0x20, R2
-       // jump directly to tail if length < 32
-       BLO     tail
-       ANDS    $0x1f, R0, R9
-       BEQ     chunk
-       // Work with not 32-byte aligned head
-       BIC     $0x1f, R0, R3
-       ADD     $0x20, R3
-head_loop:
-       MOVBU.P 1(R0), R5
-       CMP     R5, R1
-       CINC    EQ, R11, R11
-       SUB     $1, R2, R2
-       CMP     R0, R3
-       BNE     head_loop
-       // Work with 32-byte aligned chunks
-chunk:
-       BIC     $0x1f, R2, R9
-       // The first chunk can also be the last
-       CBZ     R9, tail
-       // R3 = end of 32-byte chunks
-       ADD     R0, R9, R3
-       MOVD    $1, R5
-       VMOV    R5, V5.B16
-       // R2 = length of tail
-       SUB     R9, R2, R2
-       // Duplicate R1 (byte to search) to 16 1-byte elements of V0
-       VMOV    R1, V0.B16
-       // Clear the low 64-bit element of V7 and V8
-       VEOR    V7.B8, V7.B8, V7.B8
-       VEOR    V8.B8, V8.B8, V8.B8
-       // Count the target byte in 32-byte chunk
-chunk_loop:
-       VLD1.P  (R0), [V1.B16, V2.B16]
-       CMP     R0, R3
-       VCMEQ   V0.B16, V1.B16, V3.B16
-       VCMEQ   V0.B16, V2.B16, V4.B16
-       // Clear the higher 7 bits
-       VAND    V5.B16, V3.B16, V3.B16
-       VAND    V5.B16, V4.B16, V4.B16
-       // Count lanes match the requested byte
-       VADDP   V4.B16, V3.B16, V6.B16 // 32B->16B
-       VUADDLV V6.B16, V7
-       // Accumulate the count in low 64-bit element of V8 when inside the loop
-       VADD    V7, V8
-       BNE     chunk_loop
-       VMOV    V8.D[0], R6
-       ADD     R6, R11, R11
-       CBZ     R2, done
-tail:
-       // Work with tail shorter than 32 bytes
-       MOVBU.P 1(R0), R5
-       SUB     $1, R2, R2
-       CMP     R5, R1
-       CINC    EQ, R11, R11
-       CBNZ    R2, tail
-done:
-       MOVD    R11, ret+32(FP)
-       RET
+TEXT ·Index(SB),NOSPLIT,$0-56
+       MOVD    a_base+0(FP), R0
+       MOVD    a_len+8(FP), R1
+       MOVD    b_base+24(FP), R2
+       MOVD    b_len+32(FP), R3
+       MOVD    $ret+48(FP), R9
+       B       indexbody<>(SB)
+
+TEXT ·IndexString(SB),NOSPLIT,$0-40
+       MOVD    a_base+0(FP), R0
+       MOVD    a_len+8(FP), R1
+       MOVD    b_base+16(FP), R2
+       MOVD    b_len+24(FP), R3
+       MOVD    $ret+32(FP), R9
+       B       indexbody<>(SB)
  
-// indexShortStr(s, sep []byte) int
-// precondition: 2 <= len(sep) <= 8
-TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
+// input:
+//   R0: haystack
+//   R1: length of haystack
+//   R2: needle
+//   R3: length of needle (2 <= len <= 8)
+//   R9: address to put result
+TEXT indexbody<>(SB),NOSPLIT,$0-56
         // main idea is to load 'sep' into separate register(s)
         // to avoid repeatedly re-load it again and again
         // for sebsequent substring comparisons
-       MOVD    s+0(FP), R0
-       MOVD    s_len+8(FP), R1
-       MOVD    sep+24(FP), R2
-       MOVD    sep_len+32(FP), R3
+       MOVD    a_base+0(FP), R0
+       MOVD    a_len+8(FP), R1
+       MOVD    b_base+24(FP), R2
+       MOVD    b_len+32(FP), R3
         SUB     R3, R1, R4
         // R4 contains the start of last substring for comparsion
         ADD     R0, R4, R4
@@ -189,9 +141,9 @@ loop_2:
         BLS     loop_2
  not_found:
         MOVD    $-1, R0
-       MOVD    R0, ret+48(FP)
+       MOVD    R0, (R9)
         RET
  found:
         SUB     R8, R0, R0
-       MOVD    R0, ret+48(FP)
+       MOVD    R0, (R9)
         RET
diff --git a/src/internal/bytealg/index_generic.go b/src/internal/bytealg/index_generic.go

new file mode 100644 (file)

index 0000000..98e859f
--- /dev/null
+++ b/src/internal/bytealg/index_generic.go
@@ -0,0 +1,29 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build !amd64,!arm64,!s390x
+
+package bytealg
+
+const MaxBruteForce = 0
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int {
+       panic("unimplemented")
+}
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int {
+       panic("unimplemented")
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+       panic("unimplemented")
+}
diff --git a/src/internal/bytealg/index_native.go b/src/internal/bytealg/index_native.go

new file mode 100644 (file)

index 0000000..fde4214
--- /dev/null
+++ b/src/internal/bytealg/index_native.go
@@ -0,0 +1,19 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build amd64 arm64 s390x
+
+package bytealg
+
+//go:noescape
+
+// Index returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func Index(a, b []byte) int
+
+//go:noescape
+
+// IndexString returns the index of the first instance of b in a, or -1 if b is not present in a.
+// Requires 2 <= len(b) <= MaxLen.
+func IndexString(a, b string) int
diff --git a/src/internal/bytealg/index_s390x.go b/src/internal/bytealg/index_s390x.go

new file mode 100644 (file)

index 0000000..9340cf1
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.go
@@ -0,0 +1,31 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package bytealg
+
+import "internal/cpu"
+
+const MaxBruteForce = 64
+
+func init() {
+       // Note: we're kind of lucky that this flag is available at this point.
+       // The runtime sets HasVX when processing auxv records, and that happens
+       // to happen *before* running the init functions of packages that
+       // the runtime depends on.
+       // TODO: it would really be nicer for internal/cpu to figure out this
+       // flag by itself. Then we wouldn't need to depend on quirks of
+       // early startup initialization order.
+       if cpu.S390X.HasVX {
+               MaxLen = 64
+       }
+}
+
+// Cutover reports the number of failures of IndexByte we should tolerate
+// before switching over to Index.
+// n is the number of bytes processed so far.
+// See the bytes.Index implementation for details.
+func Cutover(n int) int {
+       // 1 error per 8 characters, plus a few slop to start.
+       return (n + 16) / 8
+}
diff --git a/src/internal/bytealg/index_s390x.s b/src/internal/bytealg/index_s390x.s

new file mode 100644 (file)

index 0000000..491d5bc
--- /dev/null
+++ b/src/internal/bytealg/index_s390x.s
@@ -0,0 +1,216 @@
+// Copyright 2018 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+#include "go_asm.h"
+#include "textflag.h"
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·Index(SB),NOSPLIT|NOFRAME,$0-56
+       LMG     a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+       LMG     b_base+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+       MOVD    $ret+48(FP), R5
+       BR      indexbody<>(SB)
+
+// Caller must confirm availability of vx facility before calling.
+TEXT ·IndexString(SB),NOSPLIT|NOFRAME,$0-40
+       LMG     a_base+0(FP), R1, R2  // R1=&s[0],   R2=len(s)
+       LMG     b_base+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
+       MOVD    $ret+32(FP), R5
+       BR      indexbody<>(SB)
+
+// s: string we are searching
+// sep: string to search for
+// R1=&s[0], R2=len(s)
+// R3=&sep[0], R4=len(sep)
+// R5=&ret (int)
+// Caller must confirm availability of vx facility before calling.
+TEXT indexbody<>(SB),NOSPLIT|NOFRAME,$0
+       CMPBGT  R4, R2, notfound
+       ADD     R1, R2
+       SUB     R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
+       CMPBEQ  R4, $0, notfound
+       SUB     $1, R4 // R4=len(sep)-1 for use as VLL index
+       VLL     R4, (R3), V0 // contains first 16 bytes of sep
+       MOVD    R1, R7
+index2plus:
+       CMPBNE  R4, $1, index3plus
+       MOVD    $15(R7), R9
+       CMPBGE  R9, R2, index2to16
+       VGBM    $0xaaaa, V31       // 0xff00ff00ff00ff00...
+       VONE    V16
+       VREPH   $0, V0, V1
+       CMPBGE  R9, R2, index2to16
+index2loop:
+       VL      0(R7), V2          // 16 bytes, even indices
+       VL      1(R7), V4          // 16 bytes, odd indices
+       VCEQH   V1, V2, V5         // compare even indices
+       VCEQH   V1, V4, V6         // compare odd indices
+       VSEL    V5, V6, V31, V7    // merge even and odd indices
+       VFEEBS  V16, V7, V17       // find leftmost index, set condition to 1 if found
+       BLT     foundV17
+       MOVD    $16(R7), R7        // R7+=16
+       ADD     $15, R7, R9
+       CMPBLE  R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
+       CMPBLE  R7, R2, index2to16
+       BR      notfound
+
+index3plus:
+       CMPBNE  R4, $2, index4plus
+       ADD     $15, R7, R9
+       CMPBGE  R9, R2, index2to16
+       MOVD    $1, R0
+       VGBM    $0xaaaa, V31       // 0xff00ff00ff00ff00...
+       VONE    V16
+       VREPH   $0, V0, V1
+       VREPB   $2, V0, V8
+index3loop:
+       VL      (R7), V2           // load 16-bytes into V2
+       VLL     R0, 16(R7), V3     // load 2-bytes into V3
+       VSLDB   $1, V2, V3, V4     // V4=(V2:V3)<<1
+       VSLDB   $2, V2, V3, V9     // V9=(V2:V3)<<2
+       VCEQH   V1, V2, V5         // compare 2-byte even indices
+       VCEQH   V1, V4, V6         // compare 2-byte odd indices
+       VCEQB   V8, V9, V10        // compare last bytes
+       VSEL    V5, V6, V31, V7    // merge even and odd indices
+       VN      V7, V10, V7        // AND indices with last byte
+       VFEEBS  V16, V7, V17       // find leftmost index, set condition to 1 if found
+       BLT     foundV17
+       MOVD    $16(R7), R7        // R7+=16
+       ADD     $15, R7, R9
+       CMPBLE  R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
+       CMPBLE  R7, R2, index2to16
+       BR      notfound
+
+index4plus:
+       CMPBNE  R4, $3, index5plus
+       ADD     $15, R7, R9
+       CMPBGE  R9, R2, index2to16
+       MOVD    $2, R0
+       VGBM    $0x8888, V29       // 0xff000000ff000000...
+       VGBM    $0x2222, V30       // 0x0000ff000000ff00...
+       VGBM    $0xcccc, V31       // 0xffff0000ffff0000...
+       VONE    V16
+       VREPF   $0, V0, V1
+index4loop:
+       VL      (R7), V2           // load 16-bytes into V2
+       VLL     R0, 16(R7), V3     // load 3-bytes into V3
+       VSLDB   $1, V2, V3, V4     // V4=(V2:V3)<<1
+       VSLDB   $2, V2, V3, V9     // V9=(V2:V3)<<1
+       VSLDB   $3, V2, V3, V10    // V10=(V2:V3)<<1
+       VCEQF   V1, V2, V5         // compare index 0, 4, ...
+       VCEQF   V1, V4, V6         // compare index 1, 5, ...
+       VCEQF   V1, V9, V11        // compare index 2, 6, ...
+       VCEQF   V1, V10, V12       // compare index 3, 7, ...
+       VSEL    V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
+       VSEL    V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
+       VSEL    V13, V14, V31, V7  // final merge
+       VFEEBS  V16, V7, V17       // find leftmost index, set condition to 1 if found
+       BLT     foundV17
+       MOVD    $16(R7), R7        // R7+=16
+       ADD     $15, R7, R9
+       CMPBLE  R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
+       CMPBLE  R7, R2, index2to16
+       BR      notfound
+
+index5plus:
+       CMPBGT  R4, $15, index17plus
+index2to16:
+       CMPBGT  R7, R2, notfound
+       MOVD    $1(R7), R8
+       CMPBGT  R8, R2, index2to16tail
+index2to16loop:
+       // unrolled 2x
+       VLL     R4, (R7), V1
+       VLL     R4, 1(R7), V2
+       VCEQGS  V0, V1, V3
+       BEQ     found
+       MOVD    $1(R7), R7
+       VCEQGS  V0, V2, V4
+       BEQ     found
+       MOVD    $1(R7), R7
+       CMPBLT  R7, R2, index2to16loop
+       CMPBGT  R7, R2, notfound
+index2to16tail:
+       VLL     R4, (R7), V1
+       VCEQGS  V0, V1, V2
+       BEQ     found
+       BR      notfound
+
+index17plus:
+       CMPBGT  R4, $31, index33plus
+       SUB     $16, R4, R0
+       VLL     R0, 16(R3), V1
+       VONE    V7
+index17to32loop:
+       VL      (R7), V2
+       VLL     R0, 16(R7), V3
+       VCEQG   V0, V2, V4
+       VCEQG   V1, V3, V5
+       VN      V4, V5, V6
+       VCEQGS  V6, V7, V8
+       BEQ     found
+       MOVD    $1(R7), R7
+       CMPBLE  R7, R2, index17to32loop
+       BR      notfound
+
+index33plus:
+       CMPBGT  R4, $47, index49plus
+       SUB     $32, R4, R0
+       VL      16(R3), V1
+       VLL     R0, 32(R3), V2
+       VONE    V11
+index33to48loop:
+       VL      (R7), V3
+       VL      16(R7), V4
+       VLL     R0, 32(R7), V5
+       VCEQG   V0, V3, V6
+       VCEQG   V1, V4, V7
+       VCEQG   V2, V5, V8
+       VN      V6, V7, V9
+       VN      V8, V9, V10
+       VCEQGS  V10, V11, V12
+       BEQ     found
+       MOVD    $1(R7), R7
+       CMPBLE  R7, R2, index33to48loop
+       BR      notfound
+
+index49plus:
+       CMPBGT  R4, $63, index65plus
+       SUB     $48, R4, R0
+       VL      16(R3), V1
+       VL      32(R3), V2
+       VLL     R0, 48(R3), V3
+       VONE    V15
+index49to64loop:
+       VL      (R7), V4
+       VL      16(R7), V5
+       VL      32(R7), V6
+       VLL     R0, 48(R7), V7
+       VCEQG   V0, V4, V8
+       VCEQG   V1, V5, V9
+       VCEQG   V2, V6, V10
+       VCEQG   V3, V7, V11
+       VN      V8, V9, V12
+       VN      V10, V11, V13
+       VN      V12, V13, V14
+       VCEQGS  V14, V15, V16
+       BEQ     found
+       MOVD    $1(R7), R7
+       CMPBLE  R7, R2, index49to64loop
+notfound:
+       MOVD    $-1, (R5)
+       RET
+
+index65plus:
+       // not implemented
+       MOVD    $0, (R0)
+       RET
+
+foundV17: // index is in doubleword V17[0]
+       VLGVG   $0, V17, R8
+       ADD     R8, R7
+found:
+       SUB     R1, R7
+       MOVD    R7, (R5)
+       RET
diff --git a/src/runtime/asm_amd64.s b/src/runtime/asm_amd64.s

index f91a01da72f89c9aa2e9649e15f15e83d1249798..ab5407bbcd0692a8b38c348e3f01212b8c38d124 100644 (file)
--- a/src/runtime/asm_amd64.s
+++ b/src/runtime/asm_amd64.s
@@ -1358,274 +1358,6 @@ DATA shifts<>+0xf0(SB)/8, $0x0807060504030201
  DATA shifts<>+0xf8(SB)/8, $0xff0f0e0d0c0b0a09
  GLOBL shifts<>(SB),RODATA,$256
  
-TEXT strings·indexShortStr(SB),NOSPLIT,$0-40
-       MOVQ s+0(FP), DI
-       // We want len in DX and AX, because PCMPESTRI implicitly consumes them
-       MOVQ s_len+8(FP), DX
-       MOVQ c+16(FP), BP
-       MOVQ c_len+24(FP), AX
-       MOVQ DI, R10
-       LEAQ ret+32(FP), R11
-       JMP  runtime·indexShortStr(SB)
-
-TEXT bytes·indexShortStr(SB),NOSPLIT,$0-56
-       MOVQ s+0(FP), DI
-       MOVQ s_len+8(FP), DX
-       MOVQ c+24(FP), BP
-       MOVQ c_len+32(FP), AX
-       MOVQ DI, R10
-       LEAQ ret+48(FP), R11
-       JMP  runtime·indexShortStr(SB)
-
-// AX: length of string, that we are searching for
-// DX: length of string, in which we are searching
-// DI: pointer to string, in which we are searching
-// BP: pointer to string, that we are searching for
-// R11: address, where to put return value
-TEXT runtime·indexShortStr(SB),NOSPLIT,$0
-       CMPQ AX, DX
-       JA fail
-       CMPQ DX, $16
-       JAE sse42
-no_sse42:
-       CMPQ AX, $2
-       JA   _3_or_more
-       MOVW (BP), BP
-       LEAQ -1(DI)(DX*1), DX
-loop2:
-       MOVW (DI), SI
-       CMPW SI,BP
-       JZ success
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop2
-       JMP fail
-_3_or_more:
-       CMPQ AX, $3
-       JA   _4_or_more
-       MOVW 1(BP), BX
-       MOVW (BP), BP
-       LEAQ -2(DI)(DX*1), DX
-loop3:
-       MOVW (DI), SI
-       CMPW SI,BP
-       JZ   partial_success3
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop3
-       JMP fail
-partial_success3:
-       MOVW 1(DI), SI
-       CMPW SI,BX
-       JZ success
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop3
-       JMP fail
-_4_or_more:
-       CMPQ AX, $4
-       JA   _5_or_more
-       MOVL (BP), BP
-       LEAQ -3(DI)(DX*1), DX
-loop4:
-       MOVL (DI), SI
-       CMPL SI,BP
-       JZ   success
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop4
-       JMP fail
-_5_or_more:
-       CMPQ AX, $7
-       JA   _8_or_more
-       LEAQ 1(DI)(DX*1), DX
-       SUBQ AX, DX
-       MOVL -4(BP)(AX*1), BX
-       MOVL (BP), BP
-loop5to7:
-       MOVL (DI), SI
-       CMPL SI,BP
-       JZ   partial_success5to7
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop5to7
-       JMP fail
-partial_success5to7:
-       MOVL -4(AX)(DI*1), SI
-       CMPL SI,BX
-       JZ success
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop5to7
-       JMP fail
-_8_or_more:
-       CMPQ AX, $8
-       JA   _9_or_more
-       MOVQ (BP), BP
-       LEAQ -7(DI)(DX*1), DX
-loop8:
-       MOVQ (DI), SI
-       CMPQ SI,BP
-       JZ   success
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop8
-       JMP fail
-_9_or_more:
-       CMPQ AX, $15
-       JA   _16_or_more
-       LEAQ 1(DI)(DX*1), DX
-       SUBQ AX, DX
-       MOVQ -8(BP)(AX*1), BX
-       MOVQ (BP), BP
-loop9to15:
-       MOVQ (DI), SI
-       CMPQ SI,BP
-       JZ   partial_success9to15
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop9to15
-       JMP fail
-partial_success9to15:
-       MOVQ -8(AX)(DI*1), SI
-       CMPQ SI,BX
-       JZ success
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop9to15
-       JMP fail
-_16_or_more:
-       CMPQ AX, $16
-       JA   _17_or_more
-       MOVOU (BP), X1
-       LEAQ -15(DI)(DX*1), DX
-loop16:
-       MOVOU (DI), X2
-       PCMPEQB X1, X2
-       PMOVMSKB X2, SI
-       CMPQ  SI, $0xffff
-       JE   success
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop16
-       JMP fail
-_17_or_more:
-       CMPQ AX, $31
-       JA   _32_or_more
-       LEAQ 1(DI)(DX*1), DX
-       SUBQ AX, DX
-       MOVOU -16(BP)(AX*1), X0
-       MOVOU (BP), X1
-loop17to31:
-       MOVOU (DI), X2
-       PCMPEQB X1,X2
-       PMOVMSKB X2, SI
-       CMPQ  SI, $0xffff
-       JE   partial_success17to31
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop17to31
-       JMP fail
-partial_success17to31:
-       MOVOU -16(AX)(DI*1), X3
-       PCMPEQB X0, X3
-       PMOVMSKB X3, SI
-       CMPQ  SI, $0xffff
-       JE success
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop17to31
-       JMP fail
-// We can get here only when AVX2 is enabled and cutoff for indexShortStr is set to 63
-// So no need to check cpuid
-_32_or_more:
-       CMPQ AX, $32
-       JA   _33_to_63
-       VMOVDQU (BP), Y1
-       LEAQ -31(DI)(DX*1), DX
-loop32:
-       VMOVDQU (DI), Y2
-       VPCMPEQB Y1, Y2, Y3
-       VPMOVMSKB Y3, SI
-       CMPL  SI, $0xffffffff
-       JE   success_avx2
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop32
-       JMP fail_avx2
-_33_to_63:
-       LEAQ 1(DI)(DX*1), DX
-       SUBQ AX, DX
-       VMOVDQU -32(BP)(AX*1), Y0
-       VMOVDQU (BP), Y1
-loop33to63:
-       VMOVDQU (DI), Y2
-       VPCMPEQB Y1, Y2, Y3
-       VPMOVMSKB Y3, SI
-       CMPL  SI, $0xffffffff
-       JE   partial_success33to63
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop33to63
-       JMP fail_avx2
-partial_success33to63:
-       VMOVDQU -32(AX)(DI*1), Y3
-       VPCMPEQB Y0, Y3, Y4
-       VPMOVMSKB Y4, SI
-       CMPL  SI, $0xffffffff
-       JE success_avx2
-       ADDQ $1,DI
-       CMPQ DI,DX
-       JB loop33to63
-fail_avx2:
-       VZEROUPPER
-fail:
-       MOVQ $-1, (R11)
-       RET
-success_avx2:
-       VZEROUPPER
-       JMP success
-sse42:
-       CMPB runtime·support_sse42(SB), $1
-       JNE no_sse42
-       CMPQ AX, $12
-       // PCMPESTRI is slower than normal compare,
-       // so using it makes sense only if we advance 4+ bytes per compare
-       // This value was determined experimentally and is the ~same
-       // on Nehalem (first with SSE42) and Haswell.
-       JAE _9_or_more
-       LEAQ 16(BP), SI
-       TESTW $0xff0, SI
-       JEQ no_sse42
-       MOVOU (BP), X1
-       LEAQ -15(DI)(DX*1), SI
-       MOVQ $16, R9
-       SUBQ AX, R9 // We advance by 16-len(sep) each iteration, so precalculate it into R9
-loop_sse42:
-       // 0x0c means: unsigned byte compare (bits 0,1 are 00)
-       // for equality (bits 2,3 are 11)
-       // result is not masked or inverted (bits 4,5 are 00)
-       // and corresponds to first matching byte (bit 6 is 0)
-       PCMPESTRI $0x0c, (DI), X1
-       // CX == 16 means no match,
-       // CX > R9 means partial match at the end of the string,
-       // otherwise sep is at offset CX from X1 start
-       CMPQ CX, R9
-       JBE sse42_success
-       ADDQ R9, DI
-       CMPQ DI, SI
-       JB loop_sse42
-       PCMPESTRI $0x0c, -1(SI), X1
-       CMPQ CX, R9
-       JA fail
-       LEAQ -1(SI), DI
-sse42_success:
-       ADDQ CX, DI
-success:
-       SUBQ R10, DI
-       MOVQ DI, (R11)
-       RET
-
  TEXT runtime·return0(SB), NOSPLIT, $0
         MOVL    $0, AX
         RET
diff --git a/src/runtime/asm_s390x.s b/src/runtime/asm_s390x.s

index ed4cd6b3d3492aa5c6b5b8ae5222bc083aeb3886..1c7e44cdae4c6c75044a2ef90c1af149ab48ad54 100644 (file)
--- a/src/runtime/asm_s390x.s
+++ b/src/runtime/asm_s390x.s
@@ -796,230 +796,6 @@ TEXT ·publicationBarrier(SB),NOSPLIT|NOFRAME,$0-0
          // compile barrier.
         RET
  
-// func supportsVX() bool
-TEXT strings·supportsVX(SB),NOSPLIT,$0-1
-       MOVBZ   runtime·cpu+facilities_hasVX(SB), R0
-       MOVB    R0, ret+0(FP)
-       RET
-
-// func supportsVX() bool
-TEXT bytes·supportsVX(SB),NOSPLIT,$0-1
-       MOVBZ   runtime·cpu+facilities_hasVX(SB), R0
-       MOVB    R0, ret+0(FP)
-       RET
-
-// func indexShortStr(s, sep string) int
-// Caller must confirm availability of vx facility before calling.
-TEXT strings·indexShortStr(SB),NOSPLIT|NOFRAME,$0-40
-       LMG     s+0(FP), R1, R2   // R1=&s[0],   R2=len(s)
-       LMG     sep+16(FP), R3, R4 // R3=&sep[0], R4=len(sep)
-       MOVD    $ret+32(FP), R5
-       BR      runtime·indexShortStr(SB)
-
-// func indexShortStr(s, sep []byte) int
-// Caller must confirm availability of vx facility before calling.
-TEXT bytes·indexShortStr(SB),NOSPLIT|NOFRAME,$0-56
-       LMG     s+0(FP), R1, R2    // R1=&s[0],   R2=len(s)
-       LMG     sep+24(FP), R3, R4 // R3=&sep[0], R4=len(sep)
-       MOVD    $ret+48(FP), R5
-       BR      runtime·indexShortStr(SB)
-
-// s: string we are searching
-// sep: string to search for
-// R1=&s[0], R2=len(s)
-// R3=&sep[0], R4=len(sep)
-// R5=&ret (int)
-// Caller must confirm availability of vx facility before calling.
-TEXT runtime·indexShortStr(SB),NOSPLIT|NOFRAME,$0
-       CMPBGT  R4, R2, notfound
-       ADD     R1, R2
-       SUB     R4, R2 // R2=&s[len(s)-len(sep)] (last valid index)
-       CMPBEQ  R4, $0, notfound
-       SUB     $1, R4 // R4=len(sep)-1 for use as VLL index
-       VLL     R4, (R3), V0 // contains first 16 bytes of sep
-       MOVD    R1, R7
-index2plus:
-       CMPBNE  R4, $1, index3plus
-       MOVD    $15(R7), R9
-       CMPBGE  R9, R2, index2to16
-       VGBM    $0xaaaa, V31       // 0xff00ff00ff00ff00...
-       VONE    V16
-       VREPH   $0, V0, V1
-       CMPBGE  R9, R2, index2to16
-index2loop:
-       VL      0(R7), V2          // 16 bytes, even indices
-       VL      1(R7), V4          // 16 bytes, odd indices
-       VCEQH   V1, V2, V5         // compare even indices
-       VCEQH   V1, V4, V6         // compare odd indices
-       VSEL    V5, V6, V31, V7    // merge even and odd indices
-       VFEEBS  V16, V7, V17       // find leftmost index, set condition to 1 if found
-       BLT     foundV17
-       MOVD    $16(R7), R7        // R7+=16
-       ADD     $15, R7, R9
-       CMPBLE  R9, R2, index2loop // continue if (R7+15) <= R2 (last index to search)
-       CMPBLE  R7, R2, index2to16
-       BR      notfound
-
-index3plus:
-       CMPBNE  R4, $2, index4plus
-       ADD     $15, R7, R9
-       CMPBGE  R9, R2, index2to16
-       MOVD    $1, R0
-       VGBM    $0xaaaa, V31       // 0xff00ff00ff00ff00...
-       VONE    V16
-       VREPH   $0, V0, V1
-       VREPB   $2, V0, V8
-index3loop:
-       VL      (R7), V2           // load 16-bytes into V2
-       VLL     R0, 16(R7), V3     // load 2-bytes into V3
-       VSLDB   $1, V2, V3, V4     // V4=(V2:V3)<<1
-       VSLDB   $2, V2, V3, V9     // V9=(V2:V3)<<2
-       VCEQH   V1, V2, V5         // compare 2-byte even indices
-       VCEQH   V1, V4, V6         // compare 2-byte odd indices
-       VCEQB   V8, V9, V10        // compare last bytes
-       VSEL    V5, V6, V31, V7    // merge even and odd indices
-       VN      V7, V10, V7        // AND indices with last byte
-       VFEEBS  V16, V7, V17       // find leftmost index, set condition to 1 if found
-       BLT     foundV17
-       MOVD    $16(R7), R7        // R7+=16
-       ADD     $15, R7, R9
-       CMPBLE  R9, R2, index3loop // continue if (R7+15) <= R2 (last index to search)
-       CMPBLE  R7, R2, index2to16
-       BR      notfound
-
-index4plus:
-       CMPBNE  R4, $3, index5plus
-       ADD     $15, R7, R9
-       CMPBGE  R9, R2, index2to16
-       MOVD    $2, R0
-       VGBM    $0x8888, V29       // 0xff000000ff000000...
-       VGBM    $0x2222, V30       // 0x0000ff000000ff00...
-       VGBM    $0xcccc, V31       // 0xffff0000ffff0000...
-       VONE    V16
-       VREPF   $0, V0, V1
-index4loop:
-       VL      (R7), V2           // load 16-bytes into V2
-       VLL     R0, 16(R7), V3     // load 3-bytes into V3
-       VSLDB   $1, V2, V3, V4     // V4=(V2:V3)<<1
-       VSLDB   $2, V2, V3, V9     // V9=(V2:V3)<<1
-       VSLDB   $3, V2, V3, V10    // V10=(V2:V3)<<1
-       VCEQF   V1, V2, V5         // compare index 0, 4, ...
-       VCEQF   V1, V4, V6         // compare index 1, 5, ...
-       VCEQF   V1, V9, V11        // compare index 2, 6, ...
-       VCEQF   V1, V10, V12       // compare index 3, 7, ...
-       VSEL    V5, V6, V29, V13   // merge index 0, 1, 4, 5, ...
-       VSEL    V11, V12, V30, V14 // merge index 2, 3, 6, 7, ...
-       VSEL    V13, V14, V31, V7  // final merge
-       VFEEBS  V16, V7, V17       // find leftmost index, set condition to 1 if found
-       BLT     foundV17
-       MOVD    $16(R7), R7        // R7+=16
-       ADD     $15, R7, R9
-       CMPBLE  R9, R2, index4loop // continue if (R7+15) <= R2 (last index to search)
-       CMPBLE  R7, R2, index2to16
-       BR      notfound
-
-index5plus:
-       CMPBGT  R4, $15, index17plus
-index2to16:
-       CMPBGT  R7, R2, notfound
-       MOVD    $1(R7), R8
-       CMPBGT  R8, R2, index2to16tail
-index2to16loop:
-       // unrolled 2x
-       VLL     R4, (R7), V1
-       VLL     R4, 1(R7), V2
-       VCEQGS  V0, V1, V3
-       BEQ     found
-       MOVD    $1(R7), R7
-       VCEQGS  V0, V2, V4
-       BEQ     found
-       MOVD    $1(R7), R7
-       CMPBLT  R7, R2, index2to16loop
-       CMPBGT  R7, R2, notfound
-index2to16tail:
-       VLL     R4, (R7), V1
-       VCEQGS  V0, V1, V2
-       BEQ     found
-       BR      notfound
-
-index17plus:
-       CMPBGT  R4, $31, index33plus
-       SUB     $16, R4, R0
-       VLL     R0, 16(R3), V1
-       VONE    V7
-index17to32loop:
-       VL      (R7), V2
-       VLL     R0, 16(R7), V3
-       VCEQG   V0, V2, V4
-       VCEQG   V1, V3, V5
-       VN      V4, V5, V6
-       VCEQGS  V6, V7, V8
-       BEQ     found
-       MOVD    $1(R7), R7
-       CMPBLE  R7, R2, index17to32loop
-       BR      notfound
-
-index33plus:
-       CMPBGT  R4, $47, index49plus
-       SUB     $32, R4, R0
-       VL      16(R3), V1
-       VLL     R0, 32(R3), V2
-       VONE    V11
-index33to48loop:
-       VL      (R7), V3
-       VL      16(R7), V4
-       VLL     R0, 32(R7), V5
-       VCEQG   V0, V3, V6
-       VCEQG   V1, V4, V7
-       VCEQG   V2, V5, V8
-       VN      V6, V7, V9
-       VN      V8, V9, V10
-       VCEQGS  V10, V11, V12
-       BEQ     found
-       MOVD    $1(R7), R7
-       CMPBLE  R7, R2, index33to48loop
-       BR      notfound
-
-index49plus:
-       CMPBGT  R4, $63, index65plus
-       SUB     $48, R4, R0
-       VL      16(R3), V1
-       VL      32(R3), V2
-       VLL     R0, 48(R3), V3
-       VONE    V15
-index49to64loop:
-       VL      (R7), V4
-       VL      16(R7), V5
-       VL      32(R7), V6
-       VLL     R0, 48(R7), V7
-       VCEQG   V0, V4, V8
-       VCEQG   V1, V5, V9
-       VCEQG   V2, V6, V10
-       VCEQG   V3, V7, V11
-       VN      V8, V9, V12
-       VN      V10, V11, V13
-       VN      V12, V13, V14
-       VCEQGS  V14, V15, V16
-       BEQ     found
-       MOVD    $1(R7), R7
-       CMPBLE  R7, R2, index49to64loop
-notfound:
-       MOVD    $-1, (R5)
-       RET
-
-index65plus:
-       // not implemented
-       MOVD    $0, (R0)
-       RET
-
-foundV17: // index is in doubleword V17[0]
-       VLGVG   $0, V17, R8
-       ADD     R8, R7
-found:
-       SUB     R1, R7
-       MOVD    R7, (R5)
-       RET
-
  // This is called from .init_array and follows the platform, not Go, ABI.
  // We are overly conservative. We could only save the registers we use.
  // However, since this function is only called once per loaded module
diff --git a/src/runtime/os_linux_s390x.go b/src/runtime/os_linux_s390x.go

index 21290528363d2a23a5ac8bfb2b03265373c92f8d..55d35c7cff1cf34aba3b8665a554a866f2763bf5 100644 (file)
--- a/src/runtime/os_linux_s390x.go
+++ b/src/runtime/os_linux_s390x.go
@@ -4,32 +4,16 @@
  
  package runtime
  
-import (
-       internalcpu "internal/cpu"
-       "runtime/internal/sys"
-)
+import "internal/cpu"
  
  const (
         // bit masks taken from bits/hwcap.h
         _HWCAP_S390_VX = 2048 // vector facility
  )
  
-// facilities is padded to avoid false sharing.
-type facilities struct {
-       _     [sys.CacheLineSize]byte
-       hasVX bool // vector facility
-       _     [sys.CacheLineSize]byte
-}
-
-// cpu indicates the availability of s390x facilities that can be used in
-// Go assembly but are optional on models supported by Go.
-// TODO: remove this once we're only using internal/cpu.
-var cpu facilities
-
  func archauxv(tag, val uintptr) {
         switch tag {
         case _AT_HWCAP: // CPU capability bit flags
-               internalcpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
-               cpu.hasVX = val&_HWCAP_S390_VX != 0
+               cpu.S390X.HasVX = val&_HWCAP_S390_VX != 0
         }
  }
diff --git a/src/strings/strings.go b/src/strings/strings.go

index 7d3ed37edd7162b02dbd02d0f7f6bab2b42b8e4a..b0a53fdefdfee8668a101b64d04cce9fa06dbe1c 100644 (file)
--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@@ -932,6 +932,85 @@ func EqualFold(s, t string) bool {
         return s == t
  }
  
+// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
+func Index(s, substr string) int {
+       n := len(substr)
+       switch {
+       case n == 0:
+               return 0
+       case n == 1:
+               return IndexByte(s, substr[0])
+       case n == len(s):
+               if substr == s {
+                       return 0
+               }
+               return -1
+       case n > len(s):
+               return -1
+       case n <= bytealg.MaxLen:
+               // Use brute force when s and substr both are small
+               if len(s) <= bytealg.MaxBruteForce {
+                       return bytealg.IndexString(s, substr)
+               }
+               c := substr[0]
+               i := 0
+               t := s[:len(s)-n+1]
+               fails := 0
+               for i < len(t) {
+                       if t[i] != c {
+                               // IndexByte is faster than bytealg.IndexString, so use it as long as
+                               // we're not getting lots of false positives.
+                               o := IndexByte(t[i:], c)
+                               if o < 0 {
+                                       return -1
+                               }
+                               i += o
+                       }
+                       if s[i:i+n] == substr {
+                               return i
+                       }
+                       fails++
+                       i++
+                       // Switch to bytealg.IndexString when IndexByte produces too many false positives.
+                       if fails > bytealg.Cutover(i) {
+                               r := bytealg.IndexString(s[i:], substr)
+                               if r >= 0 {
+                                       return r + i
+                               }
+                               return -1
+                       }
+               }
+               return -1
+       }
+       c := substr[0]
+       i := 0
+       t := s[:len(s)-n+1]
+       fails := 0
+       for i < len(t) {
+               if t[i] != c {
+                       o := IndexByte(t[i:], c)
+                       if o < 0 {
+                               return -1
+                       }
+                       i += o
+               }
+               if s[i:i+n] == substr {
+                       return i
+               }
+               i++
+               fails++
+               if fails >= 4+i>>4 && i < len(t) {
+                       // See comment in ../bytes/bytes_generic.go.
+                       j := indexRabinKarp(s[i:], substr)
+                       if j < 0 {
+                               return -1
+                       }
+                       return i + j
+               }
+       }
+       return -1
+}
+
  func indexRabinKarp(s, substr string) int {
         // Rabin-Karp search
         hashss, pow := hashStr(substr)
diff --git a/src/strings/strings_amd64.go b/src/strings/strings_amd64.go

deleted file mode 100644 (file)

index 75e7d0c..0000000
--- a/src/strings/strings_amd64.go
+++ /dev/null
@@ -1,79 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package strings
-
-import "internal/cpu"
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of c in s, or -1 if c is not present in s.
-// indexShortStr requires 2 <= len(c) <= shortStringLen
-func indexShortStr(s, c string) int  // ../runtime/asm_amd64.s
-func countByte(s string, c byte) int // ../runtime/asm_amd64.s
-
-var shortStringLen int
-
-func init() {
-       if cpu.X86.HasAVX2 {
-               shortStringLen = 63
-       } else {
-               shortStringLen = 31
-       }
-}
-
-// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
-func Index(s, substr string) int {
-       n := len(substr)
-       switch {
-       case n == 0:
-               return 0
-       case n == 1:
-               return IndexByte(s, substr[0])
-       case n == len(s):
-               if substr == s {
-                       return 0
-               }
-               return -1
-       case n > len(s):
-               return -1
-       case n <= shortStringLen:
-               // Use brute force when s and substr both are small
-               if len(s) <= 64 {
-                       return indexShortStr(s, substr)
-               }
-               c := substr[0]
-               i := 0
-               t := s[:len(s)-n+1]
-               fails := 0
-               for i < len(t) {
-                       if t[i] != c {
-                               // IndexByte skips 16/32 bytes per iteration,
-                               // so it's faster than indexShortStr.
-                               o := IndexByte(t[i:], c)
-                               if o < 0 {
-                                       return -1
-                               }
-                               i += o
-                       }
-                       if s[i:i+n] == substr {
-                               return i
-                       }
-                       fails++
-                       i++
-                       // Switch to indexShortStr when IndexByte produces too many false positives.
-                       // Too many means more that 1 error per 8 characters.
-                       // Allow some errors in the beginning.
-                       if fails > (i+16)/8 {
-                               r := indexShortStr(s[i:], substr)
-                               if r >= 0 {
-                                       return r + i
-                               }
-                               return -1
-                       }
-               }
-               return -1
-       }
-       return indexRabinKarp(s, substr)
-}
diff --git a/src/strings/strings_generic.go b/src/strings/strings_generic.go

deleted file mode 100644 (file)

index ac3b8dc..0000000
--- a/src/strings/strings_generic.go
+++ /dev/null
@@ -1,55 +0,0 @@
-// Copyright 2015 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-// +build !amd64,!s390x
-
-package strings
-
-// TODO: implements short string optimization on non amd64 platforms
-// and get rid of strings_amd64.go
-
-// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
-func Index(s, substr string) int {
-       n := len(substr)
-       switch {
-       case n == 0:
-               return 0
-       case n == 1:
-               return IndexByte(s, substr[0])
-       case n == len(s):
-               if substr == s {
-                       return 0
-               }
-               return -1
-       case n > len(s):
-               return -1
-       }
-       c := substr[0]
-       i := 0
-       t := s[:len(s)-n+1]
-       fails := 0
-       for i < len(t) {
-               if t[i] != c {
-                       o := IndexByte(t[i:], c)
-                       if o < 0 {
-                               return -1
-                       }
-                       i += o
-               }
-               if s[i:i+n] == substr {
-                       return i
-               }
-               i++
-               fails++
-               if fails >= 4+i>>4 && i < len(t) {
-                       // See comment in ../bytes/bytes_generic.go.
-                       j := indexRabinKarp(s[i:], substr)
-                       if j < 0 {
-                               return -1
-                       }
-                       return i + j
-               }
-       }
-       return -1
-}
diff --git a/src/strings/strings_s390x.go b/src/strings/strings_s390x.go

deleted file mode 100644 (file)

index b2e459b..0000000
--- a/src/strings/strings_s390x.go
+++ /dev/null
@@ -1,80 +0,0 @@
-// Copyright 2016 The Go Authors. All rights reserved.
-// Use of this source code is governed by a BSD-style
-// license that can be found in the LICENSE file.
-
-package strings
-
-//go:noescape
-
-// indexShortStr returns the index of the first instance of sep in s,
-// or -1 if sep is not present in s.
-// indexShortStr requires 2 <= len(sep) <= shortStringLen
-func indexShortStr(s, sep string) int // ../runtime/asm_$GOARCH.s
-
-// supportsVX reports whether the vector facility is available.
-// indexShortStr must not be called if the vector facility is not
-// available.
-func supportsVX() bool // ../runtime/asm_s390x.s
-
-var shortStringLen = -1
-
-func init() {
-       if supportsVX() {
-               shortStringLen = 64
-       }
-}
-
-// Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
-func Index(s, substr string) int {
-       n := len(substr)
-       switch {
-       case n == 0:
-               return 0
-       case n == 1:
-               return IndexByte(s, substr[0])
-       case n == len(s):
-               if substr == s {
-                       return 0
-               }
-               return -1
-       case n > len(s):
-               return -1
-       case n <= shortStringLen:
-               // Use brute force when s and substr both are small
-               if len(s) <= 64 {
-                       return indexShortStr(s, substr)
-               }
-               c := substr[0]
-               i := 0
-               t := s[:len(s)-n+1]
-               fails := 0
-               for i < len(t) {
-                       if t[i] != c {
-                               // IndexByte skips 16/32 bytes per iteration,
-                               // so it's faster than indexShortStr.
-                               o := IndexByte(t[i:], c)
-                               if o < 0 {
-                                       return -1
-                               }
-                               i += o
-                       }
-                       if s[i:i+n] == substr {
-                               return i
-                       }
-                       fails++
-                       i++
-                       // Switch to indexShortStr when IndexByte produces too many false positives.
-                       // Too many means more that 1 error per 8 characters.
-                       // Allow some errors in the beginning.
-                       if fails > (i+16)/8 {
-                               r := indexShortStr(s[i:], substr)
-                               if r >= 0 {
-                                       return r + i
-                               }
-                               return -1
-                       }
-               }
-               return -1
-       }
-       return indexRabinKarp(s, substr)
-}
author	Keith Randall <khr@golang.org>
	Sun, 4 Mar 2018 17:47:47 +0000 (09:47 -0800)
committer	Keith Randall <khr@golang.org>
	Sun, 4 Mar 2018 19:49:44 +0000 (19:49 +0000)
src/bytes/bytes.go		patch \| blob \| history
src/bytes/bytes_amd64.go	[deleted file]	patch \| blob \| history
src/bytes/bytes_arm64.go	[deleted file]	patch \| blob \| history
src/bytes/bytes_generic.go	[deleted file]	patch \| blob \| history
src/bytes/bytes_s390x.go	[deleted file]	patch \| blob \| history
src/cmd/vet/all/whitelist/amd64.txt		patch \| blob \| history
src/cmd/vet/all/whitelist/s390x.txt		patch \| blob \| history
src/internal/bytealg/bytealg.go	[new file with mode: 0644]	patch \| blob
src/internal/bytealg/count_arm64.s	[new file with mode: 0644]	patch \| blob
src/internal/bytealg/count_generic.go		patch \| blob \| history
src/internal/bytealg/count_native.go		patch \| blob \| history
src/internal/bytealg/equal_native.go		patch \| blob \| history
src/internal/bytealg/index_amd64.go	[new file with mode: 0644]	patch \| blob
src/internal/bytealg/index_amd64.s	[new file with mode: 0644]	patch \| blob
src/internal/bytealg/index_arm64.go	[new file with mode: 0644]	patch \| blob
src/internal/bytealg/index_arm64.s	[moved from src/bytes/bytes_arm64.s with 50% similarity]	patch \| blob \| history
src/internal/bytealg/index_generic.go	[new file with mode: 0644]	patch \| blob
src/internal/bytealg/index_native.go	[new file with mode: 0644]	patch \| blob
src/internal/bytealg/index_s390x.go	[new file with mode: 0644]	patch \| blob
src/internal/bytealg/index_s390x.s	[new file with mode: 0644]	patch \| blob
src/runtime/asm_amd64.s		patch \| blob \| history
src/runtime/asm_s390x.s		patch \| blob \| history
src/runtime/os_linux_s390x.go		patch \| blob \| history
src/strings/strings.go		patch \| blob \| history
src/strings/strings_amd64.go	[deleted file]	patch \| blob \| history
src/strings/strings_generic.go	[deleted file]	patch \| blob \| history
src/strings/strings_s390x.go	[deleted file]	patch \| blob \| history