bytes,strings: in generic Index, use mix of IndexByte and Rabin-Karp

author Keith Randall <khr@golang.org>

Sat, 4 Nov 2017 17:19:53 +0000 (10:19 -0700)

committer Keith Randall <khr@golang.org>

Wed, 15 Nov 2017 17:35:09 +0000 (17:35 +0000)
author Keith Randall <khr@golang.org>
Sat, 4 Nov 2017 17:19:53 +0000 (10:19 -0700)
committer Keith Randall <khr@golang.org>
Wed, 15 Nov 2017 17:35:09 +0000 (17:35 +0000)
diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go

index 68ed8e1b437fe898816c2e153450bbd1cf4c997f..260f32500a0fc2f29c15b2928daefa1cf3acb84b 100644 (file)
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@@ -815,3 +815,46 @@ func EqualFold(s, t []byte) bool {
         // One string is empty. Are both?
         return len(s) == len(t)
  }
+
+func indexRabinKarp(s, sep []byte) int {
+       // Rabin-Karp search
+       hashsep, pow := hashStr(sep)
+       n := len(sep)
+       var h uint32
+       for i := 0; i < n; i++ {
+               h = h*primeRK + uint32(s[i])
+       }
+       if h == hashsep && Equal(s[:n], sep) {
+               return 0
+       }
+       for i := n; i < len(s); {
+               h *= primeRK
+               h += uint32(s[i])
+               h -= pow * uint32(s[i-n])
+               i++
+               if h == hashsep && Equal(s[i-n:i], sep) {
+                       return i - n
+               }
+       }
+       return -1
+}
+
+// primeRK is the prime base used in Rabin-Karp algorithm.
+const primeRK = 16777619
+
+// hashStr returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func hashStr(sep []byte) (uint32, uint32) {
+       hash := uint32(0)
+       for i := 0; i < len(sep); i++ {
+               hash = hash*primeRK + uint32(sep[i])
+       }
+       var pow, sq uint32 = 1, primeRK
+       for i := len(sep); i > 0; i >>= 1 {
+               if i&1 != 0 {
+                       pow *= sq
+               }
+               sq *= sq
+       }
+       return hash, pow
+}
diff --git a/src/bytes/bytes_amd64.go b/src/bytes/bytes_amd64.go

index 88b0564db4c8cd1498e820d54b3bb2999078773f..0c9d613ef9d2aa06b911c381d1b29610698f4762 100644 (file)
--- a/src/bytes/bytes_amd64.go
+++ b/src/bytes/bytes_amd64.go
@@ -75,25 +75,7 @@ func Index(s, sep []byte) int {
                 }
                 return -1
         }
-       // Rabin-Karp search
-       hashsep, pow := hashStr(sep)
-       var h uint32
-       for i := 0; i < n; i++ {
-               h = h*primeRK + uint32(s[i])
-       }
-       if h == hashsep && Equal(s[:n], sep) {
-               return 0
-       }
-       for i := n; i < len(s); {
-               h *= primeRK
-               h += uint32(s[i])
-               h -= pow * uint32(s[i-n])
-               i++
-               if h == hashsep && Equal(s[i-n:i], sep) {
-                       return i - n
-               }
-       }
-       return -1
+       return indexRabinKarp(s, sep)
  }
  
  // Count counts the number of non-overlapping instances of sep in s.
@@ -104,23 +86,3 @@ func Count(s, sep []byte) int {
         }
         return countGeneric(s, sep)
  }
-
-// primeRK is the prime base used in Rabin-Karp algorithm.
-const primeRK = 16777619
-
-// hashStr returns the hash and the appropriate multiplicative
-// factor for use in Rabin-Karp algorithm.
-func hashStr(sep []byte) (uint32, uint32) {
-       hash := uint32(0)
-       for i := 0; i < len(sep); i++ {
-               hash = hash*primeRK + uint32(sep[i])
-       }
-       var pow, sq uint32 = 1, primeRK
-       for i := len(sep); i > 0; i >>= 1 {
-               if i&1 != 0 {
-                       pow *= sq
-               }
-               sq *= sq
-       }
-       return hash, pow
-}
diff --git a/src/bytes/bytes_generic.go b/src/bytes/bytes_generic.go

index 32abd3b33ff7d19173e61a60063c0fef0bbaa7ba..b30e53bf2e73c77ac5349516a2cd8643e285c779 100644 (file)
--- a/src/bytes/bytes_generic.go
+++ b/src/bytes/bytes_generic.go
@@ -6,23 +6,25 @@
  
  package bytes
  
-// TODO: implements short string optimization on non amd64 platforms
-// and get rid of bytes_amd64.go
-
  // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
  func Index(s, sep []byte) int {
         n := len(sep)
-       if n == 0 {
+       switch {
+       case n == 0:
                 return 0
-       }
-       if n > len(s) {
+       case n == 1:
+               return IndexByte(s, sep[0])
+       case n == len(s):
+               if Equal(sep, s) {
+                       return 0
+               }
+               return -1
+       case n > len(s):
                 return -1
         }
         c := sep[0]
-       if n == 1 {
-               return IndexByte(s, c)
-       }
         i := 0
+       fails := 0
         t := s[:len(s)-n+1]
         for i < len(t) {
                 if t[i] != c {
@@ -36,6 +38,22 @@ func Index(s, sep []byte) int {
                         return i
                 }
                 i++
+               fails++
+               if fails >= 4+i>>4 && i < len(t) {
+                       // Give up on IndexByte, it isn't skipping ahead
+                       // far enough to be better than Rabin-Karp.
+                       // Experiments (using IndexPeriodic) suggest
+                       // the cutover is about 16 byte skips.
+                       // TODO: if large prefixes of sep are matching
+                       // we should cutover at even larger average skips,
+                       // because Equal becomes that much more expensive.
+                       // This code does not take that effect into account.
+                       j := indexRabinKarp(s[i:], sep)
+                       if j < 0 {
+                               return -1
+                       }
+                       return i + j
+               }
         }
         return -1
  }
diff --git a/src/bytes/bytes_s390x.go b/src/bytes/bytes_s390x.go

index e25ca4b84e8f31fb4ddb8121aaf6a8a304624f9f..c59b891292f66e2d8bb80fdf002d5fe3fe6ff23c 100644 (file)
--- a/src/bytes/bytes_s390x.go
+++ b/src/bytes/bytes_s390x.go
@@ -76,25 +76,7 @@ func Index(s, sep []byte) int {
                 }
                 return -1
         }
-       // Rabin-Karp search
-       hashsep, pow := hashStr(sep)
-       var h uint32
-       for i := 0; i < n; i++ {
-               h = h*primeRK + uint32(s[i])
-       }
-       if h == hashsep && Equal(s[:n], sep) {
-               return 0
-       }
-       for i := n; i < len(s); {
-               h *= primeRK
-               h += uint32(s[i])
-               h -= pow * uint32(s[i-n])
-               i++
-               if h == hashsep && Equal(s[i-n:i], sep) {
-                       return i - n
-               }
-       }
-       return -1
+       return indexRabinKarp(s, sep)
  }
  
  // Count counts the number of non-overlapping instances of sep in s.
@@ -102,23 +84,3 @@ func Index(s, sep []byte) int {
  func Count(s, sep []byte) int {
         return countGeneric(s, sep)
  }
-
-// primeRK is the prime base used in Rabin-Karp algorithm.
-const primeRK = 16777619
-
-// hashStr returns the hash and the appropriate multiplicative
-// factor for use in Rabin-Karp algorithm.
-func hashStr(sep []byte) (uint32, uint32) {
-       hash := uint32(0)
-       for i := 0; i < len(sep); i++ {
-               hash = hash*primeRK + uint32(sep[i])
-       }
-       var pow, sq uint32 = 1, primeRK
-       for i := len(sep); i > 0; i >>= 1 {
-               if i&1 != 0 {
-                       pow *= sq
-               }
-               sq *= sq
-       }
-       return hash, pow
-}
diff --git a/src/bytes/bytes_test.go b/src/bytes/bytes_test.go

index 78eca2064a83d7dbd942cbabefd87a3ebb76b6cf..1e56571c738ec6f346cac540596b92c46d26785e 100644 (file)
--- a/src/bytes/bytes_test.go
+++ b/src/bytes/bytes_test.go
@@ -139,6 +139,9 @@ var indexTests = []BinOpTest{
         {"barfoobarfooyyyzzzyyyzzzyyyzzzyyyxxxzzzyyy", "x", 33},
         {"foofyfoobarfoobar", "y", 4},
         {"oooooooooooooooooooooo", "r", -1},
+       // test fallback to Rabin-Karp.
+       {"oxoxoxoxoxoxoxoxoxoxoxoy", "oy", 22},
+       {"oxoxoxoxoxoxoxoxoxoxoxox", "oy", -1},
  }
  
  var lastIndexTests = []BinOpTest{
@@ -1730,3 +1733,18 @@ func BenchmarkTrimASCII(b *testing.B) {
                 }
         }
  }
+
+func BenchmarkIndexPeriodic(b *testing.B) {
+       key := []byte{1, 1}
+       for _, skip := range [...]int{2, 4, 8, 16, 32, 64} {
+               b.Run(fmt.Sprintf("IndexPeriodic%d", skip), func(b *testing.B) {
+                       buf := make([]byte, 1<<16)
+                       for i := 0; i < len(buf); i += skip {
+                               buf[i] = 1
+                       }
+                       for i := 0; i < b.N; i++ {
+                               Index(buf, key)
+                       }
+               })
+       }
+}
diff --git a/src/strings/strings.go b/src/strings/strings.go

index 8520f8a732d91ceabcb437129a79c4bd6804fabc..c66c248c023f2732bf0e8d960cce1bee26f0566f 100644 (file)
--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@@ -918,3 +918,27 @@ func EqualFold(s, t string) bool {
         // One string is empty. Are both?
         return s == t
  }
+
+func indexRabinKarp(s, substr string) int {
+       // Rabin-Karp search
+       hashss, pow := hashStr(substr)
+       n := len(substr)
+       var h uint32
+       for i := 0; i < n; i++ {
+               h = h*primeRK + uint32(s[i])
+       }
+       if h == hashss && s[:n] == substr {
+               return 0
+       }
+       for i := n; i < len(s); {
+               h *= primeRK
+               h += uint32(s[i])
+               h -= pow * uint32(s[i-n])
+               i++
+               if h == hashss && s[i-n:i] == substr {
+                       return i - n
+               }
+       }
+       return -1
+
+}
diff --git a/src/strings/strings_amd64.go b/src/strings/strings_amd64.go

index a9c01bbf7f9004109c9e7ec3707a496b3646a8a5..68a1d0125c1efc3e1b6a6d8b7e59dfe25a521570 100644 (file)
--- a/src/strings/strings_amd64.go
+++ b/src/strings/strings_amd64.go
@@ -75,25 +75,7 @@ func Index(s, substr string) int {
                 }
                 return -1
         }
-       // Rabin-Karp search
-       hashss, pow := hashStr(substr)
-       var h uint32
-       for i := 0; i < n; i++ {
-               h = h*primeRK + uint32(s[i])
-       }
-       if h == hashss && s[:n] == substr {
-               return 0
-       }
-       for i := n; i < len(s); {
-               h *= primeRK
-               h += uint32(s[i])
-               h -= pow * uint32(s[i-n])
-               i++
-               if h == hashss && s[i-n:i] == substr {
-                       return i - n
-               }
-       }
-       return -1
+       return indexRabinKarp(s, substr)
  }
  
  // Count counts the number of non-overlapping instances of substr in s.
diff --git a/src/strings/strings_generic.go b/src/strings/strings_generic.go

index 5429a74a22fad4101b6752cd64707cae4742156a..b2af48bec85ed1971d4538053198e229c7989c62 100644 (file)
--- a/src/strings/strings_generic.go
+++ b/src/strings/strings_generic.go
@@ -25,22 +25,30 @@ func Index(s, substr string) int {
         case n > len(s):
                 return -1
         }
-       // Rabin-Karp search
-       hashss, pow := hashStr(substr)
-       var h uint32
-       for i := 0; i < n; i++ {
-               h = h*primeRK + uint32(s[i])
-       }
-       if h == hashss && s[:n] == substr {
-               return 0
-       }
-       for i := n; i < len(s); {
-               h *= primeRK
-               h += uint32(s[i])
-               h -= pow * uint32(s[i-n])
+       c := substr[0]
+       i := 0
+       t := s[:len(s)-n+1]
+       fails := 0
+       for i < len(t) {
+               if t[i] != c {
+                       o := IndexByte(t[i:], c)
+                       if o < 0 {
+                               return -1
+                       }
+                       i += o
+               }
+               if s[i:i+n] == substr {
+                       return i
+               }
                 i++
-               if h == hashss && s[i-n:i] == substr {
-                       return i - n
+               fails++
+               if fails >= 4+i>>4 && i < len(t) {
+                       // See comment in ../bytes/bytes_generic.go.
+                       j := indexRabinKarp(s[i:], substr)
+                       if j < 0 {
+                               return -1
+                       }
+                       return i + j
                 }
         }
         return -1
diff --git a/src/strings/strings_s390x.go b/src/strings/strings_s390x.go

index ccf2da632d7a27a379ac1b05aa915048553af4a5..67c8e1700d180385dbb5f40b9b02ce8b977318eb 100644 (file)
--- a/src/strings/strings_s390x.go
+++ b/src/strings/strings_s390x.go
@@ -76,25 +76,7 @@ func Index(s, substr string) int {
                 }
                 return -1
         }
-       // Rabin-Karp search
-       hashss, pow := hashStr(substr)
-       var h uint32
-       for i := 0; i < n; i++ {
-               h = h*primeRK + uint32(s[i])
-       }
-       if h == hashss && s[:n] == substr {
-               return 0
-       }
-       for i := n; i < len(s); {
-               h *= primeRK
-               h += uint32(s[i])
-               h -= pow * uint32(s[i-n])
-               i++
-               if h == hashss && s[i-n:i] == substr {
-                       return i - n
-               }
-       }
-       return -1
+       return indexRabinKarp(s, substr)
  }
  
  // Count counts the number of non-overlapping instances of substr in s.
diff --git a/src/strings/strings_test.go b/src/strings/strings_test.go

index 289dd92d51ec19928f424260ff5e9f5a5856ede3..d8fcb62a87709defa563f36ebdf5b7b0292a973f 100644 (file)
--- a/src/strings/strings_test.go
+++ b/src/strings/strings_test.go
@@ -125,6 +125,9 @@ var indexTests = []IndexTest{
         {"xx012345678901234567890123456789012345678901234567890123456789012"[:41], "0123456789012345678901234567890123456789", -1},
         {"xx012345678901234567890123456789012345678901234567890123456789012", "0123456789012345678901234567890123456xxx", -1},
         {"xx0123456789012345678901234567890123456789012345678901234567890120123456789012345678901234567890123456xxx", "0123456789012345678901234567890123456xxx", 65},
+       // test fallback to Rabin-Karp.
+       {"oxoxoxoxoxoxoxoxoxoxoxoy", "oy", 22},
+       {"oxoxoxoxoxoxoxoxoxoxoxox", "oy", -1},
  }
  
  var lastIndexTests = []IndexTest{
@@ -1641,3 +1644,15 @@ func BenchmarkTrimASCII(b *testing.B) {
                 }
         }
  }
+
+func BenchmarkIndexPeriodic(b *testing.B) {
+       key := "aa"
+       for _, skip := range [...]int{2, 4, 8, 16, 32, 64} {
+               b.Run(fmt.Sprintf("IndexPeriodic%d", skip), func(b *testing.B) {
+                       s := Repeat("a"+Repeat(" ", skip-1), 1<<16/skip)
+                       for i := 0; i < b.N; i++ {
+                               Index(s, key)
+                       }
+               })
+       }
+}
author	Keith Randall <khr@golang.org>
	Sat, 4 Nov 2017 17:19:53 +0000 (10:19 -0700)
committer	Keith Randall <khr@golang.org>
	Wed, 15 Nov 2017 17:35:09 +0000 (17:35 +0000)
src/bytes/bytes.go		patch \| blob \| history
src/bytes/bytes_amd64.go		patch \| blob \| history
src/bytes/bytes_generic.go		patch \| blob \| history
src/bytes/bytes_s390x.go		patch \| blob \| history
src/bytes/bytes_test.go		patch \| blob \| history
src/strings/strings.go		patch \| blob \| history
src/strings/strings_amd64.go		patch \| blob \| history
src/strings/strings_generic.go		patch \| blob \| history
src/strings/strings_s390x.go		patch \| blob \| history
src/strings/strings_test.go		patch \| blob \| history