strings: better mean complexity for Count and Index.

author Rémy Oudompheng <oudomphe@phare.normalesup.org>

Sun, 17 Feb 2013 12:07:17 +0000 (13:07 +0100)

committer Rémy Oudompheng <oudomphe@phare.normalesup.org>

Sun, 17 Feb 2013 12:07:17 +0000 (13:07 +0100)
author Rémy Oudompheng <oudomphe@phare.normalesup.org>
Sun, 17 Feb 2013 12:07:17 +0000 (13:07 +0100)
committer Rémy Oudompheng <oudomphe@phare.normalesup.org>
Sun, 17 Feb 2013 12:07:17 +0000 (13:07 +0100)
diff --git a/src/pkg/strings/strings.go b/src/pkg/strings/strings.go

index d4b3f034736a94726884f2fa713bd21c61ebf01d..72b8d223af1ad493a078f95dc378f52a1c99ebe4 100644 (file)
--- a/src/pkg/strings/strings.go
+++ b/src/pkg/strings/strings.go
@@ -36,15 +36,35 @@ func explode(s string, n int) []string {
         return a
  }
  
+// primeRK is the prime base used in Rabin-Karp algorithm.
+const primeRK = 16777619
+
+// hashstr returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func hashstr(sep string) (uint32, uint32) {
+       hash := uint32(0)
+       for i := 0; i < len(sep); i++ {
+               hash = hash*primeRK + uint32(sep[i])
+
+       }
+       var pow, sq uint32 = 1, primeRK
+       for i := len(sep); i > 0; i >>= 1 {
+               if i&1 != 0 {
+                       pow *= sq
+               }
+               sq *= sq
+       }
+       return hash, pow
+}
+
  // Count counts the number of non-overlapping instances of sep in s.
  func Count(s, sep string) int {
         if sep == "" {
                 return utf8.RuneCountInString(s) + 1
         }
         c := sep[0]
-       l := len(sep)
         n := 0
-       if l == 1 {
+       if len(sep) == 1 {
                 // special case worth making fast
                 for i := 0; i < len(s); i++ {
                         if s[i] == c {
@@ -53,11 +73,26 @@ func Count(s, sep string) int {
                 }
                 return n
         }
-       for i := 0; i+l <= len(s); i++ {
-               if s[i] == c && s[i:i+l] == sep {
+       if len(sep) > len(s) {
+               return 0
+       }
+       hashsep, pow := hashstr(sep)
+       h := uint32(0)
+       for i := 0; i < len(sep); i++ {
+               h = h*primeRK + uint32(s[i])
+       }
+       lastmatch := 0
+       for i := len(sep); ; i++ {
+               // Invariant: h = hash(s[i-l : i])
+               if h == hashsep && lastmatch <= i-len(sep) && s[i-len(sep):i] == sep {
                         n++
-                       i += l - 1
+                       lastmatch = i
+               }
+               if i >= len(s) {
+                       break
                 }
+               h = h*primeRK + uint32(s[i])
+               h -= pow * uint32(s[i-len(sep)])
         }
         return n
  }
@@ -94,10 +129,25 @@ func Index(s, sep string) int {
                 return -1
         }
         // n > 1
-       for i := 0; i+n <= len(s); i++ {
-               if s[i] == c && s[i:i+n] == sep {
-                       return i
+       if n > len(s) {
+               return -1
+       }
+       // Hash sep.
+       hashsep, pow := hashstr(sep)
+       var h uint32
+       for i := 0; i < n; i++ {
+               h = h*primeRK + uint32(s[i])
+       }
+       for i := n; ; i++ {
+               // Invariant: h = hash(s[i-n : i])
+               if h == hashsep && s[i-n:i] == sep {
+                       return i - n
+               }
+               if i >= len(s) {
+                       break
                 }
+               h = h*primeRK + uint32(s[i])
+               h -= pow * uint32(s[i-n])
         }
         return -1
  }
diff --git a/src/pkg/strings/strings_test.go b/src/pkg/strings/strings_test.go

index e222af14a7c30ffacfdde3f517c3e2a87e681d2b..b5bdf35d15b5ea5be2a708192df06f25f22787bc 100644 (file)
--- a/src/pkg/strings/strings_test.go
+++ b/src/pkg/strings/strings_test.go
@@ -1001,6 +1001,57 @@ func TestEqualFold(t *testing.T) {
         }
  }
  
+func makeBenchInputHard() string {
+       tokens := [...]string{
+               "<a>", "<p>", "<b>", "<strong>",
+               "</a>", "</p>", "</b>", "</strong>",
+               "hello", "world",
+       }
+       x := make([]byte, 0, 1<<20)
+       for len(x) < 1<<20 {
+               i := rand.Intn(len(tokens))
+               x = append(x, tokens[i]...)
+       }
+       return string(x)
+}
+
+var benchInputHard = makeBenchInputHard()
+
+func benchmarkIndexHard(b *testing.B, sep string) {
+       for i := 0; i < b.N; i++ {
+               Index(benchInputHard, sep)
+       }
+}
+
+func benchmarkCountHard(b *testing.B, sep string) {
+       for i := 0; i < b.N; i++ {
+               Count(benchInputHard, sep)
+       }
+}
+
+func BenchmarkIndexHard1(b *testing.B) { benchmarkIndexHard(b, "<>") }
+func BenchmarkIndexHard2(b *testing.B) { benchmarkIndexHard(b, "</pre>") }
+func BenchmarkIndexHard3(b *testing.B) { benchmarkIndexHard(b, "<b>hello world</b>") }
+
+func BenchmarkCountHard1(b *testing.B) { benchmarkCountHard(b, "<>") }
+func BenchmarkCountHard2(b *testing.B) { benchmarkCountHard(b, "</pre>") }
+func BenchmarkCountHard3(b *testing.B) { benchmarkCountHard(b, "<b>hello world</b>") }
+
+var benchInputTorture = Repeat("ABC", 1<<10) + "123" + Repeat("ABC", 1<<10)
+var benchNeedleTorture = Repeat("ABC", 1<<10+1)
+
+func BenchmarkIndexTorture(b *testing.B) {
+       for i := 0; i < b.N; i++ {
+               Index(benchInputTorture, benchNeedleTorture)
+       }
+}
+
+func BenchmarkCountTorture(b *testing.B) {
+       for i := 0; i < b.N; i++ {
+               Count(benchInputTorture, benchNeedleTorture)
+       }
+}
+
  var makeFieldsInput = func() string {
         x := make([]byte, 1<<20)
         // Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space.
author	Rémy Oudompheng <oudomphe@phare.normalesup.org>
	Sun, 17 Feb 2013 12:07:17 +0000 (13:07 +0100)
committer	Rémy Oudompheng <oudomphe@phare.normalesup.org>
	Sun, 17 Feb 2013 12:07:17 +0000 (13:07 +0100)
src/pkg/strings/strings.go		patch \| blob \| history
src/pkg/strings/strings_test.go		patch \| blob \| history