From 23093f86eebbefb0cf11298c45513da360d2b48d Mon Sep 17 00:00:00 2001
From: =?utf8?q?R=C3=A9my=20Oudompheng?=
Date: Sun, 17 Feb 2013 13:07:17 +0100
Subject: [PATCH] strings: better mean complexity for Count and Index.
The O(n+m) complexity is obtained probabilistically
by using Rabin-Karp algorithm, which provides the needed complexity
unless exceptional collisions occur, without memory allocation.
benchmark old ns/op new ns/op delta
BenchmarkIndexHard1 6532331 4045886 -38.06%
BenchmarkIndexHard2 8178173 4038975 -50.61%
BenchmarkIndexHard3 6973687 4042591 -42.03%
BenchmarkCountHard1 6270864 4071090 -35.08%
BenchmarkCountHard2 7838039 4072853 -48.04%
BenchmarkCountHard3 6697828 4071964 -39.20%
BenchmarkIndexTorture 2730546 28934 -98.94%
BenchmarkCountTorture 2729622 29064 -98.94%
Fixes #4600.
R=rsc, donovanhide, remyoudompheng
CC=golang-dev
https://golang.org/cl/7314095
---
src/pkg/strings/strings.go | 66 +++++++++++++++++++++++++++++----
src/pkg/strings/strings_test.go | 51 +++++++++++++++++++++++++
2 files changed, 109 insertions(+), 8 deletions(-)
diff --git a/src/pkg/strings/strings.go b/src/pkg/strings/strings.go
index d4b3f03473..72b8d223af 100644
--- a/src/pkg/strings/strings.go
+++ b/src/pkg/strings/strings.go
@@ -36,15 +36,35 @@ func explode(s string, n int) []string {
return a
}
+// primeRK is the prime base used in Rabin-Karp algorithm.
+const primeRK = 16777619
+
+// hashstr returns the hash and the appropriate multiplicative
+// factor for use in Rabin-Karp algorithm.
+func hashstr(sep string) (uint32, uint32) {
+ hash := uint32(0)
+ for i := 0; i < len(sep); i++ {
+ hash = hash*primeRK + uint32(sep[i])
+
+ }
+ var pow, sq uint32 = 1, primeRK
+ for i := len(sep); i > 0; i >>= 1 {
+ if i&1 != 0 {
+ pow *= sq
+ }
+ sq *= sq
+ }
+ return hash, pow
+}
+
// Count counts the number of non-overlapping instances of sep in s.
func Count(s, sep string) int {
if sep == "" {
return utf8.RuneCountInString(s) + 1
}
c := sep[0]
- l := len(sep)
n := 0
- if l == 1 {
+ if len(sep) == 1 {
// special case worth making fast
for i := 0; i < len(s); i++ {
if s[i] == c {
@@ -53,11 +73,26 @@ func Count(s, sep string) int {
}
return n
}
- for i := 0; i+l <= len(s); i++ {
- if s[i] == c && s[i:i+l] == sep {
+ if len(sep) > len(s) {
+ return 0
+ }
+ hashsep, pow := hashstr(sep)
+ h := uint32(0)
+ for i := 0; i < len(sep); i++ {
+ h = h*primeRK + uint32(s[i])
+ }
+ lastmatch := 0
+ for i := len(sep); ; i++ {
+ // Invariant: h = hash(s[i-l : i])
+ if h == hashsep && lastmatch <= i-len(sep) && s[i-len(sep):i] == sep {
n++
- i += l - 1
+ lastmatch = i
+ }
+ if i >= len(s) {
+ break
}
+ h = h*primeRK + uint32(s[i])
+ h -= pow * uint32(s[i-len(sep)])
}
return n
}
@@ -94,10 +129,25 @@ func Index(s, sep string) int {
return -1
}
// n > 1
- for i := 0; i+n <= len(s); i++ {
- if s[i] == c && s[i:i+n] == sep {
- return i
+ if n > len(s) {
+ return -1
+ }
+ // Hash sep.
+ hashsep, pow := hashstr(sep)
+ var h uint32
+ for i := 0; i < n; i++ {
+ h = h*primeRK + uint32(s[i])
+ }
+ for i := n; ; i++ {
+ // Invariant: h = hash(s[i-n : i])
+ if h == hashsep && s[i-n:i] == sep {
+ return i - n
+ }
+ if i >= len(s) {
+ break
}
+ h = h*primeRK + uint32(s[i])
+ h -= pow * uint32(s[i-n])
}
return -1
}
diff --git a/src/pkg/strings/strings_test.go b/src/pkg/strings/strings_test.go
index e222af14a7..b5bdf35d15 100644
--- a/src/pkg/strings/strings_test.go
+++ b/src/pkg/strings/strings_test.go
@@ -1001,6 +1001,57 @@ func TestEqualFold(t *testing.T) {
}
}
+func makeBenchInputHard() string {
+ tokens := [...]string{
+ "", "", "", "",
+ "
", "
", "", "",
+ "hello", "world",
+ }
+ x := make([]byte, 0, 1<<20)
+ for len(x) < 1<<20 {
+ i := rand.Intn(len(tokens))
+ x = append(x, tokens[i]...)
+ }
+ return string(x)
+}
+
+var benchInputHard = makeBenchInputHard()
+
+func benchmarkIndexHard(b *testing.B, sep string) {
+ for i := 0; i < b.N; i++ {
+ Index(benchInputHard, sep)
+ }
+}
+
+func benchmarkCountHard(b *testing.B, sep string) {
+ for i := 0; i < b.N; i++ {
+ Count(benchInputHard, sep)
+ }
+}
+
+func BenchmarkIndexHard1(b *testing.B) { benchmarkIndexHard(b, "<>") }
+func BenchmarkIndexHard2(b *testing.B) { benchmarkIndexHard(b, "") }
+func BenchmarkIndexHard3(b *testing.B) { benchmarkIndexHard(b, "hello world") }
+
+func BenchmarkCountHard1(b *testing.B) { benchmarkCountHard(b, "<>") }
+func BenchmarkCountHard2(b *testing.B) { benchmarkCountHard(b, "") }
+func BenchmarkCountHard3(b *testing.B) { benchmarkCountHard(b, "hello world") }
+
+var benchInputTorture = Repeat("ABC", 1<<10) + "123" + Repeat("ABC", 1<<10)
+var benchNeedleTorture = Repeat("ABC", 1<<10+1)
+
+func BenchmarkIndexTorture(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ Index(benchInputTorture, benchNeedleTorture)
+ }
+}
+
+func BenchmarkCountTorture(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ Count(benchInputTorture, benchNeedleTorture)
+ }
+}
+
var makeFieldsInput = func() string {
x := make([]byte, 1<<20)
// Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space.
--
2.48.1