From 23093f86eebbefb0cf11298c45513da360d2b48d Mon Sep 17 00:00:00 2001 From: =?utf8?q?R=C3=A9my=20Oudompheng?= Date: Sun, 17 Feb 2013 13:07:17 +0100 Subject: [PATCH] strings: better mean complexity for Count and Index. The O(n+m) complexity is obtained probabilistically by using Rabin-Karp algorithm, which provides the needed complexity unless exceptional collisions occur, without memory allocation. benchmark old ns/op new ns/op delta BenchmarkIndexHard1 6532331 4045886 -38.06% BenchmarkIndexHard2 8178173 4038975 -50.61% BenchmarkIndexHard3 6973687 4042591 -42.03% BenchmarkCountHard1 6270864 4071090 -35.08% BenchmarkCountHard2 7838039 4072853 -48.04% BenchmarkCountHard3 6697828 4071964 -39.20% BenchmarkIndexTorture 2730546 28934 -98.94% BenchmarkCountTorture 2729622 29064 -98.94% Fixes #4600. R=rsc, donovanhide, remyoudompheng CC=golang-dev https://golang.org/cl/7314095 --- src/pkg/strings/strings.go | 66 +++++++++++++++++++++++++++++---- src/pkg/strings/strings_test.go | 51 +++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 8 deletions(-) diff --git a/src/pkg/strings/strings.go b/src/pkg/strings/strings.go index d4b3f03473..72b8d223af 100644 --- a/src/pkg/strings/strings.go +++ b/src/pkg/strings/strings.go @@ -36,15 +36,35 @@ func explode(s string, n int) []string { return a } +// primeRK is the prime base used in Rabin-Karp algorithm. +const primeRK = 16777619 + +// hashstr returns the hash and the appropriate multiplicative +// factor for use in Rabin-Karp algorithm. +func hashstr(sep string) (uint32, uint32) { + hash := uint32(0) + for i := 0; i < len(sep); i++ { + hash = hash*primeRK + uint32(sep[i]) + + } + var pow, sq uint32 = 1, primeRK + for i := len(sep); i > 0; i >>= 1 { + if i&1 != 0 { + pow *= sq + } + sq *= sq + } + return hash, pow +} + // Count counts the number of non-overlapping instances of sep in s. func Count(s, sep string) int { if sep == "" { return utf8.RuneCountInString(s) + 1 } c := sep[0] - l := len(sep) n := 0 - if l == 1 { + if len(sep) == 1 { // special case worth making fast for i := 0; i < len(s); i++ { if s[i] == c { @@ -53,11 +73,26 @@ func Count(s, sep string) int { } return n } - for i := 0; i+l <= len(s); i++ { - if s[i] == c && s[i:i+l] == sep { + if len(sep) > len(s) { + return 0 + } + hashsep, pow := hashstr(sep) + h := uint32(0) + for i := 0; i < len(sep); i++ { + h = h*primeRK + uint32(s[i]) + } + lastmatch := 0 + for i := len(sep); ; i++ { + // Invariant: h = hash(s[i-l : i]) + if h == hashsep && lastmatch <= i-len(sep) && s[i-len(sep):i] == sep { n++ - i += l - 1 + lastmatch = i + } + if i >= len(s) { + break } + h = h*primeRK + uint32(s[i]) + h -= pow * uint32(s[i-len(sep)]) } return n } @@ -94,10 +129,25 @@ func Index(s, sep string) int { return -1 } // n > 1 - for i := 0; i+n <= len(s); i++ { - if s[i] == c && s[i:i+n] == sep { - return i + if n > len(s) { + return -1 + } + // Hash sep. + hashsep, pow := hashstr(sep) + var h uint32 + for i := 0; i < n; i++ { + h = h*primeRK + uint32(s[i]) + } + for i := n; ; i++ { + // Invariant: h = hash(s[i-n : i]) + if h == hashsep && s[i-n:i] == sep { + return i - n + } + if i >= len(s) { + break } + h = h*primeRK + uint32(s[i]) + h -= pow * uint32(s[i-n]) } return -1 } diff --git a/src/pkg/strings/strings_test.go b/src/pkg/strings/strings_test.go index e222af14a7..b5bdf35d15 100644 --- a/src/pkg/strings/strings_test.go +++ b/src/pkg/strings/strings_test.go @@ -1001,6 +1001,57 @@ func TestEqualFold(t *testing.T) { } } +func makeBenchInputHard() string { + tokens := [...]string{ + "", "

", "", "", + "", "

", "", "", + "hello", "world", + } + x := make([]byte, 0, 1<<20) + for len(x) < 1<<20 { + i := rand.Intn(len(tokens)) + x = append(x, tokens[i]...) + } + return string(x) +} + +var benchInputHard = makeBenchInputHard() + +func benchmarkIndexHard(b *testing.B, sep string) { + for i := 0; i < b.N; i++ { + Index(benchInputHard, sep) + } +} + +func benchmarkCountHard(b *testing.B, sep string) { + for i := 0; i < b.N; i++ { + Count(benchInputHard, sep) + } +} + +func BenchmarkIndexHard1(b *testing.B) { benchmarkIndexHard(b, "<>") } +func BenchmarkIndexHard2(b *testing.B) { benchmarkIndexHard(b, "") } +func BenchmarkIndexHard3(b *testing.B) { benchmarkIndexHard(b, "hello world") } + +func BenchmarkCountHard1(b *testing.B) { benchmarkCountHard(b, "<>") } +func BenchmarkCountHard2(b *testing.B) { benchmarkCountHard(b, "") } +func BenchmarkCountHard3(b *testing.B) { benchmarkCountHard(b, "hello world") } + +var benchInputTorture = Repeat("ABC", 1<<10) + "123" + Repeat("ABC", 1<<10) +var benchNeedleTorture = Repeat("ABC", 1<<10+1) + +func BenchmarkIndexTorture(b *testing.B) { + for i := 0; i < b.N; i++ { + Index(benchInputTorture, benchNeedleTorture) + } +} + +func BenchmarkCountTorture(b *testing.B) { + for i := 0; i < b.N; i++ { + Count(benchInputTorture, benchNeedleTorture) + } +} + var makeFieldsInput = func() string { x := make([]byte, 1<<20) // Input is ~10% space, ~10% 2-byte UTF-8, rest ASCII non-space. -- 2.48.1