From 2f2cc24cd8e930b26c220f75b96606abf2bebcbc Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Wed, 7 Dec 2011 15:03:05 -0500 Subject: [PATCH] regexp: avoid allocation of input interface Matters most for small inputs, because there is no real work to amortize the allocation effort against. benchmark old ns/op new ns/op delta BenchmarkLiteral 613 473 -22.84% BenchmarkNotLiteral 4981 4931 -1.00% BenchmarkMatchClass 7289 7122 -2.29% BenchmarkMatchClass_InRange 6618 6663 +0.68% BenchmarkReplaceAll 7843 7233 -7.78% BenchmarkAnchoredLiteralShortNonMatch 329 228 -30.70% BenchmarkAnchoredLiteralLongNonMatch 322 228 -29.19% BenchmarkAnchoredShortMatch 838 715 -14.68% BenchmarkAnchoredLongMatch 824 715 -13.23% benchmark old MB/s new MB/s speedup BenchmarkMatchEasy0_32 119.73 196.61 1.64x BenchmarkMatchEasy0_1K 540.58 538.33 1.00x BenchmarkMatchEasy0_32K 732.57 714.00 0.97x BenchmarkMatchEasy0_1M 726.44 708.36 0.98x BenchmarkMatchEasy0_32M 707.77 691.45 0.98x BenchmarkMatchEasy1_32 102.12 136.11 1.33x BenchmarkMatchEasy1_1K 298.31 307.04 1.03x BenchmarkMatchEasy1_32K 273.56 274.43 1.00x BenchmarkMatchEasy1_1M 268.42 269.23 1.00x BenchmarkMatchEasy1_32M 266.15 267.34 1.00x BenchmarkMatchMedium_32 2.53 3.38 1.34x BenchmarkMatchMedium_1K 9.37 9.57 1.02x BenchmarkMatchMedium_32K 9.29 9.67 1.04x BenchmarkMatchMedium_1M 9.42 9.66 1.03x BenchmarkMatchMedium_32M 9.41 9.62 1.02x BenchmarkMatchHard_32 6.66 6.75 1.01x BenchmarkMatchHard_1K 6.81 6.85 1.01x BenchmarkMatchHard_32K 6.79 6.85 1.01x BenchmarkMatchHard_1M 6.82 6.83 1.00x BenchmarkMatchHard_32M 6.80 6.80 1.00x R=golang-dev, r CC=golang-dev https://golang.org/cl/5453076 --- src/pkg/regexp/exec.go | 40 +++++++++++++++++++++-- src/pkg/regexp/exec_test.go | 65 ++++++++++++------------------------- src/pkg/regexp/regexp.go | 50 +++++++++------------------- 3 files changed, 74 insertions(+), 81 deletions(-) diff --git a/src/pkg/regexp/exec.go b/src/pkg/regexp/exec.go index d7057a191b..e16a1b5b9e 100644 --- a/src/pkg/regexp/exec.go +++ b/src/pkg/regexp/exec.go @@ -1,6 +1,9 @@ package regexp -import "regexp/syntax" +import ( + "io" + "regexp/syntax" +) // A queue is a 'sparse array' holding pending threads of execution. // See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html @@ -34,6 +37,28 @@ type machine struct { pool []*thread // pool of available threads matched bool // whether a match was found matchcap []int // capture information for the match + + // cached inputs, to avoid allocation + inputBytes inputBytes + inputString inputString + inputReader inputReader +} + +func (m *machine) newInputBytes(b []byte) input { + m.inputBytes.str = b + return &m.inputBytes +} + +func (m *machine) newInputString(s string) input { + m.inputString.str = s + return &m.inputString +} + +func (m *machine) newInputReader(r io.RuneReader) input { + m.inputReader.r = r + m.inputReader.atEOT = false + m.inputReader.pos = 0 + return &m.inputReader } // progMachine returns a new machine running the prog p. @@ -74,6 +99,9 @@ func (m *machine) alloc(i *syntax.Inst) *thread { // free returns t to the free pool. func (m *machine) free(t *thread) { + m.inputBytes.str = nil + m.inputString.str = "" + m.inputReader.r = nil m.pool = append(m.pool, t) } @@ -287,8 +315,16 @@ var empty = make([]int, 0) // doExecute finds the leftmost match in the input and returns // the position of its subexpressions. -func (re *Regexp) doExecute(i input, pos int, ncap int) []int { +func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int) []int { m := re.get() + var i input + if r != nil { + i = m.newInputReader(r) + } else if b != nil { + i = m.newInputBytes(b) + } else { + i = m.newInputString(s) + } m.init(ncap) if !m.match(i, pos) { re.put(m) diff --git a/src/pkg/regexp/exec_test.go b/src/pkg/regexp/exec_test.go index d981f5495e..312bf0275f 100644 --- a/src/pkg/regexp/exec_test.go +++ b/src/pkg/regexp/exec_test.go @@ -10,7 +10,6 @@ import ( "fmt" "io" "math/rand" - old "old/regexp" "os" "path/filepath" "regexp/syntax" @@ -679,18 +678,6 @@ func benchmark(b *testing.B, re string, n int) { } } -func benchold(b *testing.B, re string, n int) { - r := old.MustCompile(re) - t := makeText(n) - b.ResetTimer() - b.SetBytes(int64(n)) - for i := 0; i < b.N; i++ { - if r.Match(t) { - panic("match!") - } - } -} - const ( easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$" easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$" @@ -700,35 +687,23 @@ const ( "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$" ) -func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) } -func BenchmarkMatchEasy0_1K_Old(b *testing.B) { benchold(b, easy0, 1<<10) } -func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) } -func BenchmarkMatchEasy0_1M_Old(b *testing.B) { benchold(b, easy0, 1<<20) } -func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) } -func BenchmarkMatchEasy0_32K_Old(b *testing.B) { benchold(b, easy0, 32<<10) } -func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) } -func BenchmarkMatchEasy0_32M_Old(b *testing.B) { benchold(b, easy0, 32<<20) } -func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) } -func BenchmarkMatchEasy1_1K_Old(b *testing.B) { benchold(b, easy1, 1<<10) } -func BenchmarkMatchEasy1_1M(b *testing.B) { benchmark(b, easy1, 1<<20) } -func BenchmarkMatchEasy1_1M_Old(b *testing.B) { benchold(b, easy1, 1<<20) } -func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) } -func BenchmarkMatchEasy1_32K_Old(b *testing.B) { benchold(b, easy1, 32<<10) } -func BenchmarkMatchEasy1_32M(b *testing.B) { benchmark(b, easy1, 32<<20) } -func BenchmarkMatchEasy1_32M_Old(b *testing.B) { benchold(b, easy1, 32<<20) } -func BenchmarkMatchMedium_1K(b *testing.B) { benchmark(b, medium, 1<<10) } -func BenchmarkMatchMedium_1K_Old(b *testing.B) { benchold(b, medium, 1<<10) } -func BenchmarkMatchMedium_1M(b *testing.B) { benchmark(b, medium, 1<<20) } -func BenchmarkMatchMedium_1M_Old(b *testing.B) { benchold(b, medium, 1<<20) } -func BenchmarkMatchMedium_32K(b *testing.B) { benchmark(b, medium, 32<<10) } -func BenchmarkMatchMedium_32K_Old(b *testing.B) { benchold(b, medium, 32<<10) } -func BenchmarkMatchMedium_32M(b *testing.B) { benchmark(b, medium, 32<<20) } -func BenchmarkMatchMedium_32M_Old(b *testing.B) { benchold(b, medium, 32<<20) } -func BenchmarkMatchHard_1K(b *testing.B) { benchmark(b, hard, 1<<10) } -func BenchmarkMatchHard_1K_Old(b *testing.B) { benchold(b, hard, 1<<10) } -func BenchmarkMatchHard_1M(b *testing.B) { benchmark(b, hard, 1<<20) } -func BenchmarkMatchHard_1M_Old(b *testing.B) { benchold(b, hard, 1<<20) } -func BenchmarkMatchHard_32K(b *testing.B) { benchmark(b, hard, 32<<10) } -func BenchmarkMatchHard_32K_Old(b *testing.B) { benchold(b, hard, 32<<10) } -func BenchmarkMatchHard_32M(b *testing.B) { benchmark(b, hard, 32<<20) } -func BenchmarkMatchHard_32M_Old(b *testing.B) { benchold(b, hard, 32<<20) } +func BenchmarkMatchEasy0_32(b *testing.B) { benchmark(b, easy0, 32<<0) } +func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) } +func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) } +func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) } +func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) } +func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) } +func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) } +func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) } +func BenchmarkMatchEasy1_1M(b *testing.B) { benchmark(b, easy1, 1<<20) } +func BenchmarkMatchEasy1_32M(b *testing.B) { benchmark(b, easy1, 32<<20) } +func BenchmarkMatchMedium_32(b *testing.B) { benchmark(b, medium, 1<<0) } +func BenchmarkMatchMedium_1K(b *testing.B) { benchmark(b, medium, 1<<10) } +func BenchmarkMatchMedium_32K(b *testing.B) { benchmark(b, medium, 32<<10) } +func BenchmarkMatchMedium_1M(b *testing.B) { benchmark(b, medium, 1<<20) } +func BenchmarkMatchMedium_32M(b *testing.B) { benchmark(b, medium, 32<<20) } +func BenchmarkMatchHard_32(b *testing.B) { benchmark(b, hard, 32<<0) } +func BenchmarkMatchHard_1K(b *testing.B) { benchmark(b, hard, 1<<10) } +func BenchmarkMatchHard_32K(b *testing.B) { benchmark(b, hard, 32<<10) } +func BenchmarkMatchHard_1M(b *testing.B) { benchmark(b, hard, 1<<20) } +func BenchmarkMatchHard_32M(b *testing.B) { benchmark(b, hard, 32<<20) } diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index 59f3be39d2..b0c6a0b1a1 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -240,10 +240,6 @@ type inputString struct { str string } -func newInputString(str string) *inputString { - return &inputString{str: str} -} - func (i *inputString) step(pos int) (rune, int) { if pos < len(i.str) { c := i.str[pos] @@ -283,10 +279,6 @@ type inputBytes struct { str []byte } -func newInputBytes(str []byte) *inputBytes { - return &inputBytes{str: str} -} - func (i *inputBytes) step(pos int) (rune, int) { if pos < len(i.str) { c := i.str[pos] @@ -328,10 +320,6 @@ type inputReader struct { pos int } -func newInputReader(r io.RuneReader) *inputReader { - return &inputReader{r: r} -} - func (i *inputReader) step(pos int) (rune, int) { if !i.atEOT && pos != i.pos { return endOfText, 0 @@ -373,19 +361,19 @@ func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { // RuneReader. The return value is a boolean: true for match, false for no // match. func (re *Regexp) MatchReader(r io.RuneReader) bool { - return re.doExecute(newInputReader(r), 0, 0) != nil + return re.doExecute(r, nil, "", 0, 0) != nil } // MatchString returns whether the Regexp matches the string s. // The return value is a boolean: true for match, false for no match. func (re *Regexp) MatchString(s string) bool { - return re.doExecute(newInputString(s), 0, 0) != nil + return re.doExecute(nil, nil, s, 0, 0) != nil } // Match returns whether the Regexp matches the byte slice b. // The return value is a boolean: true for match, false for no match. func (re *Regexp) Match(b []byte) bool { - return re.doExecute(newInputBytes(b), 0, 0) != nil + return re.doExecute(nil, b, "", 0, 0) != nil } // MatchReader checks whether a textual regular expression matches the text @@ -437,7 +425,7 @@ func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) str searchPos := 0 // position where we next look for a match buf := new(bytes.Buffer) for searchPos <= len(src) { - a := re.doExecute(newInputString(src), searchPos, 2) + a := re.doExecute(nil, nil, src, searchPos, 2) if len(a) == 0 { break // no more matches } @@ -489,7 +477,7 @@ func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { searchPos := 0 // position where we next look for a match buf := new(bytes.Buffer) for searchPos <= len(src) { - a := re.doExecute(newInputBytes(src), searchPos, 2) + a := re.doExecute(nil, src, "", searchPos, 2) if len(a) == 0 { break // no more matches } @@ -577,13 +565,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { } for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { - var in input - if b == nil { - in = newInputString(s) - } else { - in = newInputBytes(b) - } - matches := re.doExecute(in, pos, re.prog.NumCap) + matches := re.doExecute(nil, b, s, pos, re.prog.NumCap) if len(matches) == 0 { break } @@ -623,7 +605,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { // Find returns a slice holding the text of the leftmost match in b of the regular expression. // A return value of nil indicates no match. func (re *Regexp) Find(b []byte) []byte { - a := re.doExecute(newInputBytes(b), 0, 2) + a := re.doExecute(nil, b, "", 0, 2) if a == nil { return nil } @@ -635,7 +617,7 @@ func (re *Regexp) Find(b []byte) []byte { // b[loc[0]:loc[1]]. // A return value of nil indicates no match. func (re *Regexp) FindIndex(b []byte) (loc []int) { - a := re.doExecute(newInputBytes(b), 0, 2) + a := re.doExecute(nil, b, "", 0, 2) if a == nil { return nil } @@ -648,7 +630,7 @@ func (re *Regexp) FindIndex(b []byte) (loc []int) { // an empty string. Use FindStringIndex or FindStringSubmatch if it is // necessary to distinguish these cases. func (re *Regexp) FindString(s string) string { - a := re.doExecute(newInputString(s), 0, 2) + a := re.doExecute(nil, nil, s, 0, 2) if a == nil { return "" } @@ -660,7 +642,7 @@ func (re *Regexp) FindString(s string) string { // itself is at s[loc[0]:loc[1]]. // A return value of nil indicates no match. func (re *Regexp) FindStringIndex(s string) []int { - a := re.doExecute(newInputString(s), 0, 2) + a := re.doExecute(nil, nil, s, 0, 2) if a == nil { return nil } @@ -672,7 +654,7 @@ func (re *Regexp) FindStringIndex(s string) []int { // the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return // value of nil indicates no match. func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { - a := re.doExecute(newInputReader(r), 0, 2) + a := re.doExecute(r, nil, "", 0, 2) if a == nil { return nil } @@ -685,7 +667,7 @@ func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { // comment. // A return value of nil indicates no match. func (re *Regexp) FindSubmatch(b []byte) [][]byte { - a := re.doExecute(newInputBytes(b), 0, re.prog.NumCap) + a := re.doExecute(nil, b, "", 0, re.prog.NumCap) if a == nil { return nil } @@ -704,7 +686,7 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte { // in the package comment. // A return value of nil indicates no match. func (re *Regexp) FindSubmatchIndex(b []byte) []int { - return re.pad(re.doExecute(newInputBytes(b), 0, re.prog.NumCap)) + return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap)) } // FindStringSubmatch returns a slice of strings holding the text of the @@ -713,7 +695,7 @@ func (re *Regexp) FindSubmatchIndex(b []byte) []int { // package comment. // A return value of nil indicates no match. func (re *Regexp) FindStringSubmatch(s string) []string { - a := re.doExecute(newInputString(s), 0, re.prog.NumCap) + a := re.doExecute(nil, nil, s, 0, re.prog.NumCap) if a == nil { return nil } @@ -732,7 +714,7 @@ func (re *Regexp) FindStringSubmatch(s string) []string { // 'Index' descriptions in the package comment. // A return value of nil indicates no match. func (re *Regexp) FindStringSubmatchIndex(s string) []int { - return re.pad(re.doExecute(newInputString(s), 0, re.prog.NumCap)) + return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap)) } // FindReaderSubmatchIndex returns a slice holding the index pairs @@ -741,7 +723,7 @@ func (re *Regexp) FindStringSubmatchIndex(s string) []int { // by the 'Submatch' and 'Index' descriptions in the package comment. A // return value of nil indicates no match. func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { - return re.pad(re.doExecute(newInputReader(r), 0, re.prog.NumCap)) + return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap)) } const startSize = 10 // The size at which to start a slice in the 'All' routines. -- 2.50.0