From a1e7cd97d509bae39d9e6a964b8920b1863a4e22 Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Sun, 24 Jul 2011 17:00:28 -0400 Subject: [PATCH] exp/regexp: implement regexp API using exp/regexp/syntax Still need to write tests for new syntax and fix bugs that the tests find, but this is a good check point. All tests pass. Compared against existing regexp: benchmark old ns/op new ns/op delta regexp.BenchmarkLiteral 1869 620 -66.83% regexp.BenchmarkNotLiteral 9489 7823 -17.56% regexp.BenchmarkMatchClass 10372 8386 -19.15% regexp.BenchmarkMatchClass_InRange 10800 7750 -28.24% regexp.BenchmarkReplaceAll 13492 8519 -36.86% regexp.BenchmarkAnchoredLiteralShortNonMatch 747 339 -54.62% regexp.BenchmarkAnchoredLiteralLongNonMatch 599 335 -44.07% regexp.BenchmarkAnchoredShortMatch 2137 917 -57.09% regexp.BenchmarkAnchoredLongMatch 2029 917 -54.81% R=r, r CC=golang-dev, sam.thorogood https://golang.org/cl/4820046 --- src/pkg/exp/regexp/Makefile | 12 + src/pkg/exp/regexp/all_test.go | 429 +++++++++++++++ src/pkg/exp/regexp/exec.go | 295 ++++++++++ src/pkg/exp/regexp/find_test.go | 472 ++++++++++++++++ src/pkg/exp/regexp/regexp.go | 795 +++++++++++++++++++++++++++ src/pkg/exp/regexp/syntax/compile.go | 1 + src/pkg/exp/regexp/syntax/prog.go | 55 ++ 7 files changed, 2059 insertions(+) create mode 100644 src/pkg/exp/regexp/Makefile create mode 100644 src/pkg/exp/regexp/all_test.go create mode 100644 src/pkg/exp/regexp/exec.go create mode 100644 src/pkg/exp/regexp/find_test.go create mode 100644 src/pkg/exp/regexp/regexp.go diff --git a/src/pkg/exp/regexp/Makefile b/src/pkg/exp/regexp/Makefile new file mode 100644 index 0000000000..fc90df320f --- /dev/null +++ b/src/pkg/exp/regexp/Makefile @@ -0,0 +1,12 @@ +# Copyright 2011 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +include ../../../Make.inc + +TARG=exp/regexp +GOFILES=\ + exec.go\ + regexp.go\ + +include ../../../Make.pkg diff --git a/src/pkg/exp/regexp/all_test.go b/src/pkg/exp/regexp/all_test.go new file mode 100644 index 0000000000..77f32ca1a5 --- /dev/null +++ b/src/pkg/exp/regexp/all_test.go @@ -0,0 +1,429 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "os" + "strings" + "testing" +) + +var good_re = []string{ + ``, + `.`, + `^.$`, + `a`, + `a*`, + `a+`, + `a?`, + `a|b`, + `a*|b*`, + `(a*|b)(c*|d)`, + `[a-z]`, + `[a-abc-c\-\]\[]`, + `[a-z]+`, + `[abc]`, + `[^1234]`, + `[^\n]`, + `\!\\`, +} + +/* +type stringError struct { + re string + err os.Error +} + +var bad_re = []stringError{ + {`*`, ErrBareClosure}, + {`+`, ErrBareClosure}, + {`?`, ErrBareClosure}, + {`(abc`, ErrUnmatchedLpar}, + {`abc)`, ErrUnmatchedRpar}, + {`x[a-z`, ErrUnmatchedLbkt}, + {`abc]`, ErrUnmatchedRbkt}, + {`[z-a]`, ErrBadRange}, + {`abc\`, ErrExtraneousBackslash}, + {`a**`, ErrBadClosure}, + {`a*+`, ErrBadClosure}, + {`a??`, ErrBadClosure}, + {`\x`, ErrBadBackslash}, +} +*/ + +func compileTest(t *testing.T, expr string, error os.Error) *Regexp { + re, err := Compile(expr) + if err != error { + t.Error("compiling `", expr, "`; unexpected error: ", err.String()) + } + return re +} + +func TestGoodCompile(t *testing.T) { + for i := 0; i < len(good_re); i++ { + compileTest(t, good_re[i], nil) + } +} + +/* +func TestBadCompile(t *testing.T) { + for i := 0; i < len(bad_re); i++ { + compileTest(t, bad_re[i].re, bad_re[i].err) + } +} +*/ + +func matchTest(t *testing.T, test *FindTest) { + re := compileTest(t, test.pat, nil) + if re == nil { + return + } + m := re.MatchString(test.text) + if m != (len(test.matches) > 0) { + t.Errorf("MatchString failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } + // now try bytes + m = re.Match([]byte(test.text)) + if m != (len(test.matches) > 0) { + t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } +} + +func TestMatch(t *testing.T) { + for _, test := range findTests { + matchTest(t, &test) + } +} + +func matchFunctionTest(t *testing.T, test *FindTest) { + m, err := MatchString(test.pat, test.text) + if err == nil { + return + } + if m != (len(test.matches) > 0) { + t.Errorf("Match failure on %s: %t should be %t", test, m, len(test.matches) > 0) + } +} + +func TestMatchFunction(t *testing.T) { + for _, test := range findTests { + matchFunctionTest(t, &test) + } +} + +type ReplaceTest struct { + pattern, replacement, input, output string +} + +var replaceTests = []ReplaceTest{ + // Test empty input and/or replacement, with pattern that matches the empty string. + {"", "", "", ""}, + {"", "x", "", "x"}, + {"", "", "abc", "abc"}, + {"", "x", "abc", "xaxbxcx"}, + + // Test empty input and/or replacement, with pattern that does not match the empty string. + {"b", "", "", ""}, + {"b", "x", "", ""}, + {"b", "", "abc", "ac"}, + {"b", "x", "abc", "axc"}, + {"y", "", "", ""}, + {"y", "x", "", ""}, + {"y", "", "abc", "abc"}, + {"y", "x", "abc", "abc"}, + + // Multibyte characters -- verify that we don't try to match in the middle + // of a character. + {"[a-c]*", "x", "\u65e5", "x\u65e5x"}, + {"[^\u65e5]", "x", "abc\u65e5def", "xxx\u65e5xxx"}, + + // Start and end of a string. + {"^[a-c]*", "x", "abcdabc", "xdabc"}, + {"[a-c]*$", "x", "abcdabc", "abcdx"}, + {"^[a-c]*$", "x", "abcdabc", "abcdabc"}, + {"^[a-c]*", "x", "abc", "x"}, + {"[a-c]*$", "x", "abc", "x"}, + {"^[a-c]*$", "x", "abc", "x"}, + {"^[a-c]*", "x", "dabce", "xdabce"}, + {"[a-c]*$", "x", "dabce", "dabcex"}, + {"^[a-c]*$", "x", "dabce", "dabce"}, + {"^[a-c]*", "x", "", "x"}, + {"[a-c]*$", "x", "", "x"}, + {"^[a-c]*$", "x", "", "x"}, + + {"^[a-c]+", "x", "abcdabc", "xdabc"}, + {"[a-c]+$", "x", "abcdabc", "abcdx"}, + {"^[a-c]+$", "x", "abcdabc", "abcdabc"}, + {"^[a-c]+", "x", "abc", "x"}, + {"[a-c]+$", "x", "abc", "x"}, + {"^[a-c]+$", "x", "abc", "x"}, + {"^[a-c]+", "x", "dabce", "dabce"}, + {"[a-c]+$", "x", "dabce", "dabce"}, + {"^[a-c]+$", "x", "dabce", "dabce"}, + {"^[a-c]+", "x", "", ""}, + {"[a-c]+$", "x", "", ""}, + {"^[a-c]+$", "x", "", ""}, + + // Other cases. + {"abc", "def", "abcdefg", "defdefg"}, + {"bc", "BC", "abcbcdcdedef", "aBCBCdcdedef"}, + {"abc", "", "abcdabc", "d"}, + {"x", "xXx", "xxxXxxx", "xXxxXxxXxXxXxxXxxXx"}, + {"abc", "d", "", ""}, + {"abc", "d", "abc", "d"}, + {".+", "x", "abc", "x"}, + {"[a-c]*", "x", "def", "xdxexfx"}, + {"[a-c]+", "x", "abcbcdcdedef", "xdxdedef"}, + {"[a-c]*", "x", "abcbcdcdedef", "xdxdxexdxexfx"}, +} + +type ReplaceFuncTest struct { + pattern string + replacement func(string) string + input, output string +} + +var replaceFuncTests = []ReplaceFuncTest{ + {"[a-c]", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxayxbyxcydef"}, + {"[a-c]+", func(s string) string { return "x" + s + "y" }, "defabcdef", "defxabcydef"}, + {"[a-c]*", func(s string) string { return "x" + s + "y" }, "defabcdef", "xydxyexyfxabcydxyexyfxy"}, +} + +func TestReplaceAll(t *testing.T) { + for _, tc := range replaceTests { + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllString(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.Replace(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAll([]byte(tc.input), []byte(tc.replacement))) + if actual != tc.output { + t.Errorf("%q.Replace(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } +} + +func TestReplaceAllFunc(t *testing.T) { + for _, tc := range replaceFuncTests { + re, err := Compile(tc.pattern) + if err != nil { + t.Errorf("Unexpected error compiling %q: %v", tc.pattern, err) + continue + } + actual := re.ReplaceAllStringFunc(tc.input, tc.replacement) + if actual != tc.output { + t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + // now try bytes + actual = string(re.ReplaceAllFunc([]byte(tc.input), func(s []byte) []byte { return []byte(tc.replacement(string(s))) })) + if actual != tc.output { + t.Errorf("%q.ReplaceFunc(%q,%q) = %q; want %q", + tc.pattern, tc.input, tc.replacement, actual, tc.output) + } + } +} + +type MetaTest struct { + pattern, output, literal string + isLiteral bool +} + +var metaTests = []MetaTest{ + {``, ``, ``, true}, + {`foo`, `foo`, `foo`, true}, + {`foo\.\$`, `foo\\\.\\\$`, `foo.$`, true}, // has meta but no operator + {`foo.\$`, `foo\.\\\$`, `foo`, false}, // has escaped operators and real operators + {`!@#$%^&*()_+-=[{]}\|,<.>/?~`, `!@#\$%\^&\*\(\)_\+-=\[\{\]\}\\\|,<\.>/\?~`, `!@#`, false}, +} + +func TestQuoteMeta(t *testing.T) { + for _, tc := range metaTests { + // Verify that QuoteMeta returns the expected string. + quoted := QuoteMeta(tc.pattern) + if quoted != tc.output { + t.Errorf("QuoteMeta(`%s`) = `%s`; want `%s`", + tc.pattern, quoted, tc.output) + continue + } + + // Verify that the quoted string is in fact treated as expected + // by Compile -- i.e. that it matches the original, unquoted string. + if tc.pattern != "" { + re, err := Compile(quoted) + if err != nil { + t.Errorf("Unexpected error compiling QuoteMeta(`%s`): %v", tc.pattern, err) + continue + } + src := "abc" + tc.pattern + "def" + repl := "xyz" + replaced := re.ReplaceAllString(src, repl) + expected := "abcxyzdef" + if replaced != expected { + t.Errorf("QuoteMeta(`%s`).Replace(`%s`,`%s`) = `%s`; want `%s`", + tc.pattern, src, repl, replaced, expected) + } + } + } +} + +func TestLiteralPrefix(t *testing.T) { + for _, tc := range metaTests { + // Literal method needs to scan the pattern. + re := MustCompile(tc.pattern) + str, complete := re.LiteralPrefix() + if complete != tc.isLiteral { + t.Errorf("LiteralPrefix(`%s`) = %t; want %t", tc.pattern, complete, tc.isLiteral) + } + if str != tc.literal { + t.Errorf("LiteralPrefix(`%s`) = `%s`; want `%s`", tc.pattern, str, tc.literal) + } + } +} + +type numSubexpCase struct { + input string + expected int +} + +var numSubexpCases = []numSubexpCase{ + {``, 0}, + {`.*`, 0}, + {`abba`, 0}, + {`ab(b)a`, 1}, + {`ab(.*)a`, 1}, + {`(.*)ab(.*)a`, 2}, + {`(.*)(ab)(.*)a`, 3}, + {`(.*)((a)b)(.*)a`, 4}, + {`(.*)(\(ab)(.*)a`, 3}, + {`(.*)(\(a\)b)(.*)a`, 3}, +} + +func TestNumSubexp(t *testing.T) { + for _, c := range numSubexpCases { + re := MustCompile(c.input) + n := re.NumSubexp() + if n != c.expected { + t.Errorf("NumSubexp for %q returned %d, expected %d", c.input, n, c.expected) + } + } +} + +func BenchmarkLiteral(b *testing.B) { + x := strings.Repeat("x", 50) + "y" + b.StopTimer() + re := MustCompile("y") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkNotLiteral(b *testing.B) { + x := strings.Repeat("x", 50) + "y" + b.StopTimer() + re := MustCompile(".y") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkMatchClass(b *testing.B) { + b.StopTimer() + x := strings.Repeat("xxxx", 20) + "w" + re := MustCompile("[abcdw]") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkMatchClass_InRange(b *testing.B) { + b.StopTimer() + // 'b' is between 'a' and 'c', so the charclass + // range checking is no help here. + x := strings.Repeat("bbbb", 20) + "c" + re := MustCompile("[ac]") + b.StartTimer() + for i := 0; i < b.N; i++ { + if !re.MatchString(x) { + println("no match!") + break + } + } +} + +func BenchmarkReplaceAll(b *testing.B) { + x := "abcdefghijklmnopqrstuvwxyz" + b.StopTimer() + re := MustCompile("[cjrw]") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.ReplaceAllString(x, "") + } +} + +func BenchmarkAnchoredLiteralShortNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLiteralLongNonMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^zbc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredShortMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} + +func BenchmarkAnchoredLongMatch(b *testing.B) { + b.StopTimer() + x := []byte("abcdefghijklmnopqrstuvwxyz") + for i := 0; i < 15; i++ { + x = append(x, x...) + } + re := MustCompile("^.bc(d|e)") + b.StartTimer() + for i := 0; i < b.N; i++ { + re.Match(x) + } +} diff --git a/src/pkg/exp/regexp/exec.go b/src/pkg/exp/regexp/exec.go new file mode 100644 index 0000000000..0670bb9b1b --- /dev/null +++ b/src/pkg/exp/regexp/exec.go @@ -0,0 +1,295 @@ +package regexp + +import "exp/regexp/syntax" + +// A queue is a 'sparse array' holding pending threads of execution. +// See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html +type queue struct { + sparse []uint32 + dense []entry +} + +// A entry is an entry on a queue. +// It holds both the instruction pc and the actual thread. +// Some queue entries are just place holders so that the machine +// knows it has considered that pc. Such entries have t == nil. +type entry struct { + pc uint32 + t *thread +} + +// A thread is the state of a single path through the machine: +// an instruction and a corresponding capture array. +// See http://swtch.com/~rsc/regexp/regexp2.html +type thread struct { + inst *syntax.Inst + cap []int +} + +// A machine holds all the state during an NFA simulation for p. +type machine struct { + re *Regexp // corresponding Regexp + p *syntax.Prog // compiled program + q0, q1 queue // two queues for runq, nextq + pool []*thread // pool of available threads + matched bool // whether a match was found + matchcap []int // capture information for the match +} + +// progMachine returns a new machine running the prog p. +func progMachine(p *syntax.Prog) *machine { + m := &machine{p: p} + n := len(m.p.Inst) + m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} + m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} + ncap := p.NumCap + if ncap < 2 { + ncap = 2 + } + m.matchcap = make([]int, ncap) + return m +} + +// alloc allocates a new thread with the given instruction. +// It uses the free pool if possible. +func (m *machine) alloc(i *syntax.Inst) *thread { + var t *thread + if n := len(m.pool); n > 0 { + t = m.pool[n-1] + m.pool = m.pool[:n-1] + } else { + t = new(thread) + t.cap = make([]int, cap(m.matchcap)) + } + t.cap = t.cap[:len(m.matchcap)] + t.inst = i + return t +} + +// free returns t to the free pool. +func (m *machine) free(t *thread) { + m.pool = append(m.pool, t) +} + +// match runs the machine over the input starting at pos. +// It reports whether a match was found. +// If so, m.matchcap holds the submatch information. +func (m *machine) match(i input, pos int) bool { + startCond := m.re.cond + if startCond == ^syntax.EmptyOp(0) { // impossible + return false + } + m.matched = false + for i := range m.matchcap { + m.matchcap[i] = -1 + } + runq, nextq := &m.q0, &m.q1 + rune, rune1 := endOfText, endOfText + width, width1 := 0, 0 + rune, width = i.step(pos) + if rune != endOfText { + rune1, width1 = i.step(pos + width) + } + // TODO: Let caller specify the initial flag setting. + // For now assume pos == 0 is beginning of text and + // pos != 0 is not even beginning of line. + // TODO: Word boundary. + var flag syntax.EmptyOp + if pos == 0 { + flag = syntax.EmptyBeginText | syntax.EmptyBeginLine + } + + // Update flag using lookahead rune. + if rune1 == '\n' { + flag |= syntax.EmptyEndLine + } + if rune1 == endOfText { + flag |= syntax.EmptyEndText + } + + for { + if len(runq.dense) == 0 { + if startCond&syntax.EmptyBeginText != 0 && pos != 0 { + // Anchored match, past beginning of text. + break + } + if m.matched { + // Have match; finished exploring alternatives. + break + } + if len(m.re.prefix) > 0 && rune1 != m.re.prefixRune && i.canCheckPrefix() { + // Match requires literal prefix; fast search for it. + advance := i.index(m.re, pos) + if advance < 0 { + break + } + pos += advance + rune, width = i.step(pos) + rune1, width1 = i.step(pos + width) + } + } + if !m.matched { + if len(m.matchcap) > 0 { + m.matchcap[0] = pos + } + m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag) + } + // TODO: word boundary + flag = 0 + if rune == '\n' { + flag |= syntax.EmptyBeginLine + } + if rune1 == '\n' { + flag |= syntax.EmptyEndLine + } + if rune1 == endOfText { + flag |= syntax.EmptyEndText + } + m.step(runq, nextq, pos, pos+width, rune, flag) + if width == 0 { + break + } + pos += width + rune, width = rune1, width1 + if rune != endOfText { + rune1, width1 = i.step(pos + width) + } + runq, nextq = nextq, runq + } + m.clear(nextq) + return m.matched +} + +// clear frees all threads on the thread queue. +func (m *machine) clear(q *queue) { + for _, d := range q.dense { + if d.t != nil { + m.free(d.t) + } + } + q.dense = q.dense[:0] +} + +// step executes one step of the machine, running each of the threads +// on runq and appending new threads to nextq. +// The step processes the rune c (which may be endOfText), +// which starts at position pos and ends at nextPos. +// nextCond gives the setting for the empty-width flags after c. +func (m *machine) step(runq, nextq *queue, pos, nextPos, c int, nextCond syntax.EmptyOp) { + for j := 0; j < len(runq.dense); j++ { + d := &runq.dense[j] + t := d.t + if t == nil { + continue + } + /* + * If we support leftmost-longest matching: + if longest && matched && match[0] < t.cap[0] { + m.free(t) + continue + } + */ + + i := t.inst + switch i.Op { + default: + panic("bad inst") + + case syntax.InstMatch: + if len(t.cap) > 0 { + t.cap[1] = pos + copy(m.matchcap, t.cap) + } + m.matched = true + for _, d := range runq.dense[j+1:] { + if d.t != nil { + m.free(d.t) + } + } + runq.dense = runq.dense[:0] + + case syntax.InstRune: + if i.MatchRune(c) { + m.add(nextq, i.Out, nextPos, t.cap, nextCond) + } + } + m.free(t) + } + runq.dense = runq.dense[:0] +} + +// add adds an entry to q for pc, unless the q already has such an entry. +// It also recursively adds an entry for all instructions reachable from pc by following +// empty-width conditions satisfied by cond. pos gives the current position +// in the input. +func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp) { + if pc == 0 { + return + } + if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { + return + } + + j := len(q.dense) + q.dense = q.dense[:j+1] + d := &q.dense[j] + d.t = nil + d.pc = pc + q.sparse[pc] = uint32(j) + + i := &m.p.Inst[pc] + switch i.Op { + default: + panic("unhandled") + case syntax.InstFail: + // nothing + case syntax.InstAlt, syntax.InstAltMatch: + m.add(q, i.Out, pos, cap, cond) + m.add(q, i.Arg, pos, cap, cond) + case syntax.InstEmptyWidth: + if syntax.EmptyOp(i.Arg)&^cond == 0 { + m.add(q, i.Out, pos, cap, cond) + } + case syntax.InstNop: + m.add(q, i.Out, pos, cap, cond) + case syntax.InstCapture: + if int(i.Arg) < len(cap) { + opos := cap[i.Arg] + cap[i.Arg] = pos + m.add(q, i.Out, pos, cap, cond) + cap[i.Arg] = opos + } else { + m.add(q, i.Out, pos, cap, cond) + } + case syntax.InstMatch, syntax.InstRune: + t := m.alloc(i) + if len(t.cap) > 0 { + copy(t.cap, cap) + } + d.t = t + } +} + +// empty is a non-nil 0-element slice, +// so doExecute can avoid an allocation +// when 0 captures are requested from a successful match. +var empty = make([]int, 0) + +// doExecute finds the leftmost match in the input and returns +// the position of its subexpressions. +func (re *Regexp) doExecute(i input, pos int, ncap int) []int { + m := re.get() + m.matchcap = m.matchcap[:ncap] + if !m.match(i, pos) { + re.put(m) + return nil + } + if ncap == 0 { + re.put(m) + return empty // empty but not nil + } + cap := make([]int, ncap) + copy(cap, m.matchcap) + re.put(m) + return cap +} diff --git a/src/pkg/exp/regexp/find_test.go b/src/pkg/exp/regexp/find_test.go new file mode 100644 index 0000000000..dddc3484c9 --- /dev/null +++ b/src/pkg/exp/regexp/find_test.go @@ -0,0 +1,472 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "fmt" + "strings" + "testing" +) + +// For each pattern/text pair, what is the expected output of each function? +// We can derive the textual results from the indexed results, the non-submatch +// results from the submatched results, the single results from the 'all' results, +// and the byte results from the string results. Therefore the table includes +// only the FindAllStringSubmatchIndex result. +type FindTest struct { + pat string + text string + matches [][]int +} + +func (t FindTest) String() string { + return fmt.Sprintf("pat: %#q text: %#q", t.pat, t.text) +} + +var findTests = []FindTest{ + {``, ``, build(1, 0, 0)}, + {`^abcdefg`, "abcdefg", build(1, 0, 7)}, + {`a+`, "baaab", build(1, 1, 4)}, + {"abcd..", "abcdef", build(1, 0, 6)}, + {`a`, "a", build(1, 0, 1)}, + {`x`, "y", nil}, + {`b`, "abc", build(1, 1, 2)}, + {`.`, "a", build(1, 0, 1)}, + {`.*`, "abcdef", build(1, 0, 6)}, + {`^`, "abcde", build(1, 0, 0)}, + {`$`, "abcde", build(1, 5, 5)}, + {`^abcd$`, "abcd", build(1, 0, 4)}, + {`^bcd'`, "abcdef", nil}, + {`^abcd$`, "abcde", nil}, + {`a+`, "baaab", build(1, 1, 4)}, + {`a*`, "baaab", build(3, 0, 0, 1, 4, 5, 5)}, + {`[a-z]+`, "abcd", build(1, 0, 4)}, + {`[^a-z]+`, "ab1234cd", build(1, 2, 6)}, + {`[a\-\]z]+`, "az]-bcz", build(2, 0, 4, 6, 7)}, + {`[^\n]+`, "abcd\n", build(1, 0, 4)}, + {`[日本語]+`, "日本語日本語", build(1, 0, 18)}, + {`日本語+`, "日本語", build(1, 0, 9)}, + {`日本語+`, "日本語語語語", build(1, 0, 18)}, + {`()`, "", build(1, 0, 0, 0, 0)}, + {`(a)`, "a", build(1, 0, 1, 0, 1)}, + {`(.)(.)`, "日a", build(1, 0, 4, 0, 3, 3, 4)}, + {`(.*)`, "", build(1, 0, 0, 0, 0)}, + {`(.*)`, "abcd", build(1, 0, 4, 0, 4)}, + {`(..)(..)`, "abcd", build(1, 0, 4, 0, 2, 2, 4)}, + {`(([^xyz]*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 3, 4)}, + {`((a|b|c)*(d))`, "abcd", build(1, 0, 4, 0, 4, 2, 3, 3, 4)}, + {`(((a|b|c)*)(d))`, "abcd", build(1, 0, 4, 0, 4, 0, 3, 2, 3, 3, 4)}, + {`\a\f\n\r\t\v`, "\a\f\n\r\t\v", build(1, 0, 6)}, + {`[\a\f\n\r\t\v]+`, "\a\f\n\r\t\v", build(1, 0, 6)}, + + {`a*(|(b))c*`, "aacc", build(1, 0, 4, 2, 2, -1, -1)}, + {`(.*).*`, "ab", build(1, 0, 2, 0, 2)}, + {`[.]`, ".", build(1, 0, 1)}, + {`/$`, "/abc/", build(1, 4, 5)}, + {`/$`, "/abc", nil}, + + // multiple matches + {`.`, "abc", build(3, 0, 1, 1, 2, 2, 3)}, + {`(.)`, "abc", build(3, 0, 1, 0, 1, 1, 2, 1, 2, 2, 3, 2, 3)}, + {`.(.)`, "abcd", build(2, 0, 2, 1, 2, 2, 4, 3, 4)}, + {`ab*`, "abbaab", build(3, 0, 3, 3, 4, 4, 6)}, + {`a(b*)`, "abbaab", build(3, 0, 3, 1, 3, 3, 4, 4, 4, 4, 6, 5, 6)}, + + // fixed bugs + {`ab$`, "cab", build(1, 1, 3)}, + {`axxb$`, "axxcb", nil}, + {`data`, "daXY data", build(1, 5, 9)}, + {`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)}, + {`zx+`, "zzx", build(1, 1, 3)}, + + // can backslash-escape any punctuation + {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, + `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, + {`[\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~]+`, + `!"#$%&'()*+,-./:;<=>?@[\]^_{|}~`, build(1, 0, 31)}, + {"\\`", "`", build(1, 0, 1)}, + {"[\\`]+", "`", build(1, 0, 1)}, + + // long set of matches (longer than startSize) + { + ".", + "qwertyuiopasdfghjklzxcvbnm1234567890", + build(36, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, + 10, 11, 11, 12, 12, 13, 13, 14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, + 20, 21, 21, 22, 22, 23, 23, 24, 24, 25, 25, 26, 26, 27, 27, 28, 28, 29, 29, 30, + 30, 31, 31, 32, 32, 33, 33, 34, 34, 35, 35, 36), + }, +} + +// build is a helper to construct a [][]int by extracting n sequences from x. +// This represents n matches with len(x)/n submatches each. +func build(n int, x ...int) [][]int { + ret := make([][]int, n) + runLength := len(x) / n + j := 0 + for i := range ret { + ret[i] = make([]int, runLength) + copy(ret[i], x[j:]) + j += runLength + if j > len(x) { + panic("invalid build entry") + } + } + return ret +} + +// First the simple cases. + +func TestFind(t *testing.T) { + for _, test := range findTests { + re := MustCompile(test.pat) + if re.String() != test.pat { + t.Errorf("String() = `%s`; should be `%s`", re.String(), test.pat) + } + result := re.Find([]byte(test.text)) + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + expect := test.text[test.matches[0][0]:test.matches[0][1]] + if expect != string(result) { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } +} + +func TestFindString(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindString(test.text) + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != "": + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == "": + // Tricky because an empty result has two meanings: no match or empty match. + if test.matches[0][0] != test.matches[0][1] { + t.Errorf("expected match; got none: %s", test) + } + case test.matches != nil && result != "": + expect := test.text[test.matches[0][0]:test.matches[0][1]] + if expect != result { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } +} + +func testFindIndex(test *FindTest, result []int, t *testing.T) { + switch { + case len(test.matches) == 0 && len(result) == 0: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + expect := test.matches[0] + if expect[0] != result[0] || expect[1] != result[1] { + t.Errorf("expected %v got %v: %s", expect, result, test) + } + } +} + +func TestFindIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindIndex([]byte(test.text)), t) + } +} + +func TestFindStringIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindStringIndex(test.text), t) + } +} + +func TestFindReaderIndex(t *testing.T) { + for _, test := range findTests { + testFindIndex(&test, MustCompile(test.pat).FindReaderIndex(strings.NewReader(test.text)), t) + } +} + +// Now come the simple All cases. + +func TestFindAll(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAll([]byte(test.text), -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Fatalf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + continue + } + for k, e := range test.matches { + expect := test.text[e[0]:e[1]] + if expect != string(result[k]) { + t.Errorf("match %d: expected %q got %q: %s", k, expect, result[k], test) + } + } + } + } +} + +func TestFindAllString(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllString(test.text, -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + continue + } + for k, e := range test.matches { + expect := test.text[e[0]:e[1]] + if expect != result[k] { + t.Errorf("expected %q got %q: %s", expect, result, test) + } + } + } + } +} + +func testFindAllIndex(test *FindTest, result [][]int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + if len(test.matches) != len(result) { + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + return + } + for k, e := range test.matches { + if e[0] != result[k][0] || e[1] != result[k][1] { + t.Errorf("match %d: expected %v got %v: %s", k, e, result[k], test) + } + } + } +} + +func TestFindAllIndex(t *testing.T) { + for _, test := range findTests { + testFindAllIndex(&test, MustCompile(test.pat).FindAllIndex([]byte(test.text), -1), t) + } +} + +func TestFindAllStringIndex(t *testing.T) { + for _, test := range findTests { + testFindAllIndex(&test, MustCompile(test.pat).FindAllStringIndex(test.text, -1), t) + } +} + +// Now come the Submatch cases. + +func testSubmatchBytes(test *FindTest, n int, submatches []int, result [][]byte, t *testing.T) { + if len(submatches) != len(result)*2 { + t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) + return + } + for k := 0; k < len(submatches); k += 2 { + if submatches[k] == -1 { + if result[k/2] != nil { + t.Errorf("match %d: expected nil got %q: %s", n, result, test) + } + continue + } + expect := test.text[submatches[k]:submatches[k+1]] + if expect != string(result[k/2]) { + t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) + return + } + } +} + +func TestFindSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindSubmatch([]byte(test.text)) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchBytes(&test, 0, test.matches[0], result, t) + } + } +} + +func testSubmatchString(test *FindTest, n int, submatches []int, result []string, t *testing.T) { + if len(submatches) != len(result)*2 { + t.Errorf("match %d: expected %d submatches; got %d: %s", n, len(submatches)/2, len(result), test) + return + } + for k := 0; k < len(submatches); k += 2 { + if submatches[k] == -1 { + if result[k/2] != "" { + t.Errorf("match %d: expected nil got %q: %s", n, result, test) + } + continue + } + expect := test.text[submatches[k]:submatches[k+1]] + if expect != result[k/2] { + t.Errorf("match %d: expected %q got %q: %s", n, expect, result, test) + return + } + } +} + +func TestFindStringSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindStringSubmatch(test.text) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchString(&test, 0, test.matches[0], result, t) + } + } +} + +func testSubmatchIndices(test *FindTest, n int, expect, result []int, t *testing.T) { + if len(expect) != len(result) { + t.Errorf("match %d: expected %d matches; got %d: %s", n, len(expect)/2, len(result)/2, test) + return + } + for k, e := range expect { + if e != result[k] { + t.Errorf("match %d: submatch error: expected %v got %v: %s", n, expect, result, test) + } + } +} + +func testFindSubmatchIndex(test *FindTest, result []int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case test.matches != nil && result != nil: + testSubmatchIndices(test, 0, test.matches[0], result, t) + } +} + +func TestFindSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindSubmatchIndex([]byte(test.text)), t) + } +} + +func TestFindStringSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindStringSubmatchIndex(test.text), t) + } +} + +func TestFindReaderSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindSubmatchIndex(&test, MustCompile(test.pat).FindReaderSubmatchIndex(strings.NewReader(test.text)), t) + } +} + +// Now come the monster AllSubmatch cases. + +func TestFindAllSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllSubmatch([]byte(test.text), -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchBytes(&test, k, match, result[k], t) + } + } + } +} + +func TestFindAllStringSubmatch(t *testing.T) { + for _, test := range findTests { + result := MustCompile(test.pat).FindAllStringSubmatch(test.text, -1) + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchString(&test, k, match, result[k], t) + } + } + } +} + +func testFindAllSubmatchIndex(test *FindTest, result [][]int, t *testing.T) { + switch { + case test.matches == nil && result == nil: + // ok + case test.matches == nil && result != nil: + t.Errorf("expected no match; got one: %s", test) + case test.matches != nil && result == nil: + t.Errorf("expected match; got none: %s", test) + case len(test.matches) != len(result): + t.Errorf("expected %d matches; got %d: %s", len(test.matches), len(result), test) + case test.matches != nil && result != nil: + for k, match := range test.matches { + testSubmatchIndices(test, k, match, result[k], t) + } + } +} + +func TestFindAllSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllSubmatchIndex([]byte(test.text), -1), t) + } +} + +func TestFindAllStringSubmatchIndex(t *testing.T) { + for _, test := range findTests { + testFindAllSubmatchIndex(&test, MustCompile(test.pat).FindAllStringSubmatchIndex(test.text, -1), t) + } +} diff --git a/src/pkg/exp/regexp/regexp.go b/src/pkg/exp/regexp/regexp.go new file mode 100644 index 0000000000..1b75900f81 --- /dev/null +++ b/src/pkg/exp/regexp/regexp.go @@ -0,0 +1,795 @@ +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package regexp implements a simple regular expression library. +// +// The syntax of the regular expressions accepted is the same +// general syntax used by Perl, Python, and other languages. +// More precisely, it is the syntax accepted by RE2 and described at +// http://code.google.com/p/re2/wiki/Syntax, except for \C. +// +// All characters are UTF-8-encoded code points. +// +// There are 16 methods of Regexp that match a regular expression and identify +// the matched text. Their names are matched by this regular expression: +// +// Find(All)?(String)?(Submatch)?(Index)? +// +// If 'All' is present, the routine matches successive non-overlapping +// matches of the entire expression. Empty matches abutting a preceding +// match are ignored. The return value is a slice containing the successive +// return values of the corresponding non-'All' routine. These routines take +// an extra integer argument, n; if n >= 0, the function returns at most n +// matches/submatches. +// +// If 'String' is present, the argument is a string; otherwise it is a slice +// of bytes; return values are adjusted as appropriate. +// +// If 'Submatch' is present, the return value is a slice identifying the +// successive submatches of the expression. Submatches are matches of +// parenthesized subexpressions within the regular expression, numbered from +// left to right in order of opening parenthesis. Submatch 0 is the match of +// the entire expression, submatch 1 the match of the first parenthesized +// subexpression, and so on. +// +// If 'Index' is present, matches and submatches are identified by byte index +// pairs within the input string: result[2*n:2*n+1] identifies the indexes of +// the nth submatch. The pair for n==0 identifies the match of the entire +// expression. If 'Index' is not present, the match is identified by the +// text of the match/submatch. If an index is negative, it means that +// subexpression did not match any string in the input. +// +// There is also a subset of the methods that can be applied to text read +// from a RuneReader: +// +// MatchReader, FindReaderIndex, FindReaderSubmatchIndex +// +// This set may grow. Note that regular expression matches may need to +// examine text beyond the text returned by a match, so the methods that +// match text from a RuneReader may read arbitrarily far into the input +// before returning. +// +// (There are a few other methods that do not match this pattern.) +// +package regexp + +import ( + "bytes" + "exp/regexp/syntax" + "io" + "os" + "strings" + "sync" + "utf8" +) + +var debug = false + +// Error is the local type for a parsing error. +type Error string + +func (e Error) String() string { + return string(e) +} + +// Regexp is the representation of a compiled regular expression. +// The public interface is entirely through methods. +// A Regexp is safe for concurrent use by multiple goroutines. +type Regexp struct { + // read-only after Compile + expr string // as passed to Compile + prog *syntax.Prog // compiled program + prefix string // required prefix in unanchored matches + prefixBytes []byte // prefix, as a []byte + prefixComplete bool // prefix is the entire regexp + prefixRune int // first rune in prefix + cond syntax.EmptyOp // empty-width conditions required at start of match + + // cache of machines for running regexp + mu sync.Mutex + machine []*machine +} + +// String returns the source text used to compile the regular expression. +func (re *Regexp) String() string { + return re.expr +} + +// Compile parses a regular expression and returns, if successful, a Regexp +// object that can be used to match against text. +func Compile(expr string) (*Regexp, os.Error) { + re, err := syntax.Parse(expr, syntax.Perl) + if err != nil { + return nil, err + } + prog, err := syntax.Compile(re) + if err != nil { + return nil, err + } + regexp := &Regexp{ + expr: expr, + prog: prog, + } + regexp.prefix, regexp.prefixComplete = prog.Prefix() + if regexp.prefix != "" { + // TODO(rsc): Remove this allocation by adding + // IndexString to package bytes. + regexp.prefixBytes = []byte(regexp.prefix) + regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) + } + regexp.cond = prog.StartCond() + return regexp, nil +} + +// get returns a machine to use for matching re. +// It uses the re's machine cache if possible, to avoid +// unnecessary allocation. +func (re *Regexp) get() *machine { + re.mu.Lock() + if n := len(re.machine); n > 0 { + z := re.machine[n-1] + re.machine = re.machine[:n-1] + re.mu.Unlock() + return z + } + re.mu.Unlock() + z := progMachine(re.prog) + z.re = re + return z +} + +// put returns a machine to the re's machine cache. +// There is no attempt to limit the size of the cache, so it will +// grow to the maximum number of simultaneous matches +// run using re. (The cache empties when re gets garbage collected.) +func (re *Regexp) put(z *machine) { + re.mu.Lock() + re.machine = append(re.machine, z) + re.mu.Unlock() +} + +// MustCompile is like Compile but panics if the expression cannot be parsed. +// It simplifies safe initialization of global variables holding compiled regular +// expressions. +func MustCompile(str string) *Regexp { + regexp, error := Compile(str) + if error != nil { + panic(`regexp: compiling "` + str + `": ` + error.String()) + } + return regexp +} + +// NumSubexp returns the number of parenthesized subexpressions in this Regexp. +func (re *Regexp) NumSubexp() int { + // NumCap/2 because captures count ( and ) separately. + // -1 because NumCap counts $0 but NumSubexp does not. + return re.prog.NumCap/2 - 1 +} + +const endOfText = -1 + +// input abstracts different representations of the input text. It provides +// one-character lookahead. +type input interface { + step(pos int) (rune int, width int) // advance one rune + canCheckPrefix() bool // can we look ahead without losing info? + hasPrefix(re *Regexp) bool + index(re *Regexp, pos int) int +} + +// inputString scans a string. +type inputString struct { + str string +} + +func newInputString(str string) *inputString { + return &inputString{str: str} +} + +func (i *inputString) step(pos int) (int, int) { + if pos < len(i.str) { + return utf8.DecodeRuneInString(i.str[pos:len(i.str)]) + } + return endOfText, 0 +} + +func (i *inputString) canCheckPrefix() bool { + return true +} + +func (i *inputString) hasPrefix(re *Regexp) bool { + return strings.HasPrefix(i.str, re.prefix) +} + +func (i *inputString) index(re *Regexp, pos int) int { + return strings.Index(i.str[pos:], re.prefix) +} + +// inputBytes scans a byte slice. +type inputBytes struct { + str []byte +} + +func newInputBytes(str []byte) *inputBytes { + return &inputBytes{str: str} +} + +func (i *inputBytes) step(pos int) (int, int) { + if pos < len(i.str) { + return utf8.DecodeRune(i.str[pos:len(i.str)]) + } + return endOfText, 0 +} + +func (i *inputBytes) canCheckPrefix() bool { + return true +} + +func (i *inputBytes) hasPrefix(re *Regexp) bool { + return bytes.HasPrefix(i.str, re.prefixBytes) +} + +func (i *inputBytes) index(re *Regexp, pos int) int { + return bytes.Index(i.str[pos:], re.prefixBytes) +} + +// inputReader scans a RuneReader. +type inputReader struct { + r io.RuneReader + atEOT bool + pos int +} + +func newInputReader(r io.RuneReader) *inputReader { + return &inputReader{r: r} +} + +func (i *inputReader) step(pos int) (int, int) { + if !i.atEOT && pos != i.pos { + return endOfText, 0 + + } + r, w, err := i.r.ReadRune() + if err != nil { + i.atEOT = true + return endOfText, 0 + } + i.pos += w + return r, w +} + +func (i *inputReader) canCheckPrefix() bool { + return false +} + +func (i *inputReader) hasPrefix(re *Regexp) bool { + return false +} + +func (i *inputReader) index(re *Regexp, pos int) int { + return -1 +} + +// LiteralPrefix returns a literal string that must begin any match +// of the regular expression re. It returns the boolean true if the +// literal string comprises the entire regular expression. +func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { + return re.prefix, re.prefixComplete +} + +// MatchReader returns whether the Regexp matches the text read by the +// RuneReader. The return value is a boolean: true for match, false for no +// match. +func (re *Regexp) MatchReader(r io.RuneReader) bool { + return re.doExecute(newInputReader(r), 0, 0) != nil +} + +// MatchString returns whether the Regexp matches the string s. +// The return value is a boolean: true for match, false for no match. +func (re *Regexp) MatchString(s string) bool { + return re.doExecute(newInputString(s), 0, 0) != nil +} + +// Match returns whether the Regexp matches the byte slice b. +// The return value is a boolean: true for match, false for no match. +func (re *Regexp) Match(b []byte) bool { + return re.doExecute(newInputBytes(b), 0, 0) != nil +} + +// MatchReader checks whether a textual regular expression matches the text +// read by the RuneReader. More complicated queries need to use Compile and +// the full Regexp interface. +func MatchReader(pattern string, r io.RuneReader) (matched bool, error os.Error) { + re, err := Compile(pattern) + if err != nil { + return false, err + } + return re.MatchReader(r), nil +} + +// MatchString checks whether a textual regular expression +// matches a string. More complicated queries need +// to use Compile and the full Regexp interface. +func MatchString(pattern string, s string) (matched bool, error os.Error) { + re, err := Compile(pattern) + if err != nil { + return false, err + } + return re.MatchString(s), nil +} + +// Match checks whether a textual regular expression +// matches a byte slice. More complicated queries need +// to use Compile and the full Regexp interface. +func Match(pattern string, b []byte) (matched bool, error os.Error) { + re, err := Compile(pattern) + if err != nil { + return false, err + } + return re.Match(b), nil +} + +// ReplaceAllString returns a copy of src in which all matches for the Regexp +// have been replaced by repl. No support is provided for expressions +// (e.g. \1 or $1) in the replacement string. +func (re *Regexp) ReplaceAllString(src, repl string) string { + return re.ReplaceAllStringFunc(src, func(string) string { return repl }) +} + +// ReplaceAllStringFunc returns a copy of src in which all matches for the +// Regexp have been replaced by the return value of of function repl (whose +// first argument is the matched string). No support is provided for +// expressions (e.g. \1 or $1) in the replacement string. +func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { + lastMatchEnd := 0 // end position of the most recent match + searchPos := 0 // position where we next look for a match + buf := new(bytes.Buffer) + for searchPos <= len(src) { + a := re.doExecute(newInputString(src), searchPos, 2) + if len(a) == 0 { + break // no more matches + } + + // Copy the unmatched characters before this match. + io.WriteString(buf, src[lastMatchEnd:a[0]]) + + // Now insert a copy of the replacement string, but not for a + // match of the empty string immediately after another match. + // (Otherwise, we get double replacement for patterns that + // match both empty and nonempty strings.) + if a[1] > lastMatchEnd || a[0] == 0 { + io.WriteString(buf, repl(src[a[0]:a[1]])) + } + lastMatchEnd = a[1] + + // Advance past this match; always advance at least one character. + _, width := utf8.DecodeRuneInString(src[searchPos:]) + if searchPos+width > a[1] { + searchPos += width + } else if searchPos+1 > a[1] { + // This clause is only needed at the end of the input + // string. In that case, DecodeRuneInString returns width=0. + searchPos++ + } else { + searchPos = a[1] + } + } + + // Copy the unmatched characters after the last match. + io.WriteString(buf, src[lastMatchEnd:]) + + return buf.String() +} + +// ReplaceAll returns a copy of src in which all matches for the Regexp +// have been replaced by repl. No support is provided for expressions +// (e.g. \1 or $1) in the replacement text. +func (re *Regexp) ReplaceAll(src, repl []byte) []byte { + return re.ReplaceAllFunc(src, func([]byte) []byte { return repl }) +} + +// ReplaceAllFunc returns a copy of src in which all matches for the +// Regexp have been replaced by the return value of of function repl (whose +// first argument is the matched []byte). No support is provided for +// expressions (e.g. \1 or $1) in the replacement string. +func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { + lastMatchEnd := 0 // end position of the most recent match + searchPos := 0 // position where we next look for a match + buf := new(bytes.Buffer) + for searchPos <= len(src) { + a := re.doExecute(newInputBytes(src), searchPos, 2) + if len(a) == 0 { + break // no more matches + } + + // Copy the unmatched characters before this match. + buf.Write(src[lastMatchEnd:a[0]]) + + // Now insert a copy of the replacement string, but not for a + // match of the empty string immediately after another match. + // (Otherwise, we get double replacement for patterns that + // match both empty and nonempty strings.) + if a[1] > lastMatchEnd || a[0] == 0 { + buf.Write(repl(src[a[0]:a[1]])) + } + lastMatchEnd = a[1] + + // Advance past this match; always advance at least one character. + _, width := utf8.DecodeRune(src[searchPos:]) + if searchPos+width > a[1] { + searchPos += width + } else if searchPos+1 > a[1] { + // This clause is only needed at the end of the input + // string. In that case, DecodeRuneInString returns width=0. + searchPos++ + } else { + searchPos = a[1] + } + } + + // Copy the unmatched characters after the last match. + buf.Write(src[lastMatchEnd:]) + + return buf.Bytes() +} + +var specialBytes = []byte(`\.+*?()|[]{}^$`) + +func special(b byte) bool { + return bytes.IndexByte(specialBytes, b) >= 0 +} + +// QuoteMeta returns a string that quotes all regular expression metacharacters +// inside the argument text; the returned string is a regular expression matching +// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. +func QuoteMeta(s string) string { + b := make([]byte, 2*len(s)) + + // A byte loop is correct because all metacharacters are ASCII. + j := 0 + for i := 0; i < len(s); i++ { + if special(s[i]) { + b[j] = '\\' + j++ + } + b[j] = s[i] + j++ + } + return string(b[0:j]) +} + +// Find matches in slice b if b is non-nil, otherwise find matches in string s. +func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { + var end int + if b == nil { + end = len(s) + } else { + end = len(b) + } + + for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { + var in input + if b == nil { + in = newInputString(s) + } else { + in = newInputBytes(b) + } + matches := re.doExecute(in, pos, re.prog.NumCap) + if len(matches) == 0 { + break + } + + accept := true + if matches[1] == pos { + // We've found an empty match. + if matches[0] == prevMatchEnd { + // We don't allow an empty match right + // after a previous match, so ignore it. + accept = false + } + var width int + // TODO: use step() + if b == nil { + _, width = utf8.DecodeRuneInString(s[pos:end]) + } else { + _, width = utf8.DecodeRune(b[pos:end]) + } + if width > 0 { + pos += width + } else { + pos = end + 1 + } + } else { + pos = matches[1] + } + prevMatchEnd = matches[1] + + if accept { + deliver(matches) + i++ + } + } +} + +// Find returns a slice holding the text of the leftmost match in b of the regular expression. +// A return value of nil indicates no match. +func (re *Regexp) Find(b []byte) []byte { + a := re.doExecute(newInputBytes(b), 0, 2) + if a == nil { + return nil + } + return b[a[0]:a[1]] +} + +// FindIndex returns a two-element slice of integers defining the location of +// the leftmost match in b of the regular expression. The match itself is at +// b[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindIndex(b []byte) (loc []int) { + a := re.doExecute(newInputBytes(b), 0, 2) + if a == nil { + return nil + } + return a[0:2] +} + +// FindString returns a string holding the text of the leftmost match in s of the regular +// expression. If there is no match, the return value is an empty string, +// but it will also be empty if the regular expression successfully matches +// an empty string. Use FindStringIndex or FindStringSubmatch if it is +// necessary to distinguish these cases. +func (re *Regexp) FindString(s string) string { + a := re.doExecute(newInputString(s), 0, 2) + if a == nil { + return "" + } + return s[a[0]:a[1]] +} + +// FindStringIndex returns a two-element slice of integers defining the +// location of the leftmost match in s of the regular expression. The match +// itself is at s[loc[0]:loc[1]]. +// A return value of nil indicates no match. +func (re *Regexp) FindStringIndex(s string) []int { + a := re.doExecute(newInputString(s), 0, 2) + if a == nil { + return nil + } + return a[0:2] +} + +// FindReaderIndex returns a two-element slice of integers defining the +// location of the leftmost match of the regular expression in text read from +// the RuneReader. The match itself is at s[loc[0]:loc[1]]. A return +// value of nil indicates no match. +func (re *Regexp) FindReaderIndex(r io.RuneReader) []int { + a := re.doExecute(newInputReader(r), 0, 2) + if a == nil { + return nil + } + return a[0:2] +} + +// FindSubmatch returns a slice of slices holding the text of the leftmost +// match of the regular expression in b and the matches, if any, of its +// subexpressions, as defined by the 'Submatch' descriptions in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatch(b []byte) [][]byte { + a := re.doExecute(newInputBytes(b), 0, re.prog.NumCap) + if a == nil { + return nil + } + ret := make([][]byte, len(a)/2) + for i := range ret { + if a[2*i] >= 0 { + ret[i] = b[a[2*i]:a[2*i+1]] + } + } + return ret +} + +// FindSubmatchIndex returns a slice holding the index pairs identifying the +// leftmost match of the regular expression in b and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindSubmatchIndex(b []byte) []int { + return re.doExecute(newInputBytes(b), 0, re.prog.NumCap) +} + +// FindStringSubmatch returns a slice of strings holding the text of the +// leftmost match of the regular expression in s and the matches, if any, of +// its subexpressions, as defined by the 'Submatch' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatch(s string) []string { + a := re.doExecute(newInputString(s), 0, re.prog.NumCap) + if a == nil { + return nil + } + ret := make([]string, len(a)/2) + for i := range ret { + if a[2*i] >= 0 { + ret[i] = s[a[2*i]:a[2*i+1]] + } + } + return ret +} + +// FindStringSubmatchIndex returns a slice holding the index pairs +// identifying the leftmost match of the regular expression in s and the +// matches, if any, of its subexpressions, as defined by the 'Submatch' and +// 'Index' descriptions in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindStringSubmatchIndex(s string) []int { + return re.doExecute(newInputString(s), 0, re.prog.NumCap) +} + +// FindReaderSubmatchIndex returns a slice holding the index pairs +// identifying the leftmost match of the regular expression of text read by +// the RuneReader, and the matches, if any, of its subexpressions, as defined +// by the 'Submatch' and 'Index' descriptions in the package comment. A +// return value of nil indicates no match. +func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { + return re.doExecute(newInputReader(r), 0, re.prog.NumCap) +} + +const startSize = 10 // The size at which to start a slice in the 'All' routines. + +// FindAll is the 'All' version of Find; it returns a slice of all successive +// matches of the expression, as defined by the 'All' description in the +// package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAll(b []byte, n int) [][]byte { + if n < 0 { + n = len(b) + 1 + } + result := make([][]byte, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, b[match[0]:match[1]]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { + if n < 0 { + n = len(b) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, match[0:2]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllString is the 'All' version of FindString; it returns a slice of all +// successive matches of the expression, as defined by the 'All' description +// in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllString(s string, n int) []string { + if n < 0 { + n = len(s) + 1 + } + result := make([]string, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, s[match[0]:match[1]]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a +// slice of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { + if n < 0 { + n = len(s) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, match[0:2]) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice +// of all successive matches of the expression, as defined by the 'All' +// description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { + if n < 0 { + n = len(b) + 1 + } + result := make([][][]byte, 0, startSize) + re.allMatches("", b, n, func(match []int) { + slice := make([][]byte, len(match)/2) + for j := range slice { + if match[2*j] >= 0 { + slice[j] = b[match[2*j]:match[2*j+1]] + } + } + result = append(result, slice) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns +// a slice of all successive matches of the expression, as defined by the +// 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { + if n < 0 { + n = len(b) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches("", b, n, func(match []int) { + result = append(result, match) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it +// returns a slice of all successive matches of the expression, as defined by +// the 'All' description in the package comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { + if n < 0 { + n = len(s) + 1 + } + result := make([][]string, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + slice := make([]string, len(match)/2) + for j := range slice { + if match[2*j] >= 0 { + slice[j] = s[match[2*j]:match[2*j+1]] + } + } + result = append(result, slice) + }) + if len(result) == 0 { + return nil + } + return result +} + +// FindAllStringSubmatchIndex is the 'All' version of +// FindStringSubmatchIndex; it returns a slice of all successive matches of +// the expression, as defined by the 'All' description in the package +// comment. +// A return value of nil indicates no match. +func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { + if n < 0 { + n = len(s) + 1 + } + result := make([][]int, 0, startSize) + re.allMatches(s, nil, n, func(match []int) { + result = append(result, match) + }) + if len(result) == 0 { + return nil + } + return result +} diff --git a/src/pkg/exp/regexp/syntax/compile.go b/src/pkg/exp/regexp/syntax/compile.go index 97f94de338..5ea2425c3a 100644 --- a/src/pkg/exp/regexp/syntax/compile.go +++ b/src/pkg/exp/regexp/syntax/compile.go @@ -86,6 +86,7 @@ func Compile(re *Regexp) (*Prog, os.Error) { func (c *compiler) init() { c.p = new(Prog) + c.p.NumCap = 2 // implicit ( and ) for whole match $0 c.inst(InstFail) } diff --git a/src/pkg/exp/regexp/syntax/prog.go b/src/pkg/exp/regexp/syntax/prog.go index 9b0e3e7aae..bf85b720d0 100644 --- a/src/pkg/exp/regexp/syntax/prog.go +++ b/src/pkg/exp/regexp/syntax/prog.go @@ -55,6 +55,61 @@ func (p *Prog) String() string { return b.String() } +// skipNop follows any no-op or capturing instructions +// and returns the resulting pc. +func (p *Prog) skipNop(pc uint32) *Inst { + i := &p.Inst[pc] + for i.Op == InstNop || i.Op == InstCapture { + pc = i.Out + i = &p.Inst[pc] + } + return i +} + +// Prefix returns a literal string that all matches for the +// regexp must start with. Complete is true if the prefix +// is the entire match. +func (p *Prog) Prefix() (prefix string, complete bool) { + i := p.skipNop(uint32(p.Start)) + + // Avoid allocation of buffer if prefix is empty. + if i.Op != InstRune || len(i.Rune) != 1 { + return "", i.Op == InstMatch + } + + // Have prefix; gather characters. + var buf bytes.Buffer + for i.Op == InstRune && len(i.Rune) == 1 { + buf.WriteRune(i.Rune[0]) + i = p.skipNop(i.Out) + } + return buf.String(), i.Op == InstMatch +} + +// StartCond returns the leading empty-width conditions that must +// be true in any match. It returns ^EmptyOp(0) if no matches are possible. +func (p *Prog) StartCond() EmptyOp { + var flag EmptyOp + pc := uint32(p.Start) + i := &p.Inst[pc] +Loop: + for { + switch i.Op { + case InstEmptyWidth: + flag |= EmptyOp(i.Arg) + case InstFail: + return ^EmptyOp(0) + case InstCapture, InstNop: + // skip + default: + break Loop + } + pc = i.Out + i = &p.Inst[pc] + } + return flag +} + // MatchRune returns true if the instruction matches (and consumes) r. // It should only be called when i.Op == InstRune. func (i *Inst) MatchRune(r int) bool { -- 2.50.0