From: Russ Cox Date: Wed, 7 Sep 2011 19:48:06 +0000 (-0400) Subject: exp/regexp: bug fixes and RE2 tests X-Git-Tag: weekly.2011-09-07~6 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=08ae1a5a23e68056dad09ea48e53c4fae36e37b1;p=gostls13.git exp/regexp: bug fixes and RE2 tests Also add exp/regexp to build (forgot before). At this point I am very confident in exp/regexp's behavior. It should be usable as a drop-in replacement for regexp now. Later CLs could introduce a CompilePOSIX to get at traditional POSIX ``extended regular expressions'' as in egrep and also an re.MatchLongest method to change the matching mode to leftmost longest instead of leftmost first. On the other hand, I expect very few people to use either. R=r, r, gustavo CC=golang-dev https://golang.org/cl/4990041 --- diff --git a/src/pkg/Makefile b/src/pkg/Makefile index 0b67bdacd4..c7e65c029e 100644 --- a/src/pkg/Makefile +++ b/src/pkg/Makefile @@ -81,6 +81,7 @@ DIRS=\ exp/gui\ exp/gui/x11\ exp/norm\ + exp/regexp\ exp/regexp/syntax\ exp/template/html\ expvar\ diff --git a/src/pkg/exp/regexp/exec.go b/src/pkg/exp/regexp/exec.go index 0670bb9b1b..88b16032ee 100644 --- a/src/pkg/exp/regexp/exec.go +++ b/src/pkg/exp/regexp/exec.go @@ -90,23 +90,12 @@ func (m *machine) match(i input, pos int) bool { if rune != endOfText { rune1, width1 = i.step(pos + width) } - // TODO: Let caller specify the initial flag setting. - // For now assume pos == 0 is beginning of text and - // pos != 0 is not even beginning of line. - // TODO: Word boundary. var flag syntax.EmptyOp if pos == 0 { - flag = syntax.EmptyBeginText | syntax.EmptyBeginLine - } - - // Update flag using lookahead rune. - if rune1 == '\n' { - flag |= syntax.EmptyEndLine - } - if rune1 == endOfText { - flag |= syntax.EmptyEndText + flag = syntax.EmptyOpContext(-1, rune) + } else { + flag = i.context(pos) } - for { if len(runq.dense) == 0 { if startCond&syntax.EmptyBeginText != 0 && pos != 0 { @@ -134,17 +123,7 @@ func (m *machine) match(i input, pos int) bool { } m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag) } - // TODO: word boundary - flag = 0 - if rune == '\n' { - flag |= syntax.EmptyBeginLine - } - if rune1 == '\n' { - flag |= syntax.EmptyEndLine - } - if rune1 == endOfText { - flag |= syntax.EmptyEndText - } + flag = syntax.EmptyOpContext(rune, rune1) m.step(runq, nextq, pos, pos+width, rune, flag) if width == 0 { break diff --git a/src/pkg/exp/regexp/exec_test.go b/src/pkg/exp/regexp/exec_test.go new file mode 100644 index 0000000000..15c4c532a4 --- /dev/null +++ b/src/pkg/exp/regexp/exec_test.go @@ -0,0 +1,271 @@ +// Copyright 2010 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "bufio" + "compress/gzip" + "fmt" + "os" + "strconv" + "strings" + "testing" + "utf8" +) + +// TestRE2 tests this package's regexp API against test cases +// considered during RE2's exhaustive tests, which run all possible +// regexps over a given set of atoms and operators, up to a given +// complexity, over all possible strings over a given alphabet, +// up to a given size. Rather than try to link with RE2, we read a +// log file containing the test cases and the expected matches. +// The log file, re2.txt, is generated by running 'make exhaustive-log' +// in the open source RE2 distribution. http://code.google.com/p/re2/ +// +// The test file format is a sequence of stanzas like: +// +// strings +// "abc" +// "123x" +// regexps +// "[a-z]+" +// 0-3;0-3 +// -;- +// "([0-9])([0-9])([0-9])" +// -;- +// -;0-3 0-1 1-2 2-3 +// +// The stanza begins by defining a set of strings, quoted +// using Go double-quote syntax, one per line. Then the +// regexps section gives a sequence of regexps to run on +// the strings. In the block that follows a regexp, each line +// gives the semicolon-separated match results of running +// the regexp on the corresponding string. +// Each match result is either a single -, meaning no match, or a +// space-separated sequence of pairs giving the match and +// submatch indices. An unmatched subexpression formats +// its pair as a single - (not illustrated above). For now +// each regexp run produces two match results, one for a +// ``full match'' that restricts the regexp to matching the entire +// string or nothing, and one for a ``partial match'' that gives +// the leftmost first match found in the string. +// +// Lines beginning with # are comments. Lines beginning with +// a capital letter are test names printed during RE2's test suite +// and are echoed into t but otherwise ignored. +// +// At time of writing, re2.txt is 32 MB but compresses to 760 kB, +// so we store re2.txt.gz in the repository and decompress it on the fly. +// +func TestRE2(t *testing.T) { + if testing.Short() { + t.Log("skipping TestRE2 during short test") + return + } + + f, err := os.Open("re2.txt.gz") + if err != nil { + t.Fatal(err) + } + defer f.Close() + gz, err := gzip.NewReader(f) + if err != nil { + t.Fatalf("decompress re2.txt.gz: %v", err) + } + defer gz.Close() + lineno := 0 + r := bufio.NewReader(gz) + var ( + str []string + input []string + inStrings bool + re *Regexp + refull *Regexp + nfail int + ncase int + ) + for { + line, err := r.ReadString('\n') + if err != nil { + if err == os.EOF { + break + } + t.Fatalf("re2.txt:%d: %v", lineno, err) + } + line = line[:len(line)-1] // chop \n + lineno++ + switch { + case line == "": + t.Fatalf("re2.txt:%d: unexpected blank line", lineno) + case line[0] == '#': + continue + case 'A' <= line[0] && line[0] <= 'Z': + // Test name. + t.Logf("%s\n", line) + continue + case line == "strings": + str = str[:0] + inStrings = true + case line == "regexps": + inStrings = false + case line[0] == '"': + q, err := strconv.Unquote(line) + if err != nil { + // Fatal because we'll get out of sync. + t.Fatalf("re2.txt:%d: unquote %s: %v", lineno, line, err) + } + if inStrings { + str = append(str, q) + continue + } + // Is a regexp. + if len(input) != 0 { + t.Fatalf("re2.txt:%d: out of sync: have %d strings left before %#q", lineno, len(input), q) + } + re, err = tryCompile(q) + if err != nil { + if err.String() == "error parsing regexp: invalid escape sequence: `\\C`" { + // We don't and likely never will support \C; keep going. + continue + } + t.Errorf("re2.txt:%d: compile %#q: %v", lineno, q, err) + if nfail++; nfail >= 100 { + t.Fatalf("stopping after %d errors", nfail) + } + continue + } + full := `\A(?:` + q + `)\z` + refull, err = tryCompile(full) + if err != nil { + // Fatal because q worked, so this should always work. + t.Fatalf("re2.txt:%d: compile full %#q: %v", lineno, full, err) + } + input = str + case line[0] == '-' || '0' <= line[0] && line[0] <= '9': + // A sequence of match results. + ncase++ + if re == nil { + // Failed to compile: skip results. + continue + } + if len(input) == 0 { + t.Fatalf("re2.txt:%d: out of sync: no input remaining", lineno) + } + var text string + text, input = input[0], input[1:] + if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) { + // RE2's \B considers every byte position, + // so it sees 'not word boundary' in the + // middle of UTF-8 sequences. This package + // only considers the positions between runes, + // so it disagrees. Skip those cases. + continue + } + res := strings.Split(line, ";") + if len(res) != 2 { + t.Fatalf("re2.txt:%d: have %d test results, want 2", lineno, len(res)) + } + // res[0] is full match + // res[1] is partial match + // Run partial match first; don't bother with full if partial fails. + have := re.FindStringSubmatchIndex(text) + want := parseResult(t, lineno, res[1]) + if !same(have, want) { + t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, re, text, have, want) + if nfail++; nfail >= 100 { + t.Fatalf("stopping after %d errors", nfail) + } + continue + } + have = refull.FindStringSubmatchIndex(text) + want = parseResult(t, lineno, res[0]) + if !same(have, want) { + t.Errorf("re2.txt:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", lineno, refull, text, have, want) + if nfail++; nfail >= 100 { + t.Fatalf("stopping after %d errors", nfail) + } + } + default: + t.Fatalf("re2.txt:%d: out of sync: %s\n", lineno, line) + } + } + if len(input) != 0 { + t.Fatalf("re2.txt:%d: out of sync: have %d strings left at EOF", lineno, len(input)) + } + t.Logf("%d cases tested", ncase) +} + +func isSingleBytes(s string) bool { + for _, c := range s { + if c >= utf8.RuneSelf { + return false + } + } + return true +} + +func tryCompile(s string) (re *Regexp, err os.Error) { + // Protect against panic during Compile. + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("panic: %v", r) + } + }() + return Compile(s) +} + +func parseResult(t *testing.T, lineno int, res string) []int { + // A single - indicates no match. + if res == "-" { + return nil + } + // Otherwise, a space-separated list of pairs. + n := 1 + for j := 0; j < len(res); j++ { + if res[j] == ' ' { + n++ + } + } + out := make([]int, 2*n) + i := 0 + n = 0 + for j := 0; j <= len(res); j++ { + if j == len(res) || res[j] == ' ' { + // Process a single pair. - means no submatch. + pair := res[i:j] + if pair == "-" { + out[n] = -1 + out[n+1] = -1 + } else { + k := strings.Index(pair, "-") + if k < 0 { + t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair) + } + lo, err1 := strconv.Atoi(pair[:k]) + hi, err2 := strconv.Atoi(pair[k+1:]) + if err1 != nil || err2 != nil || lo > hi { + t.Fatalf("re2.txt:%d: invalid pair %s", lineno, pair) + } + out[n] = lo + out[n+1] = hi + } + n += 2 + i = j + 1 + } + } + return out +} + +func same(x, y []int) bool { + if len(x) != len(y) { + return false + } + for i, xi := range x { + if xi != y[i] { + return false + } + } + return true +} diff --git a/src/pkg/exp/regexp/find_test.go b/src/pkg/exp/regexp/find_test.go index dddc3484c9..6406bb6e65 100644 --- a/src/pkg/exp/regexp/find_test.go +++ b/src/pkg/exp/regexp/find_test.go @@ -80,6 +80,23 @@ var findTests = []FindTest{ {`data`, "daXY data", build(1, 5, 9)}, {`da(.)a$`, "daXY data", build(1, 5, 9, 7, 8)}, {`zx+`, "zzx", build(1, 1, 3)}, + {`ab$`, "abcab", build(1, 3, 5)}, + {`(aa)*$`, "a", build(1, 1, 1, -1, -1)}, + {`(?:.|(?:.a))`, "", nil}, + {`(?:A(?:A|a))`, "Aa", build(1, 0, 2)}, + {`(?:A|(?:A|a))`, "a", build(1, 0, 1)}, + {`(a){0}`, "", build(1, 0, 0, -1, -1)}, + {`(?-s)(?:(?:^).)`, "\n", nil}, + {`(?s)(?:(?:^).)`, "\n", build(1, 0, 1)}, + {`(?:(?:^).)`, "\n", nil}, + {`\b`, "x", build(2, 0, 0, 1, 1)}, + {`\b`, "xx", build(2, 0, 0, 2, 2)}, + {`\b`, "x y", build(4, 0, 0, 1, 1, 2, 2, 3, 3)}, + {`\b`, "xx yy", build(4, 0, 0, 2, 2, 3, 3, 5, 5)}, + {`\B`, "x", nil}, + {`\B`, "xx", build(1, 1, 1)}, + {`\B`, "x y", nil}, + {`\B`, "xx yy", build(2, 1, 1, 4, 4)}, // can backslash-escape any punctuation {`\!\"\#\$\%\&\'\(\)\*\+\,\-\.\/\:\;\<\=\>\?\@\[\\\]\^\_\{\|\}\~`, diff --git a/src/pkg/exp/regexp/re2.txt.gz b/src/pkg/exp/regexp/re2.txt.gz new file mode 100644 index 0000000000..2b8c832e52 Binary files /dev/null and b/src/pkg/exp/regexp/re2.txt.gz differ diff --git a/src/pkg/exp/regexp/regexp.go b/src/pkg/exp/regexp/regexp.go index 1b75900f81..11feecd55e 100644 --- a/src/pkg/exp/regexp/regexp.go +++ b/src/pkg/exp/regexp/regexp.go @@ -84,6 +84,7 @@ type Regexp struct { prefixComplete bool // prefix is the entire regexp prefixRune int // first rune in prefix cond syntax.EmptyOp // empty-width conditions required at start of match + numSubexp int // cache of machines for running regexp mu sync.Mutex @@ -102,13 +103,16 @@ func Compile(expr string) (*Regexp, os.Error) { if err != nil { return nil, err } + maxCap := re.MaxCap() + re = re.Simplify() prog, err := syntax.Compile(re) if err != nil { return nil, err } regexp := &Regexp{ - expr: expr, - prog: prog, + expr: expr, + prog: prog, + numSubexp: maxCap, } regexp.prefix, regexp.prefixComplete = prog.Prefix() if regexp.prefix != "" { @@ -161,9 +165,7 @@ func MustCompile(str string) *Regexp { // NumSubexp returns the number of parenthesized subexpressions in this Regexp. func (re *Regexp) NumSubexp() int { - // NumCap/2 because captures count ( and ) separately. - // -1 because NumCap counts $0 but NumSubexp does not. - return re.prog.NumCap/2 - 1 + return re.numSubexp } const endOfText = -1 @@ -175,6 +177,7 @@ type input interface { canCheckPrefix() bool // can we look ahead without losing info? hasPrefix(re *Regexp) bool index(re *Regexp, pos int) int + context(pos int) syntax.EmptyOp } // inputString scans a string. @@ -205,6 +208,17 @@ func (i *inputString) index(re *Regexp, pos int) int { return strings.Index(i.str[pos:], re.prefix) } +func (i *inputString) context(pos int) syntax.EmptyOp { + r1, r2 := -1, -1 + if pos > 0 && pos <= len(i.str) { + r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) + } + if pos < len(i.str) { + r2, _ = utf8.DecodeRuneInString(i.str[pos:]) + } + return syntax.EmptyOpContext(r1, r2) +} + // inputBytes scans a byte slice. type inputBytes struct { str []byte @@ -233,6 +247,17 @@ func (i *inputBytes) index(re *Regexp, pos int) int { return bytes.Index(i.str[pos:], re.prefixBytes) } +func (i *inputBytes) context(pos int) syntax.EmptyOp { + r1, r2 := -1, -1 + if pos > 0 && pos <= len(i.str) { + r1, _ = utf8.DecodeLastRune(i.str[:pos]) + } + if pos < len(i.str) { + r2, _ = utf8.DecodeRune(i.str[pos:]) + } + return syntax.EmptyOpContext(r1, r2) +} + // inputReader scans a RuneReader. type inputReader struct { r io.RuneReader @@ -270,6 +295,10 @@ func (i *inputReader) index(re *Regexp, pos int) int { return -1 } +func (i *inputReader) context(pos int) syntax.EmptyOp { + return 0 +} + // LiteralPrefix returns a literal string that must begin any match // of the regular expression re. It returns the boolean true if the // literal string comprises the entire regular expression. @@ -458,6 +487,23 @@ func QuoteMeta(s string) string { return string(b[0:j]) } +// The number of capture values in the program may correspond +// to fewer capturing expressions than are in the regexp. +// For example, "(a){0}" turns into an empty program, so the +// maximum capture in the program is 0 but we need to return +// an expression for \1. Pad appends -1s to the slice a as needed. +func (re *Regexp) pad(a []int) []int { + if a == nil { + // No match. + return nil + } + n := (1 + re.numSubexp) * 2 + for len(a) < n { + a = append(a, -1) + } + return a +} + // Find matches in slice b if b is non-nil, otherwise find matches in string s. func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { var end int @@ -505,7 +551,7 @@ func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { prevMatchEnd = matches[1] if accept { - deliver(matches) + deliver(re.pad(matches)) i++ } } @@ -580,9 +626,9 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte { if a == nil { return nil } - ret := make([][]byte, len(a)/2) + ret := make([][]byte, 1+re.numSubexp) for i := range ret { - if a[2*i] >= 0 { + if 2*i < len(a) && a[2*i] >= 0 { ret[i] = b[a[2*i]:a[2*i+1]] } } @@ -595,7 +641,7 @@ func (re *Regexp) FindSubmatch(b []byte) [][]byte { // in the package comment. // A return value of nil indicates no match. func (re *Regexp) FindSubmatchIndex(b []byte) []int { - return re.doExecute(newInputBytes(b), 0, re.prog.NumCap) + return re.pad(re.doExecute(newInputBytes(b), 0, re.prog.NumCap)) } // FindStringSubmatch returns a slice of strings holding the text of the @@ -608,9 +654,9 @@ func (re *Regexp) FindStringSubmatch(s string) []string { if a == nil { return nil } - ret := make([]string, len(a)/2) + ret := make([]string, 1+re.numSubexp) for i := range ret { - if a[2*i] >= 0 { + if 2*i < len(a) && a[2*i] >= 0 { ret[i] = s[a[2*i]:a[2*i+1]] } } @@ -623,7 +669,7 @@ func (re *Regexp) FindStringSubmatch(s string) []string { // 'Index' descriptions in the package comment. // A return value of nil indicates no match. func (re *Regexp) FindStringSubmatchIndex(s string) []int { - return re.doExecute(newInputString(s), 0, re.prog.NumCap) + return re.pad(re.doExecute(newInputString(s), 0, re.prog.NumCap)) } // FindReaderSubmatchIndex returns a slice holding the index pairs @@ -632,7 +678,7 @@ func (re *Regexp) FindStringSubmatchIndex(s string) []int { // by the 'Submatch' and 'Index' descriptions in the package comment. A // return value of nil indicates no match. func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { - return re.doExecute(newInputReader(r), 0, re.prog.NumCap) + return re.pad(re.doExecute(newInputReader(r), 0, re.prog.NumCap)) } const startSize = 10 // The size at which to start a slice in the 'All' routines. diff --git a/src/pkg/exp/regexp/syntax/compile.go b/src/pkg/exp/regexp/syntax/compile.go index 5ea2425c3a..6b6d062374 100644 --- a/src/pkg/exp/regexp/syntax/compile.go +++ b/src/pkg/exp/regexp/syntax/compile.go @@ -75,6 +75,7 @@ type compiler struct { } // Compile compiles the regexp into a program to be executed. +// The regexp should have been simplified already (returned from re.Simplify). func Compile(re *Regexp) (*Prog, os.Error) { var c compiler c.init() @@ -90,7 +91,7 @@ func (c *compiler) init() { c.inst(InstFail) } -var anyRuneNotNL = []int{0, '\n' - 1, '\n' - 1, unicode.MaxRune} +var anyRuneNotNL = []int{0, '\n' - 1, '\n' + 1, unicode.MaxRune} var anyRune = []int{0, unicode.MaxRune} func (c *compiler) compile(re *Regexp) frag { @@ -105,7 +106,7 @@ func (c *compiler) compile(re *Regexp) frag { } var f frag for j := range re.Rune { - f1 := c.rune(re.Rune[j : j+1]) + f1 := c.rune(re.Rune[j:j+1], re.Flags) if j == 0 { f = f1 } else { @@ -114,11 +115,11 @@ func (c *compiler) compile(re *Regexp) frag { } return f case OpCharClass: - return c.rune(re.Rune) + return c.rune(re.Rune, re.Flags) case OpAnyCharNotNL: - return c.rune(anyRuneNotNL) + return c.rune(anyRuneNotNL, 0) case OpAnyChar: - return c.rune(anyRune) + return c.rune(anyRune, 0) case OpBeginLine: return c.empty(EmptyBeginLine) case OpEndLine: @@ -261,9 +262,16 @@ func (c *compiler) empty(op EmptyOp) frag { return f } -func (c *compiler) rune(rune []int) frag { +func (c *compiler) rune(rune []int, flags Flags) frag { f := c.inst(InstRune) - c.p.Inst[f.i].Rune = rune + i := &c.p.Inst[f.i] + i.Rune = rune + flags &= FoldCase // only relevant flag is FoldCase + if len(rune) != 1 || unicode.SimpleFold(rune[0]) == rune[0] { + // and sometimes not even that + flags &^= FoldCase + } + i.Arg = uint32(flags) f.out = patchList(f.i << 1) return f } diff --git a/src/pkg/exp/regexp/syntax/parse.go b/src/pkg/exp/regexp/syntax/parse.go index 4eed182687..954a0ad8ae 100644 --- a/src/pkg/exp/regexp/syntax/parse.go +++ b/src/pkg/exp/regexp/syntax/parse.go @@ -419,8 +419,7 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { // used or marked for reuse, and the slice space has been reused // for out (len(out) <= start). // - // Invariant: sub[start:i] consists of regexps that all begin - // with str as modified by strflags. + // Invariant: sub[start:i] consists of regexps that all begin with ifirst. var ifirst *Regexp if i < len(sub) { ifirst = p.leadingRegexp(sub[i]) @@ -441,7 +440,6 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { } else { // Construct factored form: prefix(suffix1|suffix2|...) prefix := first - for j := start; j < i; j++ { reuse := j != start // prefix came from sub[start] sub[j] = p.removeLeadingRegexp(sub[j], reuse) @@ -605,8 +603,10 @@ func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp { } return re } - re.Op = OpEmptyMatch - return re + if reuse { + p.reuse(re) + } + return p.newRegexp(OpEmptyMatch) } func literalRegexp(s string, flags Flags) *Regexp { @@ -1053,18 +1053,18 @@ func mergeCharClass(dst, src *Regexp) { case OpCharClass: // src is simpler, so either literal or char class if src.Op == OpLiteral { - dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0]) + dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) } else { dst.Rune = appendClass(dst.Rune, src.Rune) } case OpLiteral: // both literal - if src.Rune[0] == dst.Rune[0] { + if src.Rune[0] == dst.Rune[0] && src.Flags == dst.Flags { break } dst.Op = OpCharClass - dst.Rune = append(dst.Rune, dst.Rune[0]) - dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0]) + dst.Rune = appendLiteral(dst.Rune[:0], dst.Rune[0], dst.Flags) + dst.Rune = appendLiteral(dst.Rune, src.Rune[0], src.Flags) } } @@ -1544,6 +1544,14 @@ func cleanClass(rp *[]int) []int { return r[:w] } +// appendLiteral returns the result of appending the literal x to the class r. +func appendLiteral(r []int, x int, flags Flags) []int { + if flags&FoldCase != 0 { + return appendFoldedRange(r, x, x) + } + return appendRange(r, x, x) +} + // appendRange returns the result of appending the range lo-hi to the class r. func appendRange(r []int, lo, hi int) []int { // Expand last range or next to last range if it overlaps or abuts. diff --git a/src/pkg/exp/regexp/syntax/parse_test.go b/src/pkg/exp/regexp/syntax/parse_test.go index 779b9afdea..a146c89c3f 100644 --- a/src/pkg/exp/regexp/syntax/parse_test.go +++ b/src/pkg/exp/regexp/syntax/parse_test.go @@ -162,6 +162,18 @@ var parseTests = []struct { // Factoring. {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`}, + + // Bug fixes. + {`(?:.)`, `dot{}`}, + {`(?:x|(?:xa))`, `cat{lit{x}alt{emp{}lit{a}}}`}, + {`(?:.|(?:.a))`, `cat{dot{}alt{emp{}lit{a}}}`}, + {`(?:A(?:A|a))`, `cat{lit{A}litfold{A}}`}, + {`(?:A|a)`, `litfold{A}`}, + {`A|(?:A|a)`, `litfold{A}`}, + {`(?s).`, `dot{}`}, + {`(?-s).`, `dnl{}`}, + {`(?:(?:^).)`, `cat{bol{}dot{}}`}, + {`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`}, } const testFlags = MatchNL | PerlX | UnicodeGroups diff --git a/src/pkg/exp/regexp/syntax/prog.go b/src/pkg/exp/regexp/syntax/prog.go index bf85b720d0..d214d70b52 100644 --- a/src/pkg/exp/regexp/syntax/prog.go +++ b/src/pkg/exp/regexp/syntax/prog.go @@ -3,6 +3,7 @@ package syntax import ( "bytes" "strconv" + "unicode" ) // Compiled program. @@ -41,6 +42,41 @@ const ( EmptyNoWordBoundary ) +// EmptyOpContext returns the zero-width assertions +// satisfied at the position between the runes r1 and r2. +// Passing r1 == -1 indicates that the position is +// at the beginning of the text. +// Passing r2 == -1 indicates that the position is +// at the end of the text. +func EmptyOpContext(r1, r2 int) EmptyOp { + var op EmptyOp + if r1 < 0 { + op |= EmptyBeginText | EmptyBeginLine + } + if r1 == '\n' { + op |= EmptyBeginLine + } + if r2 < 0 { + op |= EmptyEndText + } + if r2 == '\n' { + op |= EmptyEndLine + } + if IsWordChar(r1) != IsWordChar(r2) { + op |= EmptyWordBoundary + } else { + op |= EmptyNoWordBoundary + } + return op +} + +// IsWordChar reports whether r is consider a ``word character'' +// during the evaluation of the \b and \B zero-width assertions. +// These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. +func IsWordChar(r int) bool { + return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' +} + // An Inst is a single instruction in a regular expression program. type Inst struct { Op InstOp @@ -79,7 +115,7 @@ func (p *Prog) Prefix() (prefix string, complete bool) { // Have prefix; gather characters. var buf bytes.Buffer - for i.Op == InstRune && len(i.Rune) == 1 { + for i.Op == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 { buf.WriteRune(i.Rune[0]) i = p.skipNop(i.Out) } @@ -116,9 +152,19 @@ func (i *Inst) MatchRune(r int) bool { rune := i.Rune // Special case: single-rune slice is from literal string, not char class. - // TODO: Case folding. if len(rune) == 1 { - return r == rune[0] + r0 := rune[0] + if r == r0 { + return true + } + if Flags(i.Arg)&FoldCase != 0 { + for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { + if r == r1 { + return true + } + } + } + return false } // Peek at the first few pairs. @@ -232,6 +278,10 @@ func dumpInst(b *bytes.Buffer, i *Inst) { // shouldn't happen bw(b, "rune ") } - bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) + bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) + if Flags(i.Arg)&FoldCase != 0 { + bw(b, "/i") + } + bw(b, " -> ", u32(i.Out)) } } diff --git a/src/pkg/exp/regexp/syntax/prog_test.go b/src/pkg/exp/regexp/syntax/prog_test.go index 7be4281c27..3fe0c5870a 100644 --- a/src/pkg/exp/regexp/syntax/prog_test.go +++ b/src/pkg/exp/regexp/syntax/prog_test.go @@ -76,6 +76,16 @@ var compileTests = []struct { 4 alt -> 3, 6 5* alt -> 1, 3 6 match +`}, + {"A[Aa]", ` 0 fail + 1* rune "A" -> 2 + 2 rune "A"/i -> 3 + 3 match +`}, + {"(?:(?:^).)", ` 0 fail + 1* empty 4 -> 2 + 2 rune "\x00\t\v\U0010ffff" -> 3 + 3 match `}, } diff --git a/src/pkg/exp/regexp/syntax/regexp.go b/src/pkg/exp/regexp/syntax/regexp.go index 00a4addefc..d8f51b903b 100644 --- a/src/pkg/exp/regexp/syntax/regexp.go +++ b/src/pkg/exp/regexp/syntax/regexp.go @@ -282,3 +282,17 @@ func escape(b *bytes.Buffer, r int, force bool) { b.WriteString(`}`) } } + +// MaxCap walks the regexp to find the maximum capture index. +func (re *Regexp) MaxCap() int { + m := 0 + if re.Op == OpCapture { + m = re.Cap + } + for _, sub := range re.Sub { + if n := sub.MaxCap(); m < n { + m = n + } + } + return m +}