]> Cypherpunks repositories - gostls13.git/commitdiff
regexp/syntax: fix factoring of common prefixes in alternations
authorPaul Wankadia <junyer@google.com>
Thu, 7 Jan 2016 07:55:18 +0000 (18:55 +1100)
committerRuss Cox <rsc@golang.org>
Fri, 8 Jan 2016 16:41:46 +0000 (16:41 +0000)
In the past, `a.*?c|a.*?b` was factored to `a.*?[bc]`. Thus, given
"abc" as its input string, the automaton would consume "ab" and
then stop (when unanchored) whereas it should consume all of "abc"
as per leftmost semantics.

Fixes #13812.

Change-Id: I67ac0a353d7793b3d0c9c4aaf22d157621dfe784
Reviewed-on: https://go-review.googlesource.com/18357
Reviewed-by: Russ Cox <rsc@golang.org>
src/regexp/syntax/parse.go
src/regexp/syntax/parse_test.go
src/regexp/testdata/re2-search.txt

index c2b92c1d44ab43dccb9cd4091a002216e028515e..f38bbf66e3cf47c8420151413f00b31c720a1b15 100644 (file)
@@ -470,9 +470,14 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
        }
        sub = out
 
-       // Round 2: Factor out common complex prefixes,
-       // just the first piece of each concatenation,
-       // whatever it is.  This is good enough a lot of the time.
+       // Round 2: Factor out common simple prefixes,
+       // just the first piece of each concatenation.
+       // This will be good enough a lot of the time.
+       //
+       // Complex subexpressions (e.g. involving quantifiers)
+       // are not safe to factor because that collapses their
+       // distinct paths through the automaton, which affects
+       // correctness in some cases.
        start = 0
        out = sub[:0]
        var first *Regexp
@@ -485,7 +490,9 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
                var ifirst *Regexp
                if i < len(sub) {
                        ifirst = p.leadingRegexp(sub[i])
-                       if first != nil && first.Equal(ifirst) {
+                       if first != nil && first.Equal(ifirst) &&
+                               // first must be a character class OR a fixed repeat of a character class.
+                               (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) {
                                continue
                        }
                }
index 626ceeadf6d72b68c03c89f2df9fe7cf16328e0a..5ca54bbe1ebdf5467909fea225d4878d819a8090 100644 (file)
@@ -172,7 +172,7 @@ var parseTests = []parseTest{
 
        // Factoring.
        {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
-       {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`},
+       {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`},
 
        // Bug fixes.
        {`(?:.)`, `dot{}`},
@@ -195,12 +195,13 @@ var parseTests = []parseTest{
        {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
        {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
        {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
-       {`(?:xx|yy)c|(?:xx|yy)d`,
-               `cat{alt{str{xx}str{yy}}cc{0x63-0x64}}`},
+       {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`},
        {`x{2}|x{2}[0-9]`,
                `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
        {`x{2}y|x{2}[0-9]y`,
                `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
+       {`a.*?c|a.*?b`,
+               `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`},
 
        // Valid repetitions.
        {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``},
index f648e5527f7633d0074159aa1866899ead11bacb..4d02e9cebd687a01767295533905bd5c8ae56b8f 100644 (file)
@@ -3665,3 +3665,8 @@ regexps
 "(?:a\\C*|ba\\C)$"
 -;-;-;-
 -;1-4;-;1-4
+strings
+"abc"
+regexps
+"a.*?c|a.*?b"
+0-3;0-3;0-3;0-3