From 5ccaf0255b75063a9c685009e77cee24e26a509e Mon Sep 17 00:00:00 2001 From: Paul Wankadia Date: Thu, 7 Jan 2016 18:55:18 +1100 Subject: [PATCH] regexp/syntax: fix factoring of common prefixes in alternations In the past, `a.*?c|a.*?b` was factored to `a.*?[bc]`. Thus, given "abc" as its input string, the automaton would consume "ab" and then stop (when unanchored) whereas it should consume all of "abc" as per leftmost semantics. Fixes #13812. Change-Id: I67ac0a353d7793b3d0c9c4aaf22d157621dfe784 Reviewed-on: https://go-review.googlesource.com/18357 Reviewed-by: Russ Cox --- src/regexp/syntax/parse.go | 15 +++++++++++---- src/regexp/syntax/parse_test.go | 7 ++++--- src/regexp/testdata/re2-search.txt | 5 +++++ 3 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/regexp/syntax/parse.go b/src/regexp/syntax/parse.go index c2b92c1d44..f38bbf66e3 100644 --- a/src/regexp/syntax/parse.go +++ b/src/regexp/syntax/parse.go @@ -470,9 +470,14 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { } sub = out - // Round 2: Factor out common complex prefixes, - // just the first piece of each concatenation, - // whatever it is. This is good enough a lot of the time. + // Round 2: Factor out common simple prefixes, + // just the first piece of each concatenation. + // This will be good enough a lot of the time. + // + // Complex subexpressions (e.g. involving quantifiers) + // are not safe to factor because that collapses their + // distinct paths through the automaton, which affects + // correctness in some cases. start = 0 out = sub[:0] var first *Regexp @@ -485,7 +490,9 @@ func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp { var ifirst *Regexp if i < len(sub) { ifirst = p.leadingRegexp(sub[i]) - if first != nil && first.Equal(ifirst) { + if first != nil && first.Equal(ifirst) && + // first must be a character class OR a fixed repeat of a character class. + (isCharClass(first) || (first.Op == OpRepeat && first.Min == first.Max && isCharClass(first.Sub[0]))) { continue } } diff --git a/src/regexp/syntax/parse_test.go b/src/regexp/syntax/parse_test.go index 626ceeadf6..5ca54bbe1e 100644 --- a/src/regexp/syntax/parse_test.go +++ b/src/regexp/syntax/parse_test.go @@ -172,7 +172,7 @@ var parseTests = []parseTest{ // Factoring. {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`}, - {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`}, + {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}lit{y}}cat{plus{lit{x}}lit{z}}cat{plus{lit{y}}lit{w}}}}`}, // Bug fixes. {`(?:.)`, `dot{}`}, @@ -195,12 +195,13 @@ var parseTests = []parseTest{ {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`}, {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`}, {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`}, - {`(?:xx|yy)c|(?:xx|yy)d`, - `cat{alt{str{xx}str{yy}}cc{0x63-0x64}}`}, + {`.c|.d`, `cat{dot{}cc{0x63-0x64}}`}, {`x{2}|x{2}[0-9]`, `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`}, {`x{2}y|x{2}[0-9]y`, `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`}, + {`a.*?c|a.*?b`, + `cat{lit{a}alt{cat{nstar{dot{}}lit{c}}cat{nstar{dot{}}lit{b}}}}`}, // Valid repetitions. {`((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))`, ``}, diff --git a/src/regexp/testdata/re2-search.txt b/src/regexp/testdata/re2-search.txt index f648e5527f..4d02e9cebd 100644 --- a/src/regexp/testdata/re2-search.txt +++ b/src/regexp/testdata/re2-search.txt @@ -3665,3 +3665,8 @@ regexps "(?:a\\C*|ba\\C)$" -;-;-;- -;1-4;-;1-4 +strings +"abc" +regexps +"a.*?c|a.*?b" +0-3;0-3;0-3;0-3 -- 2.50.0