func (p *parser) newLiteral(r int, flags Flags) *Regexp {
re := p.newRegexp(OpLiteral)
re.Flags = flags
+ if flags&FoldCase != 0 {
+ r = minFoldRune(r)
+ }
re.Rune0[0] = r
re.Rune = re.Rune0[:1]
return re
}
+// minFoldRune returns the minimum rune fold-equivalent to r.
+func minFoldRune(r int) int {
+ if r < minFold || r > maxFold {
+ return r
+ }
+ min := r
+ r0 := r
+ for r = unicode.SimpleFold(r); r != r0; r = unicode.SimpleFold(r) {
+ if min > r {
+ min = r
+ }
+ }
+ return min
+}
+
// literal pushes a literal regexp for the rune r on the stack
// and returns that regexp.
func (p *parser) literal(r int) {
// repeat replaces the top stack element with itself repeated
// according to op.
-func (p *parser) repeat(op Op, min, max int, opstr, t, lastRepeat string) (string, os.Error) {
+func (p *parser) repeat(op Op, min, max int, whole, opstr, t, lastRepeat string) (string, string, os.Error) {
flags := p.flags
if p.flags&PerlX != 0 {
if len(t) > 0 && t[0] == '?' {
t = t[1:]
+ opstr = whole[:len(opstr)+1]
flags ^= NonGreedy
}
if lastRepeat != "" {
// In Perl it is not allowed to stack repetition operators:
// a** is a syntax error, not a doubled star, and a++ means
// something else entirely, which we don't support!
- return "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(t)]}
+ return "", "", &Error{ErrInvalidRepeatOp, lastRepeat[:len(lastRepeat)-len(t)]}
}
}
n := len(p.stack)
if n == 0 {
- return "", &Error{ErrMissingRepeatArgument, opstr}
+ return "", "", &Error{ErrMissingRepeatArgument, opstr}
}
sub := p.stack[n-1]
+ if sub.Op >= opPseudo {
+ return "", "", &Error{ErrMissingRepeatArgument, opstr}
+ }
re := p.newRegexp(op)
re.Min = min
re.Max = max
re.Sub = re.Sub0[:1]
re.Sub[0] = sub
p.stack[n-1] = re
- return t, nil
+ return t, opstr, nil
}
// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
case '?':
op = OpQuest
}
- if t, err = p.repeat(op, min, max, t[:1], t[1:], lastRepeat); err != nil {
+ if t, repeat, err = p.repeat(op, min, max, t, t[:1], t[1:], lastRepeat); err != nil {
return nil, err
}
case '{':
t = t[1:]
break
}
- if t, err = p.repeat(op, min, max, t[:len(t)-len(tt)], tt, lastRepeat); err != nil {
+ opstr := t[:len(t)-len(tt)]
+ if min < 0 || min > 1000 || max > 1000 || max >= 0 && min > max {
+ // Numbers were too big, or max is present and min > max.
+ return nil, &Error{ErrInvalidRepeatSize, opstr}
+ }
+ if t, repeat, err = p.repeat(op, min, max, t, opstr, tt, lastRepeat); err != nil {
return nil, err
}
case '\\':
// parseRepeat parses {min} (max=min) or {min,} (max=-1) or {min,max}.
// If s is not of that form, it returns ok == false.
+// If s has the right form but the values are too big, it returns min == -1, ok == true.
func (p *parser) parseRepeat(s string) (min, max int, rest string, ok bool) {
if s == "" || s[0] != '{' {
return
}
s = s[1:]
- if min, s, ok = p.parseInt(s); !ok {
+ var ok1 bool
+ if min, s, ok1 = p.parseInt(s); !ok1 {
return
}
if s == "" {
}
if s[0] == '}' {
max = -1
- } else if max, s, ok = p.parseInt(s); !ok {
+ } else if max, s, ok1 = p.parseInt(s); !ok1 {
return
+ } else if max < 0 {
+ // parseInt found too big a number
+ min = -1
}
}
if s == "" || s[0] != '}' {
if len(s) >= 2 && s[0] == '0' && '0' <= s[1] && s[1] <= '9' {
return
}
+ t := s
for s != "" && '0' <= s[0] && s[0] <= '9' {
- // Avoid overflow.
- if n >= 1e8 {
- return
- }
- n = n*10 + int(s[0]) - '0'
s = s[1:]
}
rest = s
ok = true
+ // Have digits, compute value.
+ t = t[:len(t)-len(s)]
+ for i := 0; i < len(t); i++ {
+ // Avoid overflow.
+ if n >= 1e8 {
+ n = -1
+ break
+ }
+ n = n*10 + int(t[i]) - '0'
+ }
return
}
if re2.Op != opLeftParen {
return &Error{ErrMissingParen, p.wholeRegexp}
}
+ // Restore flags at time of paren.
+ p.flags = re2.Flags
if re2.Cap == 0 {
// Just for grouping.
p.push(re1)
return r
}
+var anyTable = &unicode.RangeTable{
+ []unicode.Range16{{0, 1<<16 - 1, 1}},
+ []unicode.Range32{{1 << 16, unicode.MaxRune, 1}},
+}
+
// unicodeTable returns the unicode.RangeTable identified by name
// and the table of additional fold-equivalent code points.
func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
+ // Special case: "Any" means any.
+ if name == "Any" {
+ return anyTable, anyTable
+ }
if t := unicode.Categories[name]; t != nil {
return t, unicode.FoldCategory[name]
}
"unicode"
)
-var parseTests = []struct {
+type parseTest struct {
Regexp string
Dump string
-}{
+}
+
+var parseTests = []parseTest{
// Base cases
{`a`, `lit{a}`},
{`a.`, `cat{lit{a}dot{}}`},
{`a{2}?`, `nrep{2,2 lit{a}}`},
{`a{2,3}?`, `nrep{2,3 lit{a}}`},
{`a{2,}?`, `nrep{2,-1 lit{a}}`},
+ // Malformed { } are treated as literals.
+ {`x{1001`, `str{x{1001}`},
+ {`x{9876543210`, `str{x{9876543210}`},
+ {`x{9876543210,`, `str{x{9876543210,}`},
+ {`x{2,1`, `str{x{2,1}`},
+ {`x{1,9876543210`, `str{x{1,9876543210}`},
{``, `emp{}`},
{`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
{`|x|`, `alt{emp{}lit{x}emp{}}`},
{`\p{Lu}`, mkCharClass(unicode.IsUpper)},
{`[\p{Lu}]`, mkCharClass(unicode.IsUpper)},
{`(?i)[\p{Lu}]`, mkCharClass(isUpperFold)},
+ {`\p{Any}`, `dot{}`},
+ {`\p{^Any}`, `cc{}`},
// Hex, octal.
{`[\012-\234]\141`, `cat{cc{0xa-0x9c}lit{a}}`},
{`(?-s).`, `dnl{}`},
{`(?:(?:^).)`, `cat{bol{}dot{}}`},
{`(?-s)(?:(?:^).)`, `cat{bol{}dnl{}}`},
+
+ // RE2 prefix_tests
+ {`abc|abd`, `cat{str{ab}cc{0x63-0x64}}`},
+ {`a(?:b)c|abd`, `cat{str{ab}cc{0x63-0x64}}`},
+ {`abc|abd|aef|bcx|bcy`,
+ `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}` +
+ `cat{str{bc}cc{0x78-0x79}}}`},
+ {`abc|x|abd`, `alt{str{abc}lit{x}str{abd}}`},
+ {`(?i)abc|ABD`, `cat{strfold{AB}cc{0x43-0x44 0x63-0x64}}`},
+ {`[ab]c|[ab]d`, `cat{cc{0x61-0x62}cc{0x63-0x64}}`},
+ {`(?:xx|yy)c|(?:xx|yy)d`,
+ `cat{alt{str{xx}str{yy}}cc{0x63-0x64}}`},
+ {`x{2}|x{2}[0-9]`,
+ `cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}`},
+ {`x{2}y|x{2}[0-9]y`,
+ `cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}`},
}
const testFlags = MatchNL | PerlX | UnicodeGroups
+func TestParseSimple(t *testing.T) {
+ testParseDump(t, parseTests, testFlags)
+}
+
+var foldcaseTests = []parseTest{
+ {`AbCdE`, `strfold{ABCDE}`},
+ {`[Aa]`, `litfold{A}`},
+ {`a`, `litfold{A}`},
+
+ // 0x17F is an old English long s (looks like an f) and folds to s.
+ // 0x212A is the Kelvin symbol and folds to k.
+ {`A[F-g]`, `cat{litfold{A}cc{0x41-0x7a 0x17f 0x212a}}`}, // [Aa][A-z...]
+ {`[[:upper:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+ {`[[:lower:]]`, `cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}`},
+}
+
+func TestParseFoldCase(t *testing.T) {
+ testParseDump(t, foldcaseTests, FoldCase)
+}
+
+var literalTests = []parseTest{
+ {"(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}"},
+}
+
+func TestParseLiteral(t *testing.T) {
+ testParseDump(t, literalTests, Literal)
+}
+
+var matchnlTests = []parseTest{
+ {`.`, `dot{}`},
+ {"\n", "lit{\n}"},
+ {`[^a]`, `cc{0x0-0x60 0x62-0x10ffff}`},
+ {`[a\n]`, `cc{0xa 0x61}`},
+}
+
+func TestParseMatchNL(t *testing.T) {
+ testParseDump(t, matchnlTests, MatchNL)
+}
+
+var nomatchnlTests = []parseTest{
+ {`.`, `dnl{}`},
+ {"\n", "lit{\n}"},
+ {`[^a]`, `cc{0x0-0x9 0xb-0x60 0x62-0x10ffff}`},
+ {`[a\n]`, `cc{0xa 0x61}`},
+}
+
+func TestParseNoMatchNL(t *testing.T) {
+ testParseDump(t, nomatchnlTests, 0)
+}
+
// Test Parse -> Dump.
-func TestParseDump(t *testing.T) {
- for _, tt := range parseTests {
- re, err := Parse(tt.Regexp, testFlags)
+func testParseDump(t *testing.T, tests []parseTest, flags Flags) {
+ for _, tt := range tests {
+ re, err := Parse(tt.Regexp, flags)
if err != nil {
t.Errorf("Parse(%#q): %v", tt.Regexp, err)
continue
t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
}
}
+
+var invalidRegexps = []string{
+ `(`,
+ `)`,
+ `(a`,
+ `(a|b|`,
+ `(a|b`,
+ `[a-z`,
+ `([a-z)`,
+ `x{1001}`,
+ `x{9876543210}`,
+ `x{2,1}`,
+ `x{1,9876543210}`,
+ "\xff", // Invalid UTF-8
+ "[\xff]",
+ "[\\\xff]",
+ "\\\xff",
+ `(?P<name>a`,
+ `(?P<name>`,
+ `(?P<name`,
+ `(?P<x y>a)`,
+ `(?P<>a)`,
+ `[a-Z]`,
+ `(?i)[a-Z]`,
+ `a{100000}`,
+ `a{100000,}`,
+}
+
+var onlyPerl = []string{
+ `[a-b-c]`,
+ `\Qabc\E`,
+ `\Q*+?{[\E`,
+ `\Q\\E`,
+ `\Q\\\E`,
+ `\Q\\\\E`,
+ `\Q\\\\\E`,
+ `(?:a)`,
+ `(?P<name>a)`,
+}
+
+var onlyPOSIX = []string{
+ "a++",
+ "a**",
+ "a?*",
+ "a+*",
+ "a{1}*",
+}
+
+func TestParseInvalidRegexps(t *testing.T) {
+ for _, regexp := range invalidRegexps {
+ if re, err := Parse(regexp, Perl); err == nil {
+ t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
+ }
+ if re, err := Parse(regexp, POSIX); err == nil {
+ t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
+ }
+ }
+ for _, regexp := range onlyPerl {
+ if _, err := Parse(regexp, Perl); err != nil {
+ t.Errorf("Parse(%#q, Perl): %v", regexp, err)
+ }
+ if re, err := Parse(regexp, POSIX); err == nil {
+ t.Errorf("Parse(%#q, POSIX) = %s, should have failed", regexp, dump(re))
+ }
+ }
+ for _, regexp := range onlyPOSIX {
+ if re, err := Parse(regexp, Perl); err == nil {
+ t.Errorf("Parse(%#q, Perl) = %s, should have failed", regexp, dump(re))
+ }
+ if _, err := Parse(regexp, POSIX); err != nil {
+ t.Errorf("Parse(%#q, POSIX): %v", regexp, err)
+ }
+ }
+}
+
+func TestToStringEquivalentParse(t *testing.T) {
+ for _, tt := range parseTests {
+ re, err := Parse(tt.Regexp, testFlags)
+ if err != nil {
+ t.Errorf("Parse(%#q): %v", tt.Regexp, err)
+ continue
+ }
+ d := dump(re)
+ if d != tt.Dump {
+ t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
+ continue
+ }
+
+ s := re.String()
+ if s != tt.Regexp {
+ // If ToString didn't return the original regexp,
+ // it must have found one with fewer parens.
+ // Unfortunately we can't check the length here, because
+ // ToString produces "\\{" for a literal brace,
+ // but "{" is a shorter equivalent in some contexts.
+ nre, err := Parse(s, testFlags)
+ if err != nil {
+ t.Errorf("Parse(%#q.String() = %#q): %v", tt.Regexp, t, err)
+ continue
+ }
+ nd := dump(nre)
+ if d != nd {
+ t.Errorf("Parse(%#q) -> %#q; %#q vs %#q", tt.Regexp, s, d, nd)
+ }
+
+ ns := nre.String()
+ if s != ns {
+ t.Errorf("Parse(%#q) -> %#q -> %#q", tt.Regexp, s, ns)
+ }
+ }
+ }
+}