stack []*Regexp // stack of parsed expressions
numCap int // number of capturing groups seen
wholeRegexp string
+ tmpClass []int // temporary char class work space
}
// Parse stack manipulation.
if r != nil {
re.Rune = r
t = rest
- // TODO: Handle FoldCase flag.
p.push(re)
break BigSwitch
}
if r > unicode.MaxRune {
break Switch
}
+ nhex++
}
if nhex == 0 {
break Switch
if g.sign == 0 {
return
}
- if g.sign < 0 {
- r = appendNegatedClass(r, g.class)
- } else {
- r = appendClass(r, g.class)
- }
- return r, s[2:]
+ return p.appendGroup(r, g), s[2:]
}
// parseNamedClass parses a leading POSIX named character class like [:alnum:]
if g.sign == 0 {
return nil, "", &Error{ErrInvalidCharRange, name}
}
- if g.sign < 0 {
- r = appendNegatedClass(r, g.class)
+ return p.appendGroup(r, g), s, nil
+}
+
+func (p *parser) appendGroup(r []int, g charGroup) []int {
+ if p.flags&FoldCase == 0 {
+ if g.sign < 0 {
+ r = appendNegatedClass(r, g.class)
+ } else {
+ r = appendClass(r, g.class)
+ }
} else {
- r = appendClass(r, g.class)
+ tmp := p.tmpClass[:0]
+ tmp = appendFoldedClass(tmp, g.class)
+ p.tmpClass = tmp
+ tmp = cleanClass(&p.tmpClass)
+ if g.sign < 0 {
+ r = appendNegatedClass(r, tmp)
+ } else {
+ r = appendClass(r, tmp)
+ }
}
- return r, s, nil
+ return r
}
-// unicodeTable returns the unicode.RangeTable identified by name.
-func unicodeTable(name string) *unicode.RangeTable {
+// unicodeTable returns the unicode.RangeTable identified by name
+// and the table of additional fold-equivalent code points.
+func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
if t := unicode.Categories[name]; t != nil {
- return t
+ return t, unicode.FoldCategory[name]
}
if t := unicode.Scripts[name]; t != nil {
- return t
+ return t, unicode.FoldScript[name]
}
- return nil
+ return nil, nil
}
// parseUnicodeClass parses a leading Unicode character class like \p{Han}
name = name[1:]
}
- tab := unicodeTable(name)
+ tab, fold := unicodeTable(name)
if tab == nil {
return nil, "", &Error{ErrInvalidCharRange, seq}
}
- if sign > 0 {
- r = appendTable(r, tab)
+
+ if p.flags&FoldCase == 0 || fold == nil {
+ if sign > 0 {
+ r = appendTable(r, tab)
+ } else {
+ r = appendNegatedTable(r, tab)
+ }
} else {
- r = appendNegatedTable(r, tab)
+ // Merge and clean tab and fold in a temporary buffer.
+ // This is necessary for the negative case and just tidy
+ // for the positive case.
+ tmp := p.tmpClass[:0]
+ tmp = appendTable(tmp, tab)
+ tmp = appendTable(tmp, fold)
+ p.tmpClass = tmp
+ tmp = cleanClass(&p.tmpClass)
+ if sign > 0 {
+ r = appendClass(r, tmp)
+ } else {
+ r = appendNegatedClass(r, tmp)
+ }
}
return r, t, nil
}
return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
}
}
- class = appendRange(class, lo, hi)
+ if p.flags&FoldCase == 0 {
+ class = appendRange(class, lo, hi)
+ } else {
+ class = appendFoldedRange(class, lo, hi)
+ }
}
t = t[1:] // chop ]
// cleanClass sorts the ranges (pairs of elements of r),
// merges them, and eliminates duplicates.
func cleanClass(rp *[]int) []int {
+
// Sort by lo increasing, hi decreasing to break ties.
sort.Sort(ranges{rp})
r := *rp
+ if len(r) < 2 {
+ return r
+ }
+
// Merge abutting, overlapping.
w := 2 // write index
for i := 2; i < len(r); i += 2 {
// appendRange returns the result of appending the range lo-hi to the class r.
func appendRange(r []int, lo, hi int) []int {
- // Expand last range if overlaps or abuts.
- if n := len(r); n > 0 {
- rlo, rhi := r[n-2], r[n-1]
- if lo <= rhi+1 && rlo <= hi+1 {
- if lo < rlo {
- r[n-2] = lo
- }
- if hi > rhi {
- r[n-1] = hi
+ // Expand last range or next to last range if it overlaps or abuts.
+ // Checking two ranges helps when appending case-folded
+ // alphabets, so that one range can be expanding A-Z and the
+ // other expanding a-z.
+ n := len(r)
+ for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
+ if n >= i {
+ rlo, rhi := r[n-i], r[n-i+1]
+ if lo <= rhi+1 && rlo <= hi+1 {
+ if lo < rlo {
+ r[n-i] = lo
+ }
+ if hi > rhi {
+ r[n-i+1] = hi
+ }
+ return r
}
- return r
}
}
return append(r, lo, hi)
}
+const (
+ // minimum and maximum runes involved in folding.
+ // checked during test.
+ minFold = 0x0041
+ maxFold = 0x1044f
+)
+
+// appendFoldedRange returns the result of appending the range lo-hi
+// and its case folding-equivalent runes to the class r.
+func appendFoldedRange(r []int, lo, hi int) []int {
+ // Optimizations.
+ if lo <= minFold && hi >= maxFold {
+ // Range is full: folding can't add more.
+ return appendRange(r, lo, hi)
+ }
+ if hi < minFold || lo > maxFold {
+ // Range is outside folding possibilities.
+ return appendRange(r, lo, hi)
+ }
+ if lo < minFold {
+ // [lo, minFold-1] needs no folding.
+ r = appendRange(r, lo, minFold-1)
+ lo = minFold
+ }
+ if hi > maxFold {
+ // [maxFold+1, hi] needs no folding.
+ r = appendRange(r, maxFold+1, hi)
+ hi = maxFold
+ }
+
+ // Brute force. Depend on appendRange to coalesce ranges on the fly.
+ for c := lo; c <= hi; c++ {
+ r = appendRange(r, c, c)
+ f := unicode.SimpleFold(c)
+ for f != c {
+ r = appendRange(r, f, f)
+ f = unicode.SimpleFold(f)
+ }
+ }
+ return r
+}
+
// appendClass returns the result of appending the class x to the class r.
// It assume x is clean.
func appendClass(r []int, x []int) []int {
return r
}
+// appendFolded returns the result of appending the case folding of the class x to the class r.
+func appendFoldedClass(r []int, x []int) []int {
+ for i := 0; i < len(x); i += 2 {
+ r = appendFoldedRange(r, x[i], x[i+1])
+ }
+ return r
+}
+
// appendNegatedClass returns the result of appending the negation of the class x to the class r.
// It assumes x is clean.
func appendNegatedClass(r []int, x []int) []int {
{"[a-z]", "cc{0x61-0x7a}"},
{"[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
{"[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
- // { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
- // { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
- // { "(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
- // { "(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+ {"(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"},
+ {"(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"},
+ {"(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
+ {"(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
{"\\d", "cc{0x30-0x39}"},
{"\\D", "cc{0x0-0x2f 0x3a-0x10ffff}"},
{"\\s", "cc{0x9-0xa 0xc-0xd 0x20}"},
{"\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}"},
{"\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}"},
{"\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}"},
- // { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
- // { "(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+ {"(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}"},
+ {"(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
{"[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}"},
// { "\\C", "byte{}" },
{"[\\p{^Braille}]", "cc{0x0-0x27ff 0x2900-0x10ffff}"},
{"[\\P{^Braille}]", "cc{0x2800-0x28ff}"},
{"[\\pZ]", "cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}"},
+ {"\\p{Lu}", mkCharClass(unicode.IsUpper)},
+ {"[\\p{Lu}]", mkCharClass(unicode.IsUpper)},
+ {"(?i)[\\p{Lu}]", mkCharClass(isUpperFold)},
+
+ // Hex, octal.
+ {"[\\012-\\234]\\141", "cat{cc{0xa-0x9c}lit{a}}"},
+ {"[\\x{41}-\\x7a]\\x61", "cat{cc{0x41-0x7a}lit{a}}"},
// More interesting regular expressions.
// { "a{,2}", "str{a{,2}}" },
}
b.WriteByte('}')
}
+
+func mkCharClass(f func(int) bool) string {
+ re := &Regexp{Op: OpCharClass}
+ lo := -1
+ for i := 0; i <= unicode.MaxRune; i++ {
+ if f(i) {
+ if lo < 0 {
+ lo = i
+ }
+ } else {
+ if lo >= 0 {
+ re.Rune = append(re.Rune, lo, i-1)
+ lo = -1
+ }
+ }
+ }
+ if lo >= 0 {
+ re.Rune = append(re.Rune, lo, unicode.MaxRune)
+ }
+ return dump(re)
+}
+
+func isUpperFold(rune int) bool {
+ if unicode.IsUpper(rune) {
+ return true
+ }
+ c := unicode.SimpleFold(rune)
+ for c != rune {
+ if unicode.IsUpper(c) {
+ return true
+ }
+ c = unicode.SimpleFold(c)
+ }
+ return false
+}
+
+func TestFoldConstants(t *testing.T) {
+ last := -1
+ for i := 0; i <= unicode.MaxRune; i++ {
+ if unicode.SimpleFold(i) == i {
+ continue
+ }
+ if last == -1 && minFold != i {
+ t.Errorf("minFold=%#U should be %#U", minFold, i)
+ }
+ last = i
+ }
+ if maxFold != last {
+ t.Errorf("maxFold=%#U should be %#U", maxFold, last)
+ }
+}
+
+func TestAppendRangeCollapse(t *testing.T) {
+ // AppendRange should collapse each of the new ranges
+ // into the earlier ones (it looks back two ranges), so that
+ // the slice never grows very large.
+ // Note that we are not calling cleanClass.
+ var r []int
+ for i := 'A'; i <= 'Z'; i++ {
+ r = appendRange(r, i, i)
+ r = appendRange(r, i+'a'-'A', i+'a'-'A')
+ }
+ if string(r) != "AZaz" {
+ t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
+ }
+}