exp/regexp/syntax: case-folding in character classes

author Russ Cox <rsc@golang.org>

Tue, 28 Jun 2011 03:23:51 +0000 (23:23 -0400)

committer Russ Cox <rsc@golang.org>

Tue, 28 Jun 2011 03:23:51 +0000 (23:23 -0400)
author Russ Cox <rsc@golang.org>
Tue, 28 Jun 2011 03:23:51 +0000 (23:23 -0400)
committer Russ Cox <rsc@golang.org>
Tue, 28 Jun 2011 03:23:51 +0000 (23:23 -0400)
diff --git a/src/pkg/exp/regexp/syntax/parse.go b/src/pkg/exp/regexp/syntax/parse.go

index d04f25097eb58d1a7ce941a44ba6952afb5a5165..cbde6c6041845d352749e987857eab76c0c0b802 100644 (file)
--- a/src/pkg/exp/regexp/syntax/parse.go
+++ b/src/pkg/exp/regexp/syntax/parse.go
@@ -81,6 +81,7 @@ type parser struct {
         stack       []*Regexp // stack of parsed expressions
         numCap      int       // number of capturing groups seen
         wholeRegexp string
+       tmpClass    []int // temporary char class work space
  }
  
  // Parse stack manipulation.
@@ -371,7 +372,6 @@ func Parse(s string, flags Flags) (*Regexp, os.Error) {
                                 if r != nil {
                                         re.Rune = r
                                         t = rest
-                                       // TODO: Handle FoldCase flag.
                                         p.push(re)
                                         break BigSwitch
                                 }
@@ -729,6 +729,7 @@ Switch:
                                 if r > unicode.MaxRune {
                                         break Switch
                                 }
+                               nhex++
                         }
                         if nhex == 0 {
                                 break Switch
@@ -801,12 +802,7 @@ func (p *parser) parsePerlClassEscape(s string, r []int) (out []int, rest string
         if g.sign == 0 {
                 return
         }
-       if g.sign < 0 {
-               r = appendNegatedClass(r, g.class)
-       } else {
-               r = appendClass(r, g.class)
-       }
-       return r, s[2:]
+       return p.appendGroup(r, g), s[2:]
  }
  
  // parseNamedClass parses a leading POSIX named character class like [:alnum:]
@@ -827,23 +823,40 @@ func (p *parser) parseNamedClass(s string, r []int) (out []int, rest string, err
         if g.sign == 0 {
                 return nil, "", &Error{ErrInvalidCharRange, name}
         }
-       if g.sign < 0 {
-               r = appendNegatedClass(r, g.class)
+       return p.appendGroup(r, g), s, nil
+}
+
+func (p *parser) appendGroup(r []int, g charGroup) []int {
+       if p.flags&FoldCase == 0 {
+               if g.sign < 0 {
+                       r = appendNegatedClass(r, g.class)
+               } else {
+                       r = appendClass(r, g.class)
+               }
         } else {
-               r = appendClass(r, g.class)
+               tmp := p.tmpClass[:0]
+               tmp = appendFoldedClass(tmp, g.class)
+               p.tmpClass = tmp
+               tmp = cleanClass(&p.tmpClass)
+               if g.sign < 0 {
+                       r = appendNegatedClass(r, tmp)
+               } else {
+                       r = appendClass(r, tmp)
+               }
         }
-       return r, s, nil
+       return r
  }
  
-// unicodeTable returns the unicode.RangeTable identified by name.
-func unicodeTable(name string) *unicode.RangeTable {
+// unicodeTable returns the unicode.RangeTable identified by name
+// and the table of additional fold-equivalent code points.
+func unicodeTable(name string) (*unicode.RangeTable, *unicode.RangeTable) {
         if t := unicode.Categories[name]; t != nil {
-               return t
+               return t, unicode.FoldCategory[name]
         }
         if t := unicode.Scripts[name]; t != nil {
-               return t
+               return t, unicode.FoldScript[name]
         }
-       return nil
+       return nil, nil
  }
  
  // parseUnicodeClass parses a leading Unicode character class like \p{Han}
@@ -891,14 +904,31 @@ func (p *parser) parseUnicodeClass(s string, r []int) (out []int, rest string, e
                 name = name[1:]
         }
  
-       tab := unicodeTable(name)
+       tab, fold := unicodeTable(name)
         if tab == nil {
                 return nil, "", &Error{ErrInvalidCharRange, seq}
         }
-       if sign > 0 {
-               r = appendTable(r, tab)
+
+       if p.flags&FoldCase == 0 || fold == nil {
+               if sign > 0 {
+                       r = appendTable(r, tab)
+               } else {
+                       r = appendNegatedTable(r, tab)
+               }
         } else {
-               r = appendNegatedTable(r, tab)
+               // Merge and clean tab and fold in a temporary buffer.
+               // This is necessary for the negative case and just tidy
+               // for the positive case.
+               tmp := p.tmpClass[:0]
+               tmp = appendTable(tmp, tab)
+               tmp = appendTable(tmp, fold)
+               p.tmpClass = tmp
+               tmp = cleanClass(&p.tmpClass)
+               if sign > 0 {
+                       r = appendClass(r, tmp)
+               } else {
+                       r = appendNegatedClass(r, tmp)
+               }
         }
         return r, t, nil
  }
@@ -979,7 +1009,11 @@ func (p *parser) parseClass(s string) (rest string, err os.Error) {
                                 return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
                         }
                 }
-               class = appendRange(class, lo, hi)
+               if p.flags&FoldCase == 0 {
+                       class = appendRange(class, lo, hi)
+               } else {
+                       class = appendFoldedRange(class, lo, hi)
+               }
         }
         t = t[1:] // chop ]
  
@@ -999,10 +1033,15 @@ func (p *parser) parseClass(s string) (rest string, err os.Error) {
  // cleanClass sorts the ranges (pairs of elements of r),
  // merges them, and eliminates duplicates.
  func cleanClass(rp *[]int) []int {
+
         // Sort by lo increasing, hi decreasing to break ties.
         sort.Sort(ranges{rp})
  
         r := *rp
+       if len(r) < 2 {
+               return r
+       }
+
         // Merge abutting, overlapping.
         w := 2 // write index
         for i := 2; i < len(r); i += 2 {
@@ -1025,23 +1064,71 @@ func cleanClass(rp *[]int) []int {
  
  // appendRange returns the result of appending the range lo-hi to the class r.
  func appendRange(r []int, lo, hi int) []int {
-       // Expand last range if overlaps or abuts.
-       if n := len(r); n > 0 {
-               rlo, rhi := r[n-2], r[n-1]
-               if lo <= rhi+1 && rlo <= hi+1 {
-                       if lo < rlo {
-                               r[n-2] = lo
-                       }
-                       if hi > rhi {
-                               r[n-1] = hi
+       // Expand last range or next to last range if it overlaps or abuts.
+       // Checking two ranges helps when appending case-folded
+       // alphabets, so that one range can be expanding A-Z and the
+       // other expanding a-z.
+       n := len(r)
+       for i := 2; i <= 4; i += 2 { // twice, using i=2, i=4
+               if n >= i {
+                       rlo, rhi := r[n-i], r[n-i+1]
+                       if lo <= rhi+1 && rlo <= hi+1 {
+                               if lo < rlo {
+                                       r[n-i] = lo
+                               }
+                               if hi > rhi {
+                                       r[n-i+1] = hi
+                               }
+                               return r
                         }
-                       return r
                 }
         }
  
         return append(r, lo, hi)
  }
  
+const (
+       // minimum and maximum runes involved in folding.
+       // checked during test.
+       minFold = 0x0041
+       maxFold = 0x1044f
+)
+
+// appendFoldedRange returns the result of appending the range lo-hi
+// and its case folding-equivalent runes to the class r.
+func appendFoldedRange(r []int, lo, hi int) []int {
+       // Optimizations.
+       if lo <= minFold && hi >= maxFold {
+               // Range is full: folding can't add more.
+               return appendRange(r, lo, hi)
+       }
+       if hi < minFold || lo > maxFold {
+               // Range is outside folding possibilities.
+               return appendRange(r, lo, hi)
+       }
+       if lo < minFold {
+               // [lo, minFold-1] needs no folding.
+               r = appendRange(r, lo, minFold-1)
+               lo = minFold
+       }
+       if hi > maxFold {
+               // [maxFold+1, hi] needs no folding.
+               r = appendRange(r, maxFold+1, hi)
+               hi = maxFold
+       }
+
+       // Brute force.  Depend on appendRange to coalesce ranges on the fly.
+       for c := lo; c <= hi; c++ {
+               r = appendRange(r, c, c)
+               f := unicode.SimpleFold(c)
+               for f != c {
+                       r = appendRange(r, f, f)
+                       f = unicode.SimpleFold(f)
+               }
+       }
+       return r
+}
+
  // appendClass returns the result of appending the class x to the class r.
  // It assume x is clean.
  func appendClass(r []int, x []int) []int {
@@ -1051,6 +1138,14 @@ func appendClass(r []int, x []int) []int {
         return r
  }
  
+// appendFolded returns the result of appending the case folding of the class x to the class r.
+func appendFoldedClass(r []int, x []int) []int {
+       for i := 0; i < len(x); i += 2 {
+               r = appendFoldedRange(r, x[i], x[i+1])
+       }
+       return r
+}
+
  // appendNegatedClass returns the result of appending the negation of the class x to the class r.
  // It assumes x is clean.
  func appendNegatedClass(r []int, x []int) []int {
diff --git a/src/pkg/exp/regexp/syntax/parse_test.go b/src/pkg/exp/regexp/syntax/parse_test.go

index b52cab1a1a2882e3ac16614652fd3628231ae863..493806979452e72b8ad1c82cb5ae697fce82a06a 100644 (file)
--- a/src/pkg/exp/regexp/syntax/parse_test.go
+++ b/src/pkg/exp/regexp/syntax/parse_test.go
@@ -74,18 +74,18 @@ var parseTests = []struct {
         {"[a-z]", "cc{0x61-0x7a}"},
         {"[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
         {"[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}"},
-       //      { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
-       //      { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
-       //      { "(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
-       //      { "(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+       {"(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"},
+       {"(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}"},
+       {"(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
+       {"(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
         {"\\d", "cc{0x30-0x39}"},
         {"\\D", "cc{0x0-0x2f 0x3a-0x10ffff}"},
         {"\\s", "cc{0x9-0xa 0xc-0xd 0x20}"},
         {"\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}"},
         {"\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}"},
         {"\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}"},
-       //      { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
-       //      { "(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+       {"(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}"},
+       {"(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}"},
         {"[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}"},
         //      { "\\C", "byte{}" },
  
@@ -100,6 +100,13 @@ var parseTests = []struct {
         {"[\\p{^Braille}]", "cc{0x0-0x27ff 0x2900-0x10ffff}"},
         {"[\\P{^Braille}]", "cc{0x2800-0x28ff}"},
         {"[\\pZ]", "cc{0x20 0xa0 0x1680 0x180e 0x2000-0x200a 0x2028-0x2029 0x202f 0x205f 0x3000}"},
+       {"\\p{Lu}", mkCharClass(unicode.IsUpper)},
+       {"[\\p{Lu}]", mkCharClass(unicode.IsUpper)},
+       {"(?i)[\\p{Lu}]", mkCharClass(isUpperFold)},
+
+       // Hex, octal.
+       {"[\\012-\\234]\\141", "cat{cc{0xa-0x9c}lit{a}}"},
+       {"[\\x{41}-\\x7a]\\x61", "cat{cc{0x41-0x7a}lit{a}}"},
  
         // More interesting regular expressions.
         //      { "a{,2}", "str{a{,2}}" },
@@ -270,3 +277,69 @@ func dumpRegexp(b *bytes.Buffer, re *Regexp) {
         }
         b.WriteByte('}')
  }
+
+func mkCharClass(f func(int) bool) string {
+       re := &Regexp{Op: OpCharClass}
+       lo := -1
+       for i := 0; i <= unicode.MaxRune; i++ {
+               if f(i) {
+                       if lo < 0 {
+                               lo = i
+                       }
+               } else {
+                       if lo >= 0 {
+                               re.Rune = append(re.Rune, lo, i-1)
+                               lo = -1
+                       }
+               }
+       }
+       if lo >= 0 {
+               re.Rune = append(re.Rune, lo, unicode.MaxRune)
+       }
+       return dump(re)
+}
+
+func isUpperFold(rune int) bool {
+       if unicode.IsUpper(rune) {
+               return true
+       }
+       c := unicode.SimpleFold(rune)
+       for c != rune {
+               if unicode.IsUpper(c) {
+                       return true
+               }
+               c = unicode.SimpleFold(c)
+       }
+       return false
+}
+
+func TestFoldConstants(t *testing.T) {
+       last := -1
+       for i := 0; i <= unicode.MaxRune; i++ {
+               if unicode.SimpleFold(i) == i {
+                       continue
+               }
+               if last == -1 && minFold != i {
+                       t.Errorf("minFold=%#U should be %#U", minFold, i)
+               }
+               last = i
+       }
+       if maxFold != last {
+               t.Errorf("maxFold=%#U should be %#U", maxFold, last)
+       }
+}
+
+func TestAppendRangeCollapse(t *testing.T) {
+       // AppendRange should collapse each of the new ranges
+       // into the earlier ones (it looks back two ranges), so that
+       // the slice never grows very large.
+       // Note that we are not calling cleanClass.
+       var r []int
+       for i := 'A'; i <= 'Z'; i++ {
+               r = appendRange(r, i, i)
+               r = appendRange(r, i+'a'-'A', i+'a'-'A')
+       }
+       if string(r) != "AZaz" {
+               t.Errorf("appendRange interlaced A-Z a-z = %s, want AZaz", string(r))
+       }
+}
author	Russ Cox <rsc@golang.org>
	Tue, 28 Jun 2011 03:23:51 +0000 (23:23 -0400)
committer	Russ Cox <rsc@golang.org>
	Tue, 28 Jun 2011 03:23:51 +0000 (23:23 -0400)
src/pkg/exp/regexp/syntax/parse.go		patch \| blob \| history
src/pkg/exp/regexp/syntax/parse_test.go		patch \| blob \| history