exp/regexp/syntax: finish Regexp manipulation

author Russ Cox <rsc@golang.org>

Thu, 30 Jun 2011 14:26:22 +0000 (10:26 -0400)

committer Russ Cox <rsc@golang.org>

Thu, 30 Jun 2011 14:26:22 +0000 (10:26 -0400)
author Russ Cox <rsc@golang.org>
Thu, 30 Jun 2011 14:26:22 +0000 (10:26 -0400)
committer Russ Cox <rsc@golang.org>
Thu, 30 Jun 2011 14:26:22 +0000 (10:26 -0400)
diff --git a/src/pkg/exp/regexp/syntax/parse.go b/src/pkg/exp/regexp/syntax/parse.go

index ae40d5fc94b2ca68a1c5588d41b706d499246617..b6c91f7e1d41bb77fb5fa1ebfa2566e576220dc8 100644 (file)
--- a/src/pkg/exp/regexp/syntax/parse.go
+++ b/src/pkg/exp/regexp/syntax/parse.go
@@ -106,8 +106,6 @@ func (p *parser) reuse(re *Regexp) {
  
  // push pushes the regexp re onto the parse stack and returns the regexp.
  func (p *parser) push(re *Regexp) *Regexp {
-       // TODO: compute simple
-
         if re.Op == OpCharClass && len(re.Rune) == 2 && re.Rune[0] == re.Rune[1] {
                 // Single rune.
                 if p.maybeConcat(re.Rune[0], p.flags&^FoldCase) {
@@ -250,7 +248,7 @@ func (p *parser) concat() *Regexp {
                 return p.push(p.newRegexp(OpEmptyMatch))
         }
  
-       return p.collapse(subs, OpConcat)
+       return p.push(p.collapse(subs, OpConcat))
  }
  
  // alternate replaces the top of the stack (above the topmost '(') with its alternation.
@@ -276,7 +274,7 @@ func (p *parser) alternate() *Regexp {
                 return p.push(p.newRegexp(OpNoMatch))
         }
  
-       return p.collapse(subs, OpAlternate)
+       return p.push(p.collapse(subs, OpAlternate))
  }
  
  // cleanAlt cleans re for eventual inclusion in an alternation.
@@ -302,13 +300,13 @@ func cleanAlt(re *Regexp) {
         }
  }
  
-// collapse pushes the result of applying op to sub
-// onto the stack.  If sub contains op nodes, they all
-// get flattened into a single node.
-// sub points into p.stack so it cannot be kept.
+// collapse returns the result of applying op to sub.
+// If sub contains op nodes, they all get hoisted up
+// so that there is never a concat of a concat or an
+// alternate of an alternate.
  func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
         if len(subs) == 1 {
-               return p.push(subs[0])
+               return subs[0]
         }
         re := p.newRegexp(op)
         re.Sub = re.Sub0[:0]
@@ -320,7 +318,295 @@ func (p *parser) collapse(subs []*Regexp, op Op) *Regexp {
                         re.Sub = append(re.Sub, sub)
                 }
         }
-       return p.push(re)
+       if op == OpAlternate {
+               re.Sub = p.factor(re.Sub, re.Flags)
+               if len(re.Sub) == 1 {
+                       old := re
+                       re = re.Sub[0]
+                       p.reuse(old)
+               }
+       }
+       return re
+}
+
+// factor factors common prefixes from the alternation list sub.
+// It returns a replacement list that reuses the same storage and
+// frees (passes to p.reuse) any removed *Regexps.
+//
+// For example,
+//     ABC|ABD|AEF|BCX|BCY
+// simplifies by literal prefix extraction to
+//     A(B(C|D)|EF)|BC(X|Y)
+// which simplifies by character class introduction to
+//     A(B[CD]|EF)|BC[XY]
+//
+func (p *parser) factor(sub []*Regexp, flags Flags) []*Regexp {
+       if len(sub) < 2 {
+               return sub
+       }
+
+       // Round 1: Factor out common literal prefixes.
+       var str []int
+       var strflags Flags
+       start := 0
+       out := sub[:0]
+       for i := 0; i <= len(sub); i++ {
+               // Invariant: the Regexps that were in sub[0:start] have been
+               // used or marked for reuse, and the slice space has been reused
+               // for out (len(out) <= start).
+               //
+               // Invariant: sub[start:i] consists of regexps that all begin
+               // with str as modified by strflags.
+               var istr []int
+               var iflags Flags
+               if i < len(sub) {
+                       istr, iflags = p.leadingString(sub[i])
+                       if iflags == strflags {
+                               same := 0
+                               for same < len(str) && same < len(istr) && str[same] == istr[same] {
+                                       same++
+                               }
+                               if same > 0 {
+                                       // Matches at least one rune in current range.
+                                       // Keep going around.
+                                       str = str[:same]
+                                       continue
+                               }
+                       }
+               }
+
+               // Found end of a run with common leading literal string:
+               // sub[start:i] all begin with str[0:len(str)], but sub[i]
+               // does not even begin with str[0].
+               //
+               // Factor out common string and append factored expression to out.
+               if i == start {
+                       // Nothing to do - run of length 0.
+               } else if i == start+1 {
+                       // Just one: don't bother factoring.
+                       out = append(out, sub[start])
+               } else {
+                       // Construct factored form: prefix(suffix1|suffix2|...)
+                       prefix := p.newRegexp(OpLiteral)
+                       prefix.Flags = strflags
+                       prefix.Rune = append(prefix.Rune[:0], str...)
+
+                       for j := start; j < i; j++ {
+                               sub[j] = p.removeLeadingString(sub[j], len(str))
+                       }
+                       suffix := p.collapse(sub[start:i], OpAlternate) // recurse
+
+                       re := p.newRegexp(OpConcat)
+                       re.Sub = append(re.Sub[:0], prefix, suffix)
+                       out = append(out, re)
+               }
+
+               // Prepare for next iteration.
+               start = i
+               str = istr
+               strflags = iflags
+       }
+       sub = out
+
+       // Round 2: Factor out common complex prefixes,
+       // just the first piece of each concatenation,
+       // whatever it is.  This is good enough a lot of the time.
+       start = 0
+       out = sub[:0]
+       var first *Regexp
+       for i := 0; i <= len(sub); i++ {
+               // Invariant: the Regexps that were in sub[0:start] have been
+               // used or marked for reuse, and the slice space has been reused
+               // for out (len(out) <= start).
+               //
+               // Invariant: sub[start:i] consists of regexps that all begin
+               // with str as modified by strflags.
+               var ifirst *Regexp
+               if i < len(sub) {
+                       ifirst = p.leadingRegexp(sub[i])
+                       if first != nil && first.Equal(ifirst) {
+                               continue
+                       }
+               }
+
+               // Found end of a run with common leading regexp:
+               // sub[start:i] all begin with first but sub[i] does not.
+               //
+               // Factor out common regexp and append factored expression to out.
+               if i == start {
+                       // Nothing to do - run of length 0.
+               } else if i == start+1 {
+                       // Just one: don't bother factoring.
+                       out = append(out, sub[start])
+               } else {
+                       // Construct factored form: prefix(suffix1|suffix2|...)
+                       prefix := first
+
+                       for j := start; j < i; j++ {
+                               reuse := j != start // prefix came from sub[start] 
+                               sub[j] = p.removeLeadingRegexp(sub[j], reuse)
+                       }
+                       suffix := p.collapse(sub[start:i], OpAlternate) // recurse
+
+                       re := p.newRegexp(OpConcat)
+                       re.Sub = append(re.Sub[:0], prefix, suffix)
+                       out = append(out, re)
+               }
+
+               // Prepare for next iteration.
+               start = i
+               first = ifirst
+       }
+       sub = out
+
+       // Round 3: Collapse runs of single literals into character classes.
+       start = 0
+       out = sub[:0]
+       for i := 0; i <= len(sub); i++ {
+               // Invariant: the Regexps that were in sub[0:start] have been
+               // used or marked for reuse, and the slice space has been reused
+               // for out (len(out) <= start).
+               //
+               // Invariant: sub[start:i] consists of regexps that are either
+               // literal runes or character classes.
+               if i < len(sub) && isCharClass(sub[i]) {
+                       continue
+               }
+
+               // sub[i] is not a char or char class;
+               // emit char class for sub[start:i]...
+               if i == start {
+                       // Nothing to do - run of length 0.
+               } else if i == start+1 {
+                       out = append(out, sub[start])
+               } else {
+                       // Make new char class.
+                       // Start with most complex regexp in sub[start].
+                       max := start
+                       for j := start + 1; j < i; j++ {
+                               if sub[max].Op < sub[j].Op || sub[max].Op == sub[j].Op && len(sub[max].Rune) < len(sub[j].Rune) {
+                                       max = j
+                               }
+                       }
+                       sub[start], sub[max] = sub[max], sub[start]
+
+                       for j := start + 1; j < i; j++ {
+                               mergeCharClass(sub[start], sub[j])
+                               p.reuse(sub[j])
+                       }
+                       cleanAlt(sub[start])
+                       out = append(out, sub[start])
+               }
+
+               // ... and then emit sub[i].
+               if i < len(sub) {
+                       out = append(out, sub[i])
+               }
+               start = i + 1
+       }
+       sub = out
+
+       // Round 4: Collapse runs of empty matches into a single empty match.
+       start = 0
+       out = sub[:0]
+       for i := range sub {
+               if i+1 < len(sub) && sub[i].Op == OpEmptyMatch && sub[i+1].Op == OpEmptyMatch {
+                       continue
+               }
+               out = append(out, sub[i])
+       }
+       sub = out
+
+       return sub
+}
+
+// leadingString returns the leading literal string that re begins with.
+// The string refers to storage in re or its children.
+func (p *parser) leadingString(re *Regexp) ([]int, Flags) {
+       if re.Op == OpConcat && len(re.Sub) > 0 {
+               re = re.Sub[0]
+       }
+       if re.Op != OpLiteral {
+               return nil, 0
+       }
+       return re.Rune, re.Flags & FoldCase
+}
+
+// removeLeadingString removes the first n leading runes
+// from the beginning of re.  It returns the replacement for re.
+func (p *parser) removeLeadingString(re *Regexp, n int) *Regexp {
+       if re.Op == OpConcat && len(re.Sub) > 0 {
+               // Removing a leading string in a concatenation
+               // might simplify the concatenation.
+               sub := re.Sub[0]
+               sub = p.removeLeadingString(sub, n)
+               re.Sub[0] = sub
+               if sub.Op == OpEmptyMatch {
+                       p.reuse(sub)
+                       switch len(re.Sub) {
+                       case 0, 1:
+                               // Impossible but handle.
+                               re.Op = OpEmptyMatch
+                               re.Sub = nil
+                       case 2:
+                               old := re
+                               re = re.Sub[1]
+                               p.reuse(old)
+                       default:
+                               copy(re.Sub, re.Sub[1:])
+                               re.Sub = re.Sub[:len(re.Sub)-1]
+                       }
+               }
+               return re
+       }
+
+       if re.Op == OpLiteral {
+               re.Rune = re.Rune[:copy(re.Rune, re.Rune[n:])]
+               if len(re.Rune) == 0 {
+                       re.Op = OpEmptyMatch
+               }
+       }
+       return re
+}
+
+// leadingRegexp returns the leading regexp that re begins with.
+// The regexp refers to storage in re or its children.
+func (p *parser) leadingRegexp(re *Regexp) *Regexp {
+       if re.Op == OpEmptyMatch {
+               return nil
+       }
+       if re.Op == OpConcat && len(re.Sub) > 0 {
+               sub := re.Sub[0]
+               if sub.Op == OpEmptyMatch {
+                       return nil
+               }
+               return sub
+       }
+       return re
+}
+
+// removeLeadingRegexp removes the leading regexp in re.
+// It returns the replacement for re.
+// If reuse is true, it passes the removed regexp (if no longer needed) to p.reuse.
+func (p *parser) removeLeadingRegexp(re *Regexp, reuse bool) *Regexp {
+       if re.Op == OpConcat && len(re.Sub) > 0 {
+               if reuse {
+                       p.reuse(re.Sub[0])
+               }
+               re.Sub = re.Sub[:copy(re.Sub, re.Sub[1:])]
+               switch len(re.Sub) {
+               case 0:
+                       re.Op = OpEmptyMatch
+                       re.Sub = nil
+               case 1:
+                       old := re
+                       re = re.Sub[0]
+                       p.reuse(old)
+               }
+               return re
+       }
+       re.Op = OpEmptyMatch
+       return re
  }
  
  func literalRegexp(s string, flags Flags) *Regexp {
@@ -752,6 +1038,36 @@ func (p *parser) parseVerticalBar() os.Error {
         return nil
  }
  
+// mergeCharClass makes dst = dst|src.
+// The caller must ensure that dst.Op >= src.Op,
+// to reduce the amount of copying.
+func mergeCharClass(dst, src *Regexp) {
+       switch dst.Op {
+       case OpAnyChar:
+               // src doesn't add anything.
+       case OpAnyCharNotNL:
+               // src might add \n
+               if matchRune(src, '\n') {
+                       dst.Op = OpAnyChar
+               }
+       case OpCharClass:
+               // src is simpler, so either literal or char class
+               if src.Op == OpLiteral {
+                       dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
+               } else {
+                       dst.Rune = appendClass(dst.Rune, src.Rune)
+               }
+       case OpLiteral:
+               // both literal
+               if src.Rune[0] == dst.Rune[0] {
+                       break
+               }
+               dst.Op = OpCharClass
+               dst.Rune = append(dst.Rune, dst.Rune[0])
+               dst.Rune = appendRange(dst.Rune, src.Rune[0], src.Rune[0])
+       }
+}
+
  // If the top of the stack is an element followed by an opVerticalBar
  // swapVerticalBar swaps the two and returns true.
  // Otherwise it returns false.
@@ -767,30 +1083,7 @@ func (p *parser) swapVerticalBar() bool {
                         re1, re3 = re3, re1
                         p.stack[n-3] = re3
                 }
-               switch re3.Op {
-               case OpAnyChar:
-                       // re1 doesn't add anything.
-               case OpAnyCharNotNL:
-                       // re1 might add \n
-                       if matchRune(re1, '\n') {
-                               re3.Op = OpAnyChar
-                       }
-               case OpCharClass:
-                       // re1 is simpler, so either literal or char class
-                       if re1.Op == OpLiteral {
-                               re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0])
-                       } else {
-                               re3.Rune = appendClass(re3.Rune, re1.Rune)
-                       }
-               case OpLiteral:
-                       // both literal
-                       if re1.Rune[0] == re3.Rune[0] {
-                               break
-                       }
-                       re3.Op = OpCharClass
-                       re3.Rune = append(re3.Rune, re3.Rune[0])
-                       re3.Rune = appendRange(re3.Rune, re1.Rune[0], re1.Rune[0])
-               }
+               mergeCharClass(re3, re1)
                 p.reuse(re1)
                 p.stack = p.stack[:n-1]
                 return true
@@ -1432,10 +1725,11 @@ func negateClass(r []int) []int {
                 }
                 nextLo = hi + 1
         }
+       r = r[:w]
         if nextLo <= unicode.MaxRune {
                 // It's possible for the negation to have one more
                 // range - this one - than the original class, so use append.
-               r = append(r[:w], nextLo, unicode.MaxRune)
+               r = append(r, nextLo, unicode.MaxRune)
         }
         return r
  }
diff --git a/src/pkg/exp/regexp/syntax/parse_test.go b/src/pkg/exp/regexp/syntax/parse_test.go

index 51856b613ebd6b290a7e7814df973026206d1a5b..779b9afdeae06c80244a2ad767a4cc019a75dd15 100644 (file)
--- a/src/pkg/exp/regexp/syntax/parse_test.go
+++ b/src/pkg/exp/regexp/syntax/parse_test.go
@@ -39,8 +39,7 @@ var parseTests = []struct {
         {`a{2,3}?`, `nrep{2,3 lit{a}}`},
         {`a{2,}?`, `nrep{2,-1 lit{a}}`},
         {``, `emp{}`},
-       //      { `|`, `emp{}` },  // alt{emp{}emp{}} but got factored
-       {`|`, `alt{emp{}emp{}}`},
+       {`|`, `emp{}`}, // alt{emp{}emp{}} but got factored
         {`|x|`, `alt{emp{}lit{x}emp{}}`},
         {`.`, `dot{}`},
         {`^`, `bol{}`},
@@ -64,6 +63,9 @@ var parseTests = []struct {
         {`\-`, `lit{-}`},
         {`-`, `lit{-}`},
         {`\_`, `lit{_}`},
+       {`abc`, `str{abc}`},
+       {`abc|def`, `alt{str{abc}str{def}}`},
+       {`abc|def|ghi`, `alt{str{abc}str{def}str{ghi}}`},
  
         // Posix and Perl extensions
         {`[[:lower:]]`, `cc{0x61-0x7a}`},
@@ -156,6 +158,10 @@ var parseTests = []struct {
         // Strings
         {`abcde`, `str{abcde}`},
         {`[Aa][Bb]cd`, `cat{strfold{AB}str{cd}}`},
+
+       // Factoring.
+       {`abc|abd|aef|bcx|bcy`, `alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}cat{str{bc}cc{0x78-0x79}}}`},
+       {`ax+y|ax+z|ay+w`, `cat{lit{a}alt{cat{plus{lit{x}}cc{0x79-0x7a}}cat{plus{lit{y}}lit{w}}}}`},
  }
  
  const testFlags = MatchNL | PerlX | UnicodeGroups
diff --git a/src/pkg/exp/regexp/syntax/regexp.go b/src/pkg/exp/regexp/syntax/regexp.go

index 248ace503c877c02118ac4db8f067bfea83d46b1..00a4addefc4ba23d390f8cbf8c0efed0fcdc4bb5 100644 (file)
--- a/src/pkg/exp/regexp/syntax/regexp.go
+++ b/src/pkg/exp/regexp/syntax/regexp.go
@@ -60,6 +60,59 @@ const (
  
  const opPseudo Op = 128 // where pseudo-ops start
  
+// Equal returns true if x and y have identical structure.
+func (x *Regexp) Equal(y *Regexp) bool {
+       if x == nil || y == nil {
+               return x == y
+       }
+       if x.Op != y.Op {
+               return false
+       }
+       switch x.Op {
+       case OpEndText:
+               // The parse flags remember whether this is \z or \Z.
+               if x.Flags&WasDollar != y.Flags&WasDollar {
+                       return false
+               }
+
+       case OpLiteral, OpCharClass:
+               if len(x.Rune) != len(y.Rune) {
+                       return false
+               }
+               for i, r := range x.Rune {
+                       if r != y.Rune[i] {
+                               return false
+                       }
+               }
+
+       case OpAlternate, OpConcat:
+               if len(x.Sub) != len(y.Sub) {
+                       return false
+               }
+               for i, sub := range x.Sub {
+                       if !sub.Equal(y.Sub[i]) {
+                               return false
+                       }
+               }
+
+       case OpStar, OpPlus, OpQuest:
+               if x.Flags&NonGreedy != y.Flags&NonGreedy || !x.Sub[0].Equal(y.Sub[0]) {
+                       return false
+               }
+
+       case OpRepeat:
+               if x.Flags&NonGreedy != y.Flags&NonGreedy || x.Min != y.Min || x.Max != y.Max || !x.Sub[0].Equal(y.Sub[0]) {
+                       return false
+               }
+
+       case OpCapture:
+               if x.Cap != y.Cap || x.Name != y.Name || !x.Sub[0].Equal(y.Sub[0]) {
+                       return false
+               }
+       }
+       return true
+}
+
  // writeRegexp writes the Perl syntax for the regular expression re to b.
  func writeRegexp(b *bytes.Buffer, re *Regexp) {
         switch re.Op {
@@ -70,16 +123,24 @@ func writeRegexp(b *bytes.Buffer, re *Regexp) {
         case OpEmptyMatch:
                 b.WriteString(`(?:)`)
         case OpLiteral:
+               if re.Flags&FoldCase != 0 {
+                       b.WriteString(`(?i:`)
+               }
                 for _, r := range re.Rune {
                         escape(b, r, false)
                 }
+               if re.Flags&FoldCase != 0 {
+                       b.WriteString(`)`)
+               }
         case OpCharClass:
                 if len(re.Rune)%2 != 0 {
                         b.WriteString(`[invalid char class]`)
                         break
                 }
                 b.WriteRune('[')
-               if len(re.Rune) > 0 && re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
+               if len(re.Rune) == 0 {
+                       b.WriteString(`^\x00-\x{10FFFF}`)
+               } else if re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
                         // Contains 0 and MaxRune.  Probably a negated class.
                         // Print the gaps.
                         b.WriteRune('^')
@@ -126,7 +187,9 @@ func writeRegexp(b *bytes.Buffer, re *Regexp) {
                 } else {
                         b.WriteRune('(')
                 }
-               writeRegexp(b, re.Sub[0])
+               if re.Sub[0].Op != OpEmptyMatch {
+                       writeRegexp(b, re.Sub[0])
+               }
                 b.WriteRune(')')
         case OpStar, OpPlus, OpQuest, OpRepeat:
                 if sub := re.Sub[0]; sub.Op > OpCapture {
@@ -205,6 +268,15 @@ func escape(b *bytes.Buffer, r int, force bool) {
         case '\v':
                 b.WriteString(`\v`)
         default:
+               if r < 0x100 {
+                       b.WriteString(`\x`)
+                       s := strconv.Itob(r, 16)
+                       if len(s) == 1 {
+                               b.WriteRune('0')
+                       }
+                       b.WriteString(s)
+                       break
+               }
                 b.WriteString(`\x{`)
                 b.WriteString(strconv.Itob(r, 16))
                 b.WriteString(`}`)
diff --git a/src/pkg/exp/regexp/syntax/simplify.go b/src/pkg/exp/regexp/syntax/simplify.go

new file mode 100644 (file)

index 0000000..7239041
--- /dev/null
+++ b/src/pkg/exp/regexp/syntax/simplify.go
@@ -0,0 +1,151 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+// Simplify returns a regexp equivalent to re but without counted repetitions
+// and with various other simplifications, such as rewriting /(?:a+)+/ to /a+/.
+// The resulting regexp will execute correctly but its string representation
+// will not produce the same parse tree, because capturing parentheses
+// may have been duplicated or removed.  For example, the simplified form
+// for /(x){1,2}/ is /(x)(x)?/ but both parentheses capture as $1.
+// The returned regexp may share structure with or be the original.
+func (re *Regexp) Simplify() *Regexp {
+       if re == nil {
+               return nil
+       }
+       switch re.Op {
+       case OpCapture, OpConcat, OpAlternate:
+               // Simplify children, building new Regexp if children change.
+               nre := re
+               for i, sub := range re.Sub {
+                       nsub := sub.Simplify()
+                       if nre == re && nsub != sub {
+                               // Start a copy.
+                               nre = new(Regexp)
+                               *nre = *re
+                               nre.Rune = nil
+                               nre.Sub = append(nre.Sub0[:0], re.Sub[:i]...)
+                       }
+                       if nre != re {
+                               nre.Sub = append(nre.Sub, nsub)
+                       }
+               }
+               return nre
+
+       case OpStar, OpPlus, OpQuest:
+               sub := re.Sub[0].Simplify()
+               return simplify1(re.Op, re.Flags, sub, re)
+
+       case OpRepeat:
+               // Special special case: x{0} matches the empty string
+               // and doesn't even need to consider x.
+               if re.Min == 0 && re.Max == 0 {
+                       return &Regexp{Op: OpEmptyMatch}
+               }
+
+               // The fun begins.
+               sub := re.Sub[0].Simplify()
+
+               // x{n,} means at least n matches of x.
+               if re.Max == -1 {
+                       // Special case: x{0,} is x*.
+                       if re.Min == 0 {
+                               return simplify1(OpStar, re.Flags, sub, nil)
+                       }
+
+                       // Special case: x{1,} is x+.
+                       if re.Min == 1 {
+                               return simplify1(OpPlus, re.Flags, sub, nil)
+                       }
+
+                       // General case: x{4,} is xxxx+.
+                       nre := &Regexp{Op: OpConcat}
+                       nre.Sub = nre.Sub0[:0]
+                       for i := 0; i < re.Min-1; i++ {
+                               nre.Sub = append(nre.Sub, sub)
+                       }
+                       nre.Sub = append(nre.Sub, simplify1(OpPlus, re.Flags, sub, nil))
+                       return nre
+               }
+
+               // Special case x{0} handled above.
+
+               // Special case: x{1} is just x.
+               if re.Min == 1 && re.Max == 1 {
+                       return sub
+               }
+
+               // General case: x{n,m} means n copies of x and m copies of x?
+               // The machine will do less work if we nest the final m copies,
+               // so that x{2,5} = xx(x(x(x)?)?)?
+
+               // Build leading prefix: xx.
+               var prefix *Regexp
+               if re.Min > 0 {
+                       prefix = &Regexp{Op: OpConcat}
+                       prefix.Sub = prefix.Sub0[:0]
+                       for i := 0; i < re.Min; i++ {
+                               prefix.Sub = append(prefix.Sub, sub)
+                       }
+               }
+
+               // Build and attach suffix: (x(x(x)?)?)?
+               if re.Max > re.Min {
+                       suffix := simplify1(OpQuest, re.Flags, sub, nil)
+                       for i := re.Min + 1; i < re.Max; i++ {
+                               nre2 := &Regexp{Op: OpConcat}
+                               nre2.Sub = append(nre2.Sub0[:0], sub, suffix)
+                               suffix = simplify1(OpQuest, re.Flags, nre2, nil)
+                       }
+                       if prefix == nil {
+                               return suffix
+                       }
+                       prefix.Sub = append(prefix.Sub, suffix)
+               }
+               if prefix != nil {
+                       return prefix
+               }
+
+               // Some degenerate case like min > max or min < max < 0.
+               // Handle as impossible match.
+               return &Regexp{Op: OpNoMatch}
+       }
+
+       return re
+}
+
+// simplify1 implements Simplify for the unary OpStar,
+// OpPlus, and OpQuest operators.  It returns the simple regexp
+// equivalent to
+//
+//     Regexp{Op: op, Flags: flags, Sub: {sub}}
+//
+// under the assumption that sub is already simple, and
+// without first allocating that structure.  If the regexp
+// to be returned turns out to be equivalent to re, simplify1
+// returns re instead.
+//
+// simplify1 is factored out of Simplify because the implementation
+// for other operators generates these unary expressions.
+// Letting them call simplify1 makes sure the expressions they
+// generate are simple.
+func simplify1(op Op, flags Flags, sub, re *Regexp) *Regexp {
+       // Special case: repeat the empty string as much as
+       // you want, but it's still the empty string.
+       if sub.Op == OpEmptyMatch {
+               return sub
+       }
+       // The operators are idempotent if the flags match.
+       if op == sub.Op && flags&NonGreedy == sub.Flags&NonGreedy {
+               return sub
+       }
+       if re != nil && re.Op == op && re.Flags&NonGreedy == flags&NonGreedy && sub == re.Sub[0] {
+               return re
+       }
+
+       re = &Regexp{Op: op, Flags: flags}
+       re.Sub = append(re.Sub0[:0], sub)
+       return re
+}
diff --git a/src/pkg/exp/regexp/syntax/simplify_test.go b/src/pkg/exp/regexp/syntax/simplify_test.go

new file mode 100644 (file)

index 0000000..c8cec21
--- /dev/null
+++ b/src/pkg/exp/regexp/syntax/simplify_test.go
@@ -0,0 +1,151 @@
+// Copyright 2011 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import "testing"
+
+var simplifyTests = []struct {
+       Regexp string
+       Simple string
+}{
+       // Already-simple constructs
+       {`a`, `a`},
+       {`ab`, `ab`},
+       {`a|b`, `[a-b]`},
+       {`ab|cd`, `ab|cd`},
+       {`(ab)*`, `(ab)*`},
+       {`(ab)+`, `(ab)+`},
+       {`(ab)?`, `(ab)?`},
+       {`.`, `.`},
+       {`^`, `^`},
+       {`$`, `$`},
+       {`[ac]`, `[ac]`},
+       {`[^ac]`, `[^ac]`},
+
+       // Posix character classes
+       {`[[:alnum:]]`, `[0-9A-Za-z]`},
+       {`[[:alpha:]]`, `[A-Za-z]`},
+       {`[[:blank:]]`, `[\t ]`},
+       {`[[:cntrl:]]`, `[\x00-\x1f\x7f]`},
+       {`[[:digit:]]`, `[0-9]`},
+       {`[[:graph:]]`, `[!-~]`},
+       {`[[:lower:]]`, `[a-z]`},
+       {`[[:print:]]`, `[ -~]`},
+       {`[[:punct:]]`, "[!-/:-@\\[-`\\{-~]"},
+       {`[[:space:]]`, `[\t-\r ]`},
+       {`[[:upper:]]`, `[A-Z]`},
+       {`[[:xdigit:]]`, `[0-9A-Fa-f]`},
+
+       // Perl character classes
+       {`\d`, `[0-9]`},
+       {`\s`, `[\t-\n\f-\r ]`},
+       {`\w`, `[0-9A-Z_a-z]`},
+       {`\D`, `[^0-9]`},
+       {`\S`, `[^\t-\n\f-\r ]`},
+       {`\W`, `[^0-9A-Z_a-z]`},
+       {`[\d]`, `[0-9]`},
+       {`[\s]`, `[\t-\n\f-\r ]`},
+       {`[\w]`, `[0-9A-Z_a-z]`},
+       {`[\D]`, `[^0-9]`},
+       {`[\S]`, `[^\t-\n\f-\r ]`},
+       {`[\W]`, `[^0-9A-Z_a-z]`},
+
+       // Posix repetitions
+       {`a{1}`, `a`},
+       {`a{2}`, `aa`},
+       {`a{5}`, `aaaaa`},
+       {`a{0,1}`, `a?`},
+       // The next three are illegible because Simplify inserts (?:)
+       // parens instead of () parens to avoid creating extra
+       // captured subexpressions.  The comments show a version with fewer parens.
+       {`(a){0,2}`, `(?:(a)(a)?)?`},                       //       (aa?)?
+       {`(a){0,4}`, `(?:(a)(?:(a)(?:(a)(a)?)?)?)?`},       //   (a(a(aa?)?)?)?
+       {`(a){2,6}`, `(a)(a)(?:(a)(?:(a)(?:(a)(a)?)?)?)?`}, // aa(a(a(aa?)?)?)?
+       {`a{0,2}`, `(?:aa?)?`},                             //       (aa?)?
+       {`a{0,4}`, `(?:a(?:a(?:aa?)?)?)?`},                 //   (a(a(aa?)?)?)?
+       {`a{2,6}`, `aa(?:a(?:a(?:aa?)?)?)?`},               // aa(a(a(aa?)?)?)?
+       {`a{0,}`, `a*`},
+       {`a{1,}`, `a+`},
+       {`a{2,}`, `aa+`},
+       {`a{5,}`, `aaaaa+`},
+
+       // Test that operators simplify their arguments.
+       {`(?:a{1,}){1,}`, `a+`},
+       {`(a{1,}b{1,})`, `(a+b+)`},
+       {`a{1,}|b{1,}`, `a+|b+`},
+       {`(?:a{1,})*`, `(?:a+)*`},
+       {`(?:a{1,})+`, `a+`},
+       {`(?:a{1,})?`, `(?:a+)?`},
+       {``, `(?:)`},
+       {`a{0}`, `(?:)`},
+
+       // Character class simplification
+       {`[ab]`, `[a-b]`},
+       {`[a-za-za-z]`, `[a-z]`},
+       {`[A-Za-zA-Za-z]`, `[A-Za-z]`},
+       {`[ABCDEFGH]`, `[A-H]`},
+       {`[AB-CD-EF-GH]`, `[A-H]`},
+       {`[W-ZP-XE-R]`, `[E-Z]`},
+       {`[a-ee-gg-m]`, `[a-m]`},
+       {`[a-ea-ha-m]`, `[a-m]`},
+       {`[a-ma-ha-e]`, `[a-m]`},
+       {`[a-zA-Z0-9 -~]`, `[ -~]`},
+
+       // Empty character classes
+       {`[^[:cntrl:][:^cntrl:]]`, `[^\x00-\x{10FFFF}]`},
+
+       // Full character classes
+       {`[[:cntrl:][:^cntrl:]]`, `.`},
+
+       // Unicode case folding.
+       {`(?i)A`, `(?i:A)`},
+       {`(?i)a`, `(?i:a)`},
+       {`(?i)[A]`, `(?i:A)`},
+       {`(?i)[a]`, `(?i:A)`},
+       {`(?i)K`, `(?i:K)`},
+       {`(?i)k`, `(?i:k)`},
+       {`(?i)\x{212a}`, "(?i:\u212A)"},
+       {`(?i)[K]`, "[Kk\u212A]"},
+       {`(?i)[k]`, "[Kk\u212A]"},
+       {`(?i)[\x{212a}]`, "[Kk\u212A]"},
+       {`(?i)[a-z]`, "[A-Za-z\u017F\u212A]"},
+       {`(?i)[\x00-\x{FFFD}]`, "[\\x00-\uFFFD]"},
+       {`(?i)[\x00-\x{10FFFF}]`, `.`},
+
+       // Empty string as a regular expression.
+       // The empty string must be preserved inside parens in order
+       // to make submatches work right, so these tests are less
+       // interesting than they might otherwise be.  String inserts
+       // explicit (?:) in place of non-parenthesized empty strings,
+       // to make them easier to spot for other parsers.
+       {`(a|b|)`, `([a-b]|(?:))`},
+       {`(|)`, `()`},
+       {`a()`, `a()`},
+       {`(()|())`, `(()|())`},
+       {`(a|)`, `(a|(?:))`},
+       {`ab()cd()`, `ab()cd()`},
+       {`()`, `()`},
+       {`()*`, `()*`},
+       {`()+`, `()+`},
+       {`()?`, `()?`},
+       {`(){0}`, `(?:)`},
+       {`(){1}`, `()`},
+       {`(){1,}`, `()+`},
+       {`(){0,2}`, `(?:()()?)?`},
+}
+
+func TestSimplify(t *testing.T) {
+       for _, tt := range simplifyTests {
+               re, err := Parse(tt.Regexp, MatchNL|Perl&^OneLine)
+               if err != nil {
+                       t.Errorf("Parse(%#q) = error %v", tt.Regexp, err)
+                       continue
+               }
+               s := re.Simplify().String()
+               if s != tt.Simple {
+                       t.Errorf("Simplify(%#q) = %#q, want %#q", tt.Regexp, s, tt.Simple)
+               }
+       }
+}
author	Russ Cox <rsc@golang.org>
	Thu, 30 Jun 2011 14:26:22 +0000 (10:26 -0400)
committer	Russ Cox <rsc@golang.org>
	Thu, 30 Jun 2011 14:26:22 +0000 (10:26 -0400)
src/pkg/exp/regexp/syntax/parse.go		patch \| blob \| history
src/pkg/exp/regexp/syntax/parse_test.go		patch \| blob \| history
src/pkg/exp/regexp/syntax/regexp.go		patch \| blob \| history
src/pkg/exp/regexp/syntax/simplify.go	[new file with mode: 0644]	patch \| blob
src/pkg/exp/regexp/syntax/simplify_test.go	[new file with mode: 0644]	patch \| blob