exp/eval\
exp/gui\
exp/gui/x11\
+ exp/regexp/syntax\
expvar\
flag\
fmt\
--- /dev/null
+# Copyright 2011 The Go Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style
+# license that can be found in the LICENSE file.
+
+include ../../../../Make.inc
+
+TARG=exp/regexp/syntax
+GOFILES=\
+ parse.go\
+ regexp.go\
+
+include ../../../../Make.pkg
--- /dev/null
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import (
+ "os"
+ "sort"
+ "unicode"
+ "utf8"
+)
+
+// An Error describes a failure to parse a regular expression
+// and gives the offending expression.
+type Error struct {
+ Code ErrorCode
+ Expr string
+}
+
+func (e *Error) String() string {
+ return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`"
+}
+
+// An ErrorCode describes a failure to parse a regular expression.
+type ErrorCode string
+
+const (
+ // Unexpected error
+ ErrInternalError ErrorCode = "regexp/syntax: internal error"
+
+ // Parse errors
+ ErrInvalidCharClass ErrorCode = "invalid character class"
+ ErrInvalidCharRange ErrorCode = "invalid character class range"
+ ErrInvalidEscape ErrorCode = "invalid escape sequence"
+ ErrInvalidNamedCapture ErrorCode = "invalid named capture"
+ ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax"
+ ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator"
+ ErrInvalidRepeatSize ErrorCode = "invalid repeat count"
+ ErrInvalidUTF8 ErrorCode = "invalid UTF-8"
+ ErrMissingBracket ErrorCode = "missing closing ]"
+ ErrMissingParen ErrorCode = "missing closing )"
+ ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator"
+ ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression"
+)
+
+func (e ErrorCode) String() string {
+ return string(e)
+}
+
+// Flags control the behavior of the parser and record information about regexp context.
+type Flags uint16
+
+const (
+ FoldCase Flags = 1 << iota // case-insensitive match
+ Literal // treat pattern as literal string
+ ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline
+ DotNL // allow . to match newline
+ OneLine // treat ^ and $ as only matching at beginning and end of text
+ NonGreedy // make repetition operators default to non-greedy
+ PerlX // allow Perl extensions
+ UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation
+ WasDollar // regexp OpEndText was $, not \z
+ Simple // regexp contains no counted repetition
+
+ MatchNL = ClassNL | DotNL
+
+ Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible
+ POSIX Flags = 0 // POSIX syntax
+)
+
+// Pseudo-ops for parsing stack.
+const (
+ opLeftParen = opPseudo + iota
+ opVerticalBar
+)
+
+type parser struct {
+ flags Flags // parse mode flags
+ stack []*Regexp // stack of parsed expressions
+ numCap int // number of capturing groups seen
+ wholeRegexp string
+}
+
+// Parse stack manipulation.
+
+// push pushes the regexp re onto the parse stack and returns the regexp.
+func (p *parser) push(re *Regexp) *Regexp {
+ // TODO: automatic concatenation
+ // TODO: turn character class into literal
+ // TODO: compute simple
+
+ p.stack = append(p.stack, re)
+ return re
+}
+
+// newLiteral returns a new OpLiteral Regexp with the given flags
+func newLiteral(r int, flags Flags) *Regexp {
+ re := &Regexp{
+ Op: OpLiteral,
+ Flags: flags,
+ }
+ re.Rune0[0] = r
+ re.Rune = re.Rune0[:1]
+ return re
+}
+
+// literal pushes a literal regexp for the rune r on the stack
+// and returns that regexp.
+func (p *parser) literal(r int) *Regexp {
+ return p.push(newLiteral(r, p.flags))
+}
+
+// op pushes a regexp with the given op onto the stack
+// and returns that regexp.
+func (p *parser) op(op Op) *Regexp {
+ return p.push(&Regexp{Op: op, Flags: p.flags})
+}
+
+// repeat replaces the top stack element with itself repeated
+// according to op.
+func (p *parser) repeat(op Op, opstr string) os.Error {
+ n := len(p.stack)
+ if n == 0 {
+ return &Error{ErrMissingRepeatArgument, opstr}
+ }
+ sub := p.stack[n-1]
+ re := &Regexp{
+ Op: op,
+ }
+ re.Sub = re.Sub0[:1]
+ re.Sub[0] = sub
+ p.stack[n-1] = re
+ return nil
+}
+
+// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation.
+func (p *parser) concat() *Regexp {
+ // TODO: Flatten concats.
+
+ // Scan down to find pseudo-operator | or (.
+ i := len(p.stack)
+ for i > 0 && p.stack[i-1].Op < opPseudo {
+ i--
+ }
+ sub := p.stack[i:]
+ p.stack = p.stack[:i]
+
+ var re *Regexp
+ switch len(sub) {
+ case 0:
+ re = &Regexp{Op: OpEmptyMatch}
+ case 1:
+ re = sub[0]
+ default:
+ re = &Regexp{Op: OpConcat}
+ re.Sub = append(re.Sub0[:0], sub...)
+ }
+ return p.push(re)
+}
+
+// alternate replaces the top of the stack (above the topmost '(') with its alternation.
+func (p *parser) alternate() *Regexp {
+ // TODO: Flatten alternates.
+
+ // Scan down to find pseudo-operator (.
+ // There are no | above (.
+ i := len(p.stack)
+ for i > 0 && p.stack[i-1].Op < opPseudo {
+ i--
+ }
+ sub := p.stack[i:]
+ p.stack = p.stack[:i]
+
+ var re *Regexp
+ switch len(sub) {
+ case 0:
+ re = &Regexp{Op: OpNoMatch}
+ case 1:
+ re = sub[0]
+ default:
+ re = &Regexp{Op: OpAlternate}
+ re.Sub = append(re.Sub0[:0], sub...)
+ }
+ return p.push(re)
+}
+
+// Parsing.
+
+func Parse(s string, flags Flags) (*Regexp, os.Error) {
+ if flags&Literal != 0 {
+ // Trivial parser for literal string.
+ if err := checkUTF8(s); err != nil {
+ return nil, err
+ }
+ re := &Regexp{
+ Op: OpLiteral,
+ Flags: flags,
+ }
+ re.Rune = re.Rune0[:0] // use local storage for small strings
+ for _, c := range s {
+ if len(re.Rune) >= cap(re.Rune) {
+ // string is too long to fit in Rune0. let Go handle it
+ re.Rune = []int(s)
+ break
+ }
+ re.Rune = append(re.Rune, c)
+ }
+ return re, nil
+ }
+
+ // Otherwise, must do real work.
+ var (
+ p parser
+ err os.Error
+ c int
+ op Op
+ )
+ p.flags = flags
+ p.wholeRegexp = s
+ t := s
+ for t != "" {
+ switch t[0] {
+ default:
+ if c, t, err = nextRune(t); err != nil {
+ return nil, err
+ }
+ p.literal(c)
+
+ case '(':
+ // TODO: Actual Perl flag parsing.
+ if len(t) >= 3 && t[1] == '?' && t[2] == ':' {
+ // non-capturing paren
+ p.op(opLeftParen)
+ t = t[3:]
+ break
+ }
+ p.numCap++
+ p.op(opLeftParen).Cap = p.numCap
+ t = t[1:]
+ case '|':
+ p.concat()
+ if err = p.parseVerticalBar(); err != nil {
+ return nil, err
+ }
+ t = t[1:]
+ case ')':
+ if err = p.parseRightParen(); err != nil {
+ return nil, err
+ }
+ t = t[1:]
+ case '^':
+ if p.flags&OneLine != 0 {
+ p.op(OpBeginText)
+ } else {
+ p.op(OpBeginLine)
+ }
+ t = t[1:]
+ case '$':
+ if p.flags&OneLine != 0 {
+ p.op(OpEndText).Flags |= WasDollar
+ } else {
+ p.op(OpEndLine)
+ }
+ t = t[1:]
+ case '.':
+ if p.flags&DotNL != 0 {
+ p.op(OpAnyChar)
+ } else {
+ p.op(OpAnyCharNotNL)
+ }
+ t = t[1:]
+ case '[':
+ if t, err = p.parseClass(t); err != nil {
+ return nil, err
+ }
+ case '*', '+', '?':
+ switch t[0] {
+ case '*':
+ op = OpStar
+ case '+':
+ op = OpPlus
+ case '?':
+ op = OpQuest
+ }
+ // TODO: greedy
+ if err = p.repeat(op, t[0:1]); err != nil {
+ return nil, err
+ }
+ t = t[1:]
+ case '{':
+ return nil, os.NewError("repeat not implemented")
+ case '\\':
+ return nil, os.NewError("escape not implemented")
+ }
+ }
+
+ p.concat()
+ if p.swapVerticalBar() {
+ // pop vertical bar
+ p.stack = p.stack[:len(p.stack)-1]
+ }
+ p.alternate()
+
+ n := len(p.stack)
+ if n != 1 {
+ return nil, &Error{ErrMissingParen, s}
+ }
+ return p.stack[0], nil
+}
+
+// parseVerticalBar handles a | in the input.
+func (p *parser) parseVerticalBar() os.Error {
+ p.concat()
+
+ // The concatenation we just parsed is on top of the stack.
+ // If it sits above an opVerticalBar, swap it below
+ // (things below an opVerticalBar become an alternation).
+ // Otherwise, push a new vertical bar.
+ if !p.swapVerticalBar() {
+ p.op(opVerticalBar)
+ }
+
+ return nil
+}
+
+// If the top of the stack is an element followed by an opVerticalBar
+// swapVerticalBar swaps the two and returns true.
+// Otherwise it returns false.
+func (p *parser) swapVerticalBar() bool {
+ if n := len(p.stack); n >= 2 {
+ re1 := p.stack[n-1]
+ re2 := p.stack[n-2]
+ if re2.Op == opVerticalBar {
+ p.stack[n-2] = re1
+ p.stack[n-1] = re2
+ return true
+ }
+ }
+ return false
+}
+
+// parseRightParen handles a ) in the input.
+func (p *parser) parseRightParen() os.Error {
+ p.concat()
+ if p.swapVerticalBar() {
+ // pop vertical bar
+ p.stack = p.stack[:len(p.stack)-1]
+ }
+ p.alternate()
+
+ n := len(p.stack)
+ if n < 2 {
+ return &Error{ErrInternalError, ""}
+ }
+ re1 := p.stack[n-1]
+ re2 := p.stack[n-2]
+ p.stack = p.stack[:n-2]
+ if re2.Op != opLeftParen {
+ return &Error{ErrMissingParen, p.wholeRegexp}
+ }
+ if re2.Cap == 0 {
+ // Just for grouping.
+ p.push(re1)
+ } else {
+ re2.Op = OpCapture
+ re2.Sub = re2.Sub0[:1]
+ re2.Sub[0] = re1
+ p.push(re2)
+ }
+ return nil
+}
+
+// parseClassChar parses a character class character at the beginning of s
+// and returns it.
+func (p *parser) parseClassChar(s, wholeClass string) (r int, rest string, err os.Error) {
+ if s == "" {
+ return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass}
+ }
+
+ // TODO: Escapes
+
+ return nextRune(s)
+}
+
+// parseClass parses a character class at the beginning of s
+// and pushes it onto the parse stack.
+func (p *parser) parseClass(s string) (rest string, err os.Error) {
+ t := s[1:] // chop [
+ re := &Regexp{Op: OpCharClass, Flags: p.flags}
+ re.Rune = re.Rune0[:0]
+
+ sign := +1
+ if t != "" && t[0] == '^' {
+ sign = -1
+ t = t[1:]
+
+ // If character class does not match \n, add it here,
+ // so that negation later will do the right thing.
+ if p.flags&ClassNL == 0 {
+ re.Rune = append(re.Rune, '\n', '\n')
+ }
+ }
+
+ class := re.Rune
+ first := true // ] and - are okay as first char in class
+ for t == "" || t[0] != ']' || first {
+ // POSIX: - is only okay unescaped as first or last in class.
+ // Perl: - is okay anywhere.
+ if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') {
+ _, size := utf8.DecodeRuneInString(t[1:])
+ return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]}
+ }
+ first = false
+
+ // TODO: Look for [:alnum:]
+ // TODO: Look for Unicode group.
+ // TODO: Look for Perl group.
+
+ // Single character or simple range.
+ rng := t
+ var lo, hi int
+ if lo, t, err = p.parseClassChar(t, s); err != nil {
+ return "", err
+ }
+ hi = lo
+ // [a-] means (a|-) so check for final ].
+ if len(t) >= 2 && t[0] == '-' && t[1] != ']' {
+ t = t[1:]
+ if hi, t, err = p.parseClassChar(t, s); err != nil {
+ return "", err
+ }
+ if hi < lo {
+ rng = rng[:len(rng)-len(t)]
+ return "", &Error{Code: ErrInvalidCharRange, Expr: rng}
+ }
+ }
+
+ // Expand last range if overlaps or abuts.
+ if n := len(class); n > 0 {
+ clo, chi := class[n-2], class[n-1]
+ if lo <= chi+1 && clo <= hi+1 {
+ if lo < clo {
+ class[n-2] = lo
+ }
+ if hi > chi {
+ class[n-1] = hi
+ }
+ continue
+ }
+ }
+
+ class = append(class, lo, hi)
+ }
+ t = t[1:] // chop ]
+
+ // Use &re.Rune instead of &class to avoid allocation.
+ re.Rune = class
+ class = cleanClass(&re.Rune)
+ if sign < 0 {
+ class = negateClass(class)
+ }
+ re.Rune = class
+ p.push(re)
+ return t, nil
+}
+
+// cleanClass sorts the ranges (pairs of elements of r),
+// merges them, and eliminates duplicates.
+func cleanClass(rp *[]int) []int {
+ // Sort by lo increasing, hi decreasing to break ties.
+ sort.Sort(ranges{rp})
+
+ r := *rp
+ // Merge abutting, overlapping.
+ w := 2 // write index
+ for i := 2; i < len(r); i += 2 {
+ lo, hi := r[i], r[i+1]
+ if lo <= r[w-1]+1 {
+ // merge with previous range
+ if hi > r[w-1] {
+ r[w-1] = hi
+ }
+ continue
+ }
+ // new disjoint range
+ r[w] = lo
+ r[w+1] = hi
+ w += 2
+ }
+
+ return r[:w]
+}
+
+// negateClass overwrites r and returns r's negation.
+// It assumes the class r is already clean.
+func negateClass(r []int) []int {
+ nextLo := 0 // lo end of next class to add
+ w := 0 // write index
+ for i := 0; i < len(r); i += 2 {
+ lo, hi := r[i], r[i+1]
+ if nextLo <= lo-1 {
+ r[w] = nextLo
+ r[w+1] = lo - 1
+ w += 2
+ }
+ nextLo = hi + 1
+ }
+ if nextLo <= unicode.MaxRune {
+ // It's possible for the negation to have one more
+ // range - this one - than the original class, so use append.
+ r = append(r[:w], nextLo, unicode.MaxRune)
+ }
+ return r
+}
+
+// ranges implements sort.Interface on a []rune.
+// The choice of receiver type definition is strange
+// but avoids an allocation since we already have
+// a *[]int.
+type ranges struct {
+ p *[]int
+}
+
+func (ra ranges) Less(i, j int) bool {
+ p := *ra.p
+ i *= 2
+ j *= 2
+ return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1]
+}
+
+func (ra ranges) Len() int {
+ return len(*ra.p) / 2
+}
+
+func (ra ranges) Swap(i, j int) {
+ p := *ra.p
+ i *= 2
+ j *= 2
+ p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1]
+}
+
+
+func checkUTF8(s string) os.Error {
+ for s != "" {
+ rune, size := utf8.DecodeRuneInString(s)
+ if rune == utf8.RuneError && size == 1 {
+ return &Error{Code: ErrInvalidUTF8, Expr: s}
+ }
+ s = s[size:]
+ }
+ return nil
+}
+
+func nextRune(s string) (c int, t string, err os.Error) {
+ c, size := utf8.DecodeRuneInString(s)
+ if c == utf8.RuneError && size == 1 {
+ return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s}
+ }
+ return c, s[size:], nil
+}
--- /dev/null
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syntax
+
+import (
+ "bytes"
+ "fmt"
+ "testing"
+ "unicode"
+)
+
+var parseTests = []struct {
+ Regexp string
+ Dump string
+}{
+ // Base cases
+ {"a", "lit{a}"},
+ {"a.", "cat{lit{a}dot{}}"},
+ {"a.b", "cat{lit{a}dot{}lit{b}}"},
+ // { "ab", "str{ab}" },
+ {"ab", "cat{lit{a}lit{b}}"},
+ {"a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}"},
+ // { "abc", "str{abc}" },
+ {"abc", "cat{lit{a}lit{b}lit{c}}"},
+ {"a|^", "alt{lit{a}bol{}}"},
+ // { "a|b", "cc{0x61-0x62}" },
+ {"a|b", "alt{lit{a}lit{b}}"},
+ {"(a)", "cap{lit{a}}"},
+ {"(a)|b", "alt{cap{lit{a}}lit{b}}"},
+ {"a*", "star{lit{a}}"},
+ {"a+", "plus{lit{a}}"},
+ {"a?", "que{lit{a}}"},
+ // { "a{2}", "rep{2,2 lit{a}}" },
+ // { "a{2,3}", "rep{2,3 lit{a}}" },
+ // { "a{2,}", "rep{2,-1 lit{a}}" },
+ // { "a*?", "nstar{lit{a}}" },
+ // { "a+?", "nplus{lit{a}}" },
+ // { "a??", "nque{lit{a}}" },
+ // { "a{2}?", "nrep{2,2 lit{a}}" },
+ // { "a{2,3}?", "nrep{2,3 lit{a}}" },
+ // { "a{2,}?", "nrep{2,-1 lit{a}}" },
+ {"", "emp{}"},
+ // { "|", "emp{}" }, // alt{emp{}emp{}} but got factored
+ // { "|", "alt{emp{}emp{}}" },
+ {"|x|", "alt{emp{}lit{x}emp{}}"},
+ {".", "dot{}"},
+ {"^", "bol{}"},
+ {"$", "eol{}"},
+ // { "\\|", "lit{|}" },
+ // { "\\(", "lit{(}" },
+ // { "\\)", "lit{)}" },
+ // { "\\*", "lit{*}" },
+ // { "\\+", "lit{+}" },
+ // { "\\?", "lit{?}" },
+ // { "{", "lit{{}" },
+ {"}", "lit{}}"},
+ // { "\\.", "lit{.}" },
+ // { "\\^", "lit{^}" },
+ // { "\\$", "lit{$}" },
+ // { "\\\\", "lit{\\}" },
+ {"[ace]", "cc{0x61 0x63 0x65}"},
+ {"[abc]", "cc{0x61-0x63}"},
+ {"[a-z]", "cc{0x61-0x7a}"},
+ // { "[a]", "lit{a}" },
+ {"[a]", "cc{0x61}"},
+ // { "\\-", "lit{-}" },
+ {"-", "lit{-}"},
+ // { "\\_", "lit{_}" },
+
+ // Posix and Perl extensions
+ // { "[[:lower:]]", "cc{0x61-0x7a}" },
+ // { "[a-z]", "cc{0x61-0x7a}" },
+ // { "[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" },
+ // { "[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" },
+ // { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+ // { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
+ // { "(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+ // { "(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+ // { "\\d", "cc{0x30-0x39}" },
+ // { "\\D", "cc{0x0-0x2f 0x3a-0x10ffff}" },
+ // { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
+ // { "\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
+ // { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
+ // { "\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
+ // { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
+ // { "(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
+ // { "[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}" },
+ // { "\\C", "byte{}" },
+
+ // Unicode, negatives, and a double negative.
+ // { "\\p{Braille}", "cc{0x2800-0x28ff}" },
+ // { "\\P{Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" },
+ // { "\\p{^Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" },
+ // { "\\P{^Braille}", "cc{0x2800-0x28ff}" },
+
+ // More interesting regular expressions.
+ // { "a{,2}", "str{a{,2}}" },
+ // { "\\.\\^\\$\\\\", "str{.^$\\}" },
+ {"[a-zABC]", "cc{0x41-0x43 0x61-0x7a}"},
+ {"[^a]", "cc{0x0-0x60 0x62-0x10ffff}"},
+ {"[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}"}, // utf-8
+ // { "a*{", "cat{star{lit{a}}lit{{}}" },
+
+ // Test precedences
+ // { "(?:ab)*", "star{str{ab}}" },
+ // { "(ab)*", "star{cap{str{ab}}}" },
+ // { "ab|cd", "alt{str{ab}str{cd}}" },
+ // { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
+ {"(?:ab)*", "star{cat{lit{a}lit{b}}}"},
+ {"(ab)*", "star{cap{cat{lit{a}lit{b}}}}"},
+ {"ab|cd", "alt{cat{lit{a}lit{b}}cat{lit{c}lit{d}}}"},
+ {"a(b|c)d", "cat{lit{a}cap{alt{lit{b}lit{c}}}lit{d}}"},
+
+ // Test flattening.
+ // { "(?:a)", "lit{a}" },
+ // { "(?:ab)(?:cd)", "str{abcd}" },
+ // { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
+ // { "a|.", "dot{}" },
+ // { ".|a", "dot{}" },
+
+ // Test Perl quoted literals
+ // { "\\Q+|*?{[\\E", "str{+|*?{[}" },
+ // { "\\Q+\\E+", "plus{lit{+}}" },
+ // { "\\Q\\\\E", "lit{\\}" },
+ // { "\\Q\\\\\\E", "str{\\\\}" },
+
+ // Test Perl \A and \z
+ // { "(?m)^", "bol{}" },
+ // { "(?m)$", "eol{}" },
+ // { "(?-m)^", "bot{}" },
+ // { "(?-m)$", "eot{}" },
+ // { "(?m)\\A", "bot{}" },
+ // { "(?m)\\z", "eot{\\z}" },
+ // { "(?-m)\\A", "bot{}" },
+ // { "(?-m)\\z", "eot{\\z}" },
+
+ // Test named captures
+ // { "(?P<name>a)", "cap{name:lit{a}}" },
+
+ // Case-folded literals
+ // { "[Aa]", "litfold{a}" },
+
+ // Strings
+ // { "abcde", "str{abcde}" },
+ // { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
+}
+
+const testFlags = MatchNL | PerlX | UnicodeGroups
+
+// Test Parse -> Dump.
+func TestParseDump(t *testing.T) {
+ for _, tt := range parseTests {
+ re, err := Parse(tt.Regexp, testFlags)
+ if err != nil {
+ t.Errorf("Parse(%#q): %v", tt.Regexp, err)
+ continue
+ }
+ d := dump(re)
+ if d != tt.Dump {
+ t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump)
+ }
+ }
+}
+
+// dump prints a string representation of the regexp showing
+// the structure explicitly.
+func dump(re *Regexp) string {
+ var b bytes.Buffer
+ dumpRegexp(&b, re)
+ return b.String()
+}
+
+var opNames = []string{
+ OpNoMatch: "no",
+ OpEmptyMatch: "emp",
+ OpLiteral: "lit",
+ OpCharClass: "cc",
+ OpAnyCharNotNL: "dnl",
+ OpAnyChar: "dot",
+ OpBeginLine: "bol",
+ OpEndLine: "eol",
+ OpBeginText: "bot",
+ OpEndText: "eot",
+ OpWordBoundary: "wb",
+ OpNoWordBoundary: "nwb",
+ OpCapture: "cap",
+ OpStar: "star",
+ OpPlus: "plus",
+ OpQuest: "que",
+ OpRepeat: "rep",
+ OpConcat: "cat",
+ OpAlternate: "alt",
+}
+
+// dumpRegexp writes an encoding of the syntax tree for the regexp re to b.
+// It is used during testing to distinguish between parses that might print
+// the same using re's String method.
+func dumpRegexp(b *bytes.Buffer, re *Regexp) {
+ if int(re.Op) >= len(opNames) || opNames[re.Op] == "" {
+ fmt.Fprintf(b, "op%d", re.Op)
+ } else {
+ switch re.Op {
+ default:
+ b.WriteString(opNames[re.Op])
+ case OpStar, OpPlus, OpQuest, OpRepeat:
+ if re.Flags&NonGreedy != 0 {
+ b.WriteByte('n')
+ }
+ b.WriteString(opNames[re.Op])
+ case OpLiteral:
+ if len(re.Rune) > 1 {
+ b.WriteString("str")
+ } else {
+ b.WriteString("lit")
+ }
+ if re.Flags&FoldCase != 0 {
+ for _, r := range re.Rune {
+ if unicode.ToUpper(r) != r {
+ b.WriteString("fold")
+ }
+ }
+ }
+ }
+ }
+ b.WriteByte('{')
+ switch re.Op {
+ case OpEndText:
+ if re.Flags&WasDollar == 0 {
+ b.WriteString(`\z`)
+ }
+ case OpLiteral:
+ for _, r := range re.Rune {
+ b.WriteRune(r)
+ }
+ case OpConcat, OpAlternate:
+ for _, sub := range re.Sub {
+ dumpRegexp(b, sub)
+ }
+ case OpStar, OpPlus, OpQuest:
+ dumpRegexp(b, re.Sub[0])
+ case OpRepeat:
+ fmt.Fprintf(b, "%d,%d ", re.Min, re.Max)
+ dumpRegexp(b, re.Sub[0])
+ case OpCapture:
+ if re.Name != "" {
+ b.WriteString(re.Name)
+ b.WriteByte(':')
+ }
+ dumpRegexp(b, re.Sub[0])
+ case OpCharClass:
+ sep := ""
+ for i := 0; i < len(re.Rune); i += 2 {
+ b.WriteString(sep)
+ sep = " "
+ lo, hi := re.Rune[i], re.Rune[i+1]
+ if lo == hi {
+ fmt.Fprintf(b, "%#x", lo)
+ } else {
+ fmt.Fprintf(b, "%#x-%#x", lo, hi)
+ }
+ }
+ }
+ b.WriteByte('}')
+}
--- /dev/null
+// Copyright 2011 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package syntax parses regular expressions into syntax trees.
+// WORK IN PROGRESS.
+package syntax
+
+// Note to implementers:
+// In this package, re is always a *Regexp and r is always a rune.
+
+import (
+ "bytes"
+ "strconv"
+ "strings"
+ "unicode"
+)
+
+// A Regexp is a node in a regular expression syntax tree.
+type Regexp struct {
+ Op Op // operator
+ Flags Flags
+ Sub []*Regexp // subexpressions, if any
+ Sub0 [1]*Regexp // storage for short Sub
+ Rune []int // matched runes, for OpLiteral, OpCharClass
+ Rune0 [2]int // storage for short Rune
+ Min, Max int // min, max for OpRepeat
+ Cap int // capturing index, for OpCapture
+ Name string // capturing name, for OpCapture
+}
+
+// An Op is a single regular expression operator.
+type Op uint8
+
+// Operators are listed in precedence order, tightest binding to weakest.
+
+const (
+ OpNoMatch Op = 1 + iota // matches no strings
+ OpEmptyMatch // matches empty string
+ OpLiteral // matches Runes sequence
+ OpCharClass // matches Runes interpreted as range pair list
+ OpAnyCharNotNL // matches any character
+ OpAnyChar // matches any character
+ OpBeginLine // matches empty string at beginning of line
+ OpEndLine // matches empty string at end of line
+ OpBeginText // matches empty string at beginning of text
+ OpEndText // matches empty string at end of text
+ OpWordBoundary // matches word boundary `\b`
+ OpNoWordBoundary // matches word non-boundary `\B`
+ OpCapture // capturing subexpression with index Cap, optional name Name
+ OpStar // matches Sub[0] zero or more times
+ OpPlus // matches Sub[0] one or more times
+ OpQuest // matches Sub[0] zero or one times
+ OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit)
+ OpConcat // matches concatenation of Subs
+ OpAlternate // matches alternation of Subs
+)
+
+const opPseudo Op = 128 // where pseudo-ops start
+
+// writeRegexp writes the Perl syntax for the regular expression re to b.
+func writeRegexp(b *bytes.Buffer, re *Regexp) {
+ switch re.Op {
+ default:
+ b.WriteString("<invalid op" + strconv.Itoa(int(re.Op)) + ">")
+ case OpNoMatch:
+ b.WriteString(`[^\x00-\x{10FFFF}]`)
+ case OpEmptyMatch:
+ b.WriteString(`(?:)`)
+ case OpLiteral:
+ for _, r := range re.Rune {
+ escape(b, r, false)
+ }
+ case OpCharClass:
+ if len(re.Rune)%2 != 0 {
+ b.WriteString(`[invalid char class]`)
+ break
+ }
+ b.WriteRune('[')
+ if len(re.Rune) > 0 && re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune {
+ // Contains 0 and MaxRune. Probably a negated class.
+ // Print the gaps.
+ b.WriteRune('^')
+ for i := 1; i < len(re.Rune)-1; i += 2 {
+ lo, hi := re.Rune[i]+1, re.Rune[i+1]-1
+ escape(b, lo, lo == '-')
+ if lo != hi {
+ b.WriteRune('-')
+ escape(b, hi, hi == '-')
+ }
+ }
+ } else {
+ for i := 0; i < len(re.Rune); i += 2 {
+ lo, hi := re.Rune[i], re.Rune[i+1]
+ escape(b, lo, lo == '-')
+ if lo != hi {
+ b.WriteRune('-')
+ escape(b, hi, hi == '-')
+ }
+ }
+ }
+ b.WriteRune(']')
+ case OpAnyCharNotNL:
+ b.WriteString(`[^\n]`)
+ case OpAnyChar:
+ b.WriteRune('.')
+ case OpBeginLine:
+ b.WriteRune('^')
+ case OpEndLine:
+ b.WriteRune('$')
+ case OpBeginText:
+ b.WriteString(`\A`)
+ case OpEndText:
+ b.WriteString(`\z`)
+ case OpWordBoundary:
+ b.WriteString(`\b`)
+ case OpNoWordBoundary:
+ b.WriteString(`\B`)
+ case OpCapture:
+ if re.Name != "" {
+ b.WriteString(`(?P<`)
+ b.WriteString(re.Name)
+ b.WriteRune('>')
+ } else {
+ b.WriteRune('(')
+ }
+ writeRegexp(b, re.Sub[0])
+ b.WriteRune(')')
+ case OpStar, OpPlus, OpQuest, OpRepeat:
+ if sub := re.Sub[0]; sub.Op > OpCapture {
+ b.WriteString(`(?:`)
+ writeRegexp(b, sub)
+ b.WriteString(`)`)
+ } else {
+ writeRegexp(b, sub)
+ }
+ switch re.Op {
+ case OpStar:
+ b.WriteRune('*')
+ case OpPlus:
+ b.WriteRune('+')
+ case OpQuest:
+ b.WriteRune('?')
+ case OpRepeat:
+ b.WriteRune('{')
+ b.WriteString(strconv.Itoa(re.Min))
+ if re.Max != re.Min {
+ b.WriteRune(',')
+ if re.Max >= 0 {
+ b.WriteString(strconv.Itoa(re.Max))
+ }
+ }
+ b.WriteRune('}')
+ }
+ case OpConcat:
+ for _, sub := range re.Sub {
+ if sub.Op == OpAlternate {
+ b.WriteString(`(?:`)
+ writeRegexp(b, sub)
+ b.WriteString(`)`)
+ } else {
+ writeRegexp(b, sub)
+ }
+ }
+ case OpAlternate:
+ for i, sub := range re.Sub {
+ if i > 0 {
+ b.WriteRune('|')
+ }
+ writeRegexp(b, sub)
+ }
+ }
+}
+
+func (re *Regexp) String() string {
+ var b bytes.Buffer
+ writeRegexp(&b, re)
+ return b.String()
+}
+
+const meta = `\.+*?()|[]{}^$`
+
+func escape(b *bytes.Buffer, r int, force bool) {
+ if unicode.IsPrint(r) {
+ if strings.IndexRune(meta, r) >= 0 || force {
+ b.WriteRune('\\')
+ }
+ b.WriteRune(r)
+ return
+ }
+
+ switch r {
+ case '\a':
+ b.WriteString(`\a`)
+ case '\f':
+ b.WriteString(`\f`)
+ case '\n':
+ b.WriteString(`\n`)
+ case '\r':
+ b.WriteString(`\r`)
+ case '\t':
+ b.WriteString(`\t`)
+ case '\v':
+ b.WriteString(`\v`)
+ default:
+ b.WriteString(`\x{`)
+ b.WriteString(strconv.Itob(r, 16))
+ b.WriteString(`}`)
+ }
+}