From b96c3477f8edd414b9be5670f3912f7cb15eab1e Mon Sep 17 00:00:00 2001 From: Russ Cox Date: Mon, 13 Jun 2011 09:20:23 -0400 Subject: [PATCH] exp/regexp/syntax: syntax data structures, parser Parser is a work in progress but can populate most of the interesting parts of the data structure, so a good checkpoint. All the complicated Perl syntax is missing, as are various important optimizations made during parsing to the syntax tree. The plan is that exp/regexp's API will mimic regexp, and exp/regexp/syntax provides the parser directly for programs that need it (and for implementing exp/regexp). Once finished, exp/regexp will replace regexp. R=r, sam.thorogood, kevlar, edsrzf CC=golang-dev https://golang.org/cl/4538123 --- src/pkg/Makefile | 1 + src/pkg/exp/regexp/syntax/Makefile | 12 + src/pkg/exp/regexp/syntax/parse.go | 561 ++++++++++++++++++++++++ src/pkg/exp/regexp/syntax/parse_test.go | 266 +++++++++++ src/pkg/exp/regexp/syntax/regexp.go | 210 +++++++++ 5 files changed, 1050 insertions(+) create mode 100644 src/pkg/exp/regexp/syntax/Makefile create mode 100644 src/pkg/exp/regexp/syntax/parse.go create mode 100644 src/pkg/exp/regexp/syntax/parse_test.go create mode 100644 src/pkg/exp/regexp/syntax/regexp.go diff --git a/src/pkg/Makefile b/src/pkg/Makefile index a04ddc1103..c4be1da497 100644 --- a/src/pkg/Makefile +++ b/src/pkg/Makefile @@ -81,6 +81,7 @@ DIRS=\ exp/eval\ exp/gui\ exp/gui/x11\ + exp/regexp/syntax\ expvar\ flag\ fmt\ diff --git a/src/pkg/exp/regexp/syntax/Makefile b/src/pkg/exp/regexp/syntax/Makefile new file mode 100644 index 0000000000..d688a3f975 --- /dev/null +++ b/src/pkg/exp/regexp/syntax/Makefile @@ -0,0 +1,12 @@ +# Copyright 2011 The Go Authors. All rights reserved. +# Use of this source code is governed by a BSD-style +# license that can be found in the LICENSE file. + +include ../../../../Make.inc + +TARG=exp/regexp/syntax +GOFILES=\ + parse.go\ + regexp.go\ + +include ../../../../Make.pkg diff --git a/src/pkg/exp/regexp/syntax/parse.go b/src/pkg/exp/regexp/syntax/parse.go new file mode 100644 index 0000000000..0cc4620938 --- /dev/null +++ b/src/pkg/exp/regexp/syntax/parse.go @@ -0,0 +1,561 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import ( + "os" + "sort" + "unicode" + "utf8" +) + +// An Error describes a failure to parse a regular expression +// and gives the offending expression. +type Error struct { + Code ErrorCode + Expr string +} + +func (e *Error) String() string { + return "error parsing regexp: " + e.Code.String() + ": `" + e.Expr + "`" +} + +// An ErrorCode describes a failure to parse a regular expression. +type ErrorCode string + +const ( + // Unexpected error + ErrInternalError ErrorCode = "regexp/syntax: internal error" + + // Parse errors + ErrInvalidCharClass ErrorCode = "invalid character class" + ErrInvalidCharRange ErrorCode = "invalid character class range" + ErrInvalidEscape ErrorCode = "invalid escape sequence" + ErrInvalidNamedCapture ErrorCode = "invalid named capture" + ErrInvalidPerlOp ErrorCode = "invalid or unsupported Perl syntax" + ErrInvalidRepeatOp ErrorCode = "invalid nested repetition operator" + ErrInvalidRepeatSize ErrorCode = "invalid repeat count" + ErrInvalidUTF8 ErrorCode = "invalid UTF-8" + ErrMissingBracket ErrorCode = "missing closing ]" + ErrMissingParen ErrorCode = "missing closing )" + ErrMissingRepeatArgument ErrorCode = "missing argument to repetition operator" + ErrTrailingBackslash ErrorCode = "trailing backslash at end of expression" +) + +func (e ErrorCode) String() string { + return string(e) +} + +// Flags control the behavior of the parser and record information about regexp context. +type Flags uint16 + +const ( + FoldCase Flags = 1 << iota // case-insensitive match + Literal // treat pattern as literal string + ClassNL // allow character classes like [^a-z] and [[:space:]] to match newline + DotNL // allow . to match newline + OneLine // treat ^ and $ as only matching at beginning and end of text + NonGreedy // make repetition operators default to non-greedy + PerlX // allow Perl extensions + UnicodeGroups // allow \p{Han}, \P{Han} for Unicode group and negation + WasDollar // regexp OpEndText was $, not \z + Simple // regexp contains no counted repetition + + MatchNL = ClassNL | DotNL + + Perl = ClassNL | OneLine | PerlX | UnicodeGroups // as close to Perl as possible + POSIX Flags = 0 // POSIX syntax +) + +// Pseudo-ops for parsing stack. +const ( + opLeftParen = opPseudo + iota + opVerticalBar +) + +type parser struct { + flags Flags // parse mode flags + stack []*Regexp // stack of parsed expressions + numCap int // number of capturing groups seen + wholeRegexp string +} + +// Parse stack manipulation. + +// push pushes the regexp re onto the parse stack and returns the regexp. +func (p *parser) push(re *Regexp) *Regexp { + // TODO: automatic concatenation + // TODO: turn character class into literal + // TODO: compute simple + + p.stack = append(p.stack, re) + return re +} + +// newLiteral returns a new OpLiteral Regexp with the given flags +func newLiteral(r int, flags Flags) *Regexp { + re := &Regexp{ + Op: OpLiteral, + Flags: flags, + } + re.Rune0[0] = r + re.Rune = re.Rune0[:1] + return re +} + +// literal pushes a literal regexp for the rune r on the stack +// and returns that regexp. +func (p *parser) literal(r int) *Regexp { + return p.push(newLiteral(r, p.flags)) +} + +// op pushes a regexp with the given op onto the stack +// and returns that regexp. +func (p *parser) op(op Op) *Regexp { + return p.push(&Regexp{Op: op, Flags: p.flags}) +} + +// repeat replaces the top stack element with itself repeated +// according to op. +func (p *parser) repeat(op Op, opstr string) os.Error { + n := len(p.stack) + if n == 0 { + return &Error{ErrMissingRepeatArgument, opstr} + } + sub := p.stack[n-1] + re := &Regexp{ + Op: op, + } + re.Sub = re.Sub0[:1] + re.Sub[0] = sub + p.stack[n-1] = re + return nil +} + +// concat replaces the top of the stack (above the topmost '|' or '(') with its concatenation. +func (p *parser) concat() *Regexp { + // TODO: Flatten concats. + + // Scan down to find pseudo-operator | or (. + i := len(p.stack) + for i > 0 && p.stack[i-1].Op < opPseudo { + i-- + } + sub := p.stack[i:] + p.stack = p.stack[:i] + + var re *Regexp + switch len(sub) { + case 0: + re = &Regexp{Op: OpEmptyMatch} + case 1: + re = sub[0] + default: + re = &Regexp{Op: OpConcat} + re.Sub = append(re.Sub0[:0], sub...) + } + return p.push(re) +} + +// alternate replaces the top of the stack (above the topmost '(') with its alternation. +func (p *parser) alternate() *Regexp { + // TODO: Flatten alternates. + + // Scan down to find pseudo-operator (. + // There are no | above (. + i := len(p.stack) + for i > 0 && p.stack[i-1].Op < opPseudo { + i-- + } + sub := p.stack[i:] + p.stack = p.stack[:i] + + var re *Regexp + switch len(sub) { + case 0: + re = &Regexp{Op: OpNoMatch} + case 1: + re = sub[0] + default: + re = &Regexp{Op: OpAlternate} + re.Sub = append(re.Sub0[:0], sub...) + } + return p.push(re) +} + +// Parsing. + +func Parse(s string, flags Flags) (*Regexp, os.Error) { + if flags&Literal != 0 { + // Trivial parser for literal string. + if err := checkUTF8(s); err != nil { + return nil, err + } + re := &Regexp{ + Op: OpLiteral, + Flags: flags, + } + re.Rune = re.Rune0[:0] // use local storage for small strings + for _, c := range s { + if len(re.Rune) >= cap(re.Rune) { + // string is too long to fit in Rune0. let Go handle it + re.Rune = []int(s) + break + } + re.Rune = append(re.Rune, c) + } + return re, nil + } + + // Otherwise, must do real work. + var ( + p parser + err os.Error + c int + op Op + ) + p.flags = flags + p.wholeRegexp = s + t := s + for t != "" { + switch t[0] { + default: + if c, t, err = nextRune(t); err != nil { + return nil, err + } + p.literal(c) + + case '(': + // TODO: Actual Perl flag parsing. + if len(t) >= 3 && t[1] == '?' && t[2] == ':' { + // non-capturing paren + p.op(opLeftParen) + t = t[3:] + break + } + p.numCap++ + p.op(opLeftParen).Cap = p.numCap + t = t[1:] + case '|': + p.concat() + if err = p.parseVerticalBar(); err != nil { + return nil, err + } + t = t[1:] + case ')': + if err = p.parseRightParen(); err != nil { + return nil, err + } + t = t[1:] + case '^': + if p.flags&OneLine != 0 { + p.op(OpBeginText) + } else { + p.op(OpBeginLine) + } + t = t[1:] + case '$': + if p.flags&OneLine != 0 { + p.op(OpEndText).Flags |= WasDollar + } else { + p.op(OpEndLine) + } + t = t[1:] + case '.': + if p.flags&DotNL != 0 { + p.op(OpAnyChar) + } else { + p.op(OpAnyCharNotNL) + } + t = t[1:] + case '[': + if t, err = p.parseClass(t); err != nil { + return nil, err + } + case '*', '+', '?': + switch t[0] { + case '*': + op = OpStar + case '+': + op = OpPlus + case '?': + op = OpQuest + } + // TODO: greedy + if err = p.repeat(op, t[0:1]); err != nil { + return nil, err + } + t = t[1:] + case '{': + return nil, os.NewError("repeat not implemented") + case '\\': + return nil, os.NewError("escape not implemented") + } + } + + p.concat() + if p.swapVerticalBar() { + // pop vertical bar + p.stack = p.stack[:len(p.stack)-1] + } + p.alternate() + + n := len(p.stack) + if n != 1 { + return nil, &Error{ErrMissingParen, s} + } + return p.stack[0], nil +} + +// parseVerticalBar handles a | in the input. +func (p *parser) parseVerticalBar() os.Error { + p.concat() + + // The concatenation we just parsed is on top of the stack. + // If it sits above an opVerticalBar, swap it below + // (things below an opVerticalBar become an alternation). + // Otherwise, push a new vertical bar. + if !p.swapVerticalBar() { + p.op(opVerticalBar) + } + + return nil +} + +// If the top of the stack is an element followed by an opVerticalBar +// swapVerticalBar swaps the two and returns true. +// Otherwise it returns false. +func (p *parser) swapVerticalBar() bool { + if n := len(p.stack); n >= 2 { + re1 := p.stack[n-1] + re2 := p.stack[n-2] + if re2.Op == opVerticalBar { + p.stack[n-2] = re1 + p.stack[n-1] = re2 + return true + } + } + return false +} + +// parseRightParen handles a ) in the input. +func (p *parser) parseRightParen() os.Error { + p.concat() + if p.swapVerticalBar() { + // pop vertical bar + p.stack = p.stack[:len(p.stack)-1] + } + p.alternate() + + n := len(p.stack) + if n < 2 { + return &Error{ErrInternalError, ""} + } + re1 := p.stack[n-1] + re2 := p.stack[n-2] + p.stack = p.stack[:n-2] + if re2.Op != opLeftParen { + return &Error{ErrMissingParen, p.wholeRegexp} + } + if re2.Cap == 0 { + // Just for grouping. + p.push(re1) + } else { + re2.Op = OpCapture + re2.Sub = re2.Sub0[:1] + re2.Sub[0] = re1 + p.push(re2) + } + return nil +} + +// parseClassChar parses a character class character at the beginning of s +// and returns it. +func (p *parser) parseClassChar(s, wholeClass string) (r int, rest string, err os.Error) { + if s == "" { + return 0, "", &Error{Code: ErrMissingBracket, Expr: wholeClass} + } + + // TODO: Escapes + + return nextRune(s) +} + +// parseClass parses a character class at the beginning of s +// and pushes it onto the parse stack. +func (p *parser) parseClass(s string) (rest string, err os.Error) { + t := s[1:] // chop [ + re := &Regexp{Op: OpCharClass, Flags: p.flags} + re.Rune = re.Rune0[:0] + + sign := +1 + if t != "" && t[0] == '^' { + sign = -1 + t = t[1:] + + // If character class does not match \n, add it here, + // so that negation later will do the right thing. + if p.flags&ClassNL == 0 { + re.Rune = append(re.Rune, '\n', '\n') + } + } + + class := re.Rune + first := true // ] and - are okay as first char in class + for t == "" || t[0] != ']' || first { + // POSIX: - is only okay unescaped as first or last in class. + // Perl: - is okay anywhere. + if t != "" && t[0] == '-' && p.flags&PerlX == 0 && !first && (len(t) == 1 || t[1] != ']') { + _, size := utf8.DecodeRuneInString(t[1:]) + return "", &Error{Code: ErrInvalidCharRange, Expr: t[:1+size]} + } + first = false + + // TODO: Look for [:alnum:] + // TODO: Look for Unicode group. + // TODO: Look for Perl group. + + // Single character or simple range. + rng := t + var lo, hi int + if lo, t, err = p.parseClassChar(t, s); err != nil { + return "", err + } + hi = lo + // [a-] means (a|-) so check for final ]. + if len(t) >= 2 && t[0] == '-' && t[1] != ']' { + t = t[1:] + if hi, t, err = p.parseClassChar(t, s); err != nil { + return "", err + } + if hi < lo { + rng = rng[:len(rng)-len(t)] + return "", &Error{Code: ErrInvalidCharRange, Expr: rng} + } + } + + // Expand last range if overlaps or abuts. + if n := len(class); n > 0 { + clo, chi := class[n-2], class[n-1] + if lo <= chi+1 && clo <= hi+1 { + if lo < clo { + class[n-2] = lo + } + if hi > chi { + class[n-1] = hi + } + continue + } + } + + class = append(class, lo, hi) + } + t = t[1:] // chop ] + + // Use &re.Rune instead of &class to avoid allocation. + re.Rune = class + class = cleanClass(&re.Rune) + if sign < 0 { + class = negateClass(class) + } + re.Rune = class + p.push(re) + return t, nil +} + +// cleanClass sorts the ranges (pairs of elements of r), +// merges them, and eliminates duplicates. +func cleanClass(rp *[]int) []int { + // Sort by lo increasing, hi decreasing to break ties. + sort.Sort(ranges{rp}) + + r := *rp + // Merge abutting, overlapping. + w := 2 // write index + for i := 2; i < len(r); i += 2 { + lo, hi := r[i], r[i+1] + if lo <= r[w-1]+1 { + // merge with previous range + if hi > r[w-1] { + r[w-1] = hi + } + continue + } + // new disjoint range + r[w] = lo + r[w+1] = hi + w += 2 + } + + return r[:w] +} + +// negateClass overwrites r and returns r's negation. +// It assumes the class r is already clean. +func negateClass(r []int) []int { + nextLo := 0 // lo end of next class to add + w := 0 // write index + for i := 0; i < len(r); i += 2 { + lo, hi := r[i], r[i+1] + if nextLo <= lo-1 { + r[w] = nextLo + r[w+1] = lo - 1 + w += 2 + } + nextLo = hi + 1 + } + if nextLo <= unicode.MaxRune { + // It's possible for the negation to have one more + // range - this one - than the original class, so use append. + r = append(r[:w], nextLo, unicode.MaxRune) + } + return r +} + +// ranges implements sort.Interface on a []rune. +// The choice of receiver type definition is strange +// but avoids an allocation since we already have +// a *[]int. +type ranges struct { + p *[]int +} + +func (ra ranges) Less(i, j int) bool { + p := *ra.p + i *= 2 + j *= 2 + return p[i] < p[j] || p[i] == p[j] && p[i+1] > p[j+1] +} + +func (ra ranges) Len() int { + return len(*ra.p) / 2 +} + +func (ra ranges) Swap(i, j int) { + p := *ra.p + i *= 2 + j *= 2 + p[i], p[i+1], p[j], p[j+1] = p[j], p[j+1], p[i], p[i+1] +} + + +func checkUTF8(s string) os.Error { + for s != "" { + rune, size := utf8.DecodeRuneInString(s) + if rune == utf8.RuneError && size == 1 { + return &Error{Code: ErrInvalidUTF8, Expr: s} + } + s = s[size:] + } + return nil +} + +func nextRune(s string) (c int, t string, err os.Error) { + c, size := utf8.DecodeRuneInString(s) + if c == utf8.RuneError && size == 1 { + return 0, "", &Error{Code: ErrInvalidUTF8, Expr: s} + } + return c, s[size:], nil +} diff --git a/src/pkg/exp/regexp/syntax/parse_test.go b/src/pkg/exp/regexp/syntax/parse_test.go new file mode 100644 index 0000000000..4ae184c773 --- /dev/null +++ b/src/pkg/exp/regexp/syntax/parse_test.go @@ -0,0 +1,266 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syntax + +import ( + "bytes" + "fmt" + "testing" + "unicode" +) + +var parseTests = []struct { + Regexp string + Dump string +}{ + // Base cases + {"a", "lit{a}"}, + {"a.", "cat{lit{a}dot{}}"}, + {"a.b", "cat{lit{a}dot{}lit{b}}"}, + // { "ab", "str{ab}" }, + {"ab", "cat{lit{a}lit{b}}"}, + {"a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}"}, + // { "abc", "str{abc}" }, + {"abc", "cat{lit{a}lit{b}lit{c}}"}, + {"a|^", "alt{lit{a}bol{}}"}, + // { "a|b", "cc{0x61-0x62}" }, + {"a|b", "alt{lit{a}lit{b}}"}, + {"(a)", "cap{lit{a}}"}, + {"(a)|b", "alt{cap{lit{a}}lit{b}}"}, + {"a*", "star{lit{a}}"}, + {"a+", "plus{lit{a}}"}, + {"a?", "que{lit{a}}"}, + // { "a{2}", "rep{2,2 lit{a}}" }, + // { "a{2,3}", "rep{2,3 lit{a}}" }, + // { "a{2,}", "rep{2,-1 lit{a}}" }, + // { "a*?", "nstar{lit{a}}" }, + // { "a+?", "nplus{lit{a}}" }, + // { "a??", "nque{lit{a}}" }, + // { "a{2}?", "nrep{2,2 lit{a}}" }, + // { "a{2,3}?", "nrep{2,3 lit{a}}" }, + // { "a{2,}?", "nrep{2,-1 lit{a}}" }, + {"", "emp{}"}, + // { "|", "emp{}" }, // alt{emp{}emp{}} but got factored + // { "|", "alt{emp{}emp{}}" }, + {"|x|", "alt{emp{}lit{x}emp{}}"}, + {".", "dot{}"}, + {"^", "bol{}"}, + {"$", "eol{}"}, + // { "\\|", "lit{|}" }, + // { "\\(", "lit{(}" }, + // { "\\)", "lit{)}" }, + // { "\\*", "lit{*}" }, + // { "\\+", "lit{+}" }, + // { "\\?", "lit{?}" }, + // { "{", "lit{{}" }, + {"}", "lit{}}"}, + // { "\\.", "lit{.}" }, + // { "\\^", "lit{^}" }, + // { "\\$", "lit{$}" }, + // { "\\\\", "lit{\\}" }, + {"[ace]", "cc{0x61 0x63 0x65}"}, + {"[abc]", "cc{0x61-0x63}"}, + {"[a-z]", "cc{0x61-0x7a}"}, + // { "[a]", "lit{a}" }, + {"[a]", "cc{0x61}"}, + // { "\\-", "lit{-}" }, + {"-", "lit{-}"}, + // { "\\_", "lit{_}" }, + + // Posix and Perl extensions + // { "[[:lower:]]", "cc{0x61-0x7a}" }, + // { "[a-z]", "cc{0x61-0x7a}" }, + // { "[^[:lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" }, + // { "[[:^lower:]]", "cc{0x0-0x60 0x7b-0x10ffff}" }, + // { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, + // { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" }, + // { "(?i)[^[:lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + // { "(?i)[[:^lower:]]", "cc{0x0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + // { "\\d", "cc{0x30-0x39}" }, + // { "\\D", "cc{0x0-0x2f 0x3a-0x10ffff}" }, + // { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" }, + // { "\\S", "cc{0x0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" }, + // { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" }, + // { "\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" }, + // { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" }, + // { "(?i)\\W", "cc{0x0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" }, + // { "[^\\\\]", "cc{0x0-0x5b 0x5d-0x10ffff}" }, + // { "\\C", "byte{}" }, + + // Unicode, negatives, and a double negative. + // { "\\p{Braille}", "cc{0x2800-0x28ff}" }, + // { "\\P{Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" }, + // { "\\p{^Braille}", "cc{0x0-0x27ff 0x2900-0x10ffff}" }, + // { "\\P{^Braille}", "cc{0x2800-0x28ff}" }, + + // More interesting regular expressions. + // { "a{,2}", "str{a{,2}}" }, + // { "\\.\\^\\$\\\\", "str{.^$\\}" }, + {"[a-zABC]", "cc{0x41-0x43 0x61-0x7a}"}, + {"[^a]", "cc{0x0-0x60 0x62-0x10ffff}"}, + {"[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}"}, // utf-8 + // { "a*{", "cat{star{lit{a}}lit{{}}" }, + + // Test precedences + // { "(?:ab)*", "star{str{ab}}" }, + // { "(ab)*", "star{cap{str{ab}}}" }, + // { "ab|cd", "alt{str{ab}str{cd}}" }, + // { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" }, + {"(?:ab)*", "star{cat{lit{a}lit{b}}}"}, + {"(ab)*", "star{cap{cat{lit{a}lit{b}}}}"}, + {"ab|cd", "alt{cat{lit{a}lit{b}}cat{lit{c}lit{d}}}"}, + {"a(b|c)d", "cat{lit{a}cap{alt{lit{b}lit{c}}}lit{d}}"}, + + // Test flattening. + // { "(?:a)", "lit{a}" }, + // { "(?:ab)(?:cd)", "str{abcd}" }, + // { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" }, + // { "a|.", "dot{}" }, + // { ".|a", "dot{}" }, + + // Test Perl quoted literals + // { "\\Q+|*?{[\\E", "str{+|*?{[}" }, + // { "\\Q+\\E+", "plus{lit{+}}" }, + // { "\\Q\\\\E", "lit{\\}" }, + // { "\\Q\\\\\\E", "str{\\\\}" }, + + // Test Perl \A and \z + // { "(?m)^", "bol{}" }, + // { "(?m)$", "eol{}" }, + // { "(?-m)^", "bot{}" }, + // { "(?-m)$", "eot{}" }, + // { "(?m)\\A", "bot{}" }, + // { "(?m)\\z", "eot{\\z}" }, + // { "(?-m)\\A", "bot{}" }, + // { "(?-m)\\z", "eot{\\z}" }, + + // Test named captures + // { "(?Pa)", "cap{name:lit{a}}" }, + + // Case-folded literals + // { "[Aa]", "litfold{a}" }, + + // Strings + // { "abcde", "str{abcde}" }, + // { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" }, +} + +const testFlags = MatchNL | PerlX | UnicodeGroups + +// Test Parse -> Dump. +func TestParseDump(t *testing.T) { + for _, tt := range parseTests { + re, err := Parse(tt.Regexp, testFlags) + if err != nil { + t.Errorf("Parse(%#q): %v", tt.Regexp, err) + continue + } + d := dump(re) + if d != tt.Dump { + t.Errorf("Parse(%#q).Dump() = %#q want %#q", tt.Regexp, d, tt.Dump) + } + } +} + +// dump prints a string representation of the regexp showing +// the structure explicitly. +func dump(re *Regexp) string { + var b bytes.Buffer + dumpRegexp(&b, re) + return b.String() +} + +var opNames = []string{ + OpNoMatch: "no", + OpEmptyMatch: "emp", + OpLiteral: "lit", + OpCharClass: "cc", + OpAnyCharNotNL: "dnl", + OpAnyChar: "dot", + OpBeginLine: "bol", + OpEndLine: "eol", + OpBeginText: "bot", + OpEndText: "eot", + OpWordBoundary: "wb", + OpNoWordBoundary: "nwb", + OpCapture: "cap", + OpStar: "star", + OpPlus: "plus", + OpQuest: "que", + OpRepeat: "rep", + OpConcat: "cat", + OpAlternate: "alt", +} + +// dumpRegexp writes an encoding of the syntax tree for the regexp re to b. +// It is used during testing to distinguish between parses that might print +// the same using re's String method. +func dumpRegexp(b *bytes.Buffer, re *Regexp) { + if int(re.Op) >= len(opNames) || opNames[re.Op] == "" { + fmt.Fprintf(b, "op%d", re.Op) + } else { + switch re.Op { + default: + b.WriteString(opNames[re.Op]) + case OpStar, OpPlus, OpQuest, OpRepeat: + if re.Flags&NonGreedy != 0 { + b.WriteByte('n') + } + b.WriteString(opNames[re.Op]) + case OpLiteral: + if len(re.Rune) > 1 { + b.WriteString("str") + } else { + b.WriteString("lit") + } + if re.Flags&FoldCase != 0 { + for _, r := range re.Rune { + if unicode.ToUpper(r) != r { + b.WriteString("fold") + } + } + } + } + } + b.WriteByte('{') + switch re.Op { + case OpEndText: + if re.Flags&WasDollar == 0 { + b.WriteString(`\z`) + } + case OpLiteral: + for _, r := range re.Rune { + b.WriteRune(r) + } + case OpConcat, OpAlternate: + for _, sub := range re.Sub { + dumpRegexp(b, sub) + } + case OpStar, OpPlus, OpQuest: + dumpRegexp(b, re.Sub[0]) + case OpRepeat: + fmt.Fprintf(b, "%d,%d ", re.Min, re.Max) + dumpRegexp(b, re.Sub[0]) + case OpCapture: + if re.Name != "" { + b.WriteString(re.Name) + b.WriteByte(':') + } + dumpRegexp(b, re.Sub[0]) + case OpCharClass: + sep := "" + for i := 0; i < len(re.Rune); i += 2 { + b.WriteString(sep) + sep = " " + lo, hi := re.Rune[i], re.Rune[i+1] + if lo == hi { + fmt.Fprintf(b, "%#x", lo) + } else { + fmt.Fprintf(b, "%#x-%#x", lo, hi) + } + } + } + b.WriteByte('}') +} diff --git a/src/pkg/exp/regexp/syntax/regexp.go b/src/pkg/exp/regexp/syntax/regexp.go new file mode 100644 index 0000000000..a0c465967f --- /dev/null +++ b/src/pkg/exp/regexp/syntax/regexp.go @@ -0,0 +1,210 @@ +// Copyright 2011 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Package syntax parses regular expressions into syntax trees. +// WORK IN PROGRESS. +package syntax + +// Note to implementers: +// In this package, re is always a *Regexp and r is always a rune. + +import ( + "bytes" + "strconv" + "strings" + "unicode" +) + +// A Regexp is a node in a regular expression syntax tree. +type Regexp struct { + Op Op // operator + Flags Flags + Sub []*Regexp // subexpressions, if any + Sub0 [1]*Regexp // storage for short Sub + Rune []int // matched runes, for OpLiteral, OpCharClass + Rune0 [2]int // storage for short Rune + Min, Max int // min, max for OpRepeat + Cap int // capturing index, for OpCapture + Name string // capturing name, for OpCapture +} + +// An Op is a single regular expression operator. +type Op uint8 + +// Operators are listed in precedence order, tightest binding to weakest. + +const ( + OpNoMatch Op = 1 + iota // matches no strings + OpEmptyMatch // matches empty string + OpLiteral // matches Runes sequence + OpCharClass // matches Runes interpreted as range pair list + OpAnyCharNotNL // matches any character + OpAnyChar // matches any character + OpBeginLine // matches empty string at beginning of line + OpEndLine // matches empty string at end of line + OpBeginText // matches empty string at beginning of text + OpEndText // matches empty string at end of text + OpWordBoundary // matches word boundary `\b` + OpNoWordBoundary // matches word non-boundary `\B` + OpCapture // capturing subexpression with index Cap, optional name Name + OpStar // matches Sub[0] zero or more times + OpPlus // matches Sub[0] one or more times + OpQuest // matches Sub[0] zero or one times + OpRepeat // matches Sub[0] at least Min times, at most Max (Max == -1 is no limit) + OpConcat // matches concatenation of Subs + OpAlternate // matches alternation of Subs +) + +const opPseudo Op = 128 // where pseudo-ops start + +// writeRegexp writes the Perl syntax for the regular expression re to b. +func writeRegexp(b *bytes.Buffer, re *Regexp) { + switch re.Op { + default: + b.WriteString("") + case OpNoMatch: + b.WriteString(`[^\x00-\x{10FFFF}]`) + case OpEmptyMatch: + b.WriteString(`(?:)`) + case OpLiteral: + for _, r := range re.Rune { + escape(b, r, false) + } + case OpCharClass: + if len(re.Rune)%2 != 0 { + b.WriteString(`[invalid char class]`) + break + } + b.WriteRune('[') + if len(re.Rune) > 0 && re.Rune[0] == 0 && re.Rune[len(re.Rune)-1] == unicode.MaxRune { + // Contains 0 and MaxRune. Probably a negated class. + // Print the gaps. + b.WriteRune('^') + for i := 1; i < len(re.Rune)-1; i += 2 { + lo, hi := re.Rune[i]+1, re.Rune[i+1]-1 + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } else { + for i := 0; i < len(re.Rune); i += 2 { + lo, hi := re.Rune[i], re.Rune[i+1] + escape(b, lo, lo == '-') + if lo != hi { + b.WriteRune('-') + escape(b, hi, hi == '-') + } + } + } + b.WriteRune(']') + case OpAnyCharNotNL: + b.WriteString(`[^\n]`) + case OpAnyChar: + b.WriteRune('.') + case OpBeginLine: + b.WriteRune('^') + case OpEndLine: + b.WriteRune('$') + case OpBeginText: + b.WriteString(`\A`) + case OpEndText: + b.WriteString(`\z`) + case OpWordBoundary: + b.WriteString(`\b`) + case OpNoWordBoundary: + b.WriteString(`\B`) + case OpCapture: + if re.Name != "" { + b.WriteString(`(?P<`) + b.WriteString(re.Name) + b.WriteRune('>') + } else { + b.WriteRune('(') + } + writeRegexp(b, re.Sub[0]) + b.WriteRune(')') + case OpStar, OpPlus, OpQuest, OpRepeat: + if sub := re.Sub[0]; sub.Op > OpCapture { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + switch re.Op { + case OpStar: + b.WriteRune('*') + case OpPlus: + b.WriteRune('+') + case OpQuest: + b.WriteRune('?') + case OpRepeat: + b.WriteRune('{') + b.WriteString(strconv.Itoa(re.Min)) + if re.Max != re.Min { + b.WriteRune(',') + if re.Max >= 0 { + b.WriteString(strconv.Itoa(re.Max)) + } + } + b.WriteRune('}') + } + case OpConcat: + for _, sub := range re.Sub { + if sub.Op == OpAlternate { + b.WriteString(`(?:`) + writeRegexp(b, sub) + b.WriteString(`)`) + } else { + writeRegexp(b, sub) + } + } + case OpAlternate: + for i, sub := range re.Sub { + if i > 0 { + b.WriteRune('|') + } + writeRegexp(b, sub) + } + } +} + +func (re *Regexp) String() string { + var b bytes.Buffer + writeRegexp(&b, re) + return b.String() +} + +const meta = `\.+*?()|[]{}^$` + +func escape(b *bytes.Buffer, r int, force bool) { + if unicode.IsPrint(r) { + if strings.IndexRune(meta, r) >= 0 || force { + b.WriteRune('\\') + } + b.WriteRune(r) + return + } + + switch r { + case '\a': + b.WriteString(`\a`) + case '\f': + b.WriteString(`\f`) + case '\n': + b.WriteString(`\n`) + case '\r': + b.WriteString(`\r`) + case '\t': + b.WriteString(`\t`) + case '\v': + b.WriteString(`\v`) + default: + b.WriteString(`\x{`) + b.WriteString(strconv.Itob(r, 16)) + b.WriteString(`}`) + } +} -- 2.48.1