regexp: hide one-pass code from exported API

author Russ Cox <rsc@golang.org>

Wed, 28 May 2014 18:08:44 +0000 (14:08 -0400)

committer Russ Cox <rsc@golang.org>

Wed, 28 May 2014 18:08:44 +0000 (14:08 -0400)
author Russ Cox <rsc@golang.org>
Wed, 28 May 2014 18:08:44 +0000 (14:08 -0400)
committer Russ Cox <rsc@golang.org>
Wed, 28 May 2014 18:08:44 +0000 (14:08 -0400)
diff --git a/api/next.txt b/api/next.txt

index eac2d203d748523de940feb8f03752521f42c35a..cdbac57f22d3de5133fda37c12c20860a15cd085 100644 (file)
--- a/api/next.txt
+++ b/api/next.txt
@@ -330,12 +330,7 @@ pkg net/http, type Server struct, ConnState func(net.Conn, ConnState)
  pkg net/http, type Server struct, ErrorLog *log.Logger
  pkg net/http, type Transport struct, TLSHandshakeTimeout time.Duration
  pkg regexp/syntax, method (*Inst) MatchRunePos(int32) int
-pkg regexp/syntax, method (*Inst) OnePassNext(int32) uint32
-pkg regexp/syntax, method (*Prog) CompileOnePass() *Prog
-pkg regexp/syntax, method (*Prog) OnePassPrefix() (string, bool, uint32)
  pkg regexp/syntax, method (InstOp) String() string
-pkg regexp/syntax, type Inst struct, Next []uint32
-pkg regexp/syntax, var NotOnePass *Prog
  pkg runtime/debug, func SetPanicOnFault(bool) bool
  pkg runtime/debug, func WriteHeapDump(uintptr)
  pkg sync, method (*Pool) Get() interface{}
diff --git a/src/pkg/regexp/exec.go b/src/pkg/regexp/exec.go

index 22c33cfa5680d091146b528648f0d8001f5cd6a4..c4cb201f64251402978118a6f1b6d831954fe412 100644 (file)
--- a/src/pkg/regexp/exec.go
+++ b/src/pkg/regexp/exec.go
@@ -37,7 +37,7 @@ type thread struct {
  type machine struct {
         re       *Regexp      // corresponding Regexp
         p        *syntax.Prog // compiled program
-       op       *syntax.Prog // compiled onepass program, or syntax.NotOnePass
+       op       *onePassProg // compiled onepass program, or notOnePass
         q0, q1   queue        // two queues for runq, nextq
         pool     []*thread    // pool of available threads
         matched  bool         // whether a match was found
@@ -67,7 +67,7 @@ func (m *machine) newInputReader(r io.RuneReader) input {
  }
  
  // progMachine returns a new machine running the prog p.
-func progMachine(p, op *syntax.Prog) *machine {
+func progMachine(p *syntax.Prog, op *onePassProg) *machine {
         m := &machine{p: p, op: op}
         n := len(m.p.Inst)
         m.q0 = queue{make([]uint32, n), make([]entry, 0, n)}
@@ -382,7 +382,7 @@ func (m *machine) onepass(i input, pos int) bool {
                         }
                 // peek at the input rune to see which branch of the Alt to take
                 case syntax.InstAlt, syntax.InstAltMatch:
-                       pc = int(inst.OnePassNext(r))
+                       pc = int(onePassNext(&inst, r))
                         continue
                 case syntax.InstFail:
                         return m.matched
@@ -429,7 +429,7 @@ func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap i
         } else {
                 i = m.newInputString(s)
         }
-       if m.op != syntax.NotOnePass {
+       if m.op != notOnePass {
                 if !m.onepass(i, pos) {
                         re.put(m)
                         return nil
diff --git a/src/pkg/regexp/onepass.go b/src/pkg/regexp/onepass.go

new file mode 100644 (file)

index 0000000..501fb28
--- /dev/null
+++ b/src/pkg/regexp/onepass.go
@@ -0,0 +1,582 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+
+package regexp
+
+import (
+       "bytes"
+       "regexp/syntax"
+       "sort"
+       "unicode"
+)
+
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// "One-pass" regexp execution.
+// Some regexps can be analyzed to determine that they never need
+// backtracking: they are guaranteed to run in one pass over the string
+// without bothering to save all the usual NFA state.
+// Detect those and execute them more quickly.
+
+// A onePassProg is a compiled one-pass regular expression program.
+// It is the same as syntax.Prog except for the use of onePassInst.
+type onePassProg struct {
+       Inst   []onePassInst
+       Start  int // index of start instruction
+       NumCap int // number of InstCapture insts in re
+}
+
+// A onePassInst is a single instruction in a one-pass regular expression program.
+// It is the same as syntax.Inst except for the new 'Next' field.
+type onePassInst struct {
+       syntax.Inst
+       Next []uint32
+}
+
+// OnePassPrefix returns a literal string that all matches for the
+// regexp must start with.  Complete is true if the prefix
+// is the entire match. Pc is the index of the last rune instruction
+// in the string. The OnePassPrefix skips over the mandatory
+// EmptyBeginText
+func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
+       i := &p.Inst[p.Start]
+       if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
+               return "", i.Op == syntax.InstMatch, uint32(p.Start)
+       }
+       pc = i.Out
+       i = &p.Inst[pc]
+       for i.Op == syntax.InstNop {
+               pc = i.Out
+               i = &p.Inst[pc]
+       }
+       // Avoid allocation of buffer if prefix is empty.
+       if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
+               return "", i.Op == syntax.InstMatch, uint32(p.Start)
+       }
+
+       // Have prefix; gather characters.
+       var buf bytes.Buffer
+       for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
+               buf.WriteRune(i.Rune[0])
+               pc, i = i.Out, &p.Inst[i.Out]
+       }
+       return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc
+}
+
+// OnePassNext selects the next actionable state of the prog, based on the input character.
+// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
+// One of the alternates may ultimately lead without input to end of line. If the instruction
+// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
+func onePassNext(i *onePassInst, r rune) uint32 {
+       next := i.MatchRunePos(r)
+       if next >= 0 {
+               return i.Next[next]
+       }
+       if i.Op == syntax.InstAltMatch {
+               return i.Out
+       }
+       return 0
+}
+
+func iop(i *syntax.Inst) syntax.InstOp {
+       op := i.Op
+       switch op {
+       case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
+               op = syntax.InstRune
+       }
+       return op
+}
+
+// Sparse Array implementation is used as a queueOnePass.
+type queueOnePass struct {
+       sparse          []uint32
+       dense           []uint32
+       size, nextIndex uint32
+}
+
+func (q *queueOnePass) empty() bool {
+       return q.nextIndex >= q.size
+}
+
+func (q *queueOnePass) next() (n uint32) {
+       n = q.dense[q.nextIndex]
+       q.nextIndex++
+       return
+}
+
+func (q *queueOnePass) clear() {
+       q.size = 0
+       q.nextIndex = 0
+}
+
+func (q *queueOnePass) reset() {
+       q.nextIndex = 0
+}
+
+func (q *queueOnePass) contains(u uint32) bool {
+       if u >= uint32(len(q.sparse)) {
+               return false
+       }
+       return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
+}
+
+func (q *queueOnePass) insert(u uint32) {
+       if !q.contains(u) {
+               q.insertNew(u)
+       }
+}
+
+func (q *queueOnePass) insertNew(u uint32) {
+       if u >= uint32(len(q.sparse)) {
+               return
+       }
+       q.sparse[u] = q.size
+       q.dense[q.size] = u
+       q.size++
+}
+
+func newQueue(size int) (q *queueOnePass) {
+       return &queueOnePass{
+               sparse: make([]uint32, size),
+               dense:  make([]uint32, size),
+       }
+}
+
+// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
+// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
+// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
+// NextIp array with the single element mergeFailed is returned.
+// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
+const mergeFailed = uint32(0xffffffff)
+
+var (
+       noRune = []rune{}
+       noNext = []uint32{mergeFailed}
+)
+
+func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
+       leftLen := len(*leftRunes)
+       rightLen := len(*rightRunes)
+       if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
+               panic("mergeRuneSets odd length []rune")
+       }
+       var (
+               lx, rx int
+       )
+       merged := make([]rune, 0)
+       next := make([]uint32, 0)
+       ok := true
+       defer func() {
+               if !ok {
+                       merged = nil
+                       next = nil
+               }
+       }()
+
+       ix := -1
+       extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
+               if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
+                       return false
+               }
+               merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
+               *newLow += 2
+               ix += 2
+               next = append(next, pc)
+               return true
+       }
+
+       for lx < leftLen || rx < rightLen {
+               switch {
+               case rx >= rightLen:
+                       ok = extend(&lx, leftRunes, leftPC)
+               case lx >= leftLen:
+                       ok = extend(&rx, rightRunes, rightPC)
+               case (*rightRunes)[rx] < (*leftRunes)[lx]:
+                       ok = extend(&rx, rightRunes, rightPC)
+               default:
+                       ok = extend(&lx, leftRunes, leftPC)
+               }
+               if !ok {
+                       return noRune, noNext
+               }
+       }
+       return merged, next
+}
+
+// cleanupOnePass drops working memory, and restores certain shortcut instructions.
+func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
+       for ix, instOriginal := range original.Inst {
+               switch instOriginal.Op {
+               case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
+               case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
+                       prog.Inst[ix].Next = nil
+               case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
+                       prog.Inst[ix].Next = nil
+                       prog.Inst[ix] = onePassInst{Inst: instOriginal}
+               }
+       }
+}
+
+// onePassCopy creates a copy of the original Prog, as we'll be modifying it
+func onePassCopy(prog *syntax.Prog) *onePassProg {
+       p := &onePassProg{
+               Start:  prog.Start,
+               NumCap: prog.NumCap,
+       }
+       for _, inst := range prog.Inst {
+               p.Inst = append(p.Inst, onePassInst{Inst: inst})
+       }
+
+       // rewrites one or more common Prog constructs that enable some otherwise
+       // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
+       // ip A, that points to ips B & C.
+       // A:BC + B:DA => A:BC + B:CD
+       // A:BC + B:DC => A:DC + B:DC
+       for pc := range p.Inst {
+               switch p.Inst[pc].Op {
+               default:
+                       continue
+               case syntax.InstAlt, syntax.InstAltMatch:
+                       // A:Bx + B:Ay
+                       p_A_Other := &p.Inst[pc].Out
+                       p_A_Alt := &p.Inst[pc].Arg
+                       // make sure a target is another Alt
+                       instAlt := p.Inst[*p_A_Alt]
+                       if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
+                               p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
+                               instAlt = p.Inst[*p_A_Alt]
+                               if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
+                                       continue
+                               }
+                       }
+                       instOther := p.Inst[*p_A_Other]
+                       // Analyzing both legs pointing to Alts is for another day
+                       if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
+                               // too complicated
+                               continue
+                       }
+                       // simple empty transition loop
+                       // A:BC + B:DA => A:BC + B:DC
+                       p_B_Alt := &p.Inst[*p_A_Alt].Out
+                       p_B_Other := &p.Inst[*p_A_Alt].Arg
+                       patch := false
+                       if instAlt.Out == uint32(pc) {
+                               patch = true
+                       } else if instAlt.Arg == uint32(pc) {
+                               patch = true
+                               p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
+                       }
+                       if patch {
+                               *p_B_Alt = *p_A_Other
+                       }
+
+                       // empty transition to common target
+                       // A:BC + B:DC => A:DC + B:DC
+                       if *p_A_Other == *p_B_Alt {
+                               *p_A_Alt = *p_B_Other
+                       }
+               }
+       }
+       return p
+}
+
+// runeSlice exists to permit sorting the case-folded rune sets.
+type runeSlice []rune
+
+func (p runeSlice) Len() int           { return len(p) }
+func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
+func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
+
+// Sort is a convenience method.
+func (p runeSlice) Sort() {
+       sort.Sort(p)
+}
+
+var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
+var anyRune = []rune{0, unicode.MaxRune}
+
+// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
+// the match engine can always tell which branch to take. The routine may modify
+// p if it is turned into a onepass Prog. If it isn't possible for this to be a
+// onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
+// to the size of the Prog.
+func makeOnePass(p *onePassProg) *onePassProg {
+       // If the machine is very long, it's not worth the time to check if we can use one pass.
+       if len(p.Inst) >= 1000 {
+               return notOnePass
+       }
+
+       var (
+               instQueue    = newQueue(len(p.Inst))
+               visitQueue   = newQueue(len(p.Inst))
+               build        func(uint32, *queueOnePass)
+               check        func(uint32, map[uint32]bool) bool
+               onePassRunes = make([][]rune, len(p.Inst))
+       )
+       build = func(pc uint32, q *queueOnePass) {
+               if q.contains(pc) {
+                       return
+               }
+               inst := p.Inst[pc]
+               switch inst.Op {
+               case syntax.InstAlt, syntax.InstAltMatch:
+                       q.insert(inst.Out)
+                       build(inst.Out, q)
+                       q.insert(inst.Arg)
+               case syntax.InstMatch, syntax.InstFail:
+               default:
+                       q.insert(inst.Out)
+               }
+       }
+
+       // check that paths from Alt instructions are unambiguous, and rebuild the new
+       // program as a onepass program
+       check = func(pc uint32, m map[uint32]bool) (ok bool) {
+               ok = true
+               inst := &p.Inst[pc]
+               if visitQueue.contains(pc) {
+                       return
+               }
+               visitQueue.insert(pc)
+               switch inst.Op {
+               case syntax.InstAlt, syntax.InstAltMatch:
+                       ok = check(inst.Out, m) && check(inst.Arg, m)
+                       // check no-input paths to InstMatch
+                       matchOut := m[inst.Out]
+                       matchArg := m[inst.Arg]
+                       if matchOut && matchArg {
+                               ok = false
+                               break
+                       }
+                       // Match on empty goes in inst.Out
+                       if matchArg {
+                               inst.Out, inst.Arg = inst.Arg, inst.Out
+                               matchOut, matchArg = matchArg, matchOut
+                       }
+                       if matchOut {
+                               m[pc] = true
+                               inst.Op = syntax.InstAltMatch
+                       }
+
+                       // build a dispatch operator from the two legs of the alt.
+                       onePassRunes[pc], inst.Next = mergeRuneSets(
+                               &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
+                       if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
+                               ok = false
+                               break
+                       }
+               case syntax.InstCapture, syntax.InstNop:
+                       ok = check(inst.Out, m)
+                       m[pc] = m[inst.Out]
+                       // pass matching runes back through these no-ops.
+                       onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
+                       inst.Next = []uint32{}
+                       for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
+                               inst.Next = append(inst.Next, inst.Out)
+                       }
+               case syntax.InstEmptyWidth:
+                       ok = check(inst.Out, m)
+                       m[pc] = m[inst.Out]
+                       onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
+                       inst.Next = []uint32{}
+                       for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
+                               inst.Next = append(inst.Next, inst.Out)
+                       }
+               case syntax.InstMatch, syntax.InstFail:
+                       m[pc] = inst.Op == syntax.InstMatch
+                       break
+               case syntax.InstRune:
+                       ok = check(inst.Out, m)
+                       m[pc] = false
+                       if len(inst.Next) > 0 {
+                               break
+                       }
+                       if len(inst.Rune) == 0 {
+                               onePassRunes[pc] = []rune{}
+                               inst.Next = []uint32{inst.Out}
+                               break
+                       }
+                       runes := make([]rune, 0)
+                       if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
+                               r0 := inst.Rune[0]
+                               runes = append(runes, r0, r0)
+                               for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
+                                       runes = append(runes, r1, r1)
+                               }
+                               sort.Sort(runeSlice(runes))
+                       } else {
+                               runes = append(runes, inst.Rune...)
+                       }
+                       onePassRunes[pc] = runes
+                       inst.Next = []uint32{}
+                       for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
+                               inst.Next = append(inst.Next, inst.Out)
+                       }
+                       inst.Op = syntax.InstRune
+               case syntax.InstRune1:
+                       ok = check(inst.Out, m)
+                       m[pc] = false
+                       if len(inst.Next) > 0 {
+                               break
+                       }
+                       runes := []rune{}
+                       // expand case-folded runes
+                       if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
+                               r0 := inst.Rune[0]
+                               runes = append(runes, r0, r0)
+                               for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
+                                       runes = append(runes, r1, r1)
+                               }
+                               sort.Sort(runeSlice(runes))
+                       } else {
+                               runes = append(runes, inst.Rune[0], inst.Rune[0])
+                       }
+                       onePassRunes[pc] = runes
+                       inst.Next = []uint32{}
+                       for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
+                               inst.Next = append(inst.Next, inst.Out)
+                       }
+                       inst.Op = syntax.InstRune
+               case syntax.InstRuneAny:
+                       ok = check(inst.Out, m)
+                       m[pc] = false
+                       if len(inst.Next) > 0 {
+                               break
+                       }
+                       onePassRunes[pc] = append([]rune{}, anyRune...)
+                       inst.Next = []uint32{inst.Out}
+               case syntax.InstRuneAnyNotNL:
+                       ok = check(inst.Out, m)
+                       m[pc] = false
+                       if len(inst.Next) > 0 {
+                               break
+                       }
+                       onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
+                       inst.Next = []uint32{}
+                       for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
+                               inst.Next = append(inst.Next, inst.Out)
+                       }
+               }
+               return
+       }
+
+       instQueue.clear()
+       instQueue.insert(uint32(p.Start))
+       m := make(map[uint32]bool, len(p.Inst))
+       for !instQueue.empty() {
+               pc := instQueue.next()
+               inst := p.Inst[pc]
+               visitQueue.clear()
+               if !check(uint32(pc), m) {
+                       p = notOnePass
+                       break
+               }
+               switch inst.Op {
+               case syntax.InstAlt, syntax.InstAltMatch:
+                       instQueue.insert(inst.Out)
+                       instQueue.insert(inst.Arg)
+               case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop:
+                       instQueue.insert(inst.Out)
+               case syntax.InstMatch:
+               case syntax.InstFail:
+               case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
+               default:
+               }
+       }
+       if p != notOnePass {
+               for i, _ := range p.Inst {
+                       p.Inst[i].Rune = onePassRunes[i]
+               }
+       }
+       return p
+}
+
+// walk visits each Inst in the prog once, and applies the argument
+// function(ip, next), in pre-order.
+func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) {
+       var walk1 func(uint32)
+       progQueue := newQueue(len(prog.Inst))
+       walk1 = func(ip uint32) {
+               if progQueue.contains(ip) {
+                       return
+               }
+               progQueue.insert(ip)
+               inst := prog.Inst[ip]
+               switch inst.Op {
+               case syntax.InstAlt, syntax.InstAltMatch:
+                       for _, f := range funcs {
+                               f(ip, inst.Out)
+                               f(ip, inst.Arg)
+                       }
+                       walk1(inst.Out)
+                       walk1(inst.Arg)
+               default:
+                       for _, f := range funcs {
+                               f(ip, inst.Out)
+                       }
+                       walk1(inst.Out)
+               }
+       }
+       walk1(uint32(prog.Start))
+}
+
+// find returns the Insts that match the argument predicate function
+func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) {
+       matches = []uint32{}
+
+       for ip := range prog.Inst {
+               if f(prog, ip) {
+                       matches = append(matches, uint32(ip))
+               }
+       }
+       return
+}
+
+var notOnePass *onePassProg = nil
+
+// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
+// can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
+// Prog cannot be converted. For a one pass prog, the fundamental condition that must
+// be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
+func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
+       if prog.Start == 0 {
+               return notOnePass
+       }
+       // onepass regexp is anchored
+       if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
+               syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
+               return notOnePass
+       }
+       // every instruction leading to InstMatch must be EmptyEndText
+       for _, inst := range prog.Inst {
+               opOut := prog.Inst[inst.Out].Op
+               switch inst.Op {
+               default:
+                       if opOut == syntax.InstMatch {
+                               return notOnePass
+                       }
+               case syntax.InstAlt, syntax.InstAltMatch:
+                       if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
+                               return notOnePass
+                       }
+               case syntax.InstEmptyWidth:
+                       if opOut == syntax.InstMatch {
+                               if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
+                                       continue
+                               }
+                               return notOnePass
+                       }
+               }
+       }
+       // Creates a slightly optimized copy of the original Prog
+       // that cleans up some Prog idioms that block valid onepass programs
+       p = onePassCopy(prog)
+
+       // checkAmbiguity on InstAlts, build onepass Prog if possible
+       p = makeOnePass(p)
+
+       if p != notOnePass {
+               cleanupOnePass(p, prog)
+       }
+       return p
+}
diff --git a/src/pkg/regexp/onepass_test.go b/src/pkg/regexp/onepass_test.go

new file mode 100644 (file)

index 0000000..7b2beea
--- /dev/null
+++ b/src/pkg/regexp/onepass_test.go
@@ -0,0 +1,208 @@
+// Copyright 2014 The Go Authors.  All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package regexp
+
+import (
+       "reflect"
+       "regexp/syntax"
+       "testing"
+)
+
+var runeMergeTests = []struct {
+       left, right, merged []rune
+       next                []uint32
+       leftPC, rightPC     uint32
+}{
+       {
+               // empty rhs
+               []rune{69, 69},
+               []rune{},
+               []rune{69, 69},
+               []uint32{1},
+               1, 2,
+       },
+       {
+               // identical runes, identical targets
+               []rune{69, 69},
+               []rune{69, 69},
+               []rune{},
+               []uint32{mergeFailed},
+               1, 1,
+       },
+       {
+               // identical runes, different targets
+               []rune{69, 69},
+               []rune{69, 69},
+               []rune{},
+               []uint32{mergeFailed},
+               1, 2,
+       },
+       {
+               // append right-first
+               []rune{69, 69},
+               []rune{71, 71},
+               []rune{69, 69, 71, 71},
+               []uint32{1, 2},
+               1, 2,
+       },
+       {
+               // append, left-first
+               []rune{71, 71},
+               []rune{69, 69},
+               []rune{69, 69, 71, 71},
+               []uint32{2, 1},
+               1, 2,
+       },
+       {
+               // successful interleave
+               []rune{60, 60, 71, 71, 101, 101},
+               []rune{69, 69, 88, 88},
+               []rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101},
+               []uint32{1, 2, 1, 2, 1},
+               1, 2,
+       },
+       {
+               // left surrounds right
+               []rune{69, 74},
+               []rune{71, 71},
+               []rune{},
+               []uint32{mergeFailed},
+               1, 2,
+       },
+       {
+               // right surrounds left
+               []rune{69, 74},
+               []rune{68, 75},
+               []rune{},
+               []uint32{mergeFailed},
+               1, 2,
+       },
+       {
+               // overlap at interval begin
+               []rune{69, 74},
+               []rune{74, 75},
+               []rune{},
+               []uint32{mergeFailed},
+               1, 2,
+       },
+       {
+               // overlap ar interval end
+               []rune{69, 74},
+               []rune{65, 69},
+               []rune{},
+               []uint32{mergeFailed},
+               1, 2,
+       },
+       {
+               // overlap from above
+               []rune{69, 74},
+               []rune{71, 74},
+               []rune{},
+               []uint32{mergeFailed},
+               1, 2,
+       },
+       {
+               // overlap from below
+               []rune{69, 74},
+               []rune{65, 71},
+               []rune{},
+               []uint32{mergeFailed},
+               1, 2,
+       },
+       {
+               // out of order []rune
+               []rune{69, 74, 60, 65},
+               []rune{66, 67},
+               []rune{},
+               []uint32{mergeFailed},
+               1, 2,
+       },
+}
+
+func TestMergeRuneSet(t *testing.T) {
+       for ix, test := range runeMergeTests {
+               merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC)
+               if !reflect.DeepEqual(merged, test.merged) {
+                       t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged)
+               }
+               if !reflect.DeepEqual(next, test.next) {
+                       t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next)
+               }
+       }
+}
+
+const noStr = `!`
+
+var onePass = &onePassProg{}
+
+var onePassTests = []struct {
+       re      string
+       onePass *onePassProg
+       prog    string
+}{
+       {`^(?:a|(?:a*))$`, notOnePass, noStr},
+       {`^(?:(a)|(?:a*))$`, notOnePass, noStr},
+       {`^(?:(?:(?:.(?:$))?))$`, onePass, `a`},
+       {`^abcd$`, onePass, `abcd`},
+       {`^abcd$`, onePass, `abcde`},
+       {`^(?:(?:a{0,})*?)$`, onePass, `a`},
+       {`^(?:(?:a+)*)$`, onePass, ``},
+       {`^(?:(?:a|(?:aa)))$`, onePass, ``},
+       {`^(?:[^\s\S])$`, onePass, ``},
+       {`^(?:(?:a{3,4}){0,})$`, notOnePass, `aaaaaa`},
+       {`^(?:(?:a+)*)$`, onePass, `a`},
+       {`^(?:(?:(?:a*)+))$`, onePass, noStr},
+       {`^(?:(?:a+)*)$`, onePass, ``},
+       {`^[a-c]+$`, onePass, `abc`},
+       {`^[a-c]*$`, onePass, `abcdabc`},
+       {`^(?:a*)$`, onePass, `aaaaaaa`},
+       {`^(?:(?:aa)|a)$`, onePass, `a`},
+       {`^[a-c]*`, notOnePass, `abcdabc`},
+       {`^[a-c]*$`, onePass, `abc`},
+       {`^...$`, onePass, ``},
+       {`^(?:a|(?:aa))$`, onePass, `a`},
+       {`^[a-c]*`, notOnePass, `abcabc`},
+       {`^a((b))c$`, onePass, noStr},
+       {`^a.[l-nA-Cg-j]?e$`, onePass, noStr},
+       {`^a((b))$`, onePass, noStr},
+       {`^a(?:(b)|(c))c$`, onePass, noStr},
+       {`^a(?:(b*)|(c))c$`, notOnePass, noStr},
+       {`^a(?:b|c)$`, onePass, noStr},
+       {`^a(?:b?|c)$`, onePass, noStr},
+       {`^a(?:b?|c?)$`, notOnePass, noStr},
+       {`^a(?:b?|c+)$`, onePass, noStr},
+       {`^a(?:b+|(bc))d$`, notOnePass, noStr},
+       {`^a(?:bc)+$`, onePass, noStr},
+       {`^a(?:[bcd])+$`, onePass, noStr},
+       {`^a((?:[bcd])+)$`, onePass, noStr},
+       {`^a(:?b|c)*d$`, onePass, `abbbccbbcbbd"`},
+       {`^.bc(d|e)*$`, onePass, `abcddddddeeeededd`},
+       {`^(?:(?:aa)|.)$`, notOnePass, `a`},
+       {`^(?:(?:a{1,2}){1,2})$`, notOnePass, `aaaa`},
+}
+
+func TestCompileOnePass(t *testing.T) {
+       var (
+               p   *syntax.Prog
+               re  *syntax.Regexp
+               err error
+       )
+       for _, test := range onePassTests {
+               if re, err = syntax.Parse(test.re, syntax.Perl); err != nil {
+                       t.Errorf("Parse(%q) got err:%s, want success", test.re, err)
+                       continue
+               }
+               // needs to be done before compile...
+               re = re.Simplify()
+               if p, err = syntax.Compile(re); err != nil {
+                       t.Errorf("Compile(%q) got err:%s, want success", test.re, err)
+                       continue
+               }
+               onePass = compileOnePass(p)
+               if (onePass == notOnePass) != (test.onePass == notOnePass) {
+                       t.Errorf("CompileOnePass(%q) got %v, expected %v", test.re, onePass, test.onePass)
+               }
+       }
+}
diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go

index cca21e750e2d4d5e805ccc611fa6644fef737410..0b8336a04fbba9be3168ae2762892f4113267b0a 100644 (file)
--- a/src/pkg/regexp/regexp.go
+++ b/src/pkg/regexp/regexp.go
@@ -83,7 +83,7 @@ type Regexp struct {
         // read-only after Compile
         expr           string         // as passed to Compile
         prog           *syntax.Prog   // compiled program
-       onepass        *syntax.Prog   // onpass program or nil
+       onepass        *onePassProg   // onpass program or nil
         prefix         string         // required prefix in unanchored matches
         prefixBytes    []byte         // prefix, as a []byte
         prefixComplete bool           // prefix is the entire regexp
@@ -165,16 +165,16 @@ func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
         regexp := &Regexp{
                 expr:        expr,
                 prog:        prog,
-               onepass:     prog.CompileOnePass(),
+               onepass:     compileOnePass(prog),
                 numSubexp:   maxCap,
                 subexpNames: capNames,
                 cond:        prog.StartCond(),
                 longest:     longest,
         }
-       if regexp.onepass == syntax.NotOnePass {
+       if regexp.onepass == notOnePass {
                 regexp.prefix, regexp.prefixComplete = prog.Prefix()
         } else {
-               regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = prog.OnePassPrefix()
+               regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
         }
         if regexp.prefix != "" {
                 // TODO(rsc): Remove this allocation by adding
diff --git a/src/pkg/regexp/syntax/prog.go b/src/pkg/regexp/syntax/prog.go

index c1d6a121f00b54665944c3d3cbedcb512b9b6488..29bd282d0d96c7c0c235037b9f0bcac33c592bfe 100644 (file)
--- a/src/pkg/regexp/syntax/prog.go
+++ b/src/pkg/regexp/syntax/prog.go
@@ -6,7 +6,6 @@ package syntax
  
  import (
         "bytes"
-       "sort"
         "strconv"
         "unicode"
  )
@@ -115,7 +114,6 @@ type Inst struct {
         Out  uint32 // all but InstMatch, InstFail
         Arg  uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
         Rune []rune
-       Next []uint32 // If input rune matches
  }
  
  func (p *Prog) String() string {
@@ -165,36 +163,6 @@ func (p *Prog) Prefix() (prefix string, complete bool) {
         return buf.String(), i.Op == InstMatch
  }
  
-// OnePassPrefix returns a literal string that all matches for the
-// regexp must start with.  Complete is true if the prefix
-// is the entire match. Pc is the index of the last rune instruction
-// in the string. The OnePassPrefix skips over the mandatory
-// EmptyBeginText
-func (p *Prog) OnePassPrefix() (prefix string, complete bool, pc uint32) {
-       i := &p.Inst[p.Start]
-       if i.Op != InstEmptyWidth || (EmptyOp(i.Arg))&EmptyBeginText == 0 {
-               return "", i.Op == InstMatch, uint32(p.Start)
-       }
-       pc = i.Out
-       i = &p.Inst[pc]
-       for i.Op == InstNop {
-               pc = i.Out
-               i = &p.Inst[pc]
-       }
-       // Avoid allocation of buffer if prefix is empty.
-       if i.op() != InstRune || len(i.Rune) != 1 {
-               return "", i.Op == InstMatch, uint32(p.Start)
-       }
-
-       // Have prefix; gather characters.
-       var buf bytes.Buffer
-       for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
-               buf.WriteRune(i.Rune[0])
-               pc, i = i.Out, &p.Inst[i.Out]
-       }
-       return buf.String(), i.Op == InstEmptyWidth && (EmptyOp(i.Arg))&EmptyBeginText != 0, pc
-}
-
  // StartCond returns the leading empty-width conditions that must
  // be true in any match.  It returns ^EmptyOp(0) if no matches are possible.
  func (p *Prog) StartCond() EmptyOp {
@@ -221,29 +189,17 @@ Loop:
  
  const noMatch = -1
  
-// OnePassNext selects the next actionable state of the prog, based on the input character.
-// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
-// One of the alternates may ultimately lead without input to end of line. If the instruction
-// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
-func (i *Inst) OnePassNext(r rune) uint32 {
-       next := i.MatchRunePos(r)
-       if next != noMatch {
-               return i.Next[next]
-       }
-       if i.Op == InstAltMatch {
-               return i.Out
-       }
-       return 0
-}
-
  // MatchRune returns true if the instruction matches (and consumes) r.
  // It should only be called when i.Op == InstRune.
  func (i *Inst) MatchRune(r rune) bool {
         return i.MatchRunePos(r) != noMatch
  }
  
-// MatchRunePos returns the index of the rune pair if the instruction matches.
-// It should only be called when i.Op == InstRune.
+// MatchRunePos checks whether the instruction matches (and consumes) r.
+// If so, MatchRunePos returns the index of the matching rune pair
+// (or, when len(i.Rune) == 1, rune singleton).
+// If not, MatchRunePos returns -1.
+// MatchRunePos should only be called when i.Op == InstRune.
  func (i *Inst) MatchRunePos(r rune) int {
         rune := i.Rune
  
@@ -387,496 +343,3 @@ func dumpInst(b *bytes.Buffer, i *Inst) {
                 bw(b, "anynotnl -> ", u32(i.Out))
         }
  }
-
-// Sparse Array implementation is used as a queue.
-type queue struct {
-       sparse          []uint32
-       dense           []uint32
-       size, nextIndex uint32
-}
-
-func (q *queue) empty() bool {
-       return q.nextIndex >= q.size
-}
-
-func (q *queue) next() (n uint32) {
-       n = q.dense[q.nextIndex]
-       q.nextIndex++
-       return
-}
-
-func (q *queue) clear() {
-       q.size = 0
-       q.nextIndex = 0
-}
-
-func (q *queue) reset() {
-       q.nextIndex = 0
-}
-
-func (q *queue) contains(u uint32) bool {
-       if u >= uint32(len(q.sparse)) {
-               return false
-       }
-       return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
-}
-
-func (q *queue) insert(u uint32) {
-       if !q.contains(u) {
-               q.insertNew(u)
-       }
-}
-
-func (q *queue) insertNew(u uint32) {
-       if u >= uint32(len(q.sparse)) {
-               return
-       }
-       q.sparse[u] = q.size
-       q.dense[q.size] = u
-       q.size++
-}
-
-func newQueue(size int) (q *queue) {
-       return &queue{
-               sparse: make([]uint32, size),
-               dense:  make([]uint32, size),
-       }
-}
-
-// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
-// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
-// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
-// NextIp array with the single element mergeFailed is returned.
-// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
-const mergeFailed = uint32(0xffffffff)
-
-var (
-       noRune = []rune{}
-       noNext = []uint32{mergeFailed}
-)
-
-func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
-       leftLen := len(*leftRunes)
-       rightLen := len(*rightRunes)
-       if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
-               panic("mergeRuneSets odd length []rune")
-       }
-       var (
-               lx, rx int
-       )
-       merged := make([]rune, 0)
-       next := make([]uint32, 0)
-       ok := true
-       defer func() {
-               if !ok {
-                       merged = nil
-                       next = nil
-               }
-       }()
-
-       ix := -1
-       extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
-               if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
-                       return false
-               }
-               merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
-               *newLow += 2
-               ix += 2
-               next = append(next, pc)
-               return true
-       }
-
-       for lx < leftLen || rx < rightLen {
-               switch {
-               case rx >= rightLen:
-                       ok = extend(&lx, leftRunes, leftPC)
-               case lx >= leftLen:
-                       ok = extend(&rx, rightRunes, rightPC)
-               case (*rightRunes)[rx] < (*leftRunes)[lx]:
-                       ok = extend(&rx, rightRunes, rightPC)
-               default:
-                       ok = extend(&lx, leftRunes, leftPC)
-               }
-               if !ok {
-                       return noRune, noNext
-               }
-       }
-       return merged, next
-}
-
-// cleanupOnePass drops working memory, and restores certain shortcut instructions.
-func (prog *Prog) cleanupOnePass(pOriginal *Prog) {
-       for ix, instOriginal := range pOriginal.Inst {
-               switch instOriginal.Op {
-               case InstAlt, InstAltMatch, InstRune:
-               case InstCapture, InstEmptyWidth, InstNop, InstMatch, InstFail:
-                       prog.Inst[ix].Next = nil
-               case InstRune1, InstRuneAny, InstRuneAnyNotNL:
-                       prog.Inst[ix].Next = nil
-                       prog.Inst[ix] = instOriginal
-               }
-       }
-}
-
-// onePassCopy creates a copy of the original Prog, as we'll be modifying it
-func (prog *Prog) onePassCopy() *Prog {
-       p := &Prog{
-               Inst:   append([]Inst{}[:], prog.Inst...),
-               Start:  prog.Start,
-               NumCap: prog.NumCap,
-       }
-       for _, inst := range p.Inst {
-               inst.Next = make([]uint32, 0)
-       }
-
-       // rewrites one or more common Prog constructs that enable some otherwise
-       // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
-       // ip A, that points to ips B & C.
-       // A:BC + B:DA => A:BC + B:CD
-       // A:BC + B:DC => A:DC + B:DC
-       for pc := range p.Inst {
-               switch p.Inst[pc].Op {
-               default:
-                       continue
-               case InstAlt, InstAltMatch:
-                       // A:Bx + B:Ay
-                       p_A_Other := &p.Inst[pc].Out
-                       p_A_Alt := &p.Inst[pc].Arg
-                       // make sure a target is another Alt
-                       instAlt := p.Inst[*p_A_Alt]
-                       if !(instAlt.Op == InstAlt || instAlt.Op == InstAltMatch) {
-                               p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
-                               instAlt = p.Inst[*p_A_Alt]
-                               if !(instAlt.Op == InstAlt || instAlt.Op == InstAltMatch) {
-                                       continue
-                               }
-                       }
-                       instOther := p.Inst[*p_A_Other]
-                       // Analyzing both legs pointing to Alts is for another day
-                       if instOther.Op == InstAlt || instOther.Op == InstAltMatch {
-                               // too complicated
-                               continue
-                       }
-                       // simple empty transition loop
-                       // A:BC + B:DA => A:BC + B:DC
-                       p_B_Alt := &p.Inst[*p_A_Alt].Out
-                       p_B_Other := &p.Inst[*p_A_Alt].Arg
-                       patch := false
-                       if instAlt.Out == uint32(pc) {
-                               patch = true
-                       } else if instAlt.Arg == uint32(pc) {
-                               patch = true
-                               p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
-                       }
-                       if patch {
-                               *p_B_Alt = *p_A_Other
-                       }
-
-                       // empty transition to common target
-                       // A:BC + B:DC => A:DC + B:DC
-                       if *p_A_Other == *p_B_Alt {
-                               *p_A_Alt = *p_B_Other
-                       }
-               }
-       }
-       return p
-}
-
-// runeSlice exists to permit sorting the case-folded rune sets.
-type runeSlice []rune
-
-func (p runeSlice) Len() int           { return len(p) }
-func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
-func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
-
-// Sort is a convenience method.
-func (p runeSlice) Sort() {
-       sort.Sort(p)
-}
-
-// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
-// the match engine can always tell which branch to take. The routine may modify
-// p if it is turned into a onepass Prog. If it isn't possible for this to be a
-// onepass Prog, the Prog syntax.NotOnePass is returned. makeOnePass is recursive
-// to the size of the Prog
-func (p *Prog) makeOnePass() *Prog {
-       // If the machine is very long, it's not worth the time to check if we can use one pass.
-       if len(p.Inst) >= 1000 {
-               return NotOnePass
-       }
-
-       var (
-               instQueue    = newQueue(len(p.Inst))
-               visitQueue   = newQueue(len(p.Inst))
-               build        func(uint32, *queue)
-               check        func(uint32, map[uint32]bool) bool
-               onePassRunes = make([][]rune, len(p.Inst))
-       )
-       build = func(pc uint32, q *queue) {
-               if q.contains(pc) {
-                       return
-               }
-               inst := p.Inst[pc]
-               switch inst.Op {
-               case InstAlt, InstAltMatch:
-                       q.insert(inst.Out)
-                       build(inst.Out, q)
-                       q.insert(inst.Arg)
-               case InstMatch, InstFail:
-               default:
-                       q.insert(inst.Out)
-               }
-       }
-
-       // check that paths from Alt instructions are unambiguous, and rebuild the new
-       // program as a onepass program
-       check = func(pc uint32, m map[uint32]bool) (ok bool) {
-               ok = true
-               inst := &p.Inst[pc]
-               if visitQueue.contains(pc) {
-                       return
-               }
-               visitQueue.insert(pc)
-               switch inst.Op {
-               case InstAlt, InstAltMatch:
-                       ok = check(inst.Out, m) && check(inst.Arg, m)
-                       // check no-input paths to InstMatch
-                       matchOut := m[inst.Out]
-                       matchArg := m[inst.Arg]
-                       if matchOut && matchArg {
-                               ok = false
-                               break
-                       }
-                       // Match on empty goes in inst.Out
-                       if matchArg {
-                               inst.Out, inst.Arg = inst.Arg, inst.Out
-                               matchOut, matchArg = matchArg, matchOut
-                       }
-                       if matchOut {
-                               m[pc] = true
-                               inst.Op = InstAltMatch
-                       }
-
-                       // build a dispatch operator from the two legs of the alt.
-                       onePassRunes[pc], inst.Next = mergeRuneSets(
-                               &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
-                       if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
-                               ok = false
-                               break
-                       }
-               case InstCapture, InstNop:
-                       ok = check(inst.Out, m)
-                       m[pc] = m[inst.Out]
-                       // pass matching runes back through these no-ops.
-                       onePassRunes[pc] = append([]rune{}[:], onePassRunes[inst.Out][:]...)
-                       inst.Next = []uint32{}
-                       for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
-                               inst.Next = append(inst.Next, inst.Out)
-                       }
-               case InstEmptyWidth:
-                       ok = check(inst.Out, m)
-                       m[pc] = m[inst.Out]
-                       onePassRunes[pc] = append([]rune{}[:], onePassRunes[inst.Out][:]...)
-                       inst.Next = []uint32{}
-                       for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
-                               inst.Next = append(inst.Next, inst.Out)
-                       }
-               case InstMatch, InstFail:
-                       m[pc] = inst.Op == InstMatch
-                       break
-               case InstRune:
-                       ok = check(inst.Out, m)
-                       m[pc] = false
-                       if len(inst.Next) > 0 {
-                               break
-                       }
-                       if len(inst.Rune) == 0 {
-                               onePassRunes[pc] = []rune{}[:]
-                               inst.Next = []uint32{inst.Out}
-                               break
-                       }
-                       runes := make([]rune, 0)
-                       if len(inst.Rune) == 1 && Flags(inst.Arg)&FoldCase != 0 {
-                               r0 := inst.Rune[0]
-                               runes = append(runes, r0, r0)
-                               for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
-                                       runes = append(runes, r1, r1)
-                               }
-                               sort.Sort(runeSlice(runes))
-                       } else {
-                               runes = append(runes, inst.Rune...)
-                       }
-                       onePassRunes[pc] = runes
-                       inst.Next = []uint32{}
-                       for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
-                               inst.Next = append(inst.Next, inst.Out)
-                       }
-                       inst.Op = InstRune
-               case InstRune1:
-                       ok = check(inst.Out, m)
-                       m[pc] = false
-                       if len(inst.Next) > 0 {
-                               break
-                       }
-                       runes := []rune{}[:]
-                       // expand case-folded runes
-                       if Flags(inst.Arg)&FoldCase != 0 {
-                               r0 := inst.Rune[0]
-                               runes = append(runes, r0, r0)
-                               for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
-                                       runes = append(runes, r1, r1)
-                               }
-                               sort.Sort(runeSlice(runes))
-                       } else {
-                               runes = append(runes, inst.Rune[0], inst.Rune[0])
-                       }
-                       onePassRunes[pc] = runes
-                       inst.Next = []uint32{}
-                       for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
-                               inst.Next = append(inst.Next, inst.Out)
-                       }
-                       inst.Op = InstRune
-               case InstRuneAny:
-                       ok = check(inst.Out, m)
-                       m[pc] = false
-                       if len(inst.Next) > 0 {
-                               break
-                       }
-                       onePassRunes[pc] = append([]rune{}[:], anyRune[:]...)
-                       inst.Next = []uint32{inst.Out}
-               case InstRuneAnyNotNL:
-                       ok = check(inst.Out, m)
-                       m[pc] = false
-                       if len(inst.Next) > 0 {
-                               break
-                       }
-                       onePassRunes[pc] = append([]rune{}[:], anyRuneNotNL[:]...)
-                       inst.Next = []uint32{}
-                       for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
-                               inst.Next = append(inst.Next, inst.Out)
-                       }
-               }
-               return
-       }
-
-       instQueue.clear()
-       instQueue.insert(uint32(p.Start))
-       m := make(map[uint32]bool, len(p.Inst))
-       for !instQueue.empty() {
-               pc := instQueue.next()
-               inst := p.Inst[pc]
-               visitQueue.clear()
-               if !check(uint32(pc), m) {
-                       p = NotOnePass
-                       break
-               }
-               switch inst.Op {
-               case InstAlt, InstAltMatch:
-                       instQueue.insert(inst.Out)
-                       instQueue.insert(inst.Arg)
-               case InstCapture, InstEmptyWidth, InstNop:
-                       instQueue.insert(inst.Out)
-               case InstMatch:
-               case InstFail:
-               case InstRune, InstRune1, InstRuneAny, InstRuneAnyNotNL:
-               default:
-               }
-       }
-       if p != NotOnePass {
-               for i, _ := range p.Inst {
-                       p.Inst[i].Rune = onePassRunes[i][:]
-               }
-       }
-       return p
-}
-
-// walk visits each Inst in the prog once, and applies the argument
-// function(ip, next), in pre-order.
-func (prog *Prog) walk(funcs ...func(ip, next uint32)) {
-       var walk1 func(uint32)
-       progQueue := newQueue(len(prog.Inst))
-       walk1 = func(ip uint32) {
-               if progQueue.contains(ip) {
-                       return
-               }
-               progQueue.insert(ip)
-               inst := prog.Inst[ip]
-               switch inst.Op {
-               case InstAlt, InstAltMatch:
-                       for _, f := range funcs {
-                               f(ip, inst.Out)
-                               f(ip, inst.Arg)
-                       }
-                       walk1(inst.Out)
-                       walk1(inst.Arg)
-               default:
-                       for _, f := range funcs {
-                               f(ip, inst.Out)
-                       }
-                       walk1(inst.Out)
-               }
-       }
-       walk1(uint32(prog.Start))
-}
-
-// find returns the Insts that match the argument predicate function
-func (prog *Prog) find(f func(*Prog, int) bool) (matches []uint32) {
-       matches = []uint32{}
-
-       for ip := range prog.Inst {
-               if f(prog, ip) {
-                       matches = append(matches, uint32(ip))
-               }
-       }
-       return
-}
-
-var NotOnePass *Prog = nil
-var debug = false
-
-// CompileOnePass returns a new *Prog suitable for onePass execution if the original Prog
-// can be recharacterized as a one-pass regexp program, or syntax.NotOnePass if the
-// Prog cannot be converted. For a one pass prog, the fundamental condition that must
-// be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
-func (prog *Prog) CompileOnePass() (p *Prog) {
-       if prog.Start == 0 {
-               return NotOnePass
-       }
-       // onepass regexp is anchored
-       if prog.Inst[prog.Start].Op != InstEmptyWidth ||
-               EmptyOp(prog.Inst[prog.Start].Arg)&EmptyBeginText != EmptyBeginText {
-               return NotOnePass
-       }
-       // every instruction leading to InstMatch must be EmptyEndText
-       for _, inst := range prog.Inst {
-               opOut := prog.Inst[inst.Out].Op
-               switch inst.Op {
-               default:
-                       if opOut == InstMatch {
-                               return NotOnePass
-                       }
-               case InstAlt, InstAltMatch:
-                       if opOut == InstMatch || prog.Inst[inst.Arg].Op == InstMatch {
-                               return NotOnePass
-                       }
-               case InstEmptyWidth:
-                       if opOut == InstMatch {
-                               if EmptyOp(inst.Arg)&EmptyEndText == EmptyEndText {
-                                       continue
-                               }
-                               return NotOnePass
-                       }
-               }
-       }
-       // Creates a slightly optimized copy of the original Prog
-       // that cleans up some Prog idioms that block valid onepass programs
-       p = prog.onePassCopy()
-
-       // checkAmbiguity on InstAlts, build onepass Prog if possible
-       p = p.makeOnePass()
-
-       if p != NotOnePass {
-               p.cleanupOnePass(prog)
-       }
-       return p
-}
diff --git a/src/pkg/regexp/syntax/prog_test.go b/src/pkg/regexp/syntax/prog_test.go

index 66beb7435b5db91a43e8d31c20e8039e9fd7760f..50bfa3d4bef6d4643c1c6b4ff49673c6ec8fa244 100644 (file)
--- a/src/pkg/regexp/syntax/prog_test.go
+++ b/src/pkg/regexp/syntax/prog_test.go
@@ -4,10 +4,7 @@
  
  package syntax
  
-import (
-       "reflect"
-       "testing"
-)
+import "testing"
  
  var compileTests = []struct {
         Regexp string
@@ -115,200 +112,3 @@ func BenchmarkEmptyOpContext(b *testing.B) {
                 EmptyOpContext(r1, -1)
         }
  }
-
-var runeMergeTests = []struct {
-       left, right, merged []rune
-       next                []uint32
-       leftPC, rightPC     uint32
-}{
-       {
-               // empty rhs
-               []rune{69, 69},
-               []rune{},
-               []rune{69, 69},
-               []uint32{1},
-               1, 2,
-       },
-       {
-               // identical runes, identical targets
-               []rune{69, 69},
-               []rune{69, 69},
-               []rune{},
-               []uint32{mergeFailed},
-               1, 1,
-       },
-       {
-               // identical runes, different targets
-               []rune{69, 69},
-               []rune{69, 69},
-               []rune{},
-               []uint32{mergeFailed},
-               1, 2,
-       },
-       {
-               // append right-first
-               []rune{69, 69},
-               []rune{71, 71},
-               []rune{69, 69, 71, 71},
-               []uint32{1, 2},
-               1, 2,
-       },
-       {
-               // append, left-first
-               []rune{71, 71},
-               []rune{69, 69},
-               []rune{69, 69, 71, 71},
-               []uint32{2, 1},
-               1, 2,
-       },
-       {
-               // successful interleave
-               []rune{60, 60, 71, 71, 101, 101},
-               []rune{69, 69, 88, 88},
-               []rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101},
-               []uint32{1, 2, 1, 2, 1},
-               1, 2,
-       },
-       {
-               // left surrounds right
-               []rune{69, 74},
-               []rune{71, 71},
-               []rune{},
-               []uint32{mergeFailed},
-               1, 2,
-       },
-       {
-               // right surrounds left
-               []rune{69, 74},
-               []rune{68, 75},
-               []rune{},
-               []uint32{mergeFailed},
-               1, 2,
-       },
-       {
-               // overlap at interval begin
-               []rune{69, 74},
-               []rune{74, 75},
-               []rune{},
-               []uint32{mergeFailed},
-               1, 2,
-       },
-       {
-               // overlap ar interval end
-               []rune{69, 74},
-               []rune{65, 69},
-               []rune{},
-               []uint32{mergeFailed},
-               1, 2,
-       },
-       {
-               // overlap from above
-               []rune{69, 74},
-               []rune{71, 74},
-               []rune{},
-               []uint32{mergeFailed},
-               1, 2,
-       },
-       {
-               // overlap from below
-               []rune{69, 74},
-               []rune{65, 71},
-               []rune{},
-               []uint32{mergeFailed},
-               1, 2,
-       },
-       {
-               // out of order []rune
-               []rune{69, 74, 60, 65},
-               []rune{66, 67},
-               []rune{},
-               []uint32{mergeFailed},
-               1, 2,
-       },
-}
-
-func TestMergeRuneSet(t *testing.T) {
-       for ix, test := range runeMergeTests {
-               merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC)
-               if !reflect.DeepEqual(merged, test.merged) {
-                       t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged)
-               }
-               if !reflect.DeepEqual(next, test.next) {
-                       t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next)
-               }
-       }
-}
-
-const noStr = `!`
-
-var onePass = &Prog{}
-
-var onePassTests = []struct {
-       re      string
-       onePass *Prog
-       prog    string
-}{
-       {`^(?:a|(?:a*))$`, NotOnePass, noStr},
-       {`^(?:(a)|(?:a*))$`, NotOnePass, noStr},
-       {`^(?:(?:(?:.(?:$))?))$`, onePass, `a`},
-       {`^abcd$`, onePass, `abcd`},
-       {`^abcd$`, onePass, `abcde`},
-       {`^(?:(?:a{0,})*?)$`, onePass, `a`},
-       {`^(?:(?:a+)*)$`, onePass, ``},
-       {`^(?:(?:a|(?:aa)))$`, onePass, ``},
-       {`^(?:[^\s\S])$`, onePass, ``},
-       {`^(?:(?:a{3,4}){0,})$`, NotOnePass, `aaaaaa`},
-       {`^(?:(?:a+)*)$`, onePass, `a`},
-       {`^(?:(?:(?:a*)+))$`, onePass, noStr},
-       {`^(?:(?:a+)*)$`, onePass, ``},
-       {`^[a-c]+$`, onePass, `abc`},
-       {`^[a-c]*$`, onePass, `abcdabc`},
-       {`^(?:a*)$`, onePass, `aaaaaaa`},
-       {`^(?:(?:aa)|a)$`, onePass, `a`},
-       {`^[a-c]*`, NotOnePass, `abcdabc`},
-       {`^[a-c]*$`, onePass, `abc`},
-       {`^...$`, onePass, ``},
-       {`^(?:a|(?:aa))$`, onePass, `a`},
-       {`^[a-c]*`, NotOnePass, `abcabc`},
-       {`^a((b))c$`, onePass, noStr},
-       {`^a.[l-nA-Cg-j]?e$`, onePass, noStr},
-       {`^a((b))$`, onePass, noStr},
-       {`^a(?:(b)|(c))c$`, onePass, noStr},
-       {`^a(?:(b*)|(c))c$`, NotOnePass, noStr},
-       {`^a(?:b|c)$`, onePass, noStr},
-       {`^a(?:b?|c)$`, onePass, noStr},
-       {`^a(?:b?|c?)$`, NotOnePass, noStr},
-       {`^a(?:b?|c+)$`, onePass, noStr},
-       {`^a(?:b+|(bc))d$`, NotOnePass, noStr},
-       {`^a(?:bc)+$`, onePass, noStr},
-       {`^a(?:[bcd])+$`, onePass, noStr},
-       {`^a((?:[bcd])+)$`, onePass, noStr},
-       {`^a(:?b|c)*d$`, onePass, `abbbccbbcbbd"`},
-       {`^.bc(d|e)*$`, onePass, `abcddddddeeeededd`},
-       {`^(?:(?:aa)|.)$`, NotOnePass, `a`},
-       {`^(?:(?:a{1,2}){1,2})$`, NotOnePass, `aaaa`},
-}
-
-func TestCompileOnePass(t *testing.T) {
-       var (
-               p   *Prog
-               re  *Regexp
-               err error
-       )
-       for _, test := range onePassTests {
-               if re, err = Parse(test.re, Perl); err != nil {
-                       t.Errorf("Parse(%q) got err:%s, want success", test.re, err)
-                       continue
-               }
-               // needs to be done before compile...
-               re = re.Simplify()
-               if p, err = Compile(re); err != nil {
-                       t.Errorf("Compile(%q) got err:%s, want success", test.re, err)
-                       continue
-               }
-               onePass = p.CompileOnePass()
-               if (onePass == NotOnePass) != (test.onePass == NotOnePass) {
-                       t.Errorf("CompileOnePass(%q) got %v, expected %v", test.re, onePass, test.onePass)
-               }
-       }
-}
author	Russ Cox <rsc@golang.org>
	Wed, 28 May 2014 18:08:44 +0000 (14:08 -0400)
committer	Russ Cox <rsc@golang.org>
	Wed, 28 May 2014 18:08:44 +0000 (14:08 -0400)
api/next.txt		patch \| blob \| history
src/pkg/regexp/exec.go		patch \| blob \| history
src/pkg/regexp/onepass.go	[new file with mode: 0644]	patch \| blob
src/pkg/regexp/onepass_test.go	[new file with mode: 0644]	patch \| blob
src/pkg/regexp/regexp.go		patch \| blob \| history
src/pkg/regexp/syntax/prog.go		patch \| blob \| history
src/pkg/regexp/syntax/prog_test.go		patch \| blob \| history