From: Russ Cox Date: Wed, 28 May 2014 18:08:44 +0000 (-0400) Subject: regexp: hide one-pass code from exported API X-Git-Tag: go1.3rc1~29 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=6c10e64a906039678f5dffda8b453005ffcdb92e;p=gostls13.git regexp: hide one-pass code from exported API Update #8112 Hide one-pass regexp API. This means moving the code from regexp/syntax to regexp, but it avoids being locked into the specific API chosen for the implementation. It also removes a slice field from the syntax.Inst, which should avoid bloating the memory footprint of a non-one-pass regexp unnecessarily. LGTM=r R=golang-codereviews, r CC=golang-codereviews, iant https://golang.org/cl/98610046 --- diff --git a/api/next.txt b/api/next.txt index eac2d203d7..cdbac57f22 100644 --- a/api/next.txt +++ b/api/next.txt @@ -330,12 +330,7 @@ pkg net/http, type Server struct, ConnState func(net.Conn, ConnState) pkg net/http, type Server struct, ErrorLog *log.Logger pkg net/http, type Transport struct, TLSHandshakeTimeout time.Duration pkg regexp/syntax, method (*Inst) MatchRunePos(int32) int -pkg regexp/syntax, method (*Inst) OnePassNext(int32) uint32 -pkg regexp/syntax, method (*Prog) CompileOnePass() *Prog -pkg regexp/syntax, method (*Prog) OnePassPrefix() (string, bool, uint32) pkg regexp/syntax, method (InstOp) String() string -pkg regexp/syntax, type Inst struct, Next []uint32 -pkg regexp/syntax, var NotOnePass *Prog pkg runtime/debug, func SetPanicOnFault(bool) bool pkg runtime/debug, func WriteHeapDump(uintptr) pkg sync, method (*Pool) Get() interface{} diff --git a/src/pkg/regexp/exec.go b/src/pkg/regexp/exec.go index 22c33cfa56..c4cb201f64 100644 --- a/src/pkg/regexp/exec.go +++ b/src/pkg/regexp/exec.go @@ -37,7 +37,7 @@ type thread struct { type machine struct { re *Regexp // corresponding Regexp p *syntax.Prog // compiled program - op *syntax.Prog // compiled onepass program, or syntax.NotOnePass + op *onePassProg // compiled onepass program, or notOnePass q0, q1 queue // two queues for runq, nextq pool []*thread // pool of available threads matched bool // whether a match was found @@ -67,7 +67,7 @@ func (m *machine) newInputReader(r io.RuneReader) input { } // progMachine returns a new machine running the prog p. -func progMachine(p, op *syntax.Prog) *machine { +func progMachine(p *syntax.Prog, op *onePassProg) *machine { m := &machine{p: p, op: op} n := len(m.p.Inst) m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} @@ -382,7 +382,7 @@ func (m *machine) onepass(i input, pos int) bool { } // peek at the input rune to see which branch of the Alt to take case syntax.InstAlt, syntax.InstAltMatch: - pc = int(inst.OnePassNext(r)) + pc = int(onePassNext(&inst, r)) continue case syntax.InstFail: return m.matched @@ -429,7 +429,7 @@ func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap i } else { i = m.newInputString(s) } - if m.op != syntax.NotOnePass { + if m.op != notOnePass { if !m.onepass(i, pos) { re.put(m) return nil diff --git a/src/pkg/regexp/onepass.go b/src/pkg/regexp/onepass.go new file mode 100644 index 0000000000..501fb28af6 --- /dev/null +++ b/src/pkg/regexp/onepass.go @@ -0,0 +1,582 @@ +// Copyright 2014 The Go Authors. All rights reserved. + +package regexp + +import ( + "bytes" + "regexp/syntax" + "sort" + "unicode" +) + +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// "One-pass" regexp execution. +// Some regexps can be analyzed to determine that they never need +// backtracking: they are guaranteed to run in one pass over the string +// without bothering to save all the usual NFA state. +// Detect those and execute them more quickly. + +// A onePassProg is a compiled one-pass regular expression program. +// It is the same as syntax.Prog except for the use of onePassInst. +type onePassProg struct { + Inst []onePassInst + Start int // index of start instruction + NumCap int // number of InstCapture insts in re +} + +// A onePassInst is a single instruction in a one-pass regular expression program. +// It is the same as syntax.Inst except for the new 'Next' field. +type onePassInst struct { + syntax.Inst + Next []uint32 +} + +// OnePassPrefix returns a literal string that all matches for the +// regexp must start with. Complete is true if the prefix +// is the entire match. Pc is the index of the last rune instruction +// in the string. The OnePassPrefix skips over the mandatory +// EmptyBeginText +func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) { + i := &p.Inst[p.Start] + if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 { + return "", i.Op == syntax.InstMatch, uint32(p.Start) + } + pc = i.Out + i = &p.Inst[pc] + for i.Op == syntax.InstNop { + pc = i.Out + i = &p.Inst[pc] + } + // Avoid allocation of buffer if prefix is empty. + if iop(i) != syntax.InstRune || len(i.Rune) != 1 { + return "", i.Op == syntax.InstMatch, uint32(p.Start) + } + + // Have prefix; gather characters. + var buf bytes.Buffer + for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 { + buf.WriteRune(i.Rune[0]) + pc, i = i.Out, &p.Inst[i.Out] + } + return buf.String(), i.Op == syntax.InstEmptyWidth && (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText != 0, pc +} + +// OnePassNext selects the next actionable state of the prog, based on the input character. +// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. +// One of the alternates may ultimately lead without input to end of line. If the instruction +// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. +func onePassNext(i *onePassInst, r rune) uint32 { + next := i.MatchRunePos(r) + if next >= 0 { + return i.Next[next] + } + if i.Op == syntax.InstAltMatch { + return i.Out + } + return 0 +} + +func iop(i *syntax.Inst) syntax.InstOp { + op := i.Op + switch op { + case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: + op = syntax.InstRune + } + return op +} + +// Sparse Array implementation is used as a queueOnePass. +type queueOnePass struct { + sparse []uint32 + dense []uint32 + size, nextIndex uint32 +} + +func (q *queueOnePass) empty() bool { + return q.nextIndex >= q.size +} + +func (q *queueOnePass) next() (n uint32) { + n = q.dense[q.nextIndex] + q.nextIndex++ + return +} + +func (q *queueOnePass) clear() { + q.size = 0 + q.nextIndex = 0 +} + +func (q *queueOnePass) reset() { + q.nextIndex = 0 +} + +func (q *queueOnePass) contains(u uint32) bool { + if u >= uint32(len(q.sparse)) { + return false + } + return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u +} + +func (q *queueOnePass) insert(u uint32) { + if !q.contains(u) { + q.insertNew(u) + } +} + +func (q *queueOnePass) insertNew(u uint32) { + if u >= uint32(len(q.sparse)) { + return + } + q.sparse[u] = q.size + q.dense[q.size] = u + q.size++ +} + +func newQueue(size int) (q *queueOnePass) { + return &queueOnePass{ + sparse: make([]uint32, size), + dense: make([]uint32, size), + } +} + +// mergeRuneSets merges two non-intersecting runesets, and returns the merged result, +// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index +// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a +// NextIp array with the single element mergeFailed is returned. +// The code assumes that both inputs contain ordered and non-intersecting rune pairs. +const mergeFailed = uint32(0xffffffff) + +var ( + noRune = []rune{} + noNext = []uint32{mergeFailed} +) + +func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { + leftLen := len(*leftRunes) + rightLen := len(*rightRunes) + if leftLen&0x1 != 0 || rightLen&0x1 != 0 { + panic("mergeRuneSets odd length []rune") + } + var ( + lx, rx int + ) + merged := make([]rune, 0) + next := make([]uint32, 0) + ok := true + defer func() { + if !ok { + merged = nil + next = nil + } + }() + + ix := -1 + extend := func(newLow *int, newArray *[]rune, pc uint32) bool { + if ix > 0 && (*newArray)[*newLow] <= merged[ix] { + return false + } + merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) + *newLow += 2 + ix += 2 + next = append(next, pc) + return true + } + + for lx < leftLen || rx < rightLen { + switch { + case rx >= rightLen: + ok = extend(&lx, leftRunes, leftPC) + case lx >= leftLen: + ok = extend(&rx, rightRunes, rightPC) + case (*rightRunes)[rx] < (*leftRunes)[lx]: + ok = extend(&rx, rightRunes, rightPC) + default: + ok = extend(&lx, leftRunes, leftPC) + } + if !ok { + return noRune, noNext + } + } + return merged, next +} + +// cleanupOnePass drops working memory, and restores certain shortcut instructions. +func cleanupOnePass(prog *onePassProg, original *syntax.Prog) { + for ix, instOriginal := range original.Inst { + switch instOriginal.Op { + case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune: + case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail: + prog.Inst[ix].Next = nil + case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: + prog.Inst[ix].Next = nil + prog.Inst[ix] = onePassInst{Inst: instOriginal} + } + } +} + +// onePassCopy creates a copy of the original Prog, as we'll be modifying it +func onePassCopy(prog *syntax.Prog) *onePassProg { + p := &onePassProg{ + Start: prog.Start, + NumCap: prog.NumCap, + } + for _, inst := range prog.Inst { + p.Inst = append(p.Inst, onePassInst{Inst: inst}) + } + + // rewrites one or more common Prog constructs that enable some otherwise + // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at + // ip A, that points to ips B & C. + // A:BC + B:DA => A:BC + B:CD + // A:BC + B:DC => A:DC + B:DC + for pc := range p.Inst { + switch p.Inst[pc].Op { + default: + continue + case syntax.InstAlt, syntax.InstAltMatch: + // A:Bx + B:Ay + p_A_Other := &p.Inst[pc].Out + p_A_Alt := &p.Inst[pc].Arg + // make sure a target is another Alt + instAlt := p.Inst[*p_A_Alt] + if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { + p_A_Alt, p_A_Other = p_A_Other, p_A_Alt + instAlt = p.Inst[*p_A_Alt] + if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) { + continue + } + } + instOther := p.Inst[*p_A_Other] + // Analyzing both legs pointing to Alts is for another day + if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch { + // too complicated + continue + } + // simple empty transition loop + // A:BC + B:DA => A:BC + B:DC + p_B_Alt := &p.Inst[*p_A_Alt].Out + p_B_Other := &p.Inst[*p_A_Alt].Arg + patch := false + if instAlt.Out == uint32(pc) { + patch = true + } else if instAlt.Arg == uint32(pc) { + patch = true + p_B_Alt, p_B_Other = p_B_Other, p_B_Alt + } + if patch { + *p_B_Alt = *p_A_Other + } + + // empty transition to common target + // A:BC + B:DC => A:DC + B:DC + if *p_A_Other == *p_B_Alt { + *p_A_Alt = *p_B_Other + } + } + } + return p +} + +// runeSlice exists to permit sorting the case-folded rune sets. +type runeSlice []rune + +func (p runeSlice) Len() int { return len(p) } +func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } +func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +// Sort is a convenience method. +func (p runeSlice) Sort() { + sort.Sort(p) +} + +var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune} +var anyRune = []rune{0, unicode.MaxRune} + +// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, +// the match engine can always tell which branch to take. The routine may modify +// p if it is turned into a onepass Prog. If it isn't possible for this to be a +// onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive +// to the size of the Prog. +func makeOnePass(p *onePassProg) *onePassProg { + // If the machine is very long, it's not worth the time to check if we can use one pass. + if len(p.Inst) >= 1000 { + return notOnePass + } + + var ( + instQueue = newQueue(len(p.Inst)) + visitQueue = newQueue(len(p.Inst)) + build func(uint32, *queueOnePass) + check func(uint32, map[uint32]bool) bool + onePassRunes = make([][]rune, len(p.Inst)) + ) + build = func(pc uint32, q *queueOnePass) { + if q.contains(pc) { + return + } + inst := p.Inst[pc] + switch inst.Op { + case syntax.InstAlt, syntax.InstAltMatch: + q.insert(inst.Out) + build(inst.Out, q) + q.insert(inst.Arg) + case syntax.InstMatch, syntax.InstFail: + default: + q.insert(inst.Out) + } + } + + // check that paths from Alt instructions are unambiguous, and rebuild the new + // program as a onepass program + check = func(pc uint32, m map[uint32]bool) (ok bool) { + ok = true + inst := &p.Inst[pc] + if visitQueue.contains(pc) { + return + } + visitQueue.insert(pc) + switch inst.Op { + case syntax.InstAlt, syntax.InstAltMatch: + ok = check(inst.Out, m) && check(inst.Arg, m) + // check no-input paths to InstMatch + matchOut := m[inst.Out] + matchArg := m[inst.Arg] + if matchOut && matchArg { + ok = false + break + } + // Match on empty goes in inst.Out + if matchArg { + inst.Out, inst.Arg = inst.Arg, inst.Out + matchOut, matchArg = matchArg, matchOut + } + if matchOut { + m[pc] = true + inst.Op = syntax.InstAltMatch + } + + // build a dispatch operator from the two legs of the alt. + onePassRunes[pc], inst.Next = mergeRuneSets( + &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) + if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { + ok = false + break + } + case syntax.InstCapture, syntax.InstNop: + ok = check(inst.Out, m) + m[pc] = m[inst.Out] + // pass matching runes back through these no-ops. + onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) + inst.Next = []uint32{} + for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { + inst.Next = append(inst.Next, inst.Out) + } + case syntax.InstEmptyWidth: + ok = check(inst.Out, m) + m[pc] = m[inst.Out] + onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...) + inst.Next = []uint32{} + for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { + inst.Next = append(inst.Next, inst.Out) + } + case syntax.InstMatch, syntax.InstFail: + m[pc] = inst.Op == syntax.InstMatch + break + case syntax.InstRune: + ok = check(inst.Out, m) + m[pc] = false + if len(inst.Next) > 0 { + break + } + if len(inst.Rune) == 0 { + onePassRunes[pc] = []rune{} + inst.Next = []uint32{inst.Out} + break + } + runes := make([]rune, 0) + if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { + r0 := inst.Rune[0] + runes = append(runes, r0, r0) + for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { + runes = append(runes, r1, r1) + } + sort.Sort(runeSlice(runes)) + } else { + runes = append(runes, inst.Rune...) + } + onePassRunes[pc] = runes + inst.Next = []uint32{} + for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { + inst.Next = append(inst.Next, inst.Out) + } + inst.Op = syntax.InstRune + case syntax.InstRune1: + ok = check(inst.Out, m) + m[pc] = false + if len(inst.Next) > 0 { + break + } + runes := []rune{} + // expand case-folded runes + if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 { + r0 := inst.Rune[0] + runes = append(runes, r0, r0) + for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { + runes = append(runes, r1, r1) + } + sort.Sort(runeSlice(runes)) + } else { + runes = append(runes, inst.Rune[0], inst.Rune[0]) + } + onePassRunes[pc] = runes + inst.Next = []uint32{} + for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { + inst.Next = append(inst.Next, inst.Out) + } + inst.Op = syntax.InstRune + case syntax.InstRuneAny: + ok = check(inst.Out, m) + m[pc] = false + if len(inst.Next) > 0 { + break + } + onePassRunes[pc] = append([]rune{}, anyRune...) + inst.Next = []uint32{inst.Out} + case syntax.InstRuneAnyNotNL: + ok = check(inst.Out, m) + m[pc] = false + if len(inst.Next) > 0 { + break + } + onePassRunes[pc] = append([]rune{}, anyRuneNotNL...) + inst.Next = []uint32{} + for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { + inst.Next = append(inst.Next, inst.Out) + } + } + return + } + + instQueue.clear() + instQueue.insert(uint32(p.Start)) + m := make(map[uint32]bool, len(p.Inst)) + for !instQueue.empty() { + pc := instQueue.next() + inst := p.Inst[pc] + visitQueue.clear() + if !check(uint32(pc), m) { + p = notOnePass + break + } + switch inst.Op { + case syntax.InstAlt, syntax.InstAltMatch: + instQueue.insert(inst.Out) + instQueue.insert(inst.Arg) + case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop: + instQueue.insert(inst.Out) + case syntax.InstMatch: + case syntax.InstFail: + case syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: + default: + } + } + if p != notOnePass { + for i, _ := range p.Inst { + p.Inst[i].Rune = onePassRunes[i] + } + } + return p +} + +// walk visits each Inst in the prog once, and applies the argument +// function(ip, next), in pre-order. +func walk(prog *syntax.Prog, funcs ...func(ip, next uint32)) { + var walk1 func(uint32) + progQueue := newQueue(len(prog.Inst)) + walk1 = func(ip uint32) { + if progQueue.contains(ip) { + return + } + progQueue.insert(ip) + inst := prog.Inst[ip] + switch inst.Op { + case syntax.InstAlt, syntax.InstAltMatch: + for _, f := range funcs { + f(ip, inst.Out) + f(ip, inst.Arg) + } + walk1(inst.Out) + walk1(inst.Arg) + default: + for _, f := range funcs { + f(ip, inst.Out) + } + walk1(inst.Out) + } + } + walk1(uint32(prog.Start)) +} + +// find returns the Insts that match the argument predicate function +func find(prog *syntax.Prog, f func(*syntax.Prog, int) bool) (matches []uint32) { + matches = []uint32{} + + for ip := range prog.Inst { + if f(prog, ip) { + matches = append(matches, uint32(ip)) + } + } + return +} + +var notOnePass *onePassProg = nil + +// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog +// can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the +// Prog cannot be converted. For a one pass prog, the fundamental condition that must +// be true is: at any InstAlt, there must be no ambiguity about what branch to take. +func compileOnePass(prog *syntax.Prog) (p *onePassProg) { + if prog.Start == 0 { + return notOnePass + } + // onepass regexp is anchored + if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth || + syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText { + return notOnePass + } + // every instruction leading to InstMatch must be EmptyEndText + for _, inst := range prog.Inst { + opOut := prog.Inst[inst.Out].Op + switch inst.Op { + default: + if opOut == syntax.InstMatch { + return notOnePass + } + case syntax.InstAlt, syntax.InstAltMatch: + if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch { + return notOnePass + } + case syntax.InstEmptyWidth: + if opOut == syntax.InstMatch { + if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText { + continue + } + return notOnePass + } + } + } + // Creates a slightly optimized copy of the original Prog + // that cleans up some Prog idioms that block valid onepass programs + p = onePassCopy(prog) + + // checkAmbiguity on InstAlts, build onepass Prog if possible + p = makeOnePass(p) + + if p != notOnePass { + cleanupOnePass(p, prog) + } + return p +} diff --git a/src/pkg/regexp/onepass_test.go b/src/pkg/regexp/onepass_test.go new file mode 100644 index 0000000000..7b2beea67f --- /dev/null +++ b/src/pkg/regexp/onepass_test.go @@ -0,0 +1,208 @@ +// Copyright 2014 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package regexp + +import ( + "reflect" + "regexp/syntax" + "testing" +) + +var runeMergeTests = []struct { + left, right, merged []rune + next []uint32 + leftPC, rightPC uint32 +}{ + { + // empty rhs + []rune{69, 69}, + []rune{}, + []rune{69, 69}, + []uint32{1}, + 1, 2, + }, + { + // identical runes, identical targets + []rune{69, 69}, + []rune{69, 69}, + []rune{}, + []uint32{mergeFailed}, + 1, 1, + }, + { + // identical runes, different targets + []rune{69, 69}, + []rune{69, 69}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // append right-first + []rune{69, 69}, + []rune{71, 71}, + []rune{69, 69, 71, 71}, + []uint32{1, 2}, + 1, 2, + }, + { + // append, left-first + []rune{71, 71}, + []rune{69, 69}, + []rune{69, 69, 71, 71}, + []uint32{2, 1}, + 1, 2, + }, + { + // successful interleave + []rune{60, 60, 71, 71, 101, 101}, + []rune{69, 69, 88, 88}, + []rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101}, + []uint32{1, 2, 1, 2, 1}, + 1, 2, + }, + { + // left surrounds right + []rune{69, 74}, + []rune{71, 71}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // right surrounds left + []rune{69, 74}, + []rune{68, 75}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap at interval begin + []rune{69, 74}, + []rune{74, 75}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap ar interval end + []rune{69, 74}, + []rune{65, 69}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap from above + []rune{69, 74}, + []rune{71, 74}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // overlap from below + []rune{69, 74}, + []rune{65, 71}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, + { + // out of order []rune + []rune{69, 74, 60, 65}, + []rune{66, 67}, + []rune{}, + []uint32{mergeFailed}, + 1, 2, + }, +} + +func TestMergeRuneSet(t *testing.T) { + for ix, test := range runeMergeTests { + merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC) + if !reflect.DeepEqual(merged, test.merged) { + t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged) + } + if !reflect.DeepEqual(next, test.next) { + t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next) + } + } +} + +const noStr = `!` + +var onePass = &onePassProg{} + +var onePassTests = []struct { + re string + onePass *onePassProg + prog string +}{ + {`^(?:a|(?:a*))$`, notOnePass, noStr}, + {`^(?:(a)|(?:a*))$`, notOnePass, noStr}, + {`^(?:(?:(?:.(?:$))?))$`, onePass, `a`}, + {`^abcd$`, onePass, `abcd`}, + {`^abcd$`, onePass, `abcde`}, + {`^(?:(?:a{0,})*?)$`, onePass, `a`}, + {`^(?:(?:a+)*)$`, onePass, ``}, + {`^(?:(?:a|(?:aa)))$`, onePass, ``}, + {`^(?:[^\s\S])$`, onePass, ``}, + {`^(?:(?:a{3,4}){0,})$`, notOnePass, `aaaaaa`}, + {`^(?:(?:a+)*)$`, onePass, `a`}, + {`^(?:(?:(?:a*)+))$`, onePass, noStr}, + {`^(?:(?:a+)*)$`, onePass, ``}, + {`^[a-c]+$`, onePass, `abc`}, + {`^[a-c]*$`, onePass, `abcdabc`}, + {`^(?:a*)$`, onePass, `aaaaaaa`}, + {`^(?:(?:aa)|a)$`, onePass, `a`}, + {`^[a-c]*`, notOnePass, `abcdabc`}, + {`^[a-c]*$`, onePass, `abc`}, + {`^...$`, onePass, ``}, + {`^(?:a|(?:aa))$`, onePass, `a`}, + {`^[a-c]*`, notOnePass, `abcabc`}, + {`^a((b))c$`, onePass, noStr}, + {`^a.[l-nA-Cg-j]?e$`, onePass, noStr}, + {`^a((b))$`, onePass, noStr}, + {`^a(?:(b)|(c))c$`, onePass, noStr}, + {`^a(?:(b*)|(c))c$`, notOnePass, noStr}, + {`^a(?:b|c)$`, onePass, noStr}, + {`^a(?:b?|c)$`, onePass, noStr}, + {`^a(?:b?|c?)$`, notOnePass, noStr}, + {`^a(?:b?|c+)$`, onePass, noStr}, + {`^a(?:b+|(bc))d$`, notOnePass, noStr}, + {`^a(?:bc)+$`, onePass, noStr}, + {`^a(?:[bcd])+$`, onePass, noStr}, + {`^a((?:[bcd])+)$`, onePass, noStr}, + {`^a(:?b|c)*d$`, onePass, `abbbccbbcbbd"`}, + {`^.bc(d|e)*$`, onePass, `abcddddddeeeededd`}, + {`^(?:(?:aa)|.)$`, notOnePass, `a`}, + {`^(?:(?:a{1,2}){1,2})$`, notOnePass, `aaaa`}, +} + +func TestCompileOnePass(t *testing.T) { + var ( + p *syntax.Prog + re *syntax.Regexp + err error + ) + for _, test := range onePassTests { + if re, err = syntax.Parse(test.re, syntax.Perl); err != nil { + t.Errorf("Parse(%q) got err:%s, want success", test.re, err) + continue + } + // needs to be done before compile... + re = re.Simplify() + if p, err = syntax.Compile(re); err != nil { + t.Errorf("Compile(%q) got err:%s, want success", test.re, err) + continue + } + onePass = compileOnePass(p) + if (onePass == notOnePass) != (test.onePass == notOnePass) { + t.Errorf("CompileOnePass(%q) got %v, expected %v", test.re, onePass, test.onePass) + } + } +} diff --git a/src/pkg/regexp/regexp.go b/src/pkg/regexp/regexp.go index cca21e750e..0b8336a04f 100644 --- a/src/pkg/regexp/regexp.go +++ b/src/pkg/regexp/regexp.go @@ -83,7 +83,7 @@ type Regexp struct { // read-only after Compile expr string // as passed to Compile prog *syntax.Prog // compiled program - onepass *syntax.Prog // onpass program or nil + onepass *onePassProg // onpass program or nil prefix string // required prefix in unanchored matches prefixBytes []byte // prefix, as a []byte prefixComplete bool // prefix is the entire regexp @@ -165,16 +165,16 @@ func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { regexp := &Regexp{ expr: expr, prog: prog, - onepass: prog.CompileOnePass(), + onepass: compileOnePass(prog), numSubexp: maxCap, subexpNames: capNames, cond: prog.StartCond(), longest: longest, } - if regexp.onepass == syntax.NotOnePass { + if regexp.onepass == notOnePass { regexp.prefix, regexp.prefixComplete = prog.Prefix() } else { - regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = prog.OnePassPrefix() + regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) } if regexp.prefix != "" { // TODO(rsc): Remove this allocation by adding diff --git a/src/pkg/regexp/syntax/prog.go b/src/pkg/regexp/syntax/prog.go index c1d6a121f0..29bd282d0d 100644 --- a/src/pkg/regexp/syntax/prog.go +++ b/src/pkg/regexp/syntax/prog.go @@ -6,7 +6,6 @@ package syntax import ( "bytes" - "sort" "strconv" "unicode" ) @@ -115,7 +114,6 @@ type Inst struct { Out uint32 // all but InstMatch, InstFail Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth Rune []rune - Next []uint32 // If input rune matches } func (p *Prog) String() string { @@ -165,36 +163,6 @@ func (p *Prog) Prefix() (prefix string, complete bool) { return buf.String(), i.Op == InstMatch } -// OnePassPrefix returns a literal string that all matches for the -// regexp must start with. Complete is true if the prefix -// is the entire match. Pc is the index of the last rune instruction -// in the string. The OnePassPrefix skips over the mandatory -// EmptyBeginText -func (p *Prog) OnePassPrefix() (prefix string, complete bool, pc uint32) { - i := &p.Inst[p.Start] - if i.Op != InstEmptyWidth || (EmptyOp(i.Arg))&EmptyBeginText == 0 { - return "", i.Op == InstMatch, uint32(p.Start) - } - pc = i.Out - i = &p.Inst[pc] - for i.Op == InstNop { - pc = i.Out - i = &p.Inst[pc] - } - // Avoid allocation of buffer if prefix is empty. - if i.op() != InstRune || len(i.Rune) != 1 { - return "", i.Op == InstMatch, uint32(p.Start) - } - - // Have prefix; gather characters. - var buf bytes.Buffer - for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 { - buf.WriteRune(i.Rune[0]) - pc, i = i.Out, &p.Inst[i.Out] - } - return buf.String(), i.Op == InstEmptyWidth && (EmptyOp(i.Arg))&EmptyBeginText != 0, pc -} - // StartCond returns the leading empty-width conditions that must // be true in any match. It returns ^EmptyOp(0) if no matches are possible. func (p *Prog) StartCond() EmptyOp { @@ -221,29 +189,17 @@ Loop: const noMatch = -1 -// OnePassNext selects the next actionable state of the prog, based on the input character. -// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine. -// One of the alternates may ultimately lead without input to end of line. If the instruction -// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next. -func (i *Inst) OnePassNext(r rune) uint32 { - next := i.MatchRunePos(r) - if next != noMatch { - return i.Next[next] - } - if i.Op == InstAltMatch { - return i.Out - } - return 0 -} - // MatchRune returns true if the instruction matches (and consumes) r. // It should only be called when i.Op == InstRune. func (i *Inst) MatchRune(r rune) bool { return i.MatchRunePos(r) != noMatch } -// MatchRunePos returns the index of the rune pair if the instruction matches. -// It should only be called when i.Op == InstRune. +// MatchRunePos checks whether the instruction matches (and consumes) r. +// If so, MatchRunePos returns the index of the matching rune pair +// (or, when len(i.Rune) == 1, rune singleton). +// If not, MatchRunePos returns -1. +// MatchRunePos should only be called when i.Op == InstRune. func (i *Inst) MatchRunePos(r rune) int { rune := i.Rune @@ -387,496 +343,3 @@ func dumpInst(b *bytes.Buffer, i *Inst) { bw(b, "anynotnl -> ", u32(i.Out)) } } - -// Sparse Array implementation is used as a queue. -type queue struct { - sparse []uint32 - dense []uint32 - size, nextIndex uint32 -} - -func (q *queue) empty() bool { - return q.nextIndex >= q.size -} - -func (q *queue) next() (n uint32) { - n = q.dense[q.nextIndex] - q.nextIndex++ - return -} - -func (q *queue) clear() { - q.size = 0 - q.nextIndex = 0 -} - -func (q *queue) reset() { - q.nextIndex = 0 -} - -func (q *queue) contains(u uint32) bool { - if u >= uint32(len(q.sparse)) { - return false - } - return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u -} - -func (q *queue) insert(u uint32) { - if !q.contains(u) { - q.insertNew(u) - } -} - -func (q *queue) insertNew(u uint32) { - if u >= uint32(len(q.sparse)) { - return - } - q.sparse[u] = q.size - q.dense[q.size] = u - q.size++ -} - -func newQueue(size int) (q *queue) { - return &queue{ - sparse: make([]uint32, size), - dense: make([]uint32, size), - } -} - -// mergeRuneSets merges two non-intersecting runesets, and returns the merged result, -// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index -// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a -// NextIp array with the single element mergeFailed is returned. -// The code assumes that both inputs contain ordered and non-intersecting rune pairs. -const mergeFailed = uint32(0xffffffff) - -var ( - noRune = []rune{} - noNext = []uint32{mergeFailed} -) - -func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) { - leftLen := len(*leftRunes) - rightLen := len(*rightRunes) - if leftLen&0x1 != 0 || rightLen&0x1 != 0 { - panic("mergeRuneSets odd length []rune") - } - var ( - lx, rx int - ) - merged := make([]rune, 0) - next := make([]uint32, 0) - ok := true - defer func() { - if !ok { - merged = nil - next = nil - } - }() - - ix := -1 - extend := func(newLow *int, newArray *[]rune, pc uint32) bool { - if ix > 0 && (*newArray)[*newLow] <= merged[ix] { - return false - } - merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1]) - *newLow += 2 - ix += 2 - next = append(next, pc) - return true - } - - for lx < leftLen || rx < rightLen { - switch { - case rx >= rightLen: - ok = extend(&lx, leftRunes, leftPC) - case lx >= leftLen: - ok = extend(&rx, rightRunes, rightPC) - case (*rightRunes)[rx] < (*leftRunes)[lx]: - ok = extend(&rx, rightRunes, rightPC) - default: - ok = extend(&lx, leftRunes, leftPC) - } - if !ok { - return noRune, noNext - } - } - return merged, next -} - -// cleanupOnePass drops working memory, and restores certain shortcut instructions. -func (prog *Prog) cleanupOnePass(pOriginal *Prog) { - for ix, instOriginal := range pOriginal.Inst { - switch instOriginal.Op { - case InstAlt, InstAltMatch, InstRune: - case InstCapture, InstEmptyWidth, InstNop, InstMatch, InstFail: - prog.Inst[ix].Next = nil - case InstRune1, InstRuneAny, InstRuneAnyNotNL: - prog.Inst[ix].Next = nil - prog.Inst[ix] = instOriginal - } - } -} - -// onePassCopy creates a copy of the original Prog, as we'll be modifying it -func (prog *Prog) onePassCopy() *Prog { - p := &Prog{ - Inst: append([]Inst{}[:], prog.Inst...), - Start: prog.Start, - NumCap: prog.NumCap, - } - for _, inst := range p.Inst { - inst.Next = make([]uint32, 0) - } - - // rewrites one or more common Prog constructs that enable some otherwise - // non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at - // ip A, that points to ips B & C. - // A:BC + B:DA => A:BC + B:CD - // A:BC + B:DC => A:DC + B:DC - for pc := range p.Inst { - switch p.Inst[pc].Op { - default: - continue - case InstAlt, InstAltMatch: - // A:Bx + B:Ay - p_A_Other := &p.Inst[pc].Out - p_A_Alt := &p.Inst[pc].Arg - // make sure a target is another Alt - instAlt := p.Inst[*p_A_Alt] - if !(instAlt.Op == InstAlt || instAlt.Op == InstAltMatch) { - p_A_Alt, p_A_Other = p_A_Other, p_A_Alt - instAlt = p.Inst[*p_A_Alt] - if !(instAlt.Op == InstAlt || instAlt.Op == InstAltMatch) { - continue - } - } - instOther := p.Inst[*p_A_Other] - // Analyzing both legs pointing to Alts is for another day - if instOther.Op == InstAlt || instOther.Op == InstAltMatch { - // too complicated - continue - } - // simple empty transition loop - // A:BC + B:DA => A:BC + B:DC - p_B_Alt := &p.Inst[*p_A_Alt].Out - p_B_Other := &p.Inst[*p_A_Alt].Arg - patch := false - if instAlt.Out == uint32(pc) { - patch = true - } else if instAlt.Arg == uint32(pc) { - patch = true - p_B_Alt, p_B_Other = p_B_Other, p_B_Alt - } - if patch { - *p_B_Alt = *p_A_Other - } - - // empty transition to common target - // A:BC + B:DC => A:DC + B:DC - if *p_A_Other == *p_B_Alt { - *p_A_Alt = *p_B_Other - } - } - } - return p -} - -// runeSlice exists to permit sorting the case-folded rune sets. -type runeSlice []rune - -func (p runeSlice) Len() int { return len(p) } -func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] } -func (p runeSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } - -// Sort is a convenience method. -func (p runeSlice) Sort() { - sort.Sort(p) -} - -// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt, -// the match engine can always tell which branch to take. The routine may modify -// p if it is turned into a onepass Prog. If it isn't possible for this to be a -// onepass Prog, the Prog syntax.NotOnePass is returned. makeOnePass is recursive -// to the size of the Prog -func (p *Prog) makeOnePass() *Prog { - // If the machine is very long, it's not worth the time to check if we can use one pass. - if len(p.Inst) >= 1000 { - return NotOnePass - } - - var ( - instQueue = newQueue(len(p.Inst)) - visitQueue = newQueue(len(p.Inst)) - build func(uint32, *queue) - check func(uint32, map[uint32]bool) bool - onePassRunes = make([][]rune, len(p.Inst)) - ) - build = func(pc uint32, q *queue) { - if q.contains(pc) { - return - } - inst := p.Inst[pc] - switch inst.Op { - case InstAlt, InstAltMatch: - q.insert(inst.Out) - build(inst.Out, q) - q.insert(inst.Arg) - case InstMatch, InstFail: - default: - q.insert(inst.Out) - } - } - - // check that paths from Alt instructions are unambiguous, and rebuild the new - // program as a onepass program - check = func(pc uint32, m map[uint32]bool) (ok bool) { - ok = true - inst := &p.Inst[pc] - if visitQueue.contains(pc) { - return - } - visitQueue.insert(pc) - switch inst.Op { - case InstAlt, InstAltMatch: - ok = check(inst.Out, m) && check(inst.Arg, m) - // check no-input paths to InstMatch - matchOut := m[inst.Out] - matchArg := m[inst.Arg] - if matchOut && matchArg { - ok = false - break - } - // Match on empty goes in inst.Out - if matchArg { - inst.Out, inst.Arg = inst.Arg, inst.Out - matchOut, matchArg = matchArg, matchOut - } - if matchOut { - m[pc] = true - inst.Op = InstAltMatch - } - - // build a dispatch operator from the two legs of the alt. - onePassRunes[pc], inst.Next = mergeRuneSets( - &onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg) - if len(inst.Next) > 0 && inst.Next[0] == mergeFailed { - ok = false - break - } - case InstCapture, InstNop: - ok = check(inst.Out, m) - m[pc] = m[inst.Out] - // pass matching runes back through these no-ops. - onePassRunes[pc] = append([]rune{}[:], onePassRunes[inst.Out][:]...) - inst.Next = []uint32{} - for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { - inst.Next = append(inst.Next, inst.Out) - } - case InstEmptyWidth: - ok = check(inst.Out, m) - m[pc] = m[inst.Out] - onePassRunes[pc] = append([]rune{}[:], onePassRunes[inst.Out][:]...) - inst.Next = []uint32{} - for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { - inst.Next = append(inst.Next, inst.Out) - } - case InstMatch, InstFail: - m[pc] = inst.Op == InstMatch - break - case InstRune: - ok = check(inst.Out, m) - m[pc] = false - if len(inst.Next) > 0 { - break - } - if len(inst.Rune) == 0 { - onePassRunes[pc] = []rune{}[:] - inst.Next = []uint32{inst.Out} - break - } - runes := make([]rune, 0) - if len(inst.Rune) == 1 && Flags(inst.Arg)&FoldCase != 0 { - r0 := inst.Rune[0] - runes = append(runes, r0, r0) - for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { - runes = append(runes, r1, r1) - } - sort.Sort(runeSlice(runes)) - } else { - runes = append(runes, inst.Rune...) - } - onePassRunes[pc] = runes - inst.Next = []uint32{} - for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { - inst.Next = append(inst.Next, inst.Out) - } - inst.Op = InstRune - case InstRune1: - ok = check(inst.Out, m) - m[pc] = false - if len(inst.Next) > 0 { - break - } - runes := []rune{}[:] - // expand case-folded runes - if Flags(inst.Arg)&FoldCase != 0 { - r0 := inst.Rune[0] - runes = append(runes, r0, r0) - for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { - runes = append(runes, r1, r1) - } - sort.Sort(runeSlice(runes)) - } else { - runes = append(runes, inst.Rune[0], inst.Rune[0]) - } - onePassRunes[pc] = runes - inst.Next = []uint32{} - for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { - inst.Next = append(inst.Next, inst.Out) - } - inst.Op = InstRune - case InstRuneAny: - ok = check(inst.Out, m) - m[pc] = false - if len(inst.Next) > 0 { - break - } - onePassRunes[pc] = append([]rune{}[:], anyRune[:]...) - inst.Next = []uint32{inst.Out} - case InstRuneAnyNotNL: - ok = check(inst.Out, m) - m[pc] = false - if len(inst.Next) > 0 { - break - } - onePassRunes[pc] = append([]rune{}[:], anyRuneNotNL[:]...) - inst.Next = []uint32{} - for i := len(onePassRunes[pc]) / 2; i >= 0; i-- { - inst.Next = append(inst.Next, inst.Out) - } - } - return - } - - instQueue.clear() - instQueue.insert(uint32(p.Start)) - m := make(map[uint32]bool, len(p.Inst)) - for !instQueue.empty() { - pc := instQueue.next() - inst := p.Inst[pc] - visitQueue.clear() - if !check(uint32(pc), m) { - p = NotOnePass - break - } - switch inst.Op { - case InstAlt, InstAltMatch: - instQueue.insert(inst.Out) - instQueue.insert(inst.Arg) - case InstCapture, InstEmptyWidth, InstNop: - instQueue.insert(inst.Out) - case InstMatch: - case InstFail: - case InstRune, InstRune1, InstRuneAny, InstRuneAnyNotNL: - default: - } - } - if p != NotOnePass { - for i, _ := range p.Inst { - p.Inst[i].Rune = onePassRunes[i][:] - } - } - return p -} - -// walk visits each Inst in the prog once, and applies the argument -// function(ip, next), in pre-order. -func (prog *Prog) walk(funcs ...func(ip, next uint32)) { - var walk1 func(uint32) - progQueue := newQueue(len(prog.Inst)) - walk1 = func(ip uint32) { - if progQueue.contains(ip) { - return - } - progQueue.insert(ip) - inst := prog.Inst[ip] - switch inst.Op { - case InstAlt, InstAltMatch: - for _, f := range funcs { - f(ip, inst.Out) - f(ip, inst.Arg) - } - walk1(inst.Out) - walk1(inst.Arg) - default: - for _, f := range funcs { - f(ip, inst.Out) - } - walk1(inst.Out) - } - } - walk1(uint32(prog.Start)) -} - -// find returns the Insts that match the argument predicate function -func (prog *Prog) find(f func(*Prog, int) bool) (matches []uint32) { - matches = []uint32{} - - for ip := range prog.Inst { - if f(prog, ip) { - matches = append(matches, uint32(ip)) - } - } - return -} - -var NotOnePass *Prog = nil -var debug = false - -// CompileOnePass returns a new *Prog suitable for onePass execution if the original Prog -// can be recharacterized as a one-pass regexp program, or syntax.NotOnePass if the -// Prog cannot be converted. For a one pass prog, the fundamental condition that must -// be true is: at any InstAlt, there must be no ambiguity about what branch to take. -func (prog *Prog) CompileOnePass() (p *Prog) { - if prog.Start == 0 { - return NotOnePass - } - // onepass regexp is anchored - if prog.Inst[prog.Start].Op != InstEmptyWidth || - EmptyOp(prog.Inst[prog.Start].Arg)&EmptyBeginText != EmptyBeginText { - return NotOnePass - } - // every instruction leading to InstMatch must be EmptyEndText - for _, inst := range prog.Inst { - opOut := prog.Inst[inst.Out].Op - switch inst.Op { - default: - if opOut == InstMatch { - return NotOnePass - } - case InstAlt, InstAltMatch: - if opOut == InstMatch || prog.Inst[inst.Arg].Op == InstMatch { - return NotOnePass - } - case InstEmptyWidth: - if opOut == InstMatch { - if EmptyOp(inst.Arg)&EmptyEndText == EmptyEndText { - continue - } - return NotOnePass - } - } - } - // Creates a slightly optimized copy of the original Prog - // that cleans up some Prog idioms that block valid onepass programs - p = prog.onePassCopy() - - // checkAmbiguity on InstAlts, build onepass Prog if possible - p = p.makeOnePass() - - if p != NotOnePass { - p.cleanupOnePass(prog) - } - return p -} diff --git a/src/pkg/regexp/syntax/prog_test.go b/src/pkg/regexp/syntax/prog_test.go index 66beb7435b..50bfa3d4be 100644 --- a/src/pkg/regexp/syntax/prog_test.go +++ b/src/pkg/regexp/syntax/prog_test.go @@ -4,10 +4,7 @@ package syntax -import ( - "reflect" - "testing" -) +import "testing" var compileTests = []struct { Regexp string @@ -115,200 +112,3 @@ func BenchmarkEmptyOpContext(b *testing.B) { EmptyOpContext(r1, -1) } } - -var runeMergeTests = []struct { - left, right, merged []rune - next []uint32 - leftPC, rightPC uint32 -}{ - { - // empty rhs - []rune{69, 69}, - []rune{}, - []rune{69, 69}, - []uint32{1}, - 1, 2, - }, - { - // identical runes, identical targets - []rune{69, 69}, - []rune{69, 69}, - []rune{}, - []uint32{mergeFailed}, - 1, 1, - }, - { - // identical runes, different targets - []rune{69, 69}, - []rune{69, 69}, - []rune{}, - []uint32{mergeFailed}, - 1, 2, - }, - { - // append right-first - []rune{69, 69}, - []rune{71, 71}, - []rune{69, 69, 71, 71}, - []uint32{1, 2}, - 1, 2, - }, - { - // append, left-first - []rune{71, 71}, - []rune{69, 69}, - []rune{69, 69, 71, 71}, - []uint32{2, 1}, - 1, 2, - }, - { - // successful interleave - []rune{60, 60, 71, 71, 101, 101}, - []rune{69, 69, 88, 88}, - []rune{60, 60, 69, 69, 71, 71, 88, 88, 101, 101}, - []uint32{1, 2, 1, 2, 1}, - 1, 2, - }, - { - // left surrounds right - []rune{69, 74}, - []rune{71, 71}, - []rune{}, - []uint32{mergeFailed}, - 1, 2, - }, - { - // right surrounds left - []rune{69, 74}, - []rune{68, 75}, - []rune{}, - []uint32{mergeFailed}, - 1, 2, - }, - { - // overlap at interval begin - []rune{69, 74}, - []rune{74, 75}, - []rune{}, - []uint32{mergeFailed}, - 1, 2, - }, - { - // overlap ar interval end - []rune{69, 74}, - []rune{65, 69}, - []rune{}, - []uint32{mergeFailed}, - 1, 2, - }, - { - // overlap from above - []rune{69, 74}, - []rune{71, 74}, - []rune{}, - []uint32{mergeFailed}, - 1, 2, - }, - { - // overlap from below - []rune{69, 74}, - []rune{65, 71}, - []rune{}, - []uint32{mergeFailed}, - 1, 2, - }, - { - // out of order []rune - []rune{69, 74, 60, 65}, - []rune{66, 67}, - []rune{}, - []uint32{mergeFailed}, - 1, 2, - }, -} - -func TestMergeRuneSet(t *testing.T) { - for ix, test := range runeMergeTests { - merged, next := mergeRuneSets(&test.left, &test.right, test.leftPC, test.rightPC) - if !reflect.DeepEqual(merged, test.merged) { - t.Errorf("mergeRuneSet :%d (%v, %v) merged\n have\n%v\nwant\n%v", ix, test.left, test.right, merged, test.merged) - } - if !reflect.DeepEqual(next, test.next) { - t.Errorf("mergeRuneSet :%d(%v, %v) next\n have\n%v\nwant\n%v", ix, test.left, test.right, next, test.next) - } - } -} - -const noStr = `!` - -var onePass = &Prog{} - -var onePassTests = []struct { - re string - onePass *Prog - prog string -}{ - {`^(?:a|(?:a*))$`, NotOnePass, noStr}, - {`^(?:(a)|(?:a*))$`, NotOnePass, noStr}, - {`^(?:(?:(?:.(?:$))?))$`, onePass, `a`}, - {`^abcd$`, onePass, `abcd`}, - {`^abcd$`, onePass, `abcde`}, - {`^(?:(?:a{0,})*?)$`, onePass, `a`}, - {`^(?:(?:a+)*)$`, onePass, ``}, - {`^(?:(?:a|(?:aa)))$`, onePass, ``}, - {`^(?:[^\s\S])$`, onePass, ``}, - {`^(?:(?:a{3,4}){0,})$`, NotOnePass, `aaaaaa`}, - {`^(?:(?:a+)*)$`, onePass, `a`}, - {`^(?:(?:(?:a*)+))$`, onePass, noStr}, - {`^(?:(?:a+)*)$`, onePass, ``}, - {`^[a-c]+$`, onePass, `abc`}, - {`^[a-c]*$`, onePass, `abcdabc`}, - {`^(?:a*)$`, onePass, `aaaaaaa`}, - {`^(?:(?:aa)|a)$`, onePass, `a`}, - {`^[a-c]*`, NotOnePass, `abcdabc`}, - {`^[a-c]*$`, onePass, `abc`}, - {`^...$`, onePass, ``}, - {`^(?:a|(?:aa))$`, onePass, `a`}, - {`^[a-c]*`, NotOnePass, `abcabc`}, - {`^a((b))c$`, onePass, noStr}, - {`^a.[l-nA-Cg-j]?e$`, onePass, noStr}, - {`^a((b))$`, onePass, noStr}, - {`^a(?:(b)|(c))c$`, onePass, noStr}, - {`^a(?:(b*)|(c))c$`, NotOnePass, noStr}, - {`^a(?:b|c)$`, onePass, noStr}, - {`^a(?:b?|c)$`, onePass, noStr}, - {`^a(?:b?|c?)$`, NotOnePass, noStr}, - {`^a(?:b?|c+)$`, onePass, noStr}, - {`^a(?:b+|(bc))d$`, NotOnePass, noStr}, - {`^a(?:bc)+$`, onePass, noStr}, - {`^a(?:[bcd])+$`, onePass, noStr}, - {`^a((?:[bcd])+)$`, onePass, noStr}, - {`^a(:?b|c)*d$`, onePass, `abbbccbbcbbd"`}, - {`^.bc(d|e)*$`, onePass, `abcddddddeeeededd`}, - {`^(?:(?:aa)|.)$`, NotOnePass, `a`}, - {`^(?:(?:a{1,2}){1,2})$`, NotOnePass, `aaaa`}, -} - -func TestCompileOnePass(t *testing.T) { - var ( - p *Prog - re *Regexp - err error - ) - for _, test := range onePassTests { - if re, err = Parse(test.re, Perl); err != nil { - t.Errorf("Parse(%q) got err:%s, want success", test.re, err) - continue - } - // needs to be done before compile... - re = re.Simplify() - if p, err = Compile(re); err != nil { - t.Errorf("Compile(%q) got err:%s, want success", test.re, err) - continue - } - onePass = p.CompileOnePass() - if (onePass == NotOnePass) != (test.onePass == NotOnePass) { - t.Errorf("CompileOnePass(%q) got %v, expected %v", test.re, onePass, test.onePass) - } - } -}