return r.r.WriteString(w, s)
}
-// genericReplacer is the fully generic (and least optimized) algorithm.
+// trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
+// and values may be empty. For example, the trie containing keys "ax", "ay",
+// "bcbc", "x" and "xy" could have eight nodes:
+//
+// n0 -
+// n1 a-
+// n2 .x+
+// n3 .y+
+// n4 b-
+// n5 .cbc+
+// n6 x+
+// n7 .y+
+//
+// n0 is the root node, and its children are n1, n4 and n6; n1's children are
+// n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
+// with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
+// (marked with a trailing "+") are complete keys.
+type trieNode struct {
+ // value is the value of the trie node's key/value pair. It is empty if
+ // this node is not a complete key.
+ value string
+ // priority is the priority (higher is more important) of the trie node's
+ // key/value pair; keys are not necessarily matched shortest- or longest-
+ // first. Priority is positive if this node is a complete key, and zero
+ // otherwise. In the example above, positive/zero priorities are marked
+ // with a trailing "+" or "-".
+ priority int
+
+ // A trie node may have zero, one or more child nodes:
+ // * if the remaining fields are zero, there are no children.
+ // * if prefix and next are non-zero, there is one child in next.
+ // * if table is non-zero, it defines all the children.
+ //
+ // Prefixes are preferred over tables when there is one child, but the
+ // root node always uses a table for lookup efficiency.
+
+ // prefix is the difference in keys between this trie node and the next.
+ // In the example above, node n4 has prefix "cbc" and n4's next node is n5.
+ // Node n5 has no children and so has zero prefix, next and table fields.
+ prefix string
+ next *trieNode
+
+ // table is a lookup table indexed by the next byte in the key, after
+ // remapping that byte through genericReplacer.mapping to create a dense
+ // index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
+ // 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
+ // genericReplacer.tableSize will be 5. Node n0's table will be
+ // []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
+ // 'a', 'b' and 'x'.
+ table []*trieNode
+}
+
+func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
+ if key == "" {
+ if t.priority == 0 {
+ t.value = val
+ t.priority = priority
+ }
+ return
+ }
+
+ if t.prefix != "" {
+ // Need to split the prefix among multiple nodes.
+ var n int // length of the longest common prefix
+ for ; n < len(t.prefix) && n < len(key); n++ {
+ if t.prefix[n] != key[n] {
+ break
+ }
+ }
+ if n == len(t.prefix) {
+ t.next.add(key[n:], val, priority, r)
+ } else if n == 0 {
+ // First byte differs, start a new lookup table here. Looking up
+ // what is currently t.prefix[0] will lead to prefixNode, and
+ // looking up key[0] will lead to keyNode.
+ var prefixNode *trieNode
+ if len(t.prefix) == 1 {
+ prefixNode = t.next
+ } else {
+ prefixNode = &trieNode{
+ prefix: t.prefix[1:],
+ next: t.next,
+ }
+ }
+ keyNode := new(trieNode)
+ t.table = make([]*trieNode, r.tableSize)
+ t.table[r.mapping[t.prefix[0]]] = prefixNode
+ t.table[r.mapping[key[0]]] = keyNode
+ t.prefix = ""
+ t.next = nil
+ keyNode.add(key[1:], val, priority, r)
+ } else {
+ // Insert new node after the common section of the prefix.
+ next := &trieNode{
+ prefix: t.prefix[n:],
+ next: t.next,
+ }
+ t.prefix = t.prefix[:n]
+ t.next = next
+ next.add(key[n:], val, priority, r)
+ }
+ } else if t.table != nil {
+ // Insert into existing table.
+ m := r.mapping[key[0]]
+ if t.table[m] == nil {
+ t.table[m] = new(trieNode)
+ }
+ t.table[m].add(key[1:], val, priority, r)
+ } else {
+ t.prefix = key
+ t.next = new(trieNode)
+ t.next.add("", val, priority, r)
+ }
+}
+
+func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
+ // Iterate down the trie to the end, and grab the value and keylen with
+ // the highest priority.
+ bestPriority := 0
+ node := &r.root
+ n := 0
+ for node != nil {
+ if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
+ bestPriority = node.priority
+ val = node.value
+ keylen = n
+ found = true
+ }
+
+ if s == "" {
+ break
+ }
+ if node.table != nil {
+ index := r.mapping[s[0]]
+ if int(index) == r.tableSize {
+ break
+ }
+ node = node.table[index]
+ s = s[1:]
+ n++
+ } else if node.prefix != "" && HasPrefix(s, node.prefix) {
+ n += len(node.prefix)
+ s = s[len(node.prefix):]
+ node = node.next
+ } else {
+ break
+ }
+ }
+ return
+}
+
+// genericReplacer is the fully generic algorithm.
// It's used as a fallback when nothing faster can be used.
type genericReplacer struct {
- p []pair
+ root trieNode
+ // tableSize is the size of a trie node's lookup table. It is the number
+ // of unique key bytes.
+ tableSize int
+ // mapping maps from key bytes to a dense index for trieNode.table.
+ mapping [256]byte
}
-type pair struct{ old, new string }
-
func makeGenericReplacer(oldnew []string) *genericReplacer {
- gen := &genericReplacer{
- p: make([]pair, len(oldnew)/2),
+ r := new(genericReplacer)
+ // Find each byte used, then assign them each an index.
+ for i := 0; i < len(oldnew); i += 2 {
+ key := oldnew[i]
+ for j := 0; j < len(key); j++ {
+ r.mapping[key[j]] = 1
+ }
+ }
+
+ for _, b := range r.mapping {
+ r.tableSize += int(b)
}
+
+ var index byte
+ for i, b := range r.mapping {
+ if b == 0 {
+ r.mapping[i] = byte(r.tableSize)
+ } else {
+ r.mapping[i] = index
+ index++
+ }
+ }
+ // Ensure root node uses a lookup table (for performance).
+ r.root.table = make([]*trieNode, r.tableSize)
+
for i := 0; i < len(oldnew); i += 2 {
- gen.p[i/2] = pair{oldnew[i], oldnew[i+1]}
+ r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
}
- return gen
+ return r
}
-type appendSliceWriter struct {
- b []byte
-}
+type appendSliceWriter []byte
+// Write writes to the buffer to satisfy io.Writer.
func (w *appendSliceWriter) Write(p []byte) (int, error) {
- w.b = append(w.b, p...)
+ *w = append(*w, p...)
return len(p), nil
}
+// WriteString writes to the buffer without string->[]byte->string allocations.
+func (w *appendSliceWriter) WriteString(s string) (int, error) {
+ *w = append(*w, s...)
+ return len(s), nil
+}
+
+type stringWriter struct {
+ w io.Writer
+}
+
+func (w stringWriter) WriteString(s string) (int, error) {
+ return w.w.Write([]byte(s))
+}
+
func (r *genericReplacer) Replace(s string) string {
- // TODO(bradfitz): optimized version
- n, _ := r.WriteString(discard, s)
- w := appendSliceWriter{make([]byte, 0, n)}
- r.WriteString(&w, s)
- return string(w.b)
+ buf := make(appendSliceWriter, 0, len(s))
+ r.WriteString(&buf, s)
+ return string(buf)
}
func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
- lastEmpty := false // the last replacement was of the empty string
-Input:
- // TODO(bradfitz): optimized version
- for i := 0; i < len(s); {
- for _, p := range r.p {
- if p.old == "" && lastEmpty {
- // Don't let old match twice in a row.
- // (it doesn't advance the input and
- // would otherwise loop forever)
- continue
+ sw, ok := w.(interface {
+ WriteString(string) (int, error)
+ })
+ if !ok {
+ sw = stringWriter{w}
+ }
+
+ var last, wn int
+ var prevMatchEmpty bool
+ for i := 0; i <= len(s); {
+ // Ignore the empty match iff the previous loop found the empty match.
+ val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
+ prevMatchEmpty = match && keylen == 0
+ if match {
+ wn, err = sw.WriteString(s[last:i])
+ n += wn
+ if err != nil {
+ return
}
- if HasPrefix(s[i:], p.old) {
- if p.new != "" {
- wn, err := w.Write([]byte(p.new))
- n += wn
- if err != nil {
- return n, err
- }
- }
- i += len(p.old)
- lastEmpty = p.old == ""
- continue Input
+ wn, err = sw.WriteString(val)
+ n += wn
+ if err != nil {
+ return
}
- }
- wn, err := w.Write([]byte{s[i]})
- n += wn
- if err != nil {
- return n, err
+ i += keylen
+ last = i
+ continue
}
i++
}
-
- // Final empty match at end.
- for _, p := range r.p {
- if p.old == "" {
- if p.new != "" {
- wn, err := w.Write([]byte(p.new))
- n += wn
- if err != nil {
- return n, err
- }
- }
- break
- }
+ if last != len(s) {
+ wn, err = sw.WriteString(s[last:])
+ n += wn
}
-
- return n, nil
+ return
}
// byteReplacer is the implementation that's used when all the "old"
}
return n, nil
}
-
-// strings is too low-level to import io/ioutil
-var discard io.Writer = devNull(0)
-
-type devNull int
-
-func (devNull) Write(p []byte) (int, error) {
- return len(p), nil
-}
blankToX1 := NewReplacer("", "X")
blankToX2 := NewReplacer("", "X", "", "")
- blankToXOToO := NewReplacer("", "X", "o", "O")
+ blankHighPriority := NewReplacer("", "X", "o", "O")
+ blankLowPriority := NewReplacer("o", "O", "", "X")
blankNoOp1 := NewReplacer("", "")
blankNoOp2 := NewReplacer("", "", "", "A")
blankFoo := NewReplacer("", "X", "foobar", "R", "foobaz", "Z")
testCases = append(testCases,
- testCase{blankToX1, "foo", "XfooX"}, // TODO: should this be "XfXoXoX"?
+ testCase{blankToX1, "foo", "XfXoXoX"},
testCase{blankToX1, "", "X"},
- testCase{blankToX2, "foo", "XfooX"}, // TODO: should this be "XfXoXoX"?
+ testCase{blankToX2, "foo", "XfXoXoX"},
testCase{blankToX2, "", "X"},
- testCase{blankToXOToO, "oo", "XOXOX"},
- testCase{blankToXOToO, "ii", "XiiX"}, // TODO: should this be "XiXiX"?
- testCase{blankToXOToO, "iooi", "XiOXOXiX"}, // TODO: should this be "XiXOXOXiX"?
- testCase{blankToXOToO, "", "X"},
+ testCase{blankHighPriority, "oo", "XOXOX"},
+ testCase{blankHighPriority, "ii", "XiXiX"},
+ testCase{blankHighPriority, "oiio", "XOXiXiXOX"},
+ testCase{blankHighPriority, "iooi", "XiXOXOXiX"},
+ testCase{blankHighPriority, "", "X"},
+
+ testCase{blankLowPriority, "oo", "OOX"},
+ testCase{blankLowPriority, "ii", "XiXiX"},
+ testCase{blankLowPriority, "oiio", "OXiXiOX"},
+ testCase{blankLowPriority, "iooi", "XiOOXiX"},
+ testCase{blankLowPriority, "", "X"},
testCase{blankNoOp1, "foo", "foo"},
testCase{blankNoOp1, "", ""},
testCase{blankNoOp2, "", ""},
testCase{blankFoo, "foobarfoobaz", "XRXZX"},
- testCase{blankFoo, "foobar-foobaz", "XRX-ZX"}, // TODO: should this be "XRX-XZX"?
+ testCase{blankFoo, "foobar-foobaz", "XRX-XZX"},
testCase{blankFoo, "", "X"},
)
}
}
+// TestGenericTrieBuilding verifies the structure of the generated trie. There
+// is one node per line, and the key ending with the current line is in the
+// trie if it ends with a "+".
+func TestGenericTrieBuilding(t *testing.T) {
+ testCases := []struct{ in, out string }{
+ {"abc;abdef;abdefgh;xx;xy;z", `-
+ a-
+ .b-
+ ..c+
+ ..d-
+ ...ef+
+ .....gh+
+ x-
+ .x+
+ .y+
+ z+
+ `},
+ {"abracadabra;abracadabrakazam;abraham;abrasion", `-
+ a-
+ .bra-
+ ....c-
+ .....adabra+
+ ...........kazam+
+ ....h-
+ .....am+
+ ....s-
+ .....ion+
+ `},
+ {"aaa;aa;a;i;longerst;longer;long;xx;x;X;Y", `-
+ X+
+ Y+
+ a+
+ .a+
+ ..a+
+ i+
+ l-
+ .ong+
+ ....er+
+ ......st+
+ x+
+ .x+
+ `},
+ {"foo;;foo;foo1", `+
+ f-
+ .oo+
+ ...1+
+ `},
+ }
+
+ for _, tc := range testCases {
+ keys := Split(tc.in, ";")
+ args := make([]string, len(keys)*2)
+ for i, key := range keys {
+ args[i*2] = key
+ }
+
+ got := NewReplacer(args...).PrintTrie()
+ // Remove tabs from tc.out
+ wantbuf := make([]byte, 0, len(tc.out))
+ for i := 0; i < len(tc.out); i++ {
+ if tc.out[i] != '\t' {
+ wantbuf = append(wantbuf, tc.out[i])
+ }
+ }
+ want := string(wantbuf)
+
+ if got != want {
+ t.Errorf("PrintTrie(%q)\ngot\n%swant\n%s", tc.in, got, want)
+ }
+ }
+}
+
func BenchmarkGenericNoMatch(b *testing.B) {
str := Repeat("A", 100) + Repeat("B", 100)
generic := NewReplacer("a", "A", "b", "B", "12", "123") // varying lengths forces generic