strings: implement a faster generic Replacer

author Eric Eisner <eric.d.eisner@gmail.com>

Mon, 17 Sep 2012 01:50:15 +0000 (11:50 +1000)

committer Nigel Tao <nigeltao@golang.org>

Mon, 17 Sep 2012 01:50:15 +0000 (11:50 +1000)
author Eric Eisner <eric.d.eisner@gmail.com>
Mon, 17 Sep 2012 01:50:15 +0000 (11:50 +1000)
committer Nigel Tao <nigeltao@golang.org>
Mon, 17 Sep 2012 01:50:15 +0000 (11:50 +1000)
diff --git a/src/pkg/strings/export_test.go b/src/pkg/strings/export_test.go

index dcfec513ccf2a9bc77a578f3e69c24e89131ae24..3a8c85140656b03c3b51e39b00f95cbd0dd3aa8a 100644 (file)
--- a/src/pkg/strings/export_test.go
+++ b/src/pkg/strings/export_test.go
@@ -7,3 +7,30 @@ package strings
  func (r *Replacer) Replacer() interface{} {
         return r.r
  }
+
+func (r *Replacer) PrintTrie() string {
+       gen := r.r.(*genericReplacer)
+       return gen.printNode(&gen.root, 0)
+}
+
+func (r *genericReplacer) printNode(t *trieNode, depth int) (s string) {
+       if t.priority > 0 {
+               s += "+"
+       } else {
+               s += "-"
+       }
+       s += "\n"
+
+       if t.prefix != "" {
+               s += Repeat(".", depth) + t.prefix
+               s += r.printNode(t.next, depth+len(t.prefix))
+       } else if t.table != nil {
+               for b, m := range r.mapping {
+                       if int(m) != r.tableSize && t.table[m] != nil {
+                               s += Repeat(".", depth) + string([]byte{byte(b)})
+                               s += r.printNode(t.table[m], depth+1)
+                       }
+               }
+       }
+       return
+}
diff --git a/src/pkg/strings/replace.go b/src/pkg/strings/replace.go

index 3a1322badeb066b12245ad5f2d76c2d8b295b610..d863eeb5c63217a6ef00fbe59dd14b0a8c447274 100644 (file)
--- a/src/pkg/strings/replace.go
+++ b/src/pkg/strings/replace.go
@@ -80,89 +80,264 @@ func (r *Replacer) WriteString(w io.Writer, s string) (n int, err error) {
         return r.r.WriteString(w, s)
  }
  
-// genericReplacer is the fully generic (and least optimized) algorithm.
+// trieNode is a node in a lookup trie for prioritized key/value pairs. Keys
+// and values may be empty. For example, the trie containing keys "ax", "ay",
+// "bcbc", "x" and "xy" could have eight nodes:
+//
+//  n0  -
+//  n1  a-
+//  n2  .x+
+//  n3  .y+
+//  n4  b-
+//  n5  .cbc+
+//  n6  x+
+//  n7  .y+
+//
+// n0 is the root node, and its children are n1, n4 and n6; n1's children are
+// n2 and n3; n4's child is n5; n6's child is n7. Nodes n0, n1 and n4 (marked
+// with a trailing "-") are partial keys, and nodes n2, n3, n5, n6 and n7
+// (marked with a trailing "+") are complete keys.
+type trieNode struct {
+       // value is the value of the trie node's key/value pair. It is empty if
+       // this node is not a complete key.
+       value string
+       // priority is the priority (higher is more important) of the trie node's
+       // key/value pair; keys are not necessarily matched shortest- or longest-
+       // first. Priority is positive if this node is a complete key, and zero
+       // otherwise. In the example above, positive/zero priorities are marked
+       // with a trailing "+" or "-".
+       priority int
+
+       // A trie node may have zero, one or more child nodes:
+       //  * if the remaining fields are zero, there are no children.
+       //  * if prefix and next are non-zero, there is one child in next.
+       //  * if table is non-zero, it defines all the children.
+       //
+       // Prefixes are preferred over tables when there is one child, but the
+       // root node always uses a table for lookup efficiency.
+
+       // prefix is the difference in keys between this trie node and the next.
+       // In the example above, node n4 has prefix "cbc" and n4's next node is n5.
+       // Node n5 has no children and so has zero prefix, next and table fields.
+       prefix string
+       next   *trieNode
+
+       // table is a lookup table indexed by the next byte in the key, after
+       // remapping that byte through genericReplacer.mapping to create a dense
+       // index. In the example above, the keys only use 'a', 'b', 'c', 'x' and
+       // 'y', which remap to 0, 1, 2, 3 and 4. All other bytes remap to 5, and
+       // genericReplacer.tableSize will be 5. Node n0's table will be
+       // []*trieNode{ 0:n1, 1:n4, 3:n6 }, where the 0, 1 and 3 are the remapped
+       // 'a', 'b' and 'x'.
+       table []*trieNode
+}
+
+func (t *trieNode) add(key, val string, priority int, r *genericReplacer) {
+       if key == "" {
+               if t.priority == 0 {
+                       t.value = val
+                       t.priority = priority
+               }
+               return
+       }
+
+       if t.prefix != "" {
+               // Need to split the prefix among multiple nodes.
+               var n int // length of the longest common prefix
+               for ; n < len(t.prefix) && n < len(key); n++ {
+                       if t.prefix[n] != key[n] {
+                               break
+                       }
+               }
+               if n == len(t.prefix) {
+                       t.next.add(key[n:], val, priority, r)
+               } else if n == 0 {
+                       // First byte differs, start a new lookup table here. Looking up
+                       // what is currently t.prefix[0] will lead to prefixNode, and
+                       // looking up key[0] will lead to keyNode.
+                       var prefixNode *trieNode
+                       if len(t.prefix) == 1 {
+                               prefixNode = t.next
+                       } else {
+                               prefixNode = &trieNode{
+                                       prefix: t.prefix[1:],
+                                       next:   t.next,
+                               }
+                       }
+                       keyNode := new(trieNode)
+                       t.table = make([]*trieNode, r.tableSize)
+                       t.table[r.mapping[t.prefix[0]]] = prefixNode
+                       t.table[r.mapping[key[0]]] = keyNode
+                       t.prefix = ""
+                       t.next = nil
+                       keyNode.add(key[1:], val, priority, r)
+               } else {
+                       // Insert new node after the common section of the prefix.
+                       next := &trieNode{
+                               prefix: t.prefix[n:],
+                               next:   t.next,
+                       }
+                       t.prefix = t.prefix[:n]
+                       t.next = next
+                       next.add(key[n:], val, priority, r)
+               }
+       } else if t.table != nil {
+               // Insert into existing table.
+               m := r.mapping[key[0]]
+               if t.table[m] == nil {
+                       t.table[m] = new(trieNode)
+               }
+               t.table[m].add(key[1:], val, priority, r)
+       } else {
+               t.prefix = key
+               t.next = new(trieNode)
+               t.next.add("", val, priority, r)
+       }
+}
+
+func (r *genericReplacer) lookup(s string, ignoreRoot bool) (val string, keylen int, found bool) {
+       // Iterate down the trie to the end, and grab the value and keylen with
+       // the highest priority.
+       bestPriority := 0
+       node := &r.root
+       n := 0
+       for node != nil {
+               if node.priority > bestPriority && !(ignoreRoot && node == &r.root) {
+                       bestPriority = node.priority
+                       val = node.value
+                       keylen = n
+                       found = true
+               }
+
+               if s == "" {
+                       break
+               }
+               if node.table != nil {
+                       index := r.mapping[s[0]]
+                       if int(index) == r.tableSize {
+                               break
+                       }
+                       node = node.table[index]
+                       s = s[1:]
+                       n++
+               } else if node.prefix != "" && HasPrefix(s, node.prefix) {
+                       n += len(node.prefix)
+                       s = s[len(node.prefix):]
+                       node = node.next
+               } else {
+                       break
+               }
+       }
+       return
+}
+
+// genericReplacer is the fully generic algorithm.
  // It's used as a fallback when nothing faster can be used.
  type genericReplacer struct {
-       p []pair
+       root trieNode
+       // tableSize is the size of a trie node's lookup table. It is the number
+       // of unique key bytes.
+       tableSize int
+       // mapping maps from key bytes to a dense index for trieNode.table.
+       mapping [256]byte
  }
  
-type pair struct{ old, new string }
-
  func makeGenericReplacer(oldnew []string) *genericReplacer {
-       gen := &genericReplacer{
-               p: make([]pair, len(oldnew)/2),
+       r := new(genericReplacer)
+       // Find each byte used, then assign them each an index.
+       for i := 0; i < len(oldnew); i += 2 {
+               key := oldnew[i]
+               for j := 0; j < len(key); j++ {
+                       r.mapping[key[j]] = 1
+               }
+       }
+
+       for _, b := range r.mapping {
+               r.tableSize += int(b)
         }
+
+       var index byte
+       for i, b := range r.mapping {
+               if b == 0 {
+                       r.mapping[i] = byte(r.tableSize)
+               } else {
+                       r.mapping[i] = index
+                       index++
+               }
+       }
+       // Ensure root node uses a lookup table (for performance).
+       r.root.table = make([]*trieNode, r.tableSize)
+
         for i := 0; i < len(oldnew); i += 2 {
-               gen.p[i/2] = pair{oldnew[i], oldnew[i+1]}
+               r.root.add(oldnew[i], oldnew[i+1], len(oldnew)-i, r)
         }
-       return gen
+       return r
  }
  
-type appendSliceWriter struct {
-       b []byte
-}
+type appendSliceWriter []byte
  
+// Write writes to the buffer to satisfy io.Writer.
  func (w *appendSliceWriter) Write(p []byte) (int, error) {
-       w.b = append(w.b, p...)
+       *w = append(*w, p...)
         return len(p), nil
  }
  
+// WriteString writes to the buffer without string->[]byte->string allocations.
+func (w *appendSliceWriter) WriteString(s string) (int, error) {
+       *w = append(*w, s...)
+       return len(s), nil
+}
+
+type stringWriter struct {
+       w io.Writer
+}
+
+func (w stringWriter) WriteString(s string) (int, error) {
+       return w.w.Write([]byte(s))
+}
+
  func (r *genericReplacer) Replace(s string) string {
-       // TODO(bradfitz): optimized version
-       n, _ := r.WriteString(discard, s)
-       w := appendSliceWriter{make([]byte, 0, n)}
-       r.WriteString(&w, s)
-       return string(w.b)
+       buf := make(appendSliceWriter, 0, len(s))
+       r.WriteString(&buf, s)
+       return string(buf)
  }
  
  func (r *genericReplacer) WriteString(w io.Writer, s string) (n int, err error) {
-       lastEmpty := false // the last replacement was of the empty string
-Input:
-       // TODO(bradfitz): optimized version
-       for i := 0; i < len(s); {
-               for _, p := range r.p {
-                       if p.old == "" && lastEmpty {
-                               // Don't let old match twice in a row.
-                               // (it doesn't advance the input and
-                               // would otherwise loop forever)
-                               continue
+       sw, ok := w.(interface {
+               WriteString(string) (int, error)
+       })
+       if !ok {
+               sw = stringWriter{w}
+       }
+
+       var last, wn int
+       var prevMatchEmpty bool
+       for i := 0; i <= len(s); {
+               // Ignore the empty match iff the previous loop found the empty match.
+               val, keylen, match := r.lookup(s[i:], prevMatchEmpty)
+               prevMatchEmpty = match && keylen == 0
+               if match {
+                       wn, err = sw.WriteString(s[last:i])
+                       n += wn
+                       if err != nil {
+                               return
                         }
-                       if HasPrefix(s[i:], p.old) {
-                               if p.new != "" {
-                                       wn, err := w.Write([]byte(p.new))
-                                       n += wn
-                                       if err != nil {
-                                               return n, err
-                                       }
-                               }
-                               i += len(p.old)
-                               lastEmpty = p.old == ""
-                               continue Input
+                       wn, err = sw.WriteString(val)
+                       n += wn
+                       if err != nil {
+                               return
                         }
-               }
-               wn, err := w.Write([]byte{s[i]})
-               n += wn
-               if err != nil {
-                       return n, err
+                       i += keylen
+                       last = i
+                       continue
                 }
                 i++
         }
-
-       // Final empty match at end.
-       for _, p := range r.p {
-               if p.old == "" {
-                       if p.new != "" {
-                               wn, err := w.Write([]byte(p.new))
-                               n += wn
-                               if err != nil {
-                                       return n, err
-                               }
-                       }
-                       break
-               }
+       if last != len(s) {
+               wn, err = sw.WriteString(s[last:])
+               n += wn
         }
-
-       return n, nil
+       return
  }
  
  // byteReplacer is the implementation that's used when all the "old"
@@ -305,12 +480,3 @@ func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err erro
         }
         return n, nil
  }
-
-// strings is too low-level to import io/ioutil
-var discard io.Writer = devNull(0)
-
-type devNull int
-
-func (devNull) Write(p []byte) (int, error) {
-       return len(p), nil
-}
diff --git a/src/pkg/strings/replace_test.go b/src/pkg/strings/replace_test.go

index 7a960986bbe06a5ab8d0f36ab1c6ebaf9b96afa8..edc990c18b8f21c8e28626f8575a80c314fda884 100644 (file)
--- a/src/pkg/strings/replace_test.go
+++ b/src/pkg/strings/replace_test.go
@@ -219,21 +219,29 @@ func TestReplacer(t *testing.T) {
  
         blankToX1 := NewReplacer("", "X")
         blankToX2 := NewReplacer("", "X", "", "")
-       blankToXOToO := NewReplacer("", "X", "o", "O")
+       blankHighPriority := NewReplacer("", "X", "o", "O")
+       blankLowPriority := NewReplacer("o", "O", "", "X")
         blankNoOp1 := NewReplacer("", "")
         blankNoOp2 := NewReplacer("", "", "", "A")
         blankFoo := NewReplacer("", "X", "foobar", "R", "foobaz", "Z")
         testCases = append(testCases,
-               testCase{blankToX1, "foo", "XfooX"}, // TODO: should this be "XfXoXoX"?
+               testCase{blankToX1, "foo", "XfXoXoX"},
                 testCase{blankToX1, "", "X"},
  
-               testCase{blankToX2, "foo", "XfooX"}, // TODO: should this be "XfXoXoX"?
+               testCase{blankToX2, "foo", "XfXoXoX"},
                 testCase{blankToX2, "", "X"},
  
-               testCase{blankToXOToO, "oo", "XOXOX"},
-               testCase{blankToXOToO, "ii", "XiiX"},       // TODO: should this be "XiXiX"?
-               testCase{blankToXOToO, "iooi", "XiOXOXiX"}, // TODO: should this be "XiXOXOXiX"?
-               testCase{blankToXOToO, "", "X"},
+               testCase{blankHighPriority, "oo", "XOXOX"},
+               testCase{blankHighPriority, "ii", "XiXiX"},
+               testCase{blankHighPriority, "oiio", "XOXiXiXOX"},
+               testCase{blankHighPriority, "iooi", "XiXOXOXiX"},
+               testCase{blankHighPriority, "", "X"},
+
+               testCase{blankLowPriority, "oo", "OOX"},
+               testCase{blankLowPriority, "ii", "XiXiX"},
+               testCase{blankLowPriority, "oiio", "OXiXiOX"},
+               testCase{blankLowPriority, "iooi", "XiOOXiX"},
+               testCase{blankLowPriority, "", "X"},
  
                 testCase{blankNoOp1, "foo", "foo"},
                 testCase{blankNoOp1, "", ""},
@@ -242,7 +250,7 @@ func TestReplacer(t *testing.T) {
                 testCase{blankNoOp2, "", ""},
  
                 testCase{blankFoo, "foobarfoobaz", "XRXZX"},
-               testCase{blankFoo, "foobar-foobaz", "XRX-ZX"}, // TODO: should this be "XRX-XZX"?
+               testCase{blankFoo, "foobar-foobaz", "XRX-XZX"},
                 testCase{blankFoo, "", "X"},
         )
  
@@ -298,6 +306,78 @@ func TestPickAlgorithm(t *testing.T) {
         }
  }
  
+// TestGenericTrieBuilding verifies the structure of the generated trie. There
+// is one node per line, and the key ending with the current line is in the
+// trie if it ends with a "+".
+func TestGenericTrieBuilding(t *testing.T) {
+       testCases := []struct{ in, out string }{
+               {"abc;abdef;abdefgh;xx;xy;z", `-
+                       a-
+                       .b-
+                       ..c+
+                       ..d-
+                       ...ef+
+                       .....gh+
+                       x-
+                       .x+
+                       .y+
+                       z+
+                       `},
+               {"abracadabra;abracadabrakazam;abraham;abrasion", `-
+                       a-
+                       .bra-
+                       ....c-
+                       .....adabra+
+                       ...........kazam+
+                       ....h-
+                       .....am+
+                       ....s-
+                       .....ion+
+                       `},
+               {"aaa;aa;a;i;longerst;longer;long;xx;x;X;Y", `-
+                       X+
+                       Y+
+                       a+
+                       .a+
+                       ..a+
+                       i+
+                       l-
+                       .ong+
+                       ....er+
+                       ......st+
+                       x+
+                       .x+
+                       `},
+               {"foo;;foo;foo1", `+
+                       f-
+                       .oo+
+                       ...1+
+                       `},
+       }
+
+       for _, tc := range testCases {
+               keys := Split(tc.in, ";")
+               args := make([]string, len(keys)*2)
+               for i, key := range keys {
+                       args[i*2] = key
+               }
+
+               got := NewReplacer(args...).PrintTrie()
+               // Remove tabs from tc.out
+               wantbuf := make([]byte, 0, len(tc.out))
+               for i := 0; i < len(tc.out); i++ {
+                       if tc.out[i] != '\t' {
+                               wantbuf = append(wantbuf, tc.out[i])
+                       }
+               }
+               want := string(wantbuf)
+
+               if got != want {
+                       t.Errorf("PrintTrie(%q)\ngot\n%swant\n%s", tc.in, got, want)
+               }
+       }
+}
+
  func BenchmarkGenericNoMatch(b *testing.B) {
         str := Repeat("A", 100) + Repeat("B", 100)
         generic := NewReplacer("a", "A", "b", "B", "12", "123") // varying lengths forces generic
author	Eric Eisner <eric.d.eisner@gmail.com>
	Mon, 17 Sep 2012 01:50:15 +0000 (11:50 +1000)
committer	Nigel Tao <nigeltao@golang.org>
	Mon, 17 Sep 2012 01:50:15 +0000 (11:50 +1000)
src/pkg/strings/export_test.go		patch \| blob \| history
src/pkg/strings/replace.go		patch \| blob \| history
src/pkg/strings/replace_test.go		patch \| blob \| history