strings: speed-up replace for byteStringReplacer case

author Ilya Tocar <ilya.tocar@intel.com>

Mon, 26 Feb 2018 20:00:57 +0000 (14:00 -0600)

committer Brad Fitzpatrick <bradfitz@golang.org>

Mon, 2 Apr 2018 22:34:38 +0000 (22:34 +0000)
author Ilya Tocar <ilya.tocar@intel.com>
Mon, 26 Feb 2018 20:00:57 +0000 (14:00 -0600)
committer Brad Fitzpatrick <bradfitz@golang.org>
Mon, 2 Apr 2018 22:34:38 +0000 (22:34 +0000)
diff --git a/src/strings/replace.go b/src/strings/replace.go

index 4752641be0c9008d311581fcd028074ec258350d..bb0524427b3c51477ad4e0664ec139ec09fffb7b 100644 (file)
--- a/src/strings/replace.go
+++ b/src/strings/replace.go
@@ -54,13 +54,21 @@ func NewReplacer(oldnew ...string) *Replacer {
                 return &Replacer{r: &r}
         }
  
-       r := byteStringReplacer{}
+       r := byteStringReplacer{toReplace: make([]string, 0, len(oldnew)/2)}
         // The first occurrence of old->new map takes precedence
         // over the others with the same old string.
         for i := len(oldnew) - 2; i >= 0; i -= 2 {
                 o := oldnew[i][0]
                 n := oldnew[i+1]
-               r[o] = []byte(n)
+               // To avoid counting repetitions multiple times.
+               if r.replacements[o] == nil {
+                       // We need to use string([]byte{o}) instead of string(o),
+                       // to avoid utf8 encoding of o.
+                       // E. g. byte(150) produces string of length 2.
+                       r.toReplace = append(r.toReplace, string([]byte{o}))
+               }
+               r.replacements[o] = []byte(n)
+
         }
         return &Replacer{r: &r}
  }
@@ -454,34 +462,60 @@ func (r *byteReplacer) WriteString(w io.Writer, s string) (n int, err error) {
  
  // byteStringReplacer is the implementation that's used when all the
  // "old" values are single ASCII bytes but the "new" values vary in size.
-// The array contains replacement byte slices indexed by old byte.
-// A nil []byte means that the old byte should not be replaced.
-type byteStringReplacer [256][]byte
+type byteStringReplacer struct {
+       // replacements contains replacement byte slices indexed by old byte.
+       // A nil []byte means that the old byte should not be replaced.
+       replacements [256][]byte
+       // toReplace keeps a list of bytes to replace. Depending on length of toReplace
+       // and length of target string it may be faster to use Count, or a plain loop.
+       // We store single byte as a string, because Count takes a string.
+       toReplace []string
+}
+
+// countCutOff controls the ratio of a string length to a number of replacements
+// at which (*byteStringReplacer).Replace switches algorithms.
+// For strings with higher ration of length to replacements than that value,
+// we call Count, for each replacement from toReplace.
+// For strings, with a lower ratio we use simple loop, because of Count overhead.
+// countCutOff is an empirically determined overhead multiplier.
+// TODO(tocarip) revisit once we have register-based abi/mid-stack inlining.
+const countCutOff = 8
  
  func (r *byteStringReplacer) Replace(s string) string {
         newSize := len(s)
         anyChanges := false
-       for i := 0; i < len(s); i++ {
-               b := s[i]
-               if r[b] != nil {
-                       anyChanges = true
-                       // The -1 is because we are replacing 1 byte with len(r[b]) bytes.
-                       newSize += len(r[b]) - 1
+       // Is it faster to use Count?
+       if len(r.toReplace)*countCutOff <= len(s) {
+               for _, x := range r.toReplace {
+                       if c := Count(s, x); c != 0 {
+                               // The -1 is because we are replacing 1 byte with len(replacements[b]) bytes.
+                               newSize += c * (len(r.replacements[x[0]]) - 1)
+                               anyChanges = true
+                       }
+
+               }
+       } else {
+               for i := 0; i < len(s); i++ {
+                       b := s[i]
+                       if r.replacements[b] != nil {
+                               // See above for explanation of -1
+                               newSize += len(r.replacements[b]) - 1
+                               anyChanges = true
+                       }
                 }
         }
         if !anyChanges {
                 return s
         }
         buf := make([]byte, newSize)
-       bi := buf
+       j := 0
         for i := 0; i < len(s); i++ {
                 b := s[i]
-               if r[b] != nil {
-                       n := copy(bi, r[b])
-                       bi = bi[n:]
+               if r.replacements[b] != nil {
+                       j += copy(buf[j:], r.replacements[b])
                 } else {
-                       bi[0] = b
-                       bi = bi[1:]
+                       buf[j] = b
+                       j++
                 }
         }
         return string(buf)
@@ -492,7 +526,7 @@ func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err erro
         last := 0
         for i := 0; i < len(s); i++ {
                 b := s[i]
-               if r[b] == nil {
+               if r.replacements[b] == nil {
                         continue
                 }
                 if last != i {
@@ -503,7 +537,7 @@ func (r *byteStringReplacer) WriteString(w io.Writer, s string) (n int, err erro
                         }
                 }
                 last = i + 1
-               nw, err := w.Write(r[b])
+               nw, err := w.Write(r.replacements[b])
                 n += nw
                 if err != nil {
                         return n, err
author	Ilya Tocar <ilya.tocar@intel.com>
	Mon, 26 Feb 2018 20:00:57 +0000 (14:00 -0600)
committer	Brad Fitzpatrick <bradfitz@golang.org>
	Mon, 2 Apr 2018 22:34:38 +0000 (22:34 +0000)