html: speed up UnescapeString

author Ingo Oeser <nightlyone@googlemail.com>

Sat, 9 May 2015 15:55:05 +0000 (17:55 +0200)

committer Brad Fitzpatrick <bradfitz@golang.org>

Sat, 22 Aug 2015 12:45:38 +0000 (12:45 +0000)
author Ingo Oeser <nightlyone@googlemail.com>
Sat, 9 May 2015 15:55:05 +0000 (17:55 +0200)
committer Brad Fitzpatrick <bradfitz@golang.org>
Sat, 22 Aug 2015 12:45:38 +0000 (12:45 +0000)
diff --git a/src/html/escape.go b/src/html/escape.go

index f50a4b937a79b3add3565027877bf22cad348cb8..ab6fd1c7b4b3460d29d2a47a693463c2666b1e25 100644 (file)
--- a/src/html/escape.go
+++ b/src/html/escape.go
@@ -57,8 +57,9 @@ var replacementTable = [...]rune{
  // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
  // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
  // Precondition: b[src] == '&' && dst <= src.
-// attribute should be true if parsing an attribute value.
-func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
+func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
+       const attribute = false
+
         // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
  
         // i starts at 1 because we already know that s[0] == '&'.
@@ -139,14 +140,14 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
                 break
         }
  
-       entityName := string(s[1:i])
-       if entityName == "" {
+       entityName := s[1:i]
+       if len(entityName) == 0 {
                 // No-op.
         } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
                 // No-op.
-       } else if x := entity[entityName]; x != 0 {
+       } else if x := entity[string(entityName)]; x != 0 {
                 return dst + utf8.EncodeRune(b[dst:], x), src + i
-       } else if x := entity2[entityName]; x[0] != 0 {
+       } else if x := entity2[string(entityName)]; x[0] != 0 {
                 dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
                 return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
         } else if !attribute {
@@ -155,7 +156,7 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
                         maxLen = longestEntityWithoutSemicolon
                 }
                 for j := maxLen; j > 1; j-- {
-                       if x := entity[entityName[:j]]; x != 0 {
+                       if x := entity[string(entityName[:j])]; x != 0 {
                                 return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
                         }
                 }
@@ -166,26 +167,6 @@ func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
         return dst1, src1
  }
  
-// unescape unescapes b's entities in-place, so that "a&lt;b" becomes "a<b".
-func unescape(b []byte) []byte {
-       for i, c := range b {
-               if c == '&' {
-                       dst, src := unescapeEntity(b, i, i, false)
-                       for src < len(b) {
-                               c := b[src]
-                               if c == '&' {
-                                       dst, src = unescapeEntity(b, dst, src, false)
-                               } else {
-                                       b[dst] = c
-                                       dst, src = dst+1, src+1
-                               }
-                       }
-                       return b[0:dst]
-               }
-       }
-       return b
-}
-
  var htmlEscaper = strings.NewReplacer(
         `&`, "&amp;",
         `'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
@@ -208,8 +189,29 @@ func EscapeString(s string) string {
  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
  // always true.
  func UnescapeString(s string) string {
-       if !strings.Contains(s, "&") {
+       i := strings.IndexByte(s, '&')
+
+       if i < 0 {
                 return s
         }
-       return string(unescape([]byte(s)))
+
+       b := []byte(s)
+       dst, src := unescapeEntity(b, i, i)
+       for len(s[src:]) > 0 {
+               if s[src] == '&' {
+                       i = 0
+               } else {
+                       i = strings.IndexByte(s[src:], '&')
+               }
+               if i < 0 {
+                       dst += copy(b[dst:], s[src:])
+                       break
+               }
+
+               if i > 0 {
+                       copy(b[dst:], s[src:src+i])
+               }
+               dst, src = unescapeEntity(b, dst+i, src+i)
+       }
+       return string(b[:dst])
  }
diff --git a/src/html/escape_test.go b/src/html/escape_test.go

index 3702626a3dccfdb8315925eab22869ac2aefcda4..8b51a55409fa55ee04149b31bafbe4657312d138 100644 (file)
--- a/src/html/escape_test.go
+++ b/src/html/escape_test.go
@@ -118,8 +118,10 @@ func TestUnescapeEscape(t *testing.T) {
  }
  
  var (
-       benchEscapeData = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
-       benchEscapeNone = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
+       benchEscapeData     = strings.Repeat("AAAAA < BBBBB > CCCCC & DDDDD ' EEEEE \" ", 100)
+       benchEscapeNone     = strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 100)
+       benchUnescapeSparse = strings.Repeat(strings.Repeat("AAAAA x BBBBB x CCCCC x DDDDD x EEEEE x ", 10)+"&amp;", 10)
+       benchUnescapeDense  = strings.Repeat("&amp;&lt; &amp; &lt;", 100)
  )
  
  func BenchmarkEscape(b *testing.B) {
@@ -151,3 +153,17 @@ func BenchmarkUnescapeNone(b *testing.B) {
                 n += len(UnescapeString(s))
         }
  }
+
+func BenchmarkUnescapeSparse(b *testing.B) {
+       n := 0
+       for i := 0; i < b.N; i++ {
+               n += len(UnescapeString(benchUnescapeSparse))
+       }
+}
+
+func BenchmarkUnescapeDense(b *testing.B) {
+       n := 0
+       for i := 0; i < b.N; i++ {
+               n += len(UnescapeString(benchUnescapeDense))
+       }
+}
author	Ingo Oeser <nightlyone@googlemail.com>
	Sat, 9 May 2015 15:55:05 +0000 (17:55 +0200)
committer	Brad Fitzpatrick <bradfitz@golang.org>
	Sat, 22 Aug 2015 12:45:38 +0000 (12:45 +0000)
src/html/escape.go		patch \| blob \| history
src/html/escape_test.go		patch \| blob \| history