Small performance improvements to the HTML tokenizer based on your 'TODO's.

author Kyle Consalus <consalus@gmail.com>

Wed, 11 Aug 2010 23:45:34 +0000 (09:45 +1000)

committer Nigel Tao <nigeltao@golang.org>

Wed, 11 Aug 2010 23:45:34 +0000 (09:45 +1000)
author Kyle Consalus <consalus@gmail.com>
Wed, 11 Aug 2010 23:45:34 +0000 (09:45 +1000)
committer Nigel Tao <nigeltao@golang.org>
Wed, 11 Aug 2010 23:45:34 +0000 (09:45 +1000)
diff --git a/src/pkg/html/escape.go b/src/pkg/html/escape.go

index f9fdf8c4d9c954bdc1d011372417fd9f7b913eb9..f30086f3678f27f167df834e719ccaf6d9af9de0 100644 (file)
--- a/src/pkg/html/escape.go
+++ b/src/pkg/html/escape.go
@@ -5,6 +5,7 @@
  package html
  
  import (
+       "bytes"
         "strings"
         "utf8"
  )
@@ -60,18 +61,45 @@ func unescape(b []byte) []byte {
         return b
  }
  
+const escapedChars = `&'<>"`
+
+func escape(buf *bytes.Buffer, s string) {
+       i := strings.IndexAny(s, escapedChars)
+       for i != -1 {
+               buf.WriteString(s[0:i])
+               var esc string
+               switch s[i] {
+               case '&':
+                       esc = "&amp;"
+               case '\'':
+                       esc = "&apos;"
+               case '<':
+                       esc = "&lt;"
+               case '>':
+                       esc = "&gt;"
+               case '"':
+                       esc = "&quot;"
+               default:
+                       panic("unrecognized escape character")
+               }
+               s = s[i+1:]
+               buf.WriteString(esc)
+               i = strings.IndexAny(s, escapedChars)
+       }
+       buf.WriteString(s)
+}
+
  // EscapeString escapes special characters like "<" to become "&lt;". It
  // escapes only five such characters: amp, apos, lt, gt and quot.
  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
  // always true.
  func EscapeString(s string) string {
-       // TODO(nigeltao): Do this much more efficiently.
-       s = strings.Replace(s, `&`, `&amp;`, -1)
-       s = strings.Replace(s, `'`, `&apos;`, -1)
-       s = strings.Replace(s, `<`, `&lt;`, -1)
-       s = strings.Replace(s, `>`, `&gt;`, -1)
-       s = strings.Replace(s, `"`, `&quot;`, -1)
-       return s
+       if strings.IndexAny(s, escapedChars) == -1 {
+               return s
+       }
+       buf := bytes.NewBuffer(nil)
+       escape(buf, s)
+       return buf.String()
  }
  
  // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go

index 0681af44a425099448fd066159366f6d9d1b20e8..39f6700321c43dbef4e0c4bc03b8cf7d42ecf58c 100644 (file)
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -5,6 +5,7 @@
  package html
  
  import (
+       "bytes"
         "io"
         "log"
         "os"
@@ -68,12 +69,19 @@ type Token struct {
  
  // tagString returns a string representation of a tag Token's Data and Attr.
  func (t Token) tagString() string {
-       // TODO(nigeltao): Don't use string concatenation; it is inefficient.
-       s := string(t.Data)
+       if len(t.Attr) == 0 {
+               return t.Data
+       }
+       buf := bytes.NewBuffer(nil)
+       buf.WriteString(t.Data)
         for _, a := range t.Attr {
-               s += ` ` + a.Key + `="` + EscapeString(a.Val) + `"`
+               buf.WriteByte(' ')
+               buf.WriteString(a.Key)
+               buf.WriteString(`="`)
+               escape(buf, a.Val)
+               buf.WriteByte('"')
         }
-       return s
+       return buf.String()
  }
  
  // String returns a string representation of the Token.
author	Kyle Consalus <consalus@gmail.com>
	Wed, 11 Aug 2010 23:45:34 +0000 (09:45 +1000)
committer	Nigel Tao <nigeltao@golang.org>
	Wed, 11 Aug 2010 23:45:34 +0000 (09:45 +1000)
src/pkg/html/escape.go		patch \| blob \| history
src/pkg/html/token.go		patch \| blob \| history