html: tokenize HTML comments.

author Nigel Tao <nigeltao@golang.org>

Wed, 16 Feb 2011 23:45:30 +0000 (10:45 +1100)

committer Nigel Tao <nigeltao@golang.org>

Wed, 16 Feb 2011 23:45:30 +0000 (10:45 +1100)
author Nigel Tao <nigeltao@golang.org>
Wed, 16 Feb 2011 23:45:30 +0000 (10:45 +1100)
committer Nigel Tao <nigeltao@golang.org>
Wed, 16 Feb 2011 23:45:30 +0000 (10:45 +1100)
diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go

index c5338d0781dbf76cdc32948f50f9df0b9a7a276a..4f5dee72da32a135a16bb5ba631f4b4f42473d3a 100644 (file)
--- a/src/pkg/html/doc.go
+++ b/src/pkg/html/doc.go
@@ -69,6 +69,9 @@ call to Next. For example, to extract an HTML page's anchor text:
                 }
         }
  
+A Tokenizer typically skips over HTML comments. To return comment tokens, set
+Tokenizer.ReturnComments to true before looping over calls to Next.
+
  Parsing is done by calling Parse with an io.Reader, which returns the root of
  the parse tree (the document element) as a *Node. It is the caller's
  responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go

index d6388385051b3df9a0cfc5e16aaf64ee22425a21..8d8d1276485f885fa3c7a8b11cfbde8c65fef5c5 100644 (file)
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -25,6 +25,8 @@ const (
         EndTagToken
         // A SelfClosingTagToken tag looks like <br/>.
         SelfClosingTagToken
+       // A CommentToken looks like <!--x-->.
+       CommentToken
  )
  
  // String returns a string representation of the TokenType.
@@ -40,6 +42,8 @@ func (t TokenType) String() string {
                 return "EndTag"
         case SelfClosingTagToken:
                 return "SelfClosingTag"
+       case CommentToken:
+               return "Comment"
         }
         return "Invalid(" + strconv.Itoa(int(t)) + ")"
  }
@@ -52,8 +56,8 @@ type Attribute struct {
  }
  
  // A Token consists of a TokenType and some Data (tag name for start and end
-// tags, content for text). A tag Token may also contain a slice of Attributes.
-// Data is unescaped for both tag and text Tokens (it looks like "a<b" rather
+// tags, content for text and comments). A tag Token may also contain a slice
+// of Attributes. Data is unescaped for all Tokens (it looks like "a<b" rather
  // than "a&lt;b").
  type Token struct {
         Type TokenType
@@ -91,12 +95,18 @@ func (t Token) String() string {
                 return "</" + t.tagString() + ">"
         case SelfClosingTagToken:
                 return "<" + t.tagString() + "/>"
+       case CommentToken:
+               return "<!--" + EscapeString(t.Data) + "-->"
         }
         return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
  }
  
  // A Tokenizer returns a stream of HTML Tokens.
  type Tokenizer struct {
+       // If ReturnComments is set, Next returns comment tokens;
+       // otherwise it skips over comments (default).
+       ReturnComments bool
+
         // r is the source of the HTML text.
         r io.Reader
         // tt is the TokenType of the most recently read token. If tt == Error
@@ -176,6 +186,39 @@ func (z *Tokenizer) readTo(x uint8) os.Error {
         panic("unreachable")
  }
  
+// nextMarkupDeclaration returns the next TokenType starting with "<!".
+func (z *Tokenizer) nextMarkupDeclaration() (TokenType, os.Error) {
+       // TODO: check for <!DOCTYPE ... >, don't just assume that it's a comment.
+       for i := 0; i < 2; i++ {
+               c, err := z.readByte()
+               if err != nil {
+                       return TextToken, err
+               }
+               if c != '-' {
+                       return z.nextText(), nil
+               }
+       }
+       // <!--> is a valid comment.
+       for dashCount := 2; ; {
+               c, err := z.readByte()
+               if err != nil {
+                       return TextToken, err
+               }
+               switch c {
+               case '-':
+                       dashCount++
+               case '>':
+                       if dashCount >= 2 {
+                               return CommentToken, nil
+                       }
+                       fallthrough
+               default:
+                       dashCount = 0
+               }
+       }
+       panic("unreachable")
+}
+
  // nextTag returns the next TokenType starting from the tag open state.
  func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
         c, err := z.readByte()
@@ -189,7 +232,7 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
         case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
                 tt = StartTagToken
         case c == '!':
-               return ErrorToken, os.NewError("html: TODO(nigeltao): implement comments")
+               return z.nextMarkupDeclaration()
         case c == '?':
                 return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
         default:
@@ -221,22 +264,8 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
         panic("unreachable")
  }
  
-// Next scans the next token and returns its type.
-func (z *Tokenizer) Next() TokenType {
-       if z.err != nil {
-               z.tt = ErrorToken
-               return z.tt
-       }
-       z.p0 = z.p1
-       c, err := z.readByte()
-       if err != nil {
-               z.tt, z.err = ErrorToken, err
-               return z.tt
-       }
-       if c == '<' {
-               z.tt, z.err = z.nextTag()
-               return z.tt
-       }
+// nextText reads all text up until an '<'.
+func (z *Tokenizer) nextText() TokenType {
         for {
                 c, err := z.readByte()
                 if err != nil {
@@ -255,6 +284,31 @@ func (z *Tokenizer) Next() TokenType {
         panic("unreachable")
  }
  
+// Next scans the next token and returns its type.
+func (z *Tokenizer) Next() TokenType {
+       for {
+               if z.err != nil {
+                       z.tt = ErrorToken
+                       return z.tt
+               }
+               z.p0 = z.p1
+               c, err := z.readByte()
+               if err != nil {
+                       z.tt, z.err = ErrorToken, err
+                       return z.tt
+               }
+               if c == '<' {
+                       z.tt, z.err = z.nextTag()
+                       if z.tt == CommentToken && !z.ReturnComments {
+                               continue
+                       }
+                       return z.tt
+               }
+               return z.nextText()
+       }
+       panic("unreachable")
+}
+
  // trim returns the largest j such that z.buf[i:j] contains only white space,
  // or only white space plus the final ">" or "/>" of the raw data.
  func (z *Tokenizer) trim(i int) int {
@@ -299,12 +353,27 @@ loop:
         return z.buf[i0:i], z.trim(i)
  }
  
-// Text returns the raw data after unescaping.
+// Text returns the unescaped text of a TextToken or a CommentToken.
  // The contents of the returned slice may change on the next call to Next.
  func (z *Tokenizer) Text() []byte {
-       s := unescape(z.Raw())
-       z.p0 = z.p1
-       return s
+       switch z.tt {
+       case TextToken:
+               s := unescape(z.Raw())
+               z.p0 = z.p1
+               return s
+       case CommentToken:
+               // We trim the "<!--" from the left and the "-->" from the right.
+               // "<!-->" is a valid comment, so the adjusted endpoints might overlap.
+               i0 := z.p0 + 4
+               i1 := z.p1 - 3
+               z.p0 = z.p1
+               var s []byte
+               if i0 < i1 {
+                       s = unescape(z.buf[i0:i1])
+               }
+               return s
+       }
+       return nil
  }
  
  // TagName returns the lower-cased name of a tag token (the `img` out of
@@ -372,7 +441,7 @@ loop:
  func (z *Tokenizer) Token() Token {
         t := Token{Type: z.tt}
         switch z.tt {
-       case TextToken:
+       case TextToken, CommentToken:
                 t.Data = string(z.Text())
         case StartTagToken, EndTagToken, SelfClosingTagToken:
                 var attr []Attribute
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go

index e07999ca5addd5df5d6264722053a8e6ce8889ef..5cf1f6dac30b9df19b899a7503c5ce2421c7941e 100644 (file)
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -7,6 +7,7 @@ package html
  import (
         "bytes"
         "os"
+       "strings"
         "testing"
  )
  
@@ -15,8 +16,8 @@ type tokenTest struct {
         desc string
         // The HTML to parse.
         html string
-       // The string representations of the expected tokens.
-       tokens []string
+       // The string representations of the expected tokens, joined by '$'.
+       golden string
  }
  
  var tokenTests = []tokenTest{
@@ -25,61 +26,86 @@ var tokenTests = []tokenTest{
         {
                 "text",
                 "foo  bar",
-               []string{
-                       "foo  bar",
-               },
+               "foo  bar",
         },
         // An entity.
         {
                 "entity",
                 "one &lt; two",
-               []string{
-                       "one &lt; two",
-               },
+               "one &lt; two",
         },
         // A start, self-closing and end tag. The tokenizer does not care if the start
         // and end tokens don't match; that is the job of the parser.
         {
                 "tags",
                 "<a>b<c/>d</e>",
-               []string{
-                       "<a>",
-                       "b",
-                       "<c/>",
-                       "d",
-                       "</e>",
-               },
+               "<a>$b$<c/>$d$</e>",
+       },
+       // Comments.
+       {
+               "comment0",
+               "abc<b><!-- skipme --></b>def",
+               "abc$<b>$</b>$def",
+       },
+       {
+               "comment1",
+               "a<!-->z",
+               "a$z",
+       },
+       {
+               "comment2",
+               "a<!--->z",
+               "a$z",
+       },
+       {
+               "comment3",
+               "a<!--x>-->z",
+               "a$z",
+       },
+       {
+               "comment4",
+               "a<!--x->-->z",
+               "a$z",
+       },
+       {
+               "comment5",
+               "a<!>z",
+               "a$&lt;!&gt;z",
+       },
+       {
+               "comment6",
+               "a<!->z",
+               "a$&lt;!-&gt;z",
+       },
+       {
+               "comment7",
+               "a<!---<>z",
+               "a$&lt;!---&lt;&gt;z",
+       },
+       {
+               "comment8",
+               "a<!--z",
+               "a$&lt;!--z",
         },
         // An attribute with a backslash.
         {
                 "backslash",
                 `<p id="a\"b">`,
-               []string{
-                       `<p id="a&quot;b">`,
-               },
+               `<p id="a&quot;b">`,
         },
         // Entities, tag name and attribute key lower-casing, and whitespace
         // normalization within a tag.
         {
                 "tricky",
                 "<p \t\n iD=\"a&quot;B\"  foo=\"bar\"><EM>te&lt;&amp;;xt</em></p>",
-               []string{
-                       `<p id="a&quot;B" foo="bar">`,
-                       "<em>",
-                       "te&lt;&amp;;xt",
-                       "</em>",
-                       "</p>",
-               },
+               `<p id="a&quot;B" foo="bar">$<em>$te&lt;&amp;;xt$</em>$</p>`,
         },
         // A non-existant entity. Tokenizing and converting back to a string should
         // escape the "&" to become "&amp;".
         {
                 "noSuchEntity",
                 `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
-               []string{
-                       `<a b="c&amp;noSuchEntity;d">`,
-                       "&lt;&amp;alsoDoesntExist;&amp;",
-               },
+               `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
         },
  }
  
@@ -87,7 +113,7 @@ func TestTokenizer(t *testing.T) {
  loop:
         for _, tt := range tokenTests {
                 z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
-               for i, s := range tt.tokens {
+               for i, s := range strings.Split(tt.golden, "$", -1) {
                         if z.Next() == ErrorToken {
                                 t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
                                 continue loop
author	Nigel Tao <nigeltao@golang.org>
	Wed, 16 Feb 2011 23:45:30 +0000 (10:45 +1100)
committer	Nigel Tao <nigeltao@golang.org>
	Wed, 16 Feb 2011 23:45:30 +0000 (10:45 +1100)
src/pkg/html/doc.go		patch \| blob \| history
src/pkg/html/token.go		patch \| blob \| history
src/pkg/html/token_test.go		patch \| blob \| history