From a5ff8ad9db409bfa35322c8887b2bae964fec210 Mon Sep 17 00:00:00 2001 From: Nigel Tao Date: Thu, 17 Feb 2011 10:45:30 +1100 Subject: [PATCH] html: tokenize HTML comments. I'm not sure if it's 100% correct wrt the HTML5 specification, but the test suite has plenty of HTML comment test cases, and we'll shake out any tokenization bugs as the parser improves its coverage. R=gri CC=golang-dev https://golang.org/cl/4186055 --- src/pkg/html/doc.go | 3 + src/pkg/html/token.go | 117 +++++++++++++++++++++++++++++-------- src/pkg/html/token_test.go | 86 +++++++++++++++++---------- 3 files changed, 152 insertions(+), 54 deletions(-) diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go index c5338d0781..4f5dee72da 100644 --- a/src/pkg/html/doc.go +++ b/src/pkg/html/doc.go @@ -69,6 +69,9 @@ call to Next. For example, to extract an HTML page's anchor text: } } +A Tokenizer typically skips over HTML comments. To return comment tokens, set +Tokenizer.ReturnComments to true before looping over calls to Next. + Parsing is done by calling Parse with an io.Reader, which returns the root of the parse tree (the document element) as a *Node. It is the caller's responsibility to ensure that the Reader provides UTF-8 encoded HTML. For diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go index d638838505..8d8d127648 100644 --- a/src/pkg/html/token.go +++ b/src/pkg/html/token.go @@ -25,6 +25,8 @@ const ( EndTagToken // A SelfClosingTagToken tag looks like
. SelfClosingTagToken + // A CommentToken looks like . + CommentToken ) // String returns a string representation of the TokenType. @@ -40,6 +42,8 @@ func (t TokenType) String() string { return "EndTag" case SelfClosingTagToken: return "SelfClosingTag" + case CommentToken: + return "Comment" } return "Invalid(" + strconv.Itoa(int(t)) + ")" } @@ -52,8 +56,8 @@ type Attribute struct { } // A Token consists of a TokenType and some Data (tag name for start and end -// tags, content for text). A tag Token may also contain a slice of Attributes. -// Data is unescaped for both tag and text Tokens (it looks like "a" case SelfClosingTagToken: return "<" + t.tagString() + "/>" + case CommentToken: + return "" } return "Invalid(" + strconv.Itoa(int(t.Type)) + ")" } // A Tokenizer returns a stream of HTML Tokens. type Tokenizer struct { + // If ReturnComments is set, Next returns comment tokens; + // otherwise it skips over comments (default). + ReturnComments bool + // r is the source of the HTML text. r io.Reader // tt is the TokenType of the most recently read token. If tt == Error @@ -176,6 +186,39 @@ func (z *Tokenizer) readTo(x uint8) os.Error { panic("unreachable") } +// nextMarkupDeclaration returns the next TokenType starting with ", don't just assume that it's a comment. + for i := 0; i < 2; i++ { + c, err := z.readByte() + if err != nil { + return TextToken, err + } + if c != '-' { + return z.nextText(), nil + } + } + // is a valid comment. + for dashCount := 2; ; { + c, err := z.readByte() + if err != nil { + return TextToken, err + } + switch c { + case '-': + dashCount++ + case '>': + if dashCount >= 2 { + return CommentToken, nil + } + fallthrough + default: + dashCount = 0 + } + } + panic("unreachable") +} + // nextTag returns the next TokenType starting from the tag open state. func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) { c, err := z.readByte() @@ -189,7 +232,7 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) { case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z': tt = StartTagToken case c == '!': - return ErrorToken, os.NewError("html: TODO(nigeltao): implement comments") + return z.nextMarkupDeclaration() case c == '?': return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions") default: @@ -221,22 +264,8 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) { panic("unreachable") } -// Next scans the next token and returns its type. -func (z *Tokenizer) Next() TokenType { - if z.err != nil { - z.tt = ErrorToken - return z.tt - } - z.p0 = z.p1 - c, err := z.readByte() - if err != nil { - z.tt, z.err = ErrorToken, err - return z.tt - } - if c == '<' { - z.tt, z.err = z.nextTag() - return z.tt - } +// nextText reads all text up until an '<'. +func (z *Tokenizer) nextText() TokenType { for { c, err := z.readByte() if err != nil { @@ -255,6 +284,31 @@ func (z *Tokenizer) Next() TokenType { panic("unreachable") } +// Next scans the next token and returns its type. +func (z *Tokenizer) Next() TokenType { + for { + if z.err != nil { + z.tt = ErrorToken + return z.tt + } + z.p0 = z.p1 + c, err := z.readByte() + if err != nil { + z.tt, z.err = ErrorToken, err + return z.tt + } + if c == '<' { + z.tt, z.err = z.nextTag() + if z.tt == CommentToken && !z.ReturnComments { + continue + } + return z.tt + } + return z.nextText() + } + panic("unreachable") +} + // trim returns the largest j such that z.buf[i:j] contains only white space, // or only white space plus the final ">" or "/>" of the raw data. func (z *Tokenizer) trim(i int) int { @@ -299,12 +353,27 @@ loop: return z.buf[i0:i], z.trim(i) } -// Text returns the raw data after unescaping. +// Text returns the unescaped text of a TextToken or a CommentToken. // The contents of the returned slice may change on the next call to Next. func (z *Tokenizer) Text() []byte { - s := unescape(z.Raw()) - z.p0 = z.p1 - return s + switch z.tt { + case TextToken: + s := unescape(z.Raw()) + z.p0 = z.p1 + return s + case CommentToken: + // We trim the "" from the right. + // "" is a valid comment, so the adjusted endpoints might overlap. + i0 := z.p0 + 4 + i1 := z.p1 - 3 + z.p0 = z.p1 + var s []byte + if i0 < i1 { + s = unescape(z.buf[i0:i1]) + } + return s + } + return nil } // TagName returns the lower-cased name of a tag token (the `img` out of @@ -372,7 +441,7 @@ loop: func (z *Tokenizer) Token() Token { t := Token{Type: z.tt} switch z.tt { - case TextToken: + case TextToken, CommentToken: t.Data = string(z.Text()) case StartTagToken, EndTagToken, SelfClosingTagToken: var attr []Attribute diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go index e07999ca5a..5cf1f6dac3 100644 --- a/src/pkg/html/token_test.go +++ b/src/pkg/html/token_test.go @@ -7,6 +7,7 @@ package html import ( "bytes" "os" + "strings" "testing" ) @@ -15,8 +16,8 @@ type tokenTest struct { desc string // The HTML to parse. html string - // The string representations of the expected tokens. - tokens []string + // The string representations of the expected tokens, joined by '$'. + golden string } var tokenTests = []tokenTest{ @@ -25,61 +26,86 @@ var tokenTests = []tokenTest{ { "text", "foo bar", - []string{ - "foo bar", - }, + "foo bar", }, // An entity. { "entity", "one < two", - []string{ - "one < two", - }, + "one < two", }, // A start, self-closing and end tag. The tokenizer does not care if the start // and end tokens don't match; that is the job of the parser. { "tags", "bd", - []string{ - "", - "b", - "", - "d", - "", - }, + "$b$$d$", + }, + // Comments. + { + "comment0", + "abcdef", + "abc$$$def", + }, + { + "comment1", + "az", + "a$z", + }, + { + "comment2", + "az", + "a$z", + }, + { + "comment3", + "az", + "a$z", + }, + { + "comment4", + "az", + "a$z", + }, + { + "comment5", + "az", + "a$<!>z", + }, + { + "comment6", + "az", + "a$<!->z", + }, + { + "comment7", + "a