]> Cypherpunks repositories - gostls13.git/commitdiff
html: first cut at a parser.
authorNigel Tao <nigeltao@golang.org>
Tue, 7 Dec 2010 01:02:36 +0000 (12:02 +1100)
committerNigel Tao <nigeltao@golang.org>
Tue, 7 Dec 2010 01:02:36 +0000 (12:02 +1100)
R=gri
CC=golang-dev
https://golang.org/cl/3355041

src/pkg/html/Makefile
src/pkg/html/doc.go
src/pkg/html/parse.go [new file with mode: 0644]
src/pkg/html/parse_test.go [new file with mode: 0644]
src/pkg/html/token.go
src/pkg/html/token_test.go

index 4bbd98a9368c7f70b80806d6281207e69820e605..00e1c05508dbc276b902e557fc7ad91788ec2a47 100644 (file)
@@ -9,6 +9,7 @@ GOFILES=\
        doc.go\
        entity.go\
        escape.go\
+       parse.go\
        token.go\
 
 include ../../Make.pkg
index 9f5d478b42c7c0e652cad6d504c0a4cf36141cd3..c5338d0781dbf76cdc32948f50f9df0b9a7a276a 100644 (file)
@@ -15,7 +15,7 @@ which parses the next token and returns its type, or an error:
 
        for {
                tt := z.Next()
-               if tt == html.Error {
+               if tt == html.ErrorToken {
                        // ...
                        return ...
                }
@@ -34,7 +34,7 @@ Entities (such as "&lt;") are unescaped, tag names and attribute keys are
 lower-cased, and attributes are collected into a []Attribute. For example:
 
        for {
-               if z.Next() == html.Error {
+               if z.Next() == html.ErrorToken {
                        // Returning os.EOF indicates success.
                        return z.Error()
                }
@@ -49,15 +49,15 @@ call to Next. For example, to extract an HTML page's anchor text:
        for {
                tt := z.Next()
                switch tt {
-               case Error:
+               case ErrorToken:
                        return z.Error()
-               case Text:
+               case TextToken:
                        if depth > 0 {
                                // emitBytes should copy the []byte it receives,
                                // if it doesn't process it immediately.
                                emitBytes(z.Text())
                        }
-               case StartTag, EndTag:
+               case StartTagToken, EndTagToken:
                        tn, _ := z.TagName()
                        if len(tn) == 1 && tn[0] == 'a' {
                                if tt == StartTag {
@@ -69,6 +69,26 @@ call to Next. For example, to extract an HTML page's anchor text:
                }
        }
 
+Parsing is done by calling Parse with an io.Reader, which returns the root of
+the parse tree (the document element) as a *Node. It is the caller's
+responsibility to ensure that the Reader provides UTF-8 encoded HTML. For
+example, to process each anchor node in depth-first order:
+
+       doc, err := html.Parse(r)
+       if err != nil {
+               // ...
+       }
+       var f func(*html.Node)
+       f = func(n *html.Node) {
+               if n.Type == html.ElementNode && n.Data == "a" {
+                       // Do something with n...
+               }
+               for _, c := range n.Child {
+                       f(c)
+               }
+       }
+       f(doc)
+
 The relevant specifications include:
 http://www.whatwg.org/specs/web-apps/current-work/multipage/syntax.html and
 http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html
@@ -82,6 +102,5 @@ package html
 // node. Specification compliance is verified by checking expected and actual
 // outputs over a test suite rather than aiming for algorithmic fidelity.
 
-// TODO(nigeltao): Implement a parser, not just a tokenizer.
 // TODO(nigeltao): Does a DOM API belong in this package or a separate one?
 // TODO(nigeltao): How does parsing interact with a JavaScript engine?
diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go
new file mode 100644 (file)
index 0000000..d3c1f12
--- /dev/null
@@ -0,0 +1,414 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+       "io"
+       "os"
+)
+
+// A NodeType is the type of a Node.
+type NodeType int
+
+const (
+       ErrorNode NodeType = iota
+       TextNode
+       DocumentNode
+       ElementNode
+       CommentNode
+)
+
+// A Node consists of a NodeType and some Data (tag name for element nodes,
+// content for text) and are part of a tree of Nodes. Element nodes may also
+// contain a slice of Attributes. Data is unescaped, so that it looks like
+// "a<b" rather than "a&lt;b".
+type Node struct {
+       Parent *Node
+       Child  []*Node
+       Type   NodeType
+       Data   string
+       Attr   []Attribute
+}
+
+// An insertion mode (section 10.2.3.1) is the state transition function from
+// a particular state in the HTML5 parser's state machine. In addition to
+// returning the next state, it also returns whether the token was consumed.
+type insertionMode func(*parser) (insertionMode, bool)
+
+// A parser implements the HTML5 parsing algorithm:
+// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
+type parser struct {
+       // tokenizer provides the tokens for the parser.
+       tokenizer *Tokenizer
+       // tok is the most recently read token.
+       tok Token
+       // Self-closing tags like <hr/> are re-interpreted as a two-token sequence:
+       // <hr> followed by </hr>. hasSelfClosingToken is true if we have just read
+       // the synthetic start tag and the next one due is the matching end tag.
+       hasSelfClosingToken bool
+       // doc is the document root element.
+       doc *Node
+       // The stack of open elements (section 10.2.3.2).
+       stack []*Node
+       // Element pointers (section 10.2.3.4).
+       head, form *Node
+       // Other parsing state flags (section 10.2.3.5).
+       scripting, framesetOK bool
+}
+
+// pop pops the top of the stack of open elements.
+// It will panic if the stack is empty.
+func (p *parser) pop() *Node {
+       n := len(p.stack)
+       ret := p.stack[n-1]
+       p.stack = p.stack[:n-1]
+       return ret
+}
+
+// push pushes onto the stack of open elements.
+func (p *parser) push(n *Node) {
+       p.stack = append(p.stack, n)
+}
+
+// top returns the top of the stack of open elements.
+// This is also known as the current node.
+func (p *parser) top() *Node {
+       if n := len(p.stack); n > 0 {
+               return p.stack[n-1]
+       }
+       return p.doc
+}
+
+// addChild adds a child node n to the top element, and pushes n
+// if it is an element node (text nodes do not have children).
+func (p *parser) addChild(n *Node) {
+       m := p.top()
+       m.Child = append(m.Child, n)
+       if n.Type == ElementNode {
+               p.push(n)
+       }
+}
+
+// addText adds text to the current node.
+func (p *parser) addText(s string) {
+       // TODO(nigeltao): merge s with previous text, if the preceding node is a text node.
+       // TODO(nigeltao): distinguish whitespace text from others.
+       p.addChild(&Node{
+               Type: TextNode,
+               Data: s,
+       })
+}
+
+// Section 10.2.3.3.
+func (p *parser) addFormattingElement(n *Node) {
+       p.addChild(n)
+       // TODO.
+}
+
+// Section 10.2.3.3.
+func (p *parser) reconstructActiveFormattingElements() {
+       // TODO.
+}
+
+// read reads the next token. This is usually from the tokenizer, but it may
+// be the synthesized end tag implied by a self-closing tag.
+func (p *parser) read() os.Error {
+       if p.hasSelfClosingToken {
+               p.hasSelfClosingToken = false
+               p.tok.Type = EndTagToken
+               p.tok.Attr = nil
+               return nil
+       }
+       if tokenType := p.tokenizer.Next(); tokenType == ErrorToken {
+               return p.tokenizer.Error()
+       }
+       p.tok = p.tokenizer.Token()
+       if p.tok.Type == SelfClosingTagToken {
+               p.hasSelfClosingToken = true
+               p.tok.Type = StartTagToken
+       }
+       return nil
+}
+
+// Section 10.2.4.
+func (p *parser) acknowledgeSelfClosingTag() {
+       p.hasSelfClosingToken = false
+}
+
+// Section 10.2.5.4.
+func initialInsertionMode(p *parser) (insertionMode, bool) {
+       // TODO(nigeltao): check p.tok for DOCTYPE.
+       return beforeHTMLInsertionMode, false
+}
+
+// Section 10.2.5.5.
+func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
+       var (
+               add     bool
+               attr    []Attribute
+               implied bool
+       )
+       switch p.tok.Type {
+       case TextToken:
+               // TODO(nigeltao): distinguish whitespace text from others.
+               implied = true
+       case StartTagToken:
+               if p.tok.Data == "html" {
+                       add = true
+                       attr = p.tok.Attr
+               } else {
+                       implied = true
+               }
+       case EndTagToken:
+               // TODO.
+       }
+       if add || implied {
+               p.addChild(&Node{
+                       Type: ElementNode,
+                       Data: "html",
+                       Attr: attr,
+               })
+       }
+       return beforeHeadInsertionMode, !implied
+}
+
+// Section 10.2.5.6.
+func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
+       var (
+               add     bool
+               attr    []Attribute
+               implied bool
+       )
+       switch p.tok.Type {
+       case TextToken:
+               // TODO(nigeltao): distinguish whitespace text from others.
+               implied = true
+       case StartTagToken:
+               switch p.tok.Data {
+               case "head":
+                       add = true
+                       attr = p.tok.Attr
+               case "html":
+                       // TODO.
+               default:
+                       implied = true
+               }
+       case EndTagToken:
+               // TODO.
+       }
+       if add || implied {
+               p.addChild(&Node{
+                       Type: ElementNode,
+                       Data: "head",
+                       Attr: attr,
+               })
+       }
+       return inHeadInsertionMode, !implied
+}
+
+// Section 10.2.5.7.
+func inHeadInsertionMode(p *parser) (insertionMode, bool) {
+       var (
+               pop     bool
+               implied bool
+       )
+       switch p.tok.Type {
+       case TextToken:
+               implied = true
+       case StartTagToken:
+               switch p.tok.Data {
+               case "meta":
+                       // TODO.
+               case "script":
+                       // TODO.
+               default:
+                       implied = true
+               }
+       case EndTagToken:
+               if p.tok.Data == "head" {
+                       pop = true
+               }
+               // TODO.
+       }
+       if pop || implied {
+               n := p.pop()
+               if n.Data != "head" {
+                       panic("html: bad parser state")
+               }
+               return afterHeadInsertionMode, !implied
+       }
+       return inHeadInsertionMode, !implied
+}
+
+// Section 10.2.5.9.
+func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
+       var (
+               add        bool
+               attr       []Attribute
+               framesetOK bool
+               implied    bool
+       )
+       switch p.tok.Type {
+       case TextToken:
+               implied = true
+               framesetOK = true
+       case StartTagToken:
+               switch p.tok.Data {
+               case "html":
+                       // TODO.
+               case "body":
+                       add = true
+                       attr = p.tok.Attr
+                       framesetOK = false
+               case "frameset":
+                       // TODO.
+               case "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title":
+                       // TODO.
+               case "head":
+                       // TODO.
+               default:
+                       implied = true
+                       framesetOK = true
+               }
+       case EndTagToken:
+               // TODO.
+       }
+       if add || implied {
+               p.addChild(&Node{
+                       Type: ElementNode,
+                       Data: "body",
+                       Attr: attr,
+               })
+               p.framesetOK = framesetOK
+       }
+       return inBodyInsertionMode, !implied
+}
+
+// Section 10.2.5.10.
+func inBodyInsertionMode(p *parser) (insertionMode, bool) {
+       var endP bool
+       switch p.tok.Type {
+       case TextToken:
+               p.addText(p.tok.Data)
+               p.framesetOK = false
+       case StartTagToken:
+               switch p.tok.Data {
+               case "address", "article", "aside", "blockquote", "center", "details", "dir", "div", "dl", "fieldset", "figcaption", "figure", "footer", "header", "hgroup", "menu", "nav", "ol", "p", "section", "summary", "ul":
+                       // TODO(nigeltao): Do the proper "does the stack of open elements has a p element in button scope" algorithm in section 10.2.3.2.
+                       n := p.top()
+                       if n.Type == ElementNode && n.Data == "p" {
+                               endP = true
+                       } else {
+                               p.addChild(&Node{
+                                       Type: ElementNode,
+                                       Data: p.tok.Data,
+                                       Attr: p.tok.Attr,
+                               })
+                       }
+               case "b", "big", "code", "em", "font", "i", "s", "small", "strike", "strong", "tt", "u":
+                       p.reconstructActiveFormattingElements()
+                       p.addFormattingElement(&Node{
+                               Type: ElementNode,
+                               Data: p.tok.Data,
+                               Attr: p.tok.Attr,
+                       })
+               case "area", "br", "embed", "img", "input", "keygen", "wbr":
+                       p.reconstructActiveFormattingElements()
+                       p.addChild(&Node{
+                               Type: ElementNode,
+                               Data: p.tok.Data,
+                               Attr: p.tok.Attr,
+                       })
+                       p.pop()
+                       p.acknowledgeSelfClosingTag()
+                       p.framesetOK = false
+               case "hr":
+                       // TODO(nigeltao): auto-insert </p> if necessary.
+                       p.addChild(&Node{
+                               Type: ElementNode,
+                               Data: p.tok.Data,
+                               Attr: p.tok.Attr,
+                       })
+                       p.pop()
+                       p.acknowledgeSelfClosingTag()
+                       p.framesetOK = false
+               default:
+                       // TODO.
+               }
+       case EndTagToken:
+               switch p.tok.Data {
+               case "body":
+                       // TODO(nigeltao): autoclose the stack of open elements.
+                       return afterBodyInsertionMode, true
+               case "a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u":
+                       // TODO(nigeltao): implement the "adoption agency" algorithm:
+                       // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#adoptionAgency
+                       p.pop()
+               default:
+                       // TODO.
+               }
+       }
+       if endP {
+               // TODO(nigeltao): do the proper algorithm.
+               n := p.pop()
+               if n.Type != ElementNode || n.Data != "p" {
+                       panic("unreachable")
+               }
+       }
+       return inBodyInsertionMode, !endP
+}
+
+// Section 10.2.5.22.
+func afterBodyInsertionMode(p *parser) (insertionMode, bool) {
+       switch p.tok.Type {
+       case TextToken:
+               // TODO.
+       case StartTagToken:
+               // TODO.
+       case EndTagToken:
+               switch p.tok.Data {
+               case "html":
+                       // TODO(nigeltao): autoclose the stack of open elements.
+                       return afterAfterBodyInsertionMode, true
+               default:
+                       // TODO.
+               }
+       }
+       return afterBodyInsertionMode, true
+}
+
+// Section 10.2.5.25.
+func afterAfterBodyInsertionMode(p *parser) (insertionMode, bool) {
+       return inBodyInsertionMode, false
+}
+
+// Parse returns the parse tree for the HTML from the given Reader.
+// The input is assumed to be UTF-8 encoded.
+func Parse(r io.Reader) (*Node, os.Error) {
+       p := &parser{
+               tokenizer: NewTokenizer(r),
+               doc: &Node{
+                       Type: DocumentNode,
+               },
+               scripting:  true,
+               framesetOK: true,
+       }
+       im, consumed := initialInsertionMode, true
+       for {
+               if consumed {
+                       if err := p.read(); err != nil {
+                               if err == os.EOF {
+                                       break
+                               }
+                               return nil, err
+                       }
+               }
+               im, consumed = im(p)
+       }
+       // TODO(nigeltao): clean up, depending on the value of im.
+       // The specification's algorithm does clean up on reading an EOF 'token',
+       // but in go we represent EOF by an os.Error instead.
+       return p.doc, nil
+}
diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go
new file mode 100644 (file)
index 0000000..7fa4f42
--- /dev/null
@@ -0,0 +1,153 @@
+// Copyright 2010 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package html
+
+import (
+       "bufio"
+       "bytes"
+       "fmt"
+       "io"
+       "io/ioutil"
+       "os"
+       "testing"
+)
+
+type devNull struct{}
+
+func (devNull) Write(p []byte) (int, os.Error) {
+       return len(p), nil
+}
+
+func pipeErr(err os.Error) io.Reader {
+       pr, pw := io.Pipe()
+       pw.CloseWithError(err)
+       return pr
+}
+
+func readDat(filename string, c chan io.Reader) {
+       f, err := os.Open("testdata/webkit/"+filename, os.O_RDONLY, 0600)
+       if err != nil {
+               c <- pipeErr(err)
+               return
+       }
+       defer f.Close()
+
+       // Loop through the lines of the file. Each line beginning with "#" denotes
+       // a new section, which is returned as a separate io.Reader.
+       r := bufio.NewReader(f)
+       var pw *io.PipeWriter
+       for {
+               line, err := r.ReadSlice('\n')
+               if err != nil {
+                       if pw != nil {
+                               pw.CloseWithError(err)
+                               pw = nil
+                       } else {
+                               c <- pipeErr(err)
+                       }
+                       return
+               }
+               if len(line) == 0 {
+                       continue
+               }
+               if line[0] == '#' {
+                       if pw != nil {
+                               pw.Close()
+                       }
+                       var pr *io.PipeReader
+                       pr, pw = io.Pipe()
+                       c <- pr
+                       continue
+               }
+               if line[0] != '|' {
+                       // Strip the trailing '\n'.
+                       line = line[:len(line)-1]
+               }
+               if pw != nil {
+                       if _, err := pw.Write(line); err != nil {
+                               pw.CloseWithError(err)
+                               pw = nil
+                       }
+               }
+       }
+}
+
+func dumpLevel(w io.Writer, n *Node, level int) os.Error {
+       io.WriteString(w, "| ")
+       for i := 0; i < level; i++ {
+               io.WriteString(w, "  ")
+       }
+       switch n.Type {
+       case ErrorNode:
+               return os.NewError("unexpected ErrorNode")
+       case DocumentNode:
+               return os.NewError("unexpected DocumentNode")
+       case ElementNode:
+               fmt.Fprintf(w, "<%s>", EscapeString(n.Data))
+       case TextNode:
+               fmt.Fprintf(w, "%q", EscapeString(n.Data))
+       case CommentNode:
+               return os.NewError("COMMENT")
+       default:
+               return os.NewError("unknown node type")
+       }
+       io.WriteString(w, "\n")
+       for _, c := range n.Child {
+               if err := dumpLevel(w, c, level+1); err != nil {
+                       return err
+               }
+       }
+       return nil
+}
+
+func dump(n *Node) (string, os.Error) {
+       if n == nil || len(n.Child) == 0 {
+               return "", nil
+       }
+       if len(n.Child) > 1 {
+               return "too many children", nil
+       }
+       b := bytes.NewBuffer(nil)
+       if err := dumpLevel(b, n.Child[0], 0); err != nil {
+               return "", err
+       }
+       return b.String(), nil
+}
+
+func TestParser(t *testing.T) {
+       // TODO(nigeltao): Process all the .dat files, not just the first one.
+       filenames := []string{
+               "tests1.dat",
+       }
+       for _, filename := range filenames {
+               rc := make(chan io.Reader)
+               go readDat(filename, rc)
+               // TODO(nigeltao): Process all test cases, not just the first three.
+               for i := 0; i < 3; i++ {
+                       // Parse the #data section.
+                       doc, err := Parse(<-rc)
+                       if err != nil {
+                               t.Fatal(err)
+                       }
+                       actual, err := dump(doc)
+                       if err != nil {
+                               t.Fatal(err)
+                       }
+                       // Skip the #error section.
+                       if _, err := io.Copy(devNull{}, <-rc); err != nil {
+                               t.Fatal(err)
+                       }
+                       // Compare the parsed tree to the #document section.
+                       b, err := ioutil.ReadAll(<-rc)
+                       if err != nil {
+                               t.Fatal(err)
+                       }
+                       expected := string(b)
+                       if actual != expected {
+                               t.Errorf("%s test #%d, actual vs expected:\n----\n%s----\n%s----", filename, i, actual, expected)
+                       }
+               }
+       }
+}
index 0d4de254308de33463b2f148afb27738213dfeba..dc2a6ec5c317ebd3ef4aa60ab591f26eab0560e0 100644 (file)
@@ -15,30 +15,30 @@ import (
 type TokenType int
 
 const (
-       // Error means that an error occurred during tokenization.
-       Error TokenType = iota
-       // Text means a text node.
-       Text
-       // A StartTag looks like <a>.
-       StartTag
-       // An EndTag looks like </a>.
-       EndTag
-       // A SelfClosingTag tag looks like <br/>.
-       SelfClosingTag
+       // ErrorToken means that an error occurred during tokenization.
+       ErrorToken TokenType = iota
+       // TextToken means a text node.
+       TextToken
+       // A StartTagToken looks like <a>.
+       StartTagToken
+       // An EndTagToken looks like </a>.
+       EndTagToken
+       // A SelfClosingTagToken tag looks like <br/>.
+       SelfClosingTagToken
 )
 
 // String returns a string representation of the TokenType.
 func (t TokenType) String() string {
        switch t {
-       case Error:
+       case ErrorToken:
                return "Error"
-       case Text:
+       case TextToken:
                return "Text"
-       case StartTag:
+       case StartTagToken:
                return "StartTag"
-       case EndTag:
+       case EndTagToken:
                return "EndTag"
-       case SelfClosingTag:
+       case SelfClosingTagToken:
                return "SelfClosingTag"
        }
        return "Invalid(" + strconv.Itoa(int(t)) + ")"
@@ -81,15 +81,15 @@ func (t Token) tagString() string {
 // String returns a string representation of the Token.
 func (t Token) String() string {
        switch t.Type {
-       case Error:
+       case ErrorToken:
                return ""
-       case Text:
+       case TextToken:
                return EscapeString(t.Data)
-       case StartTag:
+       case StartTagToken:
                return "<" + t.tagString() + ">"
-       case EndTag:
+       case EndTagToken:
                return "</" + t.tagString() + ">"
-       case SelfClosingTag:
+       case SelfClosingTagToken:
                return "<" + t.tagString() + "/>"
        }
        return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
@@ -109,10 +109,10 @@ type Tokenizer struct {
        buf    []byte
 }
 
-// Error returns the error associated with the most recent Error token. This is
-// typically os.EOF, meaning the end of tokenization.
+// Error returns the error associated with the most recent ErrorToken token.
+// This is typically os.EOF, meaning the end of tokenization.
 func (z *Tokenizer) Error() os.Error {
-       if z.tt != Error {
+       if z.tt != ErrorToken {
                return nil
        }
        return z.err
@@ -180,40 +180,40 @@ func (z *Tokenizer) readTo(x uint8) os.Error {
 func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
        c, err := z.readByte()
        if err != nil {
-               return Error, err
+               return ErrorToken, err
        }
        switch {
        case c == '/':
-               tt = EndTag
+               tt = EndTagToken
        // Lower-cased characters are more common in tag names, so we check for them first.
        case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
-               tt = StartTag
+               tt = StartTagToken
        case c == '!':
-               return Error, os.NewError("html: TODO(nigeltao): implement comments")
+               return ErrorToken, os.NewError("html: TODO(nigeltao): implement comments")
        case c == '?':
-               return Error, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
+               return ErrorToken, os.NewError("html: TODO(nigeltao): implement XML processing instructions")
        default:
-               return Error, os.NewError("html: TODO(nigeltao): handle malformed tags")
+               return ErrorToken, os.NewError("html: TODO(nigeltao): handle malformed tags")
        }
        for {
                c, err := z.readByte()
                if err != nil {
-                       return Text, err
+                       return TextToken, err
                }
                switch c {
                case '"':
                        err = z.readTo('"')
                        if err != nil {
-                               return Text, err
+                               return TextToken, err
                        }
                case '\'':
                        err = z.readTo('\'')
                        if err != nil {
-                               return Text, err
+                               return TextToken, err
                        }
                case '>':
-                       if z.buf[z.p1-2] == '/' && tt == StartTag {
-                               return SelfClosingTag, nil
+                       if z.buf[z.p1-2] == '/' && tt == StartTagToken {
+                               return SelfClosingTagToken, nil
                        }
                        return tt, nil
                }
@@ -224,13 +224,13 @@ func (z *Tokenizer) nextTag() (tt TokenType, err os.Error) {
 // Next scans the next token and returns its type.
 func (z *Tokenizer) Next() TokenType {
        if z.err != nil {
-               z.tt = Error
+               z.tt = ErrorToken
                return z.tt
        }
        z.p0 = z.p1
        c, err := z.readByte()
        if err != nil {
-               z.tt, z.err = Error, err
+               z.tt, z.err = ErrorToken, err
                return z.tt
        }
        if c == '<' {
@@ -240,15 +240,15 @@ func (z *Tokenizer) Next() TokenType {
        for {
                c, err := z.readByte()
                if err != nil {
-                       z.tt, z.err = Error, err
+                       z.tt, z.err = ErrorToken, err
                        if err == os.EOF {
-                               z.tt = Text
+                               z.tt = TextToken
                        }
                        return z.tt
                }
                if c == '<' {
                        z.p1--
-                       z.tt = Text
+                       z.tt = TextToken
                        return z.tt
                }
        }
@@ -371,9 +371,9 @@ loop:
 func (z *Tokenizer) Token() Token {
        t := Token{Type: z.tt}
        switch z.tt {
-       case Text:
+       case TextToken:
                t.Data = string(z.Text())
-       case StartTag, EndTag, SelfClosingTag:
+       case StartTagToken, EndTagToken, SelfClosingTagToken:
                var attr []Attribute
                name, remaining := z.TagName()
                for remaining {
index 5759476eab433313516c510a4b2fa4bfc64b6381..7dbe13ddfe60c0f2203898a1e4127592b02cd819 100644 (file)
@@ -88,7 +88,7 @@ loop:
        for _, tt := range tokenTests {
                z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
                for i, s := range tt.tokens {
-                       if z.Next() == Error {
+                       if z.Next() == ErrorToken {
                                t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
                                continue loop
                        }
@@ -134,19 +134,19 @@ loop:
        for {
                tt := z.Next()
                switch tt {
-               case Error:
+               case ErrorToken:
                        if z.Error() != os.EOF {
                                t.Error(z.Error())
                        }
                        break loop
-               case Text:
+               case TextToken:
                        if depth > 0 {
                                result.Write(z.Text())
                        }
-               case StartTag, EndTag:
+               case StartTagToken, EndTagToken:
                        tn, _ := z.TagName()
                        if len(tn) == 1 && tn[0] == 'a' {
-                               if tt == StartTag {
+                               if tt == StartTagToken {
                                        depth++
                                } else {
                                        depth--