html: handle unexpected EOF during parsing.

author Nigel Tao <nigeltao@golang.org>

Tue, 7 Dec 2010 21:59:20 +0000 (08:59 +1100)

committer Nigel Tao <nigeltao@golang.org>

Tue, 7 Dec 2010 21:59:20 +0000 (08:59 +1100)
author Nigel Tao <nigeltao@golang.org>
Tue, 7 Dec 2010 21:59:20 +0000 (08:59 +1100)
committer Nigel Tao <nigeltao@golang.org>
Tue, 7 Dec 2010 21:59:20 +0000 (08:59 +1100)
diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go

index d3c1f12135f6600eab7c4e56268680b1662bf680..acc3eccbcc7134278e0cd616d6b423f4c0db7afb 100644 (file)
--- a/src/pkg/html/parse.go
+++ b/src/pkg/html/parse.go
@@ -32,11 +32,6 @@ type Node struct {
         Attr   []Attribute
  }
  
-// An insertion mode (section 10.2.3.1) is the state transition function from
-// a particular state in the HTML5 parser's state machine. In addition to
-// returning the next state, it also returns whether the token was consumed.
-type insertionMode func(*parser) (insertionMode, bool)
-
  // A parser implements the HTML5 parsing algorithm:
  // http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#tree-construction
  type parser struct {
@@ -121,11 +116,12 @@ func (p *parser) read() os.Error {
                 p.tok.Attr = nil
                 return nil
         }
-       if tokenType := p.tokenizer.Next(); tokenType == ErrorToken {
-               return p.tokenizer.Error()
-       }
+       p.tokenizer.Next()
         p.tok = p.tokenizer.Token()
-       if p.tok.Type == SelfClosingTagToken {
+       switch p.tok.Type {
+       case ErrorToken:
+               return p.tokenizer.Error()
+       case SelfClosingTagToken:
                 p.hasSelfClosingToken = true
                 p.tok.Type = StartTagToken
         }
@@ -137,6 +133,13 @@ func (p *parser) acknowledgeSelfClosingTag() {
         p.hasSelfClosingToken = false
  }
  
+// An insertion mode (section 10.2.3.1) is the state transition function from
+// a particular state in the HTML5 parser's state machine. It updates the
+// parser's fields depending on parser.token (where ErrorToken means EOF). In
+// addition to returning the next insertionMode state, it also returns whether
+// the token was consumed.
+type insertionMode func(*parser) (insertionMode, bool)
+
  // Section 10.2.5.4.
  func initialInsertionMode(p *parser) (insertionMode, bool) {
         // TODO(nigeltao): check p.tok for DOCTYPE.
@@ -151,6 +154,8 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
                 implied bool
         )
         switch p.tok.Type {
+       case ErrorToken:
+               implied = true
         case TextToken:
                 // TODO(nigeltao): distinguish whitespace text from others.
                 implied = true
@@ -162,7 +167,12 @@ func beforeHTMLInsertionMode(p *parser) (insertionMode, bool) {
                         implied = true
                 }
         case EndTagToken:
-               // TODO.
+               switch p.tok.Data {
+               case "head", "body", "html", "br":
+                       implied = true
+               default:
+                       // Ignore the token.
+               }
         }
         if add || implied {
                 p.addChild(&Node{
@@ -182,6 +192,8 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
                 implied bool
         )
         switch p.tok.Type {
+       case ErrorToken:
+               implied = true
         case TextToken:
                 // TODO(nigeltao): distinguish whitespace text from others.
                 implied = true
@@ -191,12 +203,17 @@ func beforeHeadInsertionMode(p *parser) (insertionMode, bool) {
                         add = true
                         attr = p.tok.Attr
                 case "html":
-                       // TODO.
+                       return inBodyInsertionMode, false
                 default:
                         implied = true
                 }
         case EndTagToken:
-               // TODO.
+               switch p.tok.Data {
+               case "head", "body", "html", "br":
+                       implied = true
+               default:
+                       // Ignore the token.
+               }
         }
         if add || implied {
                 p.addChild(&Node{
@@ -215,7 +232,7 @@ func inHeadInsertionMode(p *parser) (insertionMode, bool) {
                 implied bool
         )
         switch p.tok.Type {
-       case TextToken:
+       case ErrorToken, TextToken:
                 implied = true
         case StartTagToken:
                 switch p.tok.Data {
@@ -251,7 +268,7 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
                 implied    bool
         )
         switch p.tok.Type {
-       case TextToken:
+       case ErrorToken, TextToken:
                 implied = true
                 framesetOK = true
         case StartTagToken:
@@ -290,6 +307,8 @@ func afterHeadInsertionMode(p *parser) (insertionMode, bool) {
  func inBodyInsertionMode(p *parser) (insertionMode, bool) {
         var endP bool
         switch p.tok.Type {
+       case ErrorToken:
+               // No-op.
         case TextToken:
                 p.addText(p.tok.Data)
                 p.framesetOK = false
@@ -363,6 +382,8 @@ func inBodyInsertionMode(p *parser) (insertionMode, bool) {
  // Section 10.2.5.22.
  func afterBodyInsertionMode(p *parser) (insertionMode, bool) {
         switch p.tok.Type {
+       case ErrorToken:
+               // TODO.
         case TextToken:
                 // TODO.
         case StartTagToken:
@@ -395,6 +416,7 @@ func Parse(r io.Reader) (*Node, os.Error) {
                 scripting:  true,
                 framesetOK: true,
         }
+       // Iterate until EOF. Any other error will cause an early return.
         im, consumed := initialInsertionMode, true
         for {
                 if consumed {
@@ -407,8 +429,11 @@ func Parse(r io.Reader) (*Node, os.Error) {
                 }
                 im, consumed = im(p)
         }
-       // TODO(nigeltao): clean up, depending on the value of im.
-       // The specification's algorithm does clean up on reading an EOF 'token',
-       // but in go we represent EOF by an os.Error instead.
+       // Loop until the final token (the ErrorToken signifying EOF) is consumed.
+       for {
+               if im, consumed = im(p); consumed {
+                       break
+               }
+       }
         return p.doc, nil
  }
diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go

index 7fa4f427671d8414dde79eebbeacfb8597e772d3..839a034b7d59690830c51b9293fd29678dceb691 100644 (file)
--- a/src/pkg/html/parse_test.go
+++ b/src/pkg/html/parse_test.go
@@ -106,12 +106,11 @@ func dump(n *Node) (string, os.Error) {
         if n == nil || len(n.Child) == 0 {
                 return "", nil
         }
-       if len(n.Child) > 1 {
-               return "too many children", nil
-       }
         b := bytes.NewBuffer(nil)
-       if err := dumpLevel(b, n.Child[0], 0); err != nil {
-               return "", err
+       for _, child := range n.Child {
+               if err := dumpLevel(b, child, 0); err != nil {
+                       return "", err
+               }
         }
         return b.String(), nil
  }
@@ -124,8 +123,8 @@ func TestParser(t *testing.T) {
         for _, filename := range filenames {
                 rc := make(chan io.Reader)
                 go readDat(filename, rc)
-               // TODO(nigeltao): Process all test cases, not just the first three.
-               for i := 0; i < 3; i++ {
+               // TODO(nigeltao): Process all test cases, not just a subset.
+               for i := 0; i < 19; i++ {
                         // Parse the #data section.
                         doc, err := Parse(<-rc)
                         if err != nil {
author	Nigel Tao <nigeltao@golang.org>
	Tue, 7 Dec 2010 21:59:20 +0000 (08:59 +1100)
committer	Nigel Tao <nigeltao@golang.org>
	Tue, 7 Dec 2010 21:59:20 +0000 (08:59 +1100)
src/pkg/html/parse.go		patch \| blob \| history
src/pkg/html/parse_test.go		patch \| blob \| history