From 631a575fd92b711854930f3b03b40a2bf66bbd29 Mon Sep 17 00:00:00 2001 From: Andrew Balholm Date: Sun, 13 Nov 2011 12:39:41 +1100 Subject: [PATCH] html: store the current insertion mode in the parser Currently, the state transition functions in the HTML parser return the next insertion mode and whether the token is consumed. This works well except for when one insertion mode needs to use the rules for another insertion mode. Then the useTheRulesFor function needs to patch things up. This requires comparing functions for equality, which is going to stop working. Adding a field to the parser structure to store the current insertion mode eliminates the need for useTheRulesFor; one insertion mode function can now just call the other directly. The insertion mode will be changed only if it needs to be. This CL is an alternative to CL 5372078. R=nigeltao, rsc CC=golang-dev https://golang.org/cl/5372079 --- src/pkg/html/parse.go | 323 +++++++++++++++++++++++------------------- 1 file changed, 174 insertions(+), 149 deletions(-) diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go index 9dd5a4091c..d6505c6913 100644 --- a/src/pkg/html/parse.go +++ b/src/pkg/html/parse.go @@ -29,6 +29,8 @@ type parser struct { head, form *Node // Other parsing state flags (section 11.2.3.5). scripting, framesetOK bool + // im is the current insertion mode. + im insertionMode // originalIM is the insertion mode to go back to after completing a text // or inTableText insertion mode. originalIM insertionMode @@ -265,37 +267,22 @@ func (p *parser) acknowledgeSelfClosingTag() { // An insertion mode (section 11.2.3.1) is the state transition function from // a particular state in the HTML5 parser's state machine. It updates the -// parser's fields depending on parser.token (where ErrorToken means EOF). In -// addition to returning the next insertionMode state, it also returns whether -// the token was consumed. -type insertionMode func(*parser) (insertionMode, bool) - -// useTheRulesFor runs the delegate insertionMode over p, returning the actual -// insertionMode unless the delegate caused a state transition. -// Section 11.2.3.1, "using the rules for". -func useTheRulesFor(p *parser, actual, delegate insertionMode) (insertionMode, bool) { - im, consumed := delegate(p) - if p.originalIM == delegate { - p.originalIM = actual - } - if im != delegate { - return im, consumed - } - return actual, consumed -} +// parser's fields depending on parser.tok (where ErrorToken means EOF). +// It returns whether the token was consumed. +type insertionMode func(*parser) bool // setOriginalIM sets the insertion mode to return to after completing a text or // inTableText insertion mode. // Section 11.2.3.1, "using the rules for". -func (p *parser) setOriginalIM(im insertionMode) { +func (p *parser) setOriginalIM() { if p.originalIM != nil { panic("html: bad parser state: originalIM was set twice") } - p.originalIM = im + p.originalIM = p.im } // Section 11.2.3.1, "reset the insertion mode". -func (p *parser) resetInsertionMode() insertionMode { +func (p *parser) resetInsertionMode() { for i := len(p.oe) - 1; i >= 0; i-- { n := p.oe[i] if i == 0 { @@ -303,60 +290,66 @@ func (p *parser) resetInsertionMode() insertionMode { } switch n.Data { case "select": - return inSelectIM + p.im = inSelectIM case "td", "th": - return inCellIM + p.im = inCellIM case "tr": - return inRowIM + p.im = inRowIM case "tbody", "thead", "tfoot": - return inTableBodyIM + p.im = inTableBodyIM case "caption": - // TODO: return inCaptionIM + // TODO: p.im = inCaptionIM case "colgroup": - return inColumnGroupIM + p.im = inColumnGroupIM case "table": - return inTableIM + p.im = inTableIM case "head": - return inBodyIM + p.im = inBodyIM case "body": - return inBodyIM + p.im = inBodyIM case "frameset": - return inFramesetIM + p.im = inFramesetIM case "html": - return beforeHeadIM + p.im = beforeHeadIM + default: + continue } + return } - return inBodyIM + p.im = inBodyIM } // Section 11.2.5.4.1. -func initialIM(p *parser) (insertionMode, bool) { +func initialIM(p *parser) bool { switch p.tok.Type { case CommentToken: p.doc.Add(&Node{ Type: CommentNode, Data: p.tok.Data, }) - return initialIM, true + return true case DoctypeToken: p.doc.Add(&Node{ Type: DoctypeNode, Data: p.tok.Data, }) - return beforeHTMLIM, true + p.im = beforeHTMLIM + return true } // TODO: set "quirks mode"? It's defined in the DOM spec instead of HTML5 proper, // and so switching on "quirks mode" might belong in a different package. - return beforeHTMLIM, false + p.im = beforeHTMLIM + return false } // Section 11.2.5.4.2. -func beforeHTMLIM(p *parser) (insertionMode, bool) { +func beforeHTMLIM(p *parser) bool { switch p.tok.Type { case StartTagToken: if p.tok.Data == "html" { p.addElement(p.tok.Data, p.tok.Attr) - return beforeHeadIM, true + p.im = beforeHeadIM + return true } case EndTagToken: switch p.tok.Data { @@ -364,22 +357,23 @@ func beforeHTMLIM(p *parser) (insertionMode, bool) { // Drop down to creating an implied tag. default: // Ignore the token. - return beforeHTMLIM, true + return true } case CommentToken: p.doc.Add(&Node{ Type: CommentNode, Data: p.tok.Data, }) - return beforeHTMLIM, true + return true } // Create an implied tag. p.addElement("html", nil) - return beforeHeadIM, false + p.im = beforeHeadIM + return false } // Section 11.2.5.4.3. -func beforeHeadIM(p *parser) (insertionMode, bool) { +func beforeHeadIM(p *parser) bool { var ( add bool attr []Attribute @@ -397,7 +391,7 @@ func beforeHeadIM(p *parser) (insertionMode, bool) { add = true attr = p.tok.Attr case "html": - return useTheRulesFor(p, beforeHeadIM, inBodyIM) + return inBodyIM(p) default: implied = true } @@ -413,19 +407,20 @@ func beforeHeadIM(p *parser) (insertionMode, bool) { Type: CommentNode, Data: p.tok.Data, }) - return beforeHeadIM, true + return true } if add || implied { p.addElement("head", attr) p.head = p.top() } - return inHeadIM, !implied + p.im = inHeadIM + return !implied } const whitespace = " \t\r\n\f" // Section 11.2.5.4.4. -func inHeadIM(p *parser) (insertionMode, bool) { +func inHeadIM(p *parser) bool { var ( pop bool implied bool @@ -439,7 +434,7 @@ func inHeadIM(p *parser) (insertionMode, bool) { // Add the initial whitespace to the current node. p.addText(p.tok.Data[:len(p.tok.Data)-len(s)]) if s == "" { - return inHeadIM, true + return true } p.tok.Data = s } @@ -452,8 +447,9 @@ func inHeadIM(p *parser) (insertionMode, bool) { p.acknowledgeSelfClosingTag() case "script", "title", "noscript", "noframes", "style": p.addElement(p.tok.Data, p.tok.Attr) - p.setOriginalIM(inHeadIM) - return textIM, true + p.setOriginalIM() + p.im = textIM + return true default: implied = true } @@ -465,27 +461,28 @@ func inHeadIM(p *parser) (insertionMode, bool) { implied = true default: // Ignore the token. - return inHeadIM, true + return true } case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) - return inHeadIM, true + return true } if pop || implied { n := p.oe.pop() if n.Data != "head" { panic("html: bad parser state: element not found, in the in-head insertion mode") } - return afterHeadIM, !implied + p.im = afterHeadIM + return !implied } - return inHeadIM, true + return true } // Section 11.2.5.4.6. -func afterHeadIM(p *parser) (insertionMode, bool) { +func afterHeadIM(p *parser) bool { var ( add bool attr []Attribute @@ -506,11 +503,12 @@ func afterHeadIM(p *parser) (insertionMode, bool) { framesetOK = false case "frameset": p.addElement(p.tok.Data, p.tok.Attr) - return inFramesetIM, true + p.im = inFramesetIM + return true case "base", "basefont", "bgsound", "link", "meta", "noframes", "script", "style", "title": p.oe = append(p.oe, p.head) defer p.oe.pop() - return useTheRulesFor(p, afterHeadIM, inHeadIM) + return inHeadIM(p) case "head": // TODO. default: @@ -524,20 +522,21 @@ func afterHeadIM(p *parser) (insertionMode, bool) { framesetOK = true default: // Ignore the token. - return afterHeadIM, true + return true } case CommentToken: p.addChild(&Node{ Type: CommentNode, Data: p.tok.Data, }) - return afterHeadIM, true + return true } if add || implied { p.addElement("body", attr) p.framesetOK = framesetOK } - return inBodyIM, !implied + p.im = inBodyIM + return !implied } // copyAttributes copies attributes of src not found on dst to dst. @@ -558,7 +557,7 @@ func copyAttributes(dst *Node, src Token) { } // Section 11.2.5.4.7. -func inBodyIM(p *parser) (insertionMode, bool) { +func inBodyIM(p *parser) bool { switch p.tok.Type { case TextToken: p.reconstructActiveFormattingElements() @@ -605,7 +604,8 @@ func inBodyIM(p *parser) (insertionMode, bool) { p.popUntil(buttonScopeStopTags, "p") // TODO: skip this step in quirks mode. p.addElement(p.tok.Data, p.tok.Attr) p.framesetOK = false - return inTableIM, true + p.im = inTableIM + return true case "hr": p.popUntil(buttonScopeStopTags, "p") p.addElement(p.tok.Data, p.tok.Attr) @@ -617,7 +617,8 @@ func inBodyIM(p *parser) (insertionMode, bool) { p.addElement(p.tok.Data, p.tok.Attr) p.framesetOK = false // TODO: detect