head, form *Node
// Other parsing state flags (section 11.2.3.5).
scripting, framesetOK bool
+ // originalIM is the insertion mode to go back to after completing a text
+ // or inTableText insertion mode.
+ originalIM insertionMode
}
func (p *parser) top() *Node {
// Section 11.2.3.1, "using the rules for".
func useTheRulesFor(p *parser, actual, delegate insertionMode) (insertionMode, bool) {
im, consumed := delegate(p)
+ // TODO: do we need to update p.originalMode if it equals delegate?
if im != delegate {
return im, consumed
}
return actual, consumed
}
+// setOriginalIM sets the insertion mode to return to after completing a text or
+// inTableText insertion mode.
+// Section 11.2.3.1, "using the rules for".
+func (p *parser) setOriginalIM(im insertionMode) {
+ if p.originalIM != nil {
+ panic("html: bad parser state: originalIM was set twice")
+ }
+ p.originalIM = im
+}
+
// Section 11.2.5.4.1.
func initialIM(p *parser) (insertionMode, bool) {
if p.tok.Type == DoctypeToken {
switch p.tok.Data {
case "meta":
// TODO.
- case "script":
- // TODO.
+ case "script", "title":
+ p.addElement(p.tok.Data, p.tok.Attr)
+ p.setOriginalIM(inHeadIM)
+ return textIM, true
default:
implied = true
}
}
}
+// Section 11.2.5.4.8.
+func textIM(p *parser) (insertionMode, bool) {
+ switch p.tok.Type {
+ case TextToken:
+ p.addText(p.tok.Data)
+ return textIM, true
+ case EndTagToken:
+ p.oe.pop()
+ }
+ o := p.originalIM
+ p.originalIM = nil
+ return o, p.tok.Type == EndTagToken
+}
+
// Section 11.2.5.4.9.
func inTableIM(p *parser) (insertionMode, bool) {
var (
case DocumentNode:
return os.NewError("unexpected DocumentNode")
case ElementNode:
- fmt.Fprintf(w, "<%s>", EscapeString(n.Data))
+ fmt.Fprintf(w, "<%s>", n.Data)
case TextNode:
- fmt.Fprintf(w, "%q", EscapeString(n.Data))
+ fmt.Fprintf(w, "%q", n.Data)
case CommentNode:
return os.NewError("COMMENT")
case DoctypeNode:
- fmt.Fprintf(w, "<!DOCTYPE %s>", EscapeString(n.Data))
+ fmt.Fprintf(w, "<!DOCTYPE %s>", n.Data)
case scopeMarkerNode:
return os.NewError("unexpected scopeMarkerNode")
default:
rc := make(chan io.Reader)
go readDat(filename, rc)
// TODO(nigeltao): Process all test cases, not just a subset.
- for i := 0; i < 26; i++ {
+ for i := 0; i < 27; i++ {
// Parse the #data section.
b, err := ioutil.ReadAll(<-rc)
if err != nil {
return os.NewError("html: unknown node type")
}
- // TODO: figure out what to do with <script>, <style>, <noembed>,
- // <noframes> and <noscript> elements. A tentative plan:
- // 1. render the <xxx> opening tag as normal.
- // 2. maybe error out if any child is not a text node.
- // 3. render the text nodes (without escaping??).
- // 4. maybe error out if `</xxx` is a case-insensitive substring of the
- // concatenation of the children's data.
- // 5. maybe error out if the concatenation of the children's data contains an
- // unbalanced escaping text span start ("<!--") not followed by an end ("-->").
- // 6. render the closing tag as normal.
-
// Render the <xxx> opening tag.
if err := w.WriteByte('<'); err != nil {
return err
}
// Render any child nodes.
- for _, c := range n.Child {
- if err := render(w, c); err != nil {
- return err
+ switch n.Data {
+ case "noembed", "noframes", "noscript", "script", "style":
+ for _, c := range n.Child {
+ if c.Type != TextNode {
+ return fmt.Errorf("html: raw text element <%s> has non-text child node", n.Data)
+ }
+ if _, err := w.WriteString(c.Data); err != nil {
+ return err
+ }
+ }
+ case "textarea", "title":
+ for _, c := range n.Child {
+ if c.Type != TextNode {
+ return fmt.Errorf("html: RCDATA element <%s> has non-text child node", n.Data)
+ }
+ if err := render(w, c); err != nil {
+ return err
+ }
+ }
+ default:
+ for _, c := range n.Child {
+ if err := render(w, c); err != nil {
+ return err
+ }
}
}
"io"
"os"
"strconv"
+ "strings"
)
// A TokenType is the type of a Token.
pendingAttr [2]span
attr [][2]span
nAttrReturned int
+ // rawTag is the "script" in "</script>" that closes the next token. If
+ // non-empty, the subsequent call to Next will return a raw or RCDATA text
+ // token: one that treats "<p>" as text instead of an element.
+ // rawTag's contents are lower-cased.
+ rawTag string
+ // textIsRaw is whether the current text token's data is not escaped.
+ textIsRaw bool
}
// Error returns the error associated with the most recent ErrorToken token.
}
}
+// readRawOrRCDATA reads until the next "</foo>", where "foo" is z.rawTag and
+// is typically something like "script" or "textarea".
+func (z *Tokenizer) readRawOrRCDATA() {
+loop:
+ for {
+ c := z.readByte()
+ if z.err != nil {
+ break loop
+ }
+ if c != '<' {
+ continue loop
+ }
+ c = z.readByte()
+ if z.err != nil {
+ break loop
+ }
+ if c != '/' {
+ continue loop
+ }
+ for i := 0; i < len(z.rawTag); i++ {
+ c = z.readByte()
+ if z.err != nil {
+ break loop
+ }
+ if c != z.rawTag[i] && c != z.rawTag[i]-('a'-'A') {
+ continue loop
+ }
+ }
+ c = z.readByte()
+ if z.err != nil {
+ break loop
+ }
+ switch c {
+ case ' ', '\n', '\r', '\t', '\f', '/', '>':
+ // The 3 is 2 for the leading "</" plus 1 for the trailing character c.
+ z.raw.end -= 3 + len(z.rawTag)
+ break loop
+ case '<':
+ // Step back one, to catch "</foo</foo>".
+ z.raw.end--
+ }
+ }
+ z.data.end = z.raw.end
+ // A textarea's or title's RCDATA can contain escaped entities.
+ z.textIsRaw = z.rawTag != "textarea" && z.rawTag != "title"
+ z.rawTag = ""
+}
+
// readComment reads the next comment token starting with "<!--". The opening
// "<!--" has already been consumed.
func (z *Tokenizer) readComment() {
break
}
}
+ // Any "<noembed>", "<noframes>", "<noscript>", "<script>", "<style>",
+ // "<textarea>" or "<title>" tag flags the tokenizer's next token as raw.
+ // The tag name lengths of these special cases ranges in [5, 8].
+ if x := z.data.end - z.data.start; 5 <= x && x <= 8 {
+ switch z.buf[z.data.start] {
+ case 'n', 's', 't', 'N', 'S', 'T':
+ switch s := strings.ToLower(string(z.buf[z.data.start:z.data.end])); s {
+ case "noembed", "noframes", "noscript", "script", "style", "textarea", "title":
+ z.rawTag = s
+ }
+ }
+ }
+ // Look for a self-closing token like "<br/>".
if z.err == nil && z.buf[z.raw.end-2] == '/' {
return SelfClosingTagToken
}
z.raw.start = z.raw.end
z.data.start = z.raw.end
z.data.end = z.raw.end
+ if z.rawTag != "" {
+ z.readRawOrRCDATA()
+ return TextToken
+ }
+ z.textIsRaw = false
loop:
for {
s := z.buf[z.data.start:z.data.end]
z.data.start = z.raw.end
z.data.end = z.raw.end
- return unescape(s)
+ if !z.textIsRaw {
+ s = unescape(s)
+ }
+ return s
}
return nil
}
`<p id="0"</p>`,
`<p id="0" <="" p="">`,
},
+ // Raw text and RCDATA.
+ {
+ "basic raw text",
+ "<script><a></b></script>",
+ "<script>$<a></b>$</script>",
+ },
+ {
+ "unfinished script end tag",
+ "<SCRIPT>a</SCR",
+ "<script>$a</SCR",
+ },
+ {
+ "broken script end tag",
+ "<SCRIPT>a</SCR ipt>",
+ "<script>$a</SCR ipt>",
+ },
+ {
+ "EOF in script end tag",
+ "<SCRIPT>a</SCRipt",
+ "<script>$a</SCRipt",
+ },
+ {
+ "scriptx end tag",
+ "<SCRIPT>a</SCRiptx",
+ "<script>$a</SCRiptx",
+ },
+ {
+ "' ' completes script end tag",
+ "<SCRIPT>a</SCRipt ",
+ "<script>$a$</script>",
+ },
+ {
+ "'>' completes script end tag",
+ "<SCRIPT>a</SCRipt>",
+ "<script>$a$</script>",
+ },
+ {
+ "self-closing script end tag",
+ "<SCRIPT>a</SCRipt/>",
+ "<script>$a$</script>",
+ },
+ {
+ "nested script tag",
+ "<SCRIPT>a</SCRipt<script>",
+ "<script>$a</SCRipt<script>",
+ },
+ {
+ "script end tag after unfinished",
+ "<SCRIPT>a</SCRipt</script>",
+ "<script>$a</SCRipt$</script>",
+ },
+ {
+ "script/style mismatched tags",
+ "<script>a</style>",
+ "<script>$a</style>",
+ },
+ {
+ "style element with entity",
+ "<style>'",
+ "<style>$&apos;",
+ },
+ {
+ "textarea with tag",
+ "<textarea><div></textarea>",
+ "<textarea>$<div>$</textarea>",
+ },
+ {
+ "title with tag and entity",
+ "<title><b>K&R C</b></title>",
+ "<title>$<b>K&R C</b>$</title>",
+ },
// DOCTYPE tests.
{
"Proper DOCTYPE",