exp/html: make the parser manipulate the tokenizer via exported methods

author Nigel Tao <nigeltao@golang.org>

Mon, 20 Aug 2012 01:04:36 +0000 (11:04 +1000)

committer Nigel Tao <nigeltao@golang.org>

Mon, 20 Aug 2012 01:04:36 +0000 (11:04 +1000)
author Nigel Tao <nigeltao@golang.org>
Mon, 20 Aug 2012 01:04:36 +0000 (11:04 +1000)
committer Nigel Tao <nigeltao@golang.org>
Mon, 20 Aug 2012 01:04:36 +0000 (11:04 +1000)
diff --git a/src/pkg/exp/html/parse.go b/src/pkg/exp/html/parse.go

index d4b12277689707935fbac6d0a1c7e617ad997322..2ef0241debd1e6ac4fbe2fae2bd1985459a16d1a 100644 (file)
--- a/src/pkg/exp/html/parse.go
+++ b/src/pkg/exp/html/parse.go
@@ -402,7 +402,7 @@ func (p *parser) reconstructActiveFormattingElements() {
  func (p *parser) read() error {
         // CDATA sections are allowed only in foreign content.
         n := p.oe.top()
-       p.tokenizer.cdataOK = n != nil && n.Namespace != ""
+       p.tokenizer.AllowCDATA(n != nil && n.Namespace != "")
  
         p.tokenizer.Next()
         p.tok = p.tokenizer.Token()
@@ -1613,9 +1613,9 @@ func inSelectIM(p *parser) bool {
                                 p.parseImpliedToken(EndTagToken, a.Select, a.Select.String())
                                 return false
                         }
-                       // Ignore the token.
                         // In order to properly ignore <textarea>, we need to change the tokenizer mode.
-                       p.tokenizer.rawTag = ""
+                       p.tokenizer.NextIsNotRawText()
+                       // Ignore the token.
                         return true
                 case a.Script:
                         return inHeadIM(p)
@@ -1921,7 +1921,7 @@ func parseForeignContent(p *parser) bool {
                 if namespace != "" {
                         // Don't let the tokenizer go into raw text mode in foreign content
                         // (e.g. in an SVG <title> tag).
-                       p.tokenizer.rawTag = ""
+                       p.tokenizer.NextIsNotRawText()
                 }
                 if p.hasSelfClosingToken {
                         p.oe.pop()
@@ -2046,16 +2046,7 @@ func Parse(r io.Reader) (*Node, error) {
  // found. If the fragment is the InnerHTML for an existing element, pass that
  // element in context.
  func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
-       p := &parser{
-               tokenizer: NewTokenizer(r),
-               doc: &Node{
-                       Type: DocumentNode,
-               },
-               scripting: true,
-               fragment:  true,
-               context:   context,
-       }
-
+       contextTag := ""
         if context != nil {
                 if context.Type != ElementNode {
                         return nil, errors.New("html: ParseFragment of non-element Node")
@@ -2066,10 +2057,16 @@ func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
                 if context.DataAtom != a.Lookup([]byte(context.Data)) {
                         return nil, fmt.Errorf("html: inconsistent Node: DataAtom=%q, Data=%q", context.DataAtom, context.Data)
                 }
-               switch context.DataAtom {
-               case a.Iframe, a.Noembed, a.Noframes, a.Noscript, a.Plaintext, a.Script, a.Style, a.Title, a.Textarea, a.Xmp:
-                       p.tokenizer.rawTag = context.DataAtom.String()
-               }
+               contextTag = context.DataAtom.String()
+       }
+       p := &parser{
+               tokenizer: NewTokenizerFragment(r, contextTag),
+               doc: &Node{
+                       Type: DocumentNode,
+               },
+               scripting: true,
+               fragment:  true,
+               context:   context,
         }
  
         root := &Node{
diff --git a/src/pkg/exp/html/token.go b/src/pkg/exp/html/token.go

index 7bc77329d8d091992e3e407aa885714252469884..517bd5d3ee8dd39ebbba38aea80f7a06fe5886c9 100644 (file)
--- a/src/pkg/exp/html/token.go
+++ b/src/pkg/exp/html/token.go
@@ -155,8 +155,54 @@ type Tokenizer struct {
         // convertNUL is whether NUL bytes in the current token's data should
         // be converted into \ufffd replacement characters.
         convertNUL bool
-       // cdataOK is whether CDATA sections are allowed in the current context.
-       cdataOK bool
+       // allowCDATA is whether CDATA sections are allowed in the current context.
+       allowCDATA bool
+}
+
+// AllowCDATA sets whether or not the tokenizer recognizes <![CDATA[foo]]> as
+// the text "foo". The default value is false, which means to recognize it as
+// a bogus comment "<!-- [CDATA[foo]] -->" instead.
+//
+// Strictly speaking, an HTML5 compliant tokenizer should allow CDATA if and
+// only if tokenizing foreign content, such as MathML and SVG. However,
+// tracking foreign-contentness is difficult to do purely in the tokenizer,
+// as opposed to the parser, due to HTML integration points: an <svg> element
+// can contain a <foreignObject> that is foreign-to-SVG but not foreign-to-
+// HTML. For strict compliance with the HTML5 tokenization algorithm, it is the
+// responsibility of the user of a tokenizer to call AllowCDATA as appropriate.
+// In practice, if using the tokenizer without caring whether MathML or SVG
+// CDATA is text or comments, such as tokenizing HTML to find all the anchor
+// text, it is acceptable to ignore this responsibility.
+func (z *Tokenizer) AllowCDATA(allowCDATA bool) {
+       z.allowCDATA = allowCDATA
+}
+
+// NextIsNotRawText instructs the tokenizer that the next token should not be
+// considered as 'raw text'. Some elements, such as script and title elements,
+// normally require the next token after the opening tag to be 'raw text' that
+// has no child elements. For example, tokenizing "<title>a<b>c</b>d</title>"
+// yields a start tag token for "<title>", a text token for "a<b>c</b>d", and
+// an end tag token for "</title>". There are no distinct start tag or end tag
+// tokens for the "<b>" and "</b>".
+//
+// This tokenizer implementation will generally look for raw text at the right
+// times. Strictly speaking, an HTML5 compliant tokenizer should not look for
+// raw text if in foreign content: <title> generally needs raw text, but a
+// <title> inside an <svg> does not. Another example is that a <textarea>
+// generally needs raw text, but a <textarea> is not allowed as an immediate
+// child of a <select>; in normal parsing, a <textarea> implies </select>, but
+// one cannot close the implicit element when parsing a <select>'s InnerHTML.
+// Similarly to AllowCDATA, tracking the correct moment to override raw-text-
+// ness is difficult to do purely in the tokenizer, as opposed to the parser.
+// For strict compliance with the HTML5 tokenization algorithm, it is the
+// responsibility of the user of a tokenizer to call NextIsNotRawText as
+// appropriate. In practice, like AllowCDATA, it is acceptable to ignore this
+// responsibility for basic usage.
+//
+// Note that this 'raw text' concept is different from the one offered by the
+// Tokenizer.Raw method.
+func (z *Tokenizer) NextIsNotRawText() {
+       z.rawTag = ""
  }
  
  // Err returns the error associated with the most recent ErrorToken token.
@@ -592,7 +638,7 @@ func (z *Tokenizer) readMarkupDeclaration() TokenType {
         if z.readDoctype() {
                 return DoctypeToken
         }
-       if z.cdataOK && z.readCDATA() {
+       if z.allowCDATA && z.readCDATA() {
                 z.convertNUL = true
                 return TextToken
         }
@@ -1101,8 +1147,27 @@ func (z *Tokenizer) Token() Token {
  // NewTokenizer returns a new HTML Tokenizer for the given Reader.
  // The input is assumed to be UTF-8 encoded.
  func NewTokenizer(r io.Reader) *Tokenizer {
-       return &Tokenizer{
+       return NewTokenizerFragment(r, "")
+}
+
+// NewTokenizerFragment returns a new HTML Tokenizer for the given Reader, for
+// tokenizing an exisitng element's InnerHTML fragment. contextTag is that
+// element's tag, such as "div" or "iframe".
+//
+// For example, how the InnerHTML "a<b" is tokenized depends on whether it is
+// for a <p> tag or a <script> tag.
+//
+// The input is assumed to be UTF-8 encoded.
+func NewTokenizerFragment(r io.Reader, contextTag string) *Tokenizer {
+       z := &Tokenizer{
                 r:   r,
                 buf: make([]byte, 0, 4096),
         }
+       if contextTag != "" {
+               switch s := strings.ToLower(contextTag); s {
+               case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":
+                       z.rawTag = s
+               }
+       }
+       return z
  }
author	Nigel Tao <nigeltao@golang.org>
	Mon, 20 Aug 2012 01:04:36 +0000 (11:04 +1000)
committer	Nigel Tao <nigeltao@golang.org>
	Mon, 20 Aug 2012 01:04:36 +0000 (11:04 +1000)
src/pkg/exp/html/parse.go		patch \| blob \| history
src/pkg/exp/html/token.go		patch \| blob \| history