exp/html: make the tokenizer return atoms for tag tokens.

author Nigel Tao <nigeltao@golang.org>

Thu, 7 Jun 2012 03:05:35 +0000 (13:05 +1000)

committer Nigel Tao <nigeltao@golang.org>

Thu, 7 Jun 2012 03:05:35 +0000 (13:05 +1000)
author Nigel Tao <nigeltao@golang.org>
Thu, 7 Jun 2012 03:05:35 +0000 (13:05 +1000)
committer Nigel Tao <nigeltao@golang.org>
Thu, 7 Jun 2012 03:05:35 +0000 (13:05 +1000)
diff --git a/src/pkg/exp/html/node.go b/src/pkg/exp/html/node.go

index c105a4e709aeae2afd07035d795daefb1decf342..65fa558b2487075b994b7965e8823dca9d3a0a42 100644 (file)
--- a/src/pkg/exp/html/node.go
+++ b/src/pkg/exp/html/node.go
@@ -4,8 +4,12 @@
  
  package html
  
+import (
+       "exp/html/atom"
+)
+
  // A NodeType is the type of a Node.
-type NodeType int
+type NodeType uint32
  
  const (
         ErrorNode NodeType = iota
@@ -25,7 +29,8 @@ var scopeMarker = Node{Type: scopeMarkerNode}
  // A Node consists of a NodeType and some Data (tag name for element nodes,
  // content for text) and are part of a tree of Nodes. Element nodes may also
  // have a Namespace and contain a slice of Attributes. Data is unescaped, so
-// that it looks like "a<b" rather than "a&lt;b".
+// that it looks like "a<b" rather than "a&lt;b". For element nodes, DataAtom
+// is the atom for Data, or zero if Data is not a known tag name.
  //
  // An empty Namespace implies a "http://www.w3.org/1999/xhtml" namespace.
  // Similarly, "math" is short for "http://www.w3.org/1998/Math/MathML", and
@@ -34,6 +39,7 @@ type Node struct {
         Parent    *Node
         Child     []*Node
         Type      NodeType
+       DataAtom  atom.Atom
         Data      string
         Namespace string
         Attr      []Attribute
@@ -83,9 +89,10 @@ func reparentChildren(dst, src *Node) {
  // The clone has no parent and no children.
  func (n *Node) clone() *Node {
         m := &Node{
-               Type: n.Type,
-               Data: n.Data,
-               Attr: make([]Attribute, len(n.Attr)),
+               Type:     n.Type,
+               DataAtom: n.DataAtom,
+               Data:     n.Data,
+               Attr:     make([]Attribute, len(n.Attr)),
         }
         copy(m.Attr, n.Attr)
         return m
diff --git a/src/pkg/exp/html/parse.go b/src/pkg/exp/html/parse.go

index e1bfcd9aa5c02d6a3959f219c8b7634ec2f4e295..eb063268fc6034add91f1567ae6ef6ee44ec3061 100644 (file)
--- a/src/pkg/exp/html/parse.go
+++ b/src/pkg/exp/html/parse.go
@@ -5,6 +5,7 @@
  package html
  
  import (
+       a "exp/html/atom"
         "io"
         "strings"
  )
@@ -280,7 +281,7 @@ func (p *parser) addText(text string) {
  func (p *parser) addElement(tag string, attr []Attribute) {
         p.addChild(&Node{
                 Type: ElementNode,
-               Data: tag,
+               Data: tag, // TODO: also set DataAtom.
                 Attr: attr,
         })
  }
@@ -310,9 +311,9 @@ findIdenticalElements:
                         continue
                 }
         compareAttributes:
-               for _, a := range n.Attr {
-                       for _, b := range attr {
-                               if a.Key == b.Key && a.Namespace == b.Namespace && a.Val == b.Val {
+               for _, t0 := range n.Attr {
+                       for _, t1 := range attr {
+                               if t0.Key == t1.Key && t0.Namespace == t1.Namespace && t0.Val == t1.Val {
                                         // Found a match for this attribute, continue with the next attribute.
                                         continue compareAttributes
                                 }
@@ -676,13 +677,13 @@ func copyAttributes(dst *Node, src Token) {
                 return
         }
         attr := map[string]string{}
-       for _, a := range dst.Attr {
-               attr[a.Key] = a.Val
+       for _, t := range dst.Attr {
+               attr[t.Key] = t.Val
         }
-       for _, a := range src.Attr {
-               if _, ok := attr[a.Key]; !ok {
-                       dst.Attr = append(dst.Attr, a)
-                       attr[a.Key] = a.Val
+       for _, t := range src.Attr {
+               if _, ok := attr[t.Key]; !ok {
+                       dst.Attr = append(dst.Attr, t)
+                       attr[t.Key] = t.Val
                 }
         }
  }
@@ -843,9 +844,9 @@ func inBodyIM(p *parser) bool {
                         p.oe.pop()
                         p.acknowledgeSelfClosingTag()
                         if p.tok.Data == "input" {
-                               for _, a := range p.tok.Attr {
-                                       if a.Key == "type" {
-                                               if strings.ToLower(a.Val) == "hidden" {
+                               for _, t := range p.tok.Attr {
+                                       if t.Key == "type" {
+                                               if strings.ToLower(t.Val) == "hidden" {
                                                         // Skip setting framesetOK = false
                                                         return true
                                                 }
@@ -874,16 +875,16 @@ func inBodyIM(p *parser) bool {
                         action := ""
                         prompt := "This is a searchable index. Enter search keywords: "
                         attr := []Attribute{{Key: "name", Val: "isindex"}}
-                       for _, a := range p.tok.Attr {
-                               switch a.Key {
+                       for _, t := range p.tok.Attr {
+                               switch t.Key {
                                 case "action":
-                                       action = a.Val
+                                       action = t.Val
                                 case "name":
                                         // Ignore the attribute.
                                 case "prompt":
-                                       prompt = a.Val
+                                       prompt = t.Val
                                 default:
-                                       attr = append(attr, a)
+                                       attr = append(attr, t)
                                 }
                         }
                         p.acknowledgeSelfClosingTag()
@@ -1231,8 +1232,8 @@ func inTableIM(p *parser) bool {
                 case "style", "script":
                         return inHeadIM(p)
                 case "input":
-                       for _, a := range p.tok.Attr {
-                               if a.Key == "type" && strings.ToLower(a.Val) == "hidden" {
+                       for _, t := range p.tok.Attr {
+                               if t.Key == "type" && strings.ToLower(t.Val) == "hidden" {
                                         p.addElement(p.tok.Data, p.tok.Attr)
                                         p.oe.pop()
                                         return true
@@ -1863,6 +1864,7 @@ func parseForeignContent(p *parser) bool {
                         // Adjust SVG tag names. The tokenizer lower-cases tag names, but
                         // SVG wants e.g. "foreignObject" with a capital second "O".
                         if x := svgTagNameAdjustments[p.tok.Data]; x != "" {
+                               p.tok.DataAtom = a.Lookup([]byte(x))
                                 p.tok.Data = x
                         }
                         adjustAttributeNames(p.tok.Attr, svgAttributeAdjustments)
@@ -1929,7 +1931,7 @@ func (p *parser) parseImpliedToken(t TokenType, data string, attr []Attribute) {
         realToken, selfClosing := p.tok, p.hasSelfClosingToken
         p.tok = Token{
                 Type: t,
-               Data: data,
+               Data: data, // TODO: also set DataAtom.
                 Attr: attr,
         }
         p.hasSelfClosingToken = false
@@ -2014,7 +2016,7 @@ func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
  
         root := &Node{
                 Type: ElementNode,
-               Data: "html",
+               Data: "html", // TODO: also set DataAtom.
         }
         p.doc.Add(root)
         p.oe = nodeStack{root}
diff --git a/src/pkg/exp/html/parse_test.go b/src/pkg/exp/html/parse_test.go

index ffd1660541447c8eb81da0b390e5e60af584d2c9..234191ef14cc16eed7b7c17ec067663e4d559877 100644 (file)
--- a/src/pkg/exp/html/parse_test.go
+++ b/src/pkg/exp/html/parse_test.go
@@ -8,6 +8,7 @@ import (
         "bufio"
         "bytes"
         "errors"
+       "exp/html/atom"
         "flag"
         "fmt"
         "io"
@@ -320,8 +321,9 @@ func testParseCase(text, want, context string) (result parseTestResult, err erro
                 }
         } else {
                 contextNode := &Node{
-                       Type: ElementNode,
-                       Data: context,
+                       Type:     ElementNode,
+                       DataAtom: atom.Lookup([]byte(context)),
+                       Data:     context,
                 }
                 nodes, err := ParseFragment(strings.NewReader(text), contextNode)
                 if err != nil {
diff --git a/src/pkg/exp/html/token.go b/src/pkg/exp/html/token.go

index 632ba8d2f2ef8ece1d67fdc7267e6e4f6e0b424c..4ca0a90b344210778b18271bcb24513b50ccacb8 100644 (file)
--- a/src/pkg/exp/html/token.go
+++ b/src/pkg/exp/html/token.go
@@ -13,7 +13,7 @@ import (
  )
  
  // A TokenType is the type of a Token.
-type TokenType int
+type TokenType uint32
  
  const (
         // ErrorToken means that an error occurred during tokenization.
@@ -66,11 +66,13 @@ type Attribute struct {
  // A Token consists of a TokenType and some Data (tag name for start and end
  // tags, content for text, comments and doctypes). A tag Token may also contain
  // a slice of Attributes. Data is unescaped for all Tokens (it looks like "a<b"
-// rather than "a&lt;b").
+// rather than "a&lt;b"). For tag Tokens, DataAtom is the atom for Data, or
+// zero if Data is not a known tag name.
  type Token struct {
-       Type TokenType
-       Data string
-       Attr []Attribute
+       Type     TokenType
+       DataAtom atom.Atom
+       Data     string
+       Attr     []Attribute
  }
  
  // tagString returns a string representation of a tag Token's Data and Attr.
@@ -794,11 +796,19 @@ func (z *Tokenizer) Token() Token {
                         key, val, moreAttr = z.TagAttr()
                         attr = append(attr, Attribute{"", atom.String(key), string(val)})
                 }
-               t.Data = atom.String(name)
+               if a := atom.Lookup(name); a != 0 {
+                       t.DataAtom, t.Data = a, a.String()
+               } else {
+                       t.DataAtom, t.Data = 0, string(name)
+               }
                 t.Attr = attr
         case EndTagToken:
                 name, _ := z.TagName()
-               t.Data = atom.String(name)
+               if a := atom.Lookup(name); a != 0 {
+                       t.DataAtom, t.Data = a, a.String()
+               } else {
+                       t.DataAtom, t.Data = 0, string(name)
+               }
         }
         return t
  }
author	Nigel Tao <nigeltao@golang.org>
	Thu, 7 Jun 2012 03:05:35 +0000 (13:05 +1000)
committer	Nigel Tao <nigeltao@golang.org>
	Thu, 7 Jun 2012 03:05:35 +0000 (13:05 +1000)
src/pkg/exp/html/node.go		patch \| blob \| history
src/pkg/exp/html/parse.go		patch \| blob \| history
src/pkg/exp/html/parse_test.go		patch \| blob \| history
src/pkg/exp/html/token.go		patch \| blob \| history