]> Cypherpunks repositories - gostls13.git/commitdiff
html: parse DOCTYPE into name and public and system identifiers
authorAndrew Balholm <andybalholm@gmail.com>
Wed, 23 Nov 2011 22:28:58 +0000 (09:28 +1100)
committerNigel Tao <nigeltao@golang.org>
Wed, 23 Nov 2011 22:28:58 +0000 (09:28 +1100)
Pass tests2.dat, test 59:
<!DOCTYPE <!DOCTYPE HTML>><!--<!--x-->-->

| <!DOCTYPE <!doctype>
| <html>
|   <head>
|   <body>
|     ">"
|     <!-- <!--x -->
|     "-->"

Pass all the tests in doctype01.dat.

Also pass tests2.dat, test 60:
<!doctype html><div><form></form><div></div></div>

R=nigeltao
CC=golang-dev
https://golang.org/cl/5437045

src/pkg/html/parse.go
src/pkg/html/parse_test.go
src/pkg/html/render.go

index 36204895b9c55a6158541a905a0d52d7a7713625..041c5390edaa7b5ccb12beddc81feaf0b443f4d9 100644 (file)
@@ -321,6 +321,59 @@ func (p *parser) resetInsertionMode() {
 
 const whitespace = " \t\r\n\f"
 
+// parseDoctype parses the data from a DoctypeToken into a name,
+// public identifier, and system identifier. It returns a Node whose Type 
+// is DoctypeNode, whose Data is the name, and which has attributes
+// named "system" and "public" for the two identifiers if they were present.
+func parseDoctype(s string) *Node {
+       n := &Node{Type: DoctypeNode}
+
+       // Find the name.
+       space := strings.IndexAny(s, whitespace)
+       if space == -1 {
+               space = len(s)
+       }
+       n.Data = strings.ToLower(s[:space])
+       s = strings.TrimLeft(s[space:], whitespace)
+
+       if len(s) < 6 {
+               // It can't start with "PUBLIC" or "SYSTEM".
+               // Ignore the rest of the string.
+               return n
+       }
+
+       key := strings.ToLower(s[:6])
+       s = s[6:]
+       for key == "public" || key == "system" {
+               s = strings.TrimLeft(s, whitespace)
+               if s == "" {
+                       break
+               }
+               quote := s[0]
+               if quote != '"' && quote != '\'' {
+                       break
+               }
+               s = s[1:]
+               q := strings.IndexRune(s, rune(quote))
+               var id string
+               if q == -1 {
+                       id = s
+                       s = ""
+               } else {
+                       id = s[:q]
+                       s = s[q+1:]
+               }
+               n.Attr = append(n.Attr, Attribute{Key: key, Val: id})
+               if key == "public" {
+                       key = "system"
+               } else {
+                       key = ""
+               }
+       }
+
+       return n
+}
+
 // Section 11.2.5.4.1.
 func initialIM(p *parser) bool {
        switch p.tok.Type {
@@ -337,10 +390,7 @@ func initialIM(p *parser) bool {
                })
                return true
        case DoctypeToken:
-               p.doc.Add(&Node{
-                       Type: DoctypeNode,
-                       Data: p.tok.Data,
-               })
+               p.doc.Add(parseDoctype(p.tok.Data))
                p.im = beforeHTMLIM
                return true
        }
index c1347c9dc1c0b861e2dea59f7334948219f6163c..90d3f46c61b9935080dd3fe297f6d6d3681baf6c 100644 (file)
@@ -97,7 +97,23 @@ func dumpLevel(w io.Writer, n *Node, level int) error {
        case CommentNode:
                fmt.Fprintf(w, "<!-- %s -->", n.Data)
        case DoctypeNode:
-               fmt.Fprintf(w, "<!DOCTYPE %s>", n.Data)
+               fmt.Fprintf(w, "<!DOCTYPE %s", n.Data)
+               if n.Attr != nil {
+                       var p, s string
+                       for _, a := range n.Attr {
+                               switch a.Key {
+                               case "public":
+                                       p = a.Val
+                               case "system":
+                                       s = a.Val
+                               }
+                       }
+                       if p != "" || s != "" {
+                               fmt.Fprintf(w, ` "%s"`, p)
+                               fmt.Fprintf(w, ` "%s"`, s)
+                       }
+               }
+               io.WriteString(w, ">")
        case scopeMarkerNode:
                return errors.New("unexpected scopeMarkerNode")
        default:
@@ -133,8 +149,9 @@ func TestParser(t *testing.T) {
                n int
        }{
                // TODO(nigeltao): Process all the test cases from all the .dat files.
+               {"doctype01.dat", -1},
                {"tests1.dat", -1},
-               {"tests2.dat", 59},
+               {"tests2.dat", -1},
                {"tests3.dat", 0},
        }
        for _, tf := range testFiles {
index 92c349fb32c890addd69796b02b03279641b0eff..57d78beef1c9e82accf61e503451a60681f02734 100644 (file)
@@ -9,6 +9,7 @@ import (
        "errors"
        "fmt"
        "io"
+       "strings"
 )
 
 type writer interface {
@@ -98,6 +99,40 @@ func render1(w writer, n *Node) error {
                if _, err := w.WriteString(n.Data); err != nil {
                        return err
                }
+               if n.Attr != nil {
+                       var p, s string
+                       for _, a := range n.Attr {
+                               switch a.Key {
+                               case "public":
+                                       p = a.Val
+                               case "system":
+                                       s = a.Val
+                               }
+                       }
+                       if p != "" {
+                               if _, err := w.WriteString(" PUBLIC "); err != nil {
+                                       return err
+                               }
+                               if err := writeQuoted(w, p); err != nil {
+                                       return err
+                               }
+                               if s != "" {
+                                       if err := w.WriteByte(' '); err != nil {
+                                               return err
+                                       }
+                                       if err := writeQuoted(w, s); err != nil {
+                                               return err
+                                       }
+                               }
+                       } else if s != "" {
+                               if _, err := w.WriteString(" SYSTEM "); err != nil {
+                                       return err
+                               }
+                               if err := writeQuoted(w, s); err != nil {
+                                       return err
+                               }
+                       }
+               }
                return w.WriteByte('>')
        default:
                return errors.New("html: unknown node type")
@@ -181,6 +216,27 @@ func render1(w writer, n *Node) error {
        return w.WriteByte('>')
 }
 
+// writeQuoted writes s to w surrounded by quotes. Normally it will use double
+// quotes, but if s contains a double quote, it will use single quotes.
+// It is used for writing the identifiers in a doctype declaration.
+// In valid HTML, they can't contain both types of quotes.
+func writeQuoted(w writer, s string) error {
+       var q byte = '"'
+       if strings.Contains(s, `"`) {
+               q = '\''
+       }
+       if err := w.WriteByte(q); err != nil {
+               return err
+       }
+       if _, err := w.WriteString(s); err != nil {
+               return err
+       }
+       if err := w.WriteByte(q); err != nil {
+               return err
+       }
+       return nil
+}
+
 // Section 13.1.2, "Elements", gives this list of void elements. Void elements
 // are those that can't have any contents.
 var voidElements = map[string]bool{