]> Cypherpunks repositories - gostls13.git/commitdiff
html: implement fragment parsing algorithm
authorAndrew Balholm <andybalholm@gmail.com>
Thu, 1 Dec 2011 01:47:57 +0000 (12:47 +1100)
committerNigel Tao <nigeltao@golang.org>
Thu, 1 Dec 2011 01:47:57 +0000 (12:47 +1100)
Pass the tests in tests4.dat.

R=nigeltao
CC=golang-dev
https://golang.org/cl/5447055

src/pkg/html/parse.go
src/pkg/html/parse_test.go

index 45dc19150cf88c87efd8485f63d0928b876effda..97fbc514d827abc75ba8b81a79c325afe6ace497 100644 (file)
@@ -39,6 +39,9 @@ type parser struct {
        fosterParenting bool
        // quirks is whether the parser is operating in "quirks mode."
        quirks bool
+       // context is the context element when parsing an HTML fragment
+       // (section 11.4).
+       context *Node
 }
 
 func (p *parser) top() *Node {
@@ -287,9 +290,10 @@ func (p *parser) setOriginalIM() {
 func (p *parser) resetInsertionMode() {
        for i := len(p.oe) - 1; i >= 0; i-- {
                n := p.oe[i]
-               if i == 0 {
-                       // TODO: set n to the context element, for HTML fragment parsing.
+               if i == 0 && p.context != nil {
+                       n = p.context
                }
+
                switch n.Data {
                case "select":
                        p.im = inSelectIM
@@ -1516,18 +1520,7 @@ func afterAfterFramesetIM(p *parser) bool {
        return true
 }
 
-// Parse returns the parse tree for the HTML from the given Reader.
-// The input is assumed to be UTF-8 encoded.
-func Parse(r io.Reader) (*Node, error) {
-       p := &parser{
-               tokenizer: NewTokenizer(r),
-               doc: &Node{
-                       Type: DocumentNode,
-               },
-               scripting:  true,
-               framesetOK: true,
-               im:         initialIM,
-       }
+func (p *parser) parse() error {
        // Iterate until EOF. Any other error will cause an early return.
        consumed := true
        for {
@@ -1536,7 +1529,7 @@ func Parse(r io.Reader) (*Node, error) {
                                if err == io.EOF {
                                        break
                                }
-                               return nil, err
+                               return err
                        }
                }
                consumed = p.im(p)
@@ -1547,5 +1540,77 @@ func Parse(r io.Reader) (*Node, error) {
                        break
                }
        }
+       return nil
+}
+
+// Parse returns the parse tree for the HTML from the given Reader.
+// The input is assumed to be UTF-8 encoded.
+func Parse(r io.Reader) (*Node, error) {
+       p := &parser{
+               tokenizer: NewTokenizer(r),
+               doc: &Node{
+                       Type: DocumentNode,
+               },
+               scripting:  true,
+               framesetOK: true,
+               im:         initialIM,
+       }
+       err := p.parse()
+       if err != nil {
+               return nil, err
+       }
        return p.doc, nil
 }
+
+// ParseFragment parses a fragment of HTML and returns the nodes that were 
+// found. If the fragment is the InnerHTML for an existing element, pass that
+// element in context.
+func ParseFragment(r io.Reader, context *Node) ([]*Node, error) {
+       p := &parser{
+               tokenizer: NewTokenizer(r),
+               doc: &Node{
+                       Type: DocumentNode,
+               },
+               scripting: true,
+               context:   context,
+       }
+
+       if context != nil {
+               switch context.Data {
+               case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp":
+                       p.tokenizer.rawTag = context.Data
+               }
+       }
+
+       root := &Node{
+               Type: ElementNode,
+               Data: "html",
+       }
+       p.doc.Add(root)
+       p.oe = nodeStack{root}
+       p.resetInsertionMode()
+
+       for n := context; n != nil; n = n.Parent {
+               if n.Type == ElementNode && n.Data == "form" {
+                       p.form = n
+                       break
+               }
+       }
+
+       err := p.parse()
+       if err != nil {
+               return nil, err
+       }
+
+       parent := p.doc
+       if context != nil {
+               parent = root
+       }
+
+       result := parent.Child
+       parent.Child = nil
+       for _, n := range result {
+               n.Parent = nil
+       }
+       return result, nil
+}
index ea72557a0b04ce5be692da79936129b5908f5024..e0c19cff6da5f91deccd66de4cfe7fd694dcb764 100644 (file)
@@ -16,21 +16,21 @@ import (
 )
 
 // readParseTest reads a single test case from r.
-func readParseTest(r *bufio.Reader) (text, want string, err error) {
+func readParseTest(r *bufio.Reader) (text, want, context string, err error) {
        line, err := r.ReadSlice('\n')
        if err != nil {
-               return "", "", err
+               return "", "", "", err
        }
        var b []byte
 
        // Read the HTML.
        if string(line) != "#data\n" {
-               return "", "", fmt.Errorf(`got %q want "#data\n"`, line)
+               return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line)
        }
        for {
                line, err = r.ReadSlice('\n')
                if err != nil {
-                       return "", "", err
+                       return "", "", "", err
                }
                if line[0] == '#' {
                        break
@@ -42,33 +42,45 @@ func readParseTest(r *bufio.Reader) (text, want string, err error) {
 
        // Skip the error list.
        if string(line) != "#errors\n" {
-               return "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
+               return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line)
        }
        for {
                line, err = r.ReadSlice('\n')
                if err != nil {
-                       return "", "", err
+                       return "", "", "", err
                }
                if line[0] == '#' {
                        break
                }
        }
 
+       if string(line) == "#document-fragment\n" {
+               line, err = r.ReadSlice('\n')
+               if err != nil {
+                       return "", "", "", err
+               }
+               context = strings.TrimSpace(string(line))
+               line, err = r.ReadSlice('\n')
+               if err != nil {
+                       return "", "", "", err
+               }
+       }
+
        // Read the dump of what the parse tree should be.
        if string(line) != "#document\n" {
-               return "", "", fmt.Errorf(`got %q want "#document\n"`, line)
+               return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line)
        }
        for {
                line, err = r.ReadSlice('\n')
                if err != nil && err != io.EOF {
-                       return "", "", err
+                       return "", "", "", err
                }
                if len(line) == 0 || len(line) == 1 && line[0] == '\n' {
                        break
                }
                b = append(b, line...)
        }
-       return text, string(b), nil
+       return text, string(b), context, nil
 }
 
 func dumpIndent(w io.Writer, level int) {
@@ -153,7 +165,7 @@ func TestParser(t *testing.T) {
                {"tests1.dat", -1},
                {"tests2.dat", -1},
                {"tests3.dat", -1},
-               // tests4.dat is fragment cases.
+               {"tests4.dat", -1},
                {"tests5.dat", -1},
        }
        for _, tf := range testFiles {
@@ -164,17 +176,37 @@ func TestParser(t *testing.T) {
                defer f.Close()
                r := bufio.NewReader(f)
                for i := 0; i != tf.n; i++ {
-                       text, want, err := readParseTest(r)
+                       text, want, context, err := readParseTest(r)
                        if err == io.EOF && tf.n == -1 {
                                break
                        }
                        if err != nil {
                                t.Fatal(err)
                        }
-                       doc, err := Parse(strings.NewReader(text))
-                       if err != nil {
-                               t.Fatal(err)
+
+                       var doc *Node
+                       if context == "" {
+                               doc, err = Parse(strings.NewReader(text))
+                               if err != nil {
+                                       t.Fatal(err)
+                               }
+                       } else {
+                               contextNode := &Node{
+                                       Type: ElementNode,
+                                       Data: context,
+                               }
+                               nodes, err := ParseFragment(strings.NewReader(text), contextNode)
+                               if err != nil {
+                                       t.Fatal(err)
+                               }
+                               doc = &Node{
+                                       Type: DocumentNode,
+                               }
+                               for _, n := range nodes {
+                                       doc.Add(n)
+                               }
                        }
+
                        got, err := dump(doc)
                        if err != nil {
                                t.Fatal(err)
@@ -184,7 +216,7 @@ func TestParser(t *testing.T) {
                                t.Errorf("%s test #%d %q, got vs want:\n----\n%s----\n%s----", tf.filename, i, text, got, want)
                                continue
                        }
-                       if renderTestBlacklist[text] {
+                       if renderTestBlacklist[text] || context != "" {
                                continue
                        }
                        // Check that rendering and re-parsing results in an identical tree.