From ce27b00f48bf3b90445bb4bcd28f6115c129d75b Mon Sep 17 00:00:00 2001 From: Andrew Balholm Date: Thu, 1 Dec 2011 12:47:57 +1100 Subject: [PATCH] html: implement fragment parsing algorithm Pass the tests in tests4.dat. R=nigeltao CC=golang-dev https://golang.org/cl/5447055 --- src/pkg/html/parse.go | 95 ++++++++++++++++++++++++++++++++------ src/pkg/html/parse_test.go | 62 +++++++++++++++++++------ 2 files changed, 127 insertions(+), 30 deletions(-) diff --git a/src/pkg/html/parse.go b/src/pkg/html/parse.go index 45dc19150c..97fbc514d8 100644 --- a/src/pkg/html/parse.go +++ b/src/pkg/html/parse.go @@ -39,6 +39,9 @@ type parser struct { fosterParenting bool // quirks is whether the parser is operating in "quirks mode." quirks bool + // context is the context element when parsing an HTML fragment + // (section 11.4). + context *Node } func (p *parser) top() *Node { @@ -287,9 +290,10 @@ func (p *parser) setOriginalIM() { func (p *parser) resetInsertionMode() { for i := len(p.oe) - 1; i >= 0; i-- { n := p.oe[i] - if i == 0 { - // TODO: set n to the context element, for HTML fragment parsing. + if i == 0 && p.context != nil { + n = p.context } + switch n.Data { case "select": p.im = inSelectIM @@ -1516,18 +1520,7 @@ func afterAfterFramesetIM(p *parser) bool { return true } -// Parse returns the parse tree for the HTML from the given Reader. -// The input is assumed to be UTF-8 encoded. -func Parse(r io.Reader) (*Node, error) { - p := &parser{ - tokenizer: NewTokenizer(r), - doc: &Node{ - Type: DocumentNode, - }, - scripting: true, - framesetOK: true, - im: initialIM, - } +func (p *parser) parse() error { // Iterate until EOF. Any other error will cause an early return. consumed := true for { @@ -1536,7 +1529,7 @@ func Parse(r io.Reader) (*Node, error) { if err == io.EOF { break } - return nil, err + return err } } consumed = p.im(p) @@ -1547,5 +1540,77 @@ func Parse(r io.Reader) (*Node, error) { break } } + return nil +} + +// Parse returns the parse tree for the HTML from the given Reader. +// The input is assumed to be UTF-8 encoded. +func Parse(r io.Reader) (*Node, error) { + p := &parser{ + tokenizer: NewTokenizer(r), + doc: &Node{ + Type: DocumentNode, + }, + scripting: true, + framesetOK: true, + im: initialIM, + } + err := p.parse() + if err != nil { + return nil, err + } return p.doc, nil } + +// ParseFragment parses a fragment of HTML and returns the nodes that were +// found. If the fragment is the InnerHTML for an existing element, pass that +// element in context. +func ParseFragment(r io.Reader, context *Node) ([]*Node, error) { + p := &parser{ + tokenizer: NewTokenizer(r), + doc: &Node{ + Type: DocumentNode, + }, + scripting: true, + context: context, + } + + if context != nil { + switch context.Data { + case "iframe", "noembed", "noframes", "noscript", "plaintext", "script", "style", "title", "textarea", "xmp": + p.tokenizer.rawTag = context.Data + } + } + + root := &Node{ + Type: ElementNode, + Data: "html", + } + p.doc.Add(root) + p.oe = nodeStack{root} + p.resetInsertionMode() + + for n := context; n != nil; n = n.Parent { + if n.Type == ElementNode && n.Data == "form" { + p.form = n + break + } + } + + err := p.parse() + if err != nil { + return nil, err + } + + parent := p.doc + if context != nil { + parent = root + } + + result := parent.Child + parent.Child = nil + for _, n := range result { + n.Parent = nil + } + return result, nil +} diff --git a/src/pkg/html/parse_test.go b/src/pkg/html/parse_test.go index ea72557a0b..e0c19cff6d 100644 --- a/src/pkg/html/parse_test.go +++ b/src/pkg/html/parse_test.go @@ -16,21 +16,21 @@ import ( ) // readParseTest reads a single test case from r. -func readParseTest(r *bufio.Reader) (text, want string, err error) { +func readParseTest(r *bufio.Reader) (text, want, context string, err error) { line, err := r.ReadSlice('\n') if err != nil { - return "", "", err + return "", "", "", err } var b []byte // Read the HTML. if string(line) != "#data\n" { - return "", "", fmt.Errorf(`got %q want "#data\n"`, line) + return "", "", "", fmt.Errorf(`got %q want "#data\n"`, line) } for { line, err = r.ReadSlice('\n') if err != nil { - return "", "", err + return "", "", "", err } if line[0] == '#' { break @@ -42,33 +42,45 @@ func readParseTest(r *bufio.Reader) (text, want string, err error) { // Skip the error list. if string(line) != "#errors\n" { - return "", "", fmt.Errorf(`got %q want "#errors\n"`, line) + return "", "", "", fmt.Errorf(`got %q want "#errors\n"`, line) } for { line, err = r.ReadSlice('\n') if err != nil { - return "", "", err + return "", "", "", err } if line[0] == '#' { break } } + if string(line) == "#document-fragment\n" { + line, err = r.ReadSlice('\n') + if err != nil { + return "", "", "", err + } + context = strings.TrimSpace(string(line)) + line, err = r.ReadSlice('\n') + if err != nil { + return "", "", "", err + } + } + // Read the dump of what the parse tree should be. if string(line) != "#document\n" { - return "", "", fmt.Errorf(`got %q want "#document\n"`, line) + return "", "", "", fmt.Errorf(`got %q want "#document\n"`, line) } for { line, err = r.ReadSlice('\n') if err != nil && err != io.EOF { - return "", "", err + return "", "", "", err } if len(line) == 0 || len(line) == 1 && line[0] == '\n' { break } b = append(b, line...) } - return text, string(b), nil + return text, string(b), context, nil } func dumpIndent(w io.Writer, level int) { @@ -153,7 +165,7 @@ func TestParser(t *testing.T) { {"tests1.dat", -1}, {"tests2.dat", -1}, {"tests3.dat", -1}, - // tests4.dat is fragment cases. + {"tests4.dat", -1}, {"tests5.dat", -1}, } for _, tf := range testFiles { @@ -164,17 +176,37 @@ func TestParser(t *testing.T) { defer f.Close() r := bufio.NewReader(f) for i := 0; i != tf.n; i++ { - text, want, err := readParseTest(r) + text, want, context, err := readParseTest(r) if err == io.EOF && tf.n == -1 { break } if err != nil { t.Fatal(err) } - doc, err := Parse(strings.NewReader(text)) - if err != nil { - t.Fatal(err) + + var doc *Node + if context == "" { + doc, err = Parse(strings.NewReader(text)) + if err != nil { + t.Fatal(err) + } + } else { + contextNode := &Node{ + Type: ElementNode, + Data: context, + } + nodes, err := ParseFragment(strings.NewReader(text), contextNode) + if err != nil { + t.Fatal(err) + } + doc = &Node{ + Type: DocumentNode, + } + for _, n := range nodes { + doc.Add(n) + } } + got, err := dump(doc) if err != nil { t.Fatal(err) @@ -184,7 +216,7 @@ func TestParser(t *testing.T) { t.Errorf("%s test #%d %q, got vs want:\n----\n%s----\n%s----", tf.filename, i, text, got, want) continue } - if renderTestBlacklist[text] { + if renderTestBlacklist[text] || context != "" { continue } // Check that rendering and re-parsing results in an identical tree. -- 2.48.1