]> Cypherpunks repositories - gostls13.git/commitdiff
html: improve parsing of comments and "bogus comments"
authorAndrew Balholm <andybalholm@gmail.com>
Sat, 15 Oct 2011 01:22:08 +0000 (12:22 +1100)
committerNigel Tao <nigeltao@golang.org>
Sat, 15 Oct 2011 01:22:08 +0000 (12:22 +1100)
R=nigeltao
CC=golang-dev
https://golang.org/cl/5279044

src/pkg/html/token.go
src/pkg/html/token_test.go

index dcece8cacb439528975d4ef8ccb5b09a719c2a3e..a02b968dc2881d6bb13c52697da45758dfb1a6db 100644 (file)
@@ -100,9 +100,9 @@ func (t Token) String() string {
        case SelfClosingTagToken:
                return "<" + t.tagString() + "/>"
        case CommentToken:
-               return "<!--" + EscapeString(t.Data) + "-->"
+               return "<!--" + t.Data + "-->"
        case DoctypeToken:
-               return "<!DOCTYPE " + EscapeString(t.Data) + ">"
+               return "<!DOCTYPE " + t.Data + ">"
        }
        return "Invalid(" + strconv.Itoa(int(t.Type)) + ")"
 }
@@ -227,30 +227,62 @@ func (z *Tokenizer) skipWhiteSpace() {
 
 // nextComment reads the next token starting with "<!--".
 // The opening "<!--" has already been consumed.
-// Pre-condition: z.tt == TextToken && z.err == nil &&
+// Pre-condition: z.tt == CommentToken && z.err == nil &&
 //   z.raw.start + 4 <= z.raw.end.
 func (z *Tokenizer) nextComment() {
-       // <!--> is a valid comment.
+       z.data.start = z.raw.end
+       defer func() {
+               if z.data.end < z.data.start {
+                       // It's a comment with no data, like <!-->.
+                       z.data.end = z.data.start
+               }
+       }()
        for dashCount := 2; ; {
                c := z.readByte()
                if z.err != nil {
-                       z.data = z.raw
+                       z.data.end = z.raw.end
                        return
                }
                switch c {
                case '-':
                        dashCount++
+                       continue
                case '>':
                        if dashCount >= 2 {
-                               z.tt = CommentToken
-                               // TODO: adjust z.data to be only the "x" in "<!--x-->".
-                               // Note that "<!>" is also a valid HTML5 comment.
-                               z.data = z.raw
+                               z.data.end = z.raw.end - len("-->")
                                return
                        }
-                       dashCount = 0
-               default:
-                       dashCount = 0
+               case '!':
+                       if dashCount >= 2 {
+                               c = z.readByte()
+                               if z.err != nil {
+                                       z.data.end = z.raw.end
+                                       return
+                               }
+                               if c == '>' {
+                                       z.data.end = z.raw.end - len("--!>")
+                                       return
+                               }
+                       }
+               }
+               dashCount = 0
+       }
+}
+
+// nextBogusComment reads text until the next ">" and treats it as a comment.
+// Pre-condition: z.err == nil && z.raw.end is before the first comment byte.
+func (z *Tokenizer) nextBogusComment() {
+       z.tt = CommentToken
+       z.data.start = z.raw.end
+       for {
+               c := z.readByte()
+               if z.err != nil {
+                       z.data.end = z.raw.end
+                       return
+               }
+               if c == '>' {
+                       z.data.end = z.raw.end - len(">")
+                       return
                }
        }
 }
@@ -258,13 +290,15 @@ func (z *Tokenizer) nextComment() {
 // nextMarkupDeclaration reads the next token starting with "<!".
 // It might be a "<!--comment-->", a "<!DOCTYPE foo>", or "<!malformed text".
 // The opening "<!" has already been consumed.
-// Pre-condition: z.tt == TextToken && z.err == nil &&
-//   z.raw.start + 2 <= z.raw.end.
+// Pre-condition: z.err == nil && z.raw.start + 2 <= z.raw.end.
 func (z *Tokenizer) nextMarkupDeclaration() {
+       z.tt = CommentToken
+       z.data.start = z.raw.end
        var c [2]byte
        for i := 0; i < 2; i++ {
                c[i] = z.readByte()
                if z.err != nil {
+                       z.data.end = z.raw.end
                        return
                }
        }
@@ -273,27 +307,35 @@ func (z *Tokenizer) nextMarkupDeclaration() {
                return
        }
        z.raw.end -= 2
-       const s = "DOCTYPE "
-       for i := 0; ; i++ {
+       const s = "DOCTYPE"
+       for i := 0; i < len(s); i++ {
                c := z.readByte()
                if z.err != nil {
-                       z.data = z.raw
+                       z.data.end = z.raw.end
                        return
                }
-               // Capitalize c.
-               if 'a' <= c && c <= 'z' {
-                       c = 'A' + (c - 'a')
+               if c != s[i] && c != s[i]+('a'-'A') {
+                       // Back up to read the fragment of "DOCTYPE" again.
+                       z.raw.end = z.data.start
+                       z.nextBogusComment()
+                       return
                }
-               if i < len(s) && c != s[i] {
-                       z.nextText()
+       }
+       z.tt = DoctypeToken
+       if z.skipWhiteSpace(); z.err != nil {
+               z.data.start = z.raw.end
+               z.data.end = z.raw.end
+               return
+       }
+       z.data.start = z.raw.end
+       for {
+               c := z.readByte()
+               if z.err != nil {
+                       z.data.end = z.raw.end
                        return
                }
                if c == '>' {
-                       if i >= len(s) {
-                               z.tt = DoctypeToken
-                               z.data.start = z.raw.start + len("<!DOCTYPE ")
-                               z.data.end = z.raw.end - len(">")
-                       }
+                       z.data.end = z.raw.end - len(">")
                        return
                }
        }
@@ -311,8 +353,18 @@ func (z *Tokenizer) nextTag() {
                return
        }
        switch {
-       // TODO: check that the "</" is followed by something in A-Za-z.
        case c == '/':
+               // Check that the "</" is followed by something in A-Za-z.
+               c = z.readByte()
+               if z.err != nil {
+                       z.data = z.raw
+                       return
+               }
+               z.raw.end--
+               if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
+                       z.nextBogusComment()
+                       return
+               }
                z.tt = EndTagToken
                z.data.start += len("</")
        // Lower-cased characters are more common in tag names, so we check for them first.
@@ -323,7 +375,8 @@ func (z *Tokenizer) nextTag() {
                z.nextMarkupDeclaration()
                return
        case c == '?':
-               z.tt, z.err = ErrorToken, os.NewError("html: TODO: implement XML processing instructions")
+               z.raw.end--
+               z.nextBogusComment()
                return
        default:
                z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
index f1082fce430e9bc3fa3559fec71173d43804f193..986e6cb748af04defd0ed1fd062f3356f2077227 100644 (file)
@@ -87,51 +87,88 @@ var tokenTests = []tokenTest{
                `<p id="0"</p>`,
                `<p id="0" <="" p="">`,
        },
+       // DOCTYPE tests.
+       {
+               "Proper DOCTYPE",
+               "<!DOCTYPE html>",
+               "<!DOCTYPE html>",
+       },
+       {
+               "DOCTYPE with no space",
+               "<!doctypehtml>",
+               "<!DOCTYPE html>",
+       },
+       {
+               "DOCTYPE with two spaces",
+               "<!doctype  html>",
+               "<!DOCTYPE html>",
+       },
+       {
+               "looks like DOCTYPE but isn't",
+               "<!DOCUMENT html>",
+               "<!--DOCUMENT html-->",
+       },
+       {
+               "DOCTYPE at EOF",
+               "<!DOCtype",
+               "<!DOCTYPE >",
+       },
+       // XML processing instructions.
+       {
+               "XML processing instruction",
+               "<?xml?>",
+               "<!--?xml?-->",
+       },
        // Comments.
        {
                "comment0",
                "abc<b><!-- skipme --></b>def",
-               "abc$<b>$</b>$def",
+               "abc$<b>$<!-- skipme -->$</b>$def",
        },
        {
                "comment1",
                "a<!-->z",
-               "a$z",
+               "a$<!---->$z",
        },
        {
                "comment2",
                "a<!--->z",
-               "a$z",
+               "a$<!---->$z",
        },
        {
                "comment3",
                "a<!--x>-->z",
-               "a$z",
+               "a$<!--x>-->$z",
        },
        {
                "comment4",
                "a<!--x->-->z",
-               "a$z",
+               "a$<!--x->-->$z",
        },
        {
                "comment5",
                "a<!>z",
-               "a$&lt;!&gt;z",
+               "a$<!---->$z",
        },
        {
                "comment6",
                "a<!->z",
-               "a$&lt;!-&gt;z",
+               "a$<!----->$z",
        },
        {
                "comment7",
                "a<!---<>z",
-               "a$&lt;!---&lt;&gt;z",
+               "a$<!---<>z-->",
        },
        {
                "comment8",
                "a<!--z",
-               "a$&lt;!--z",
+               "a$<!--z-->",
+       },
+       {
+               "comment9",
+               "a<!--x--!>z",
+               "a$<!--x-->$z",
        },
        // An attribute with a backslash.
        {
@@ -229,6 +266,7 @@ func TestTokenizer(t *testing.T) {
 loop:
        for _, tt := range tokenTests {
                z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
+               z.ReturnComments = true
                for i, s := range strings.Split(tt.golden, "$") {
                        if z.Next() == ErrorToken {
                                t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())