html: parse malformed tags missing a '>', such as `<p id=0</p>`.

author Nigel Tao <nigeltao@golang.org>

Wed, 10 Aug 2011 03:39:07 +0000 (13:39 +1000)

committer Nigel Tao <nigeltao@golang.org>

Wed, 10 Aug 2011 03:39:07 +0000 (13:39 +1000)
author Nigel Tao <nigeltao@golang.org>
Wed, 10 Aug 2011 03:39:07 +0000 (13:39 +1000)
committer Nigel Tao <nigeltao@golang.org>
Wed, 10 Aug 2011 03:39:07 +0000 (13:39 +1000)
diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go

index d280ff225609c145e1231d383b599b7cc9723eaf..fddc922d6016d9903fbfb9fc2c2cf69fdd74b4c0 100644 (file)
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -276,13 +276,12 @@ func (z *Tokenizer) nextTag() {
         if z.err != nil {
                 return
         }
-       var tt TokenType
         switch {
         case c == '/':
-               tt = EndTagToken
+               z.tt = EndTagToken
         // Lower-cased characters are more common in tag names, so we check for them first.
         case 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z':
-               tt = StartTagToken
+               z.tt = StartTagToken
         case c == '!':
                 z.nextMarkupDeclaration()
                 return
@@ -305,8 +304,7 @@ func (z *Tokenizer) nextTag() {
                                 return
                         }
                 case '>':
-                       z.tt = tt
-                       if z.buf[z.p1-2] == '/' && tt == StartTagToken {
+                       if z.buf[z.p1-2] == '/' && z.tt == StartTagToken {
                                 z.tt = SelfClosingTagToken
                         }
                         return
@@ -379,37 +377,53 @@ func (z *Tokenizer) trim(i int) int {
         return k
  }
  
-// word finds the largest alphabetic [0-9A-Za-z]* word at the start
-// of z.buf[i:] and returns that word (optionally lower-cased), as
-// well as the trimmed cursor location after that word.
-func (z *Tokenizer) word(i int, lower bool) ([]byte, int) {
+// tagName finds the tag name at the start of z.buf[i:] and returns that name
+// lower-cased, as well as the trimmed cursor location afterwards.
+func (z *Tokenizer) tagName(i int) ([]byte, int) {
         i0 := i
  loop:
         for ; i < z.p1; i++ {
                 c := z.buf[i]
-               switch {
-               case '0' <= c && c <= '9':
-                       // No-op.
-               case 'A' <= c && c <= 'Z':
-                       if lower {
-                               z.buf[i] = c + 'a' - 'A'
-                       }
-               case 'a' <= c && c <= 'z':
-                       // No-op.
-               default:
+               switch c {
+               case ' ', '\n', '\t', '\f', '/', '>':
                         break loop
                 }
+               if 'A' <= c && c <= 'Z' {
+                       z.buf[i] = c + 'a' - 'A'
+               }
+       }
+       return z.buf[i0:i], z.trim(i)
+}
+
+// unquotedAttrVal finds the unquoted attribute value at the start of z.buf[i:]
+// and returns that value, as well as the trimmed cursor location afterwards.
+func (z *Tokenizer) unquotedAttrVal(i int) ([]byte, int) {
+       i0 := i
+loop:
+       for ; i < z.p1; i++ {
+               switch z.buf[i] {
+               case ' ', '\n', '\t', '\f', '>':
+                       break loop
+               case '&':
+                       // TODO: unescape the entity.
+               }
         }
         return z.buf[i0:i], z.trim(i)
  }
  
  // attrName finds the largest attribute name at the start
  // of z.buf[i:] and returns it lower-cased, as well
-// as the trimmed cursor location after that word.
+// as the trimmed cursor location after that name.
  //
  // http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
  // TODO: unicode characters
  func (z *Tokenizer) attrName(i int) ([]byte, int) {
+       for z.buf[i] == '/' {
+               i++
+               if z.buf[i] == '>' {
+                       return nil, z.trim(i)
+               }
+       }
         i0 := i
  loop:
         for ; i < z.p1; i++ {
@@ -469,7 +483,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
         if z.buf[i] == '/' {
                 i++
         }
-       name, z.p0 = z.word(i, true)
+       name, z.p0 = z.tagName(i)
         hasAttr = z.p0 != z.p1
         return
  }
@@ -496,7 +510,7 @@ func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
         }
         closeQuote := z.buf[i]
         if closeQuote != '\'' && closeQuote != '"' {
-               val, z.p0 = z.word(i, false)
+               val, z.p0 = z.unquotedAttrVal(i)
                 moreAttr = z.p0 != z.p1
                 return
         }
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go

index c8dcc88648def6a23c9ed825bc28b70bc4a8ee3d..1330f3247a15ecef2fc30c1a03f57572ee32ac68 100644 (file)
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -41,6 +41,22 @@ var tokenTests = []tokenTest{
                 "<a>b<c/>d</e>",
                 "<a>$b$<c/>$d$</e>",
         },
+       // Some malformed tags that are missing a '>'.
+       {
+               "malformed tag #0",
+               `<p</p>`,
+               `<p< p="">`,
+       },
+       {
+               "malformed tag #1",
+               `<p id=0</p>`,
+               `<p id="0&lt;/p">`,
+       },
+       {
+               "malformed tag #2",
+               `<p id="0</p>`,
+               `<p id="0&lt;/p&gt;">`,
+       },
         // Comments.
         {
                 "comment0",
@@ -117,7 +133,6 @@ var tokenTests = []tokenTest{
                 "&frac12;",
                 "½",
         },
-
         // Attribute tests:
         // http://dev.w3.org/html5/spec/Overview.html#attributes-0
         {
author	Nigel Tao <nigeltao@golang.org>
	Wed, 10 Aug 2011 03:39:07 +0000 (13:39 +1000)
committer	Nigel Tao <nigeltao@golang.org>
	Wed, 10 Aug 2011 03:39:07 +0000 (13:39 +1000)
src/pkg/html/token.go		patch \| blob \| history
src/pkg/html/token_test.go		patch \| blob \| history