html: tokenize "a < b" as one whole text token.

author Nigel Tao <nigeltao@golang.org>

Sun, 16 Oct 2011 09:50:11 +0000 (20:50 +1100)

committer Nigel Tao <nigeltao@golang.org>

Sun, 16 Oct 2011 09:50:11 +0000 (20:50 +1100)
author Nigel Tao <nigeltao@golang.org>
Sun, 16 Oct 2011 09:50:11 +0000 (20:50 +1100)
committer Nigel Tao <nigeltao@golang.org>
Sun, 16 Oct 2011 09:50:11 +0000 (20:50 +1100)
diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go

index a02b968dc2881d6bb13c52697da45758dfb1a6db..2105cc6f5610fdd523c689348e8a8431c94682b6 100644 (file)
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -379,15 +379,16 @@ func (z *Tokenizer) nextTag() {
                 z.nextBogusComment()
                 return
         default:
-               z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
+               z.nextText()
                 return
         }
         // Read the tag name and attribute key/value pairs.
         z.readTagName()
+       if z.skipWhiteSpace(); z.err != nil {
+               z.tt = ErrorToken
+               return
+       }
         for {
-               if z.skipWhiteSpace(); z.err != nil {
-                       break
-               }
                 c := z.readByte()
                 if z.err != nil || c == '>' {
                         break
@@ -399,6 +400,9 @@ func (z *Tokenizer) nextTag() {
                 if z.pendingAttr[0].start != z.pendingAttr[0].end {
                         z.attr = append(z.attr, z.pendingAttr)
                 }
+               if z.skipWhiteSpace(); z.err != nil {
+                       break
+               }
         }
         // Check for a self-closing token.
         if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
@@ -510,21 +514,40 @@ func (z *Tokenizer) readTagAttrVal() {
         }
  }
  
-// nextText reads all text up until an '<'.
-// Pre-condition: z.tt == TextToken && z.err == nil && z.raw.start + 1 <= z.raw.end.
+// nextText reads all text up until a start tag "<a", end tag "</a", comment
+// "<!" or XML processing instruction "<?".
+// Pre-condition: z.tt == TextToken && z.err == nil &&
+//   z.raw.start + 1 <= z.raw.end.
  func (z *Tokenizer) nextText() {
         for {
                 c := z.readByte()
                 if z.err != nil {
-                       z.data = z.raw
-                       return
+                       break
                 }
-               if c == '<' {
-                       z.raw.end--
-                       z.data = z.raw
-                       return
+               if c != '<' {
+                       continue
+               }
+               c = z.readByte()
+               if z.err != nil {
+                       break
+               }
+               if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
+                       z.raw.end -= 2
+                       break
+               }
+               if c != '/' {
+                       continue
+               }
+               c = z.readByte()
+               if z.err != nil {
+                       break
+               }
+               if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
+                       z.raw.end -= 3
+                       break
                 }
         }
+       z.data = z.raw
  }
  
  // Next scans the next token and returns its type.
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go

index 986e6cb748af04defd0ed1fd062f3356f2077227..09bb75be15712a65f6eff96c7b8ba2b68d1c9b61 100644 (file)
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -21,6 +21,11 @@ type tokenTest struct {
  }
  
  var tokenTests = []tokenTest{
+       {
+               "empty",
+               "",
+               "",
+       },
         // A single text node. The tokenizer should not break text nodes on whitespace,
         // nor should it normalize whitespace within a text node.
         {
@@ -41,6 +46,81 @@ var tokenTests = []tokenTest{
                 "<a>b<c/>d</e>",
                 "<a>$b$<c/>$d$</e>",
         },
+       // Angle brackets that aren't a tag.
+       {
+               "not a tag #0",
+               "<",
+               "&lt;",
+       },
+       {
+               "not a tag #1",
+               "</",
+               "&lt;/",
+       },
+       /*
+               // TODO: re-enable these tests when we tokenize them correctly.
+               {
+                       "not a tag #2",
+                       "</>",
+                       "",
+               },
+               {
+                       "not a tag #3",
+                       "a</>b",
+                       "a$b",
+               },
+       */
+       {
+               "not a tag #4",
+               "</ >",
+               "<!-- -->",
+       },
+       {
+               "not a tag #5",
+               "a < b",
+               "a &lt; b",
+       },
+       {
+               "not a tag #6",
+               "<.>",
+               "&lt;.&gt;",
+       },
+       {
+               "not a tag #7",
+               "a<<<b>>>c",
+               "a&lt;&lt;$<b>$&gt;&gt;c",
+       },
+       {
+               "not a tag #8",
+               "if x<0 and y < 0 then x*y>0",
+               "if x&lt;0 and y &lt; 0 then x*y&gt;0",
+       },
+       // EOF in a tag name.
+       {
+               "tag name eof #0",
+               "<a",
+               "",
+       },
+       {
+               "tag name eof #1",
+               "<a ",
+               "",
+       },
+       {
+               "tag name eof #2",
+               "a<b",
+               "a",
+       },
+       {
+               "tag name eof #3",
+               "<a><b",
+               "<a>",
+       },
+       {
+               "tag name eof #4",
+               `<a x`,
+               `<a x="">`,
+       },
         // Some malformed tags that are missing a '>'.
         {
                 "malformed tag #0",
@@ -257,8 +337,8 @@ var tokenTests = []tokenTest{
         },
         {
                 "Attributes with a solitary single quote",
-               "<p id=can't><p id=won't>",
-               "<p id=\"can&apos;t\">$<p id=\"won&apos;t\">",
+               `<p id=can't><p id=won't>`,
+               `<p id="can&apos;t">$<p id="won&apos;t">`,
         },
  }
  
@@ -267,15 +347,17 @@ loop:
         for _, tt := range tokenTests {
                 z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
                 z.ReturnComments = true
-               for i, s := range strings.Split(tt.golden, "$") {
-                       if z.Next() == ErrorToken {
-                               t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
-                               continue loop
-                       }
-                       actual := z.Token().String()
-                       if s != actual {
-                               t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
-                               continue loop
+               if tt.golden != "" {
+                       for i, s := range strings.Split(tt.golden, "$") {
+                               if z.Next() == ErrorToken {
+                                       t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
+                                       continue loop
+                               }
+                               actual := z.Token().String()
+                               if s != actual {
+                                       t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
+                                       continue loop
+                               }
                         }
                 }
                 z.Next()
author	Nigel Tao <nigeltao@golang.org>
	Sun, 16 Oct 2011 09:50:11 +0000 (20:50 +1100)
committer	Nigel Tao <nigeltao@golang.org>
	Sun, 16 Oct 2011 09:50:11 +0000 (20:50 +1100)
src/pkg/html/token.go		patch \| blob \| history
src/pkg/html/token_test.go		patch \| blob \| history