html: parse empty, unquoted, and single-quoted attribute values

author Brad Fitzpatrick <bradfitz@golang.org>

Thu, 12 May 2011 23:11:35 +0000 (16:11 -0700)

committer Brad Fitzpatrick <bradfitz@golang.org>

Thu, 12 May 2011 23:11:35 +0000 (16:11 -0700)
author Brad Fitzpatrick <bradfitz@golang.org>
Thu, 12 May 2011 23:11:35 +0000 (16:11 -0700)
committer Brad Fitzpatrick <bradfitz@golang.org>
Thu, 12 May 2011 23:11:35 +0000 (16:11 -0700)
diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go

index ad03241ed920adce0c49540fc7fd7b4a2a09da00..6d8eb604ef4f0faa219a84031c01dd1a3fa5a974 100644 (file)
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -331,10 +331,10 @@ func (z *Tokenizer) trim(i int) int {
         return k
  }
  
-// lower finds the largest alphabetic [0-9A-Za-z]* word at the start of z.buf[i:]
-// and returns that word lower-cased, as well as the trimmed cursor location
-// after that word.
-func (z *Tokenizer) lower(i int) ([]byte, int) {
+// word finds the largest alphabetic [0-9A-Za-z]* word at the start
+// of z.buf[i:] and returns that word (optionally lower-cased), as
+// well as the trimmed cursor location after that word.
+func (z *Tokenizer) word(i int, lower bool) ([]byte, int) {
         i0 := i
  loop:
         for ; i < z.p1; i++ {
@@ -343,7 +343,9 @@ loop:
                 case '0' <= c && c <= '9':
                         // No-op.
                 case 'A' <= c && c <= 'Z':
-                       z.buf[i] = c + 'a' - 'A'
+                       if lower {
+                               z.buf[i] = c + 'a' - 'A'
+                       }
                 case 'a' <= c && c <= 'z':
                         // No-op.
                 default:
@@ -388,7 +390,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
         if z.buf[i] == '/' {
                 i++
         }
-       name, z.p0 = z.lower(i)
+       name, z.p0 = z.word(i, true)
         hasAttr = z.p0 != z.p1
         return
  }
@@ -397,23 +399,36 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
  // attribute for the current tag token and whether there are more attributes.
  // The contents of the returned slices may change on the next call to Next.
  func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
-       key, i := z.lower(z.p0)
-       // Get past the "=\"".
-       if i == z.p1 || z.buf[i] != '=' {
+       key, i := z.word(z.p0, true)
+       // Check for an empty attribute value.
+       if i == z.p1 {
+               z.p0 = i
+               return
+       }
+       // Get past the equals and quote characters.
+       if z.buf[i] != '=' {
+               z.p0, moreAttr = i, true
                 return
         }
         i = z.trim(i + 1)
-       if i == z.p1 || z.buf[i] != '"' {
+       if i == z.p1 {
+               z.p0 = i
+               return
+       }
+       closeQuote := z.buf[i]
+       if closeQuote != '\'' && closeQuote != '"' {
+               val, z.p0 = z.word(i, false)
+               moreAttr = z.p0 != z.p1
                 return
         }
         i = z.trim(i + 1)
-       // Copy and unescape everything up to the closing '"'.
+       // Copy and unescape everything up to the closing quote.
         dst, src := i, i
  loop:
         for src < z.p1 {
                 c := z.buf[src]
                 switch c {
-               case '"':
+               case closeQuote:
                         src++
                         break loop
                 case '&':
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go

index 5cf1f6dac30b9df19b899a7503c5ce2421c7941e..e474c92812549965d2fdc249eb753b5fa19db5f0 100644 (file)
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -107,6 +107,44 @@ var tokenTests = []tokenTest{
                 `<a b="c&noSuchEntity;d">&lt;&alsoDoesntExist;&`,
                 `<a b="c&amp;noSuchEntity;d">$&lt;&amp;alsoDoesntExist;&amp;`,
         },
+
+       // Attribute tests:
+       // http://dev.w3.org/html5/spec/Overview.html#attributes-0
+       {
+               "Empty attribute",
+               `<input disabled FOO>`,
+               `<input disabled="" foo="">`,
+       },
+       {
+               "Empty attribute, whitespace",
+               `<input disabled FOO >`,
+               `<input disabled="" foo="">`,
+       },
+       {
+               "Unquoted attribute value",
+               `<input value=yes FOO=BAR>`,
+               `<input value="yes" foo="BAR">`,
+       },
+       {
+               "Unquoted attribute value, trailing space",
+               `<input value=yes FOO=BAR >`,
+               `<input value="yes" foo="BAR">`,
+       },
+       {
+               "Single-quoted attribute value",
+               `<input value='yes' FOO='BAR'>`,
+               `<input value="yes" foo="BAR">`,
+       },
+       {
+               "Single-quoted attribute value, trailing space",
+               `<input value='yes' FOO='BAR' >`,
+               `<input value="yes" foo="BAR">`,
+       },
+       {
+               "Double-quoted attribute value",
+               `<input value="I'm an attribute" FOO="BAR">`,
+               `<input value="I&apos;m an attribute" foo="BAR">`,
+       },
  }
  
  func TestTokenizer(t *testing.T) {
author	Brad Fitzpatrick <bradfitz@golang.org>
	Thu, 12 May 2011 23:11:35 +0000 (16:11 -0700)
committer	Brad Fitzpatrick <bradfitz@golang.org>
	Thu, 12 May 2011 23:11:35 +0000 (16:11 -0700)
src/pkg/html/token.go		patch \| blob \| history
src/pkg/html/token_test.go		patch \| blob \| history