html: improve attribute parsing, note package status

author Brad Fitzpatrick <bradfitz@golang.org>

Mon, 6 Jun 2011 22:56:15 +0000 (15:56 -0700)

committer Brad Fitzpatrick <bradfitz@golang.org>

Mon, 6 Jun 2011 22:56:15 +0000 (15:56 -0700)
author Brad Fitzpatrick <bradfitz@golang.org>
Mon, 6 Jun 2011 22:56:15 +0000 (15:56 -0700)
committer Brad Fitzpatrick <bradfitz@golang.org>
Mon, 6 Jun 2011 22:56:15 +0000 (15:56 -0700)
diff --git a/src/pkg/html/doc.go b/src/pkg/html/doc.go

index 55135c3d05fb198324a9884f6ca8463a66438c79..5bc0630861ae5565e70a16d27cb36388a14e8181 100644 (file)
--- a/src/pkg/html/doc.go
+++ b/src/pkg/html/doc.go
@@ -4,6 +4,7 @@
  
  /*
  Package html implements an HTML5-compliant tokenizer and parser.
+INCOMPLETE.
  
  Tokenization is done by creating a Tokenizer for an io.Reader r. It is the
  caller's responsibility to ensure that r provides UTF-8 encoded HTML.
diff --git a/src/pkg/html/token.go b/src/pkg/html/token.go

index 6d8eb604ef4f0faa219a84031c01dd1a3fa5a974..23c95ece6f6806ddb5034e4a7ee2f9cd68f5107b 100644 (file)
--- a/src/pkg/html/token.go
+++ b/src/pkg/html/token.go
@@ -355,6 +355,33 @@ loop:
         return z.buf[i0:i], z.trim(i)
  }
  
+// attrName finds the largest attribute name at the start
+// of z.buf[i:] and returns it lower-cased, as well
+// as the trimmed cursor location after that word.
+//
+// http://dev.w3.org/html5/spec/Overview.html#syntax-attribute-name
+// TODO: unicode characters
+func (z *Tokenizer) attrName(i int) ([]byte, int) {
+       i0 := i
+loop:
+       for ; i < z.p1; i++ {
+               c := z.buf[i]
+               switch c {
+               case '<', '>', '"', '\'', '/', '=':
+                       break loop
+               }
+               switch {
+               case 'A' <= c && c <= 'Z':
+                       z.buf[i] = c + 'a' - 'A'
+               case c > ' ' && c < 0x7f:
+                       // No-op.
+               default:
+                       break loop
+               }
+       }
+       return z.buf[i0:i], z.trim(i)
+}
+
  // Text returns the unescaped text of a TextToken or a CommentToken.
  // The contents of the returned slice may change on the next call to Next.
  func (z *Tokenizer) Text() []byte {
@@ -399,7 +426,7 @@ func (z *Tokenizer) TagName() (name []byte, hasAttr bool) {
  // attribute for the current tag token and whether there are more attributes.
  // The contents of the returned slices may change on the next call to Next.
  func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
-       key, i := z.word(z.p0, true)
+       key, i := z.attrName(z.p0)
         // Check for an empty attribute value.
         if i == z.p1 {
                 z.p0 = i
diff --git a/src/pkg/html/token_test.go b/src/pkg/html/token_test.go

index 6291af600579d2d534fcbd18e250b554eca7614e..c17b436aab4070f6fbb79b38b3868bf13ff68516 100644 (file)
--- a/src/pkg/html/token_test.go
+++ b/src/pkg/html/token_test.go
@@ -125,6 +125,11 @@ var tokenTests = []tokenTest{
                 `<input value=yes FOO=BAR>`,
                 `<input value="yes" foo="BAR">`,
         },
+       {
+               "Unquoted attribute value, spaces",
+               `<input value = yes FOO = BAR>`,
+               `<input value="yes" foo="BAR">`,
+       },
         {
                 "Unquoted attribute value, trailing space",
                 `<input value=yes FOO=BAR >`,
@@ -145,6 +150,11 @@ var tokenTests = []tokenTest{
                 `<input value="I'm an attribute" FOO="BAR">`,
                 `<input value="I&apos;m an attribute" foo="BAR">`,
         },
+       {
+               "Attribute name characters",
+               `<meta http-equiv="content-type">`,
+               `<meta http-equiv="content-type">`,
+       },
  }
  
  func TestTokenizer(t *testing.T) {
author	Brad Fitzpatrick <bradfitz@golang.org>
	Mon, 6 Jun 2011 22:56:15 +0000 (15:56 -0700)
committer	Brad Fitzpatrick <bradfitz@golang.org>
	Mon, 6 Jun 2011 22:56:15 +0000 (15:56 -0700)
src/pkg/html/doc.go		patch \| blob \| history
src/pkg/html/token.go		patch \| blob \| history
src/pkg/html/token_test.go		patch \| blob \| history