From 9f3b00579eca946337d486776797b78aaf3bc55b Mon Sep 17 00:00:00 2001
From: Andrew Balholm <andybalholm@gmail.com>
Date: Wed, 1 Aug 2012 09:35:02 +1000
Subject: [PATCH] exp/html: tokenize attributes of end tags

If an end tag has an attribute that is a quoted string containing '>',
the tokenizer would end the tag prematurely. Now it reads the attributes
on end tags just as it does on start tags, but the high-level interface
still doesn't return them, because their presence is a parse error.

Pass 1 additional test.

R=nigeltao
CC=golang-dev
https://golang.org/cl/6457060
---
 .../exp/html/testlogs/scriptdata01.dat.log    |  2 +-
 src/pkg/exp/html/token.go                     | 71 ++++++++-----------
 2 files changed, 32 insertions(+), 41 deletions(-)
diff --git a/src/pkg/exp/html/testlogs/scriptdata01.dat.log b/src/pkg/exp/html/testlogs/scriptdata01.dat.log
index d5c9d6e331..85b9284d51 100644
--- a/src/pkg/exp/html/testlogs/scriptdata01.dat.log
+++ b/src/pkg/exp/html/testlogs/scriptdata01.dat.log
@@ -4,7 +4,7 @@ PASS "FOO<script></script >BAR"
 PASS "FOO<script></script/>BAR"
 PASS "FOO<script></script/ >BAR"
 PASS "FOO<script type=\"text/plain\"></scriptx>BAR"
-FAIL "FOO<script></script foo=\">\" dd>BAR"
+PASS "FOO<script></script foo=\">\" dd>BAR"
 PASS "FOO<script>'<'</script>BAR"
 PASS "FOO<script>'<!'</script>BAR"
 PASS "FOO<script>'<!-'</script>BAR"
diff --git a/src/pkg/exp/html/token.go b/src/pkg/exp/html/token.go
index d4867fc173..7e431c21ef 100644
--- a/src/pkg/exp/html/token.go
+++ b/src/pkg/exp/html/token.go
@@ -468,29 +468,10 @@ loop:
 // readStartTag reads the next start tag token. The opening "<a" has already
 // been consumed, where 'a' means anything in [A-Za-z].
 func (z *Tokenizer) readStartTag() TokenType {
-	z.attr = z.attr[:0]
-	z.nAttrReturned = 0
-	// Read the tag name and attribute key/value pairs.
-	z.readTagName()
-	if z.skipWhiteSpace(); z.err != nil {
+	z.readTag()
+	if z.err != nil && len(z.attr) == 0 {
 		return ErrorToken
 	}
-	for {
-		c := z.readByte()
-		if z.err != nil || c == '>' {
-			break
-		}
-		z.raw.end--
-		z.readTagAttrKey()
-		z.readTagAttrVal()
-		// Save pendingAttr if it has a non-empty key.
-		if z.pendingAttr[0].start != z.pendingAttr[0].end {
-			z.attr = append(z.attr, z.pendingAttr)
-		}
-		if z.skipWhiteSpace(); z.err != nil {
-			break
-		}
-	}
 	// Several tags flag the tokenizer's next token as raw.
 	c, raw := z.buf[z.data.start], false
 	if 'A' <= c && c <= 'Z' {
@@ -520,16 +501,30 @@ func (z *Tokenizer) readStartTag() TokenType {
 	return StartTagToken
 }
 
-// readEndTag reads the next end tag token. The opening "</a" has already
-// been consumed, where 'a' means anything in [A-Za-z].
-func (z *Tokenizer) readEndTag() {
+// readTag reads the next tag token. The opening "<a" or "</a" has already been
+// consumed, where 'a' means anything in [A-Za-z].
+func (z *Tokenizer) readTag() {
 	z.attr = z.attr[:0]
 	z.nAttrReturned = 0
+	// Read the tag name and attribute key/value pairs.
 	z.readTagName()
+	if z.skipWhiteSpace(); z.err != nil {
+		return
+	}
 	for {
 		c := z.readByte()
 		if z.err != nil || c == '>' {
-			return
+			break
+		}
+		z.raw.end--
+		z.readTagAttrKey()
+		z.readTagAttrVal()
+		// Save pendingAttr if it has a non-empty key.
+		if z.pendingAttr[0].start != z.pendingAttr[0].end {
+			z.attr = append(z.attr, z.pendingAttr)
+		}
+		if z.skipWhiteSpace(); z.err != nil {
+			break
 		}
 	}
 }
@@ -727,7 +722,7 @@ loop:
 				continue loop
 			}
 			if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
-				z.readEndTag()
+				z.readTag()
 				z.tt = EndTagToken
 				return z.tt
 			}
@@ -858,22 +853,18 @@ func (z *Tokenizer) Token() Token {
 	switch z.tt {
 	case TextToken, CommentToken, DoctypeToken:
 		t.Data = string(z.Text())
-	case StartTagToken, SelfClosingTagToken:
-		var attr []Attribute
+	case StartTagToken, SelfClosingTagToken, EndTagToken:
 		name, moreAttr := z.TagName()
-		for moreAttr {
-			var key, val []byte
-			key, val, moreAttr = z.TagAttr()
-			attr = append(attr, Attribute{"", atom.String(key), string(val)})
-		}
-		if a := atom.Lookup(name); a != 0 {
-			t.DataAtom, t.Data = a, a.String()
-		} else {
-			t.DataAtom, t.Data = 0, string(name)
+		// Since end tags should not have attributes, the high-level tokenizer
+		// interface will not return attributes for an end tag token even if
+		// it looks like </br foo="bar">.
+		if z.tt != EndTagToken {
+			for moreAttr {
+				var key, val []byte
+				key, val, moreAttr = z.TagAttr()
+				t.Attr = append(t.Attr, Attribute{"", atom.String(key), string(val)})
+			}
 		}
-		t.Attr = attr
-	case EndTagToken:
-		name, _ := z.TagName()
 		if a := atom.Lookup(name); a != 0 {
 			t.DataAtom, t.Data = a, a.String()
 		} else {
-- 
2.50.0