z.nextBogusComment()
return
default:
- z.tt, z.err = ErrorToken, os.NewError("html: TODO: handle malformed tags")
+ z.nextText()
return
}
// Read the tag name and attribute key/value pairs.
z.readTagName()
+ if z.skipWhiteSpace(); z.err != nil {
+ z.tt = ErrorToken
+ return
+ }
for {
- if z.skipWhiteSpace(); z.err != nil {
- break
- }
c := z.readByte()
if z.err != nil || c == '>' {
break
if z.pendingAttr[0].start != z.pendingAttr[0].end {
z.attr = append(z.attr, z.pendingAttr)
}
+ if z.skipWhiteSpace(); z.err != nil {
+ break
+ }
}
// Check for a self-closing token.
if z.err == nil && z.tt == StartTagToken && z.buf[z.raw.end-2] == '/' {
}
}
-// nextText reads all text up until an '<'.
-// Pre-condition: z.tt == TextToken && z.err == nil && z.raw.start + 1 <= z.raw.end.
+// nextText reads all text up until a start tag "<a", end tag "</a", comment
+// "<!" or XML processing instruction "<?".
+// Pre-condition: z.tt == TextToken && z.err == nil &&
+// z.raw.start + 1 <= z.raw.end.
func (z *Tokenizer) nextText() {
for {
c := z.readByte()
if z.err != nil {
- z.data = z.raw
- return
+ break
}
- if c == '<' {
- z.raw.end--
- z.data = z.raw
- return
+ if c != '<' {
+ continue
+ }
+ c = z.readByte()
+ if z.err != nil {
+ break
+ }
+ if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
+ z.raw.end -= 2
+ break
+ }
+ if c != '/' {
+ continue
+ }
+ c = z.readByte()
+ if z.err != nil {
+ break
+ }
+ if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
+ z.raw.end -= 3
+ break
}
}
+ z.data = z.raw
}
// Next scans the next token and returns its type.
}
var tokenTests = []tokenTest{
+ {
+ "empty",
+ "",
+ "",
+ },
// A single text node. The tokenizer should not break text nodes on whitespace,
// nor should it normalize whitespace within a text node.
{
"<a>b<c/>d</e>",
"<a>$b$<c/>$d$</e>",
},
+ // Angle brackets that aren't a tag.
+ {
+ "not a tag #0",
+ "<",
+ "<",
+ },
+ {
+ "not a tag #1",
+ "</",
+ "</",
+ },
+ /*
+ // TODO: re-enable these tests when we tokenize them correctly.
+ {
+ "not a tag #2",
+ "</>",
+ "",
+ },
+ {
+ "not a tag #3",
+ "a</>b",
+ "a$b",
+ },
+ */
+ {
+ "not a tag #4",
+ "</ >",
+ "<!-- -->",
+ },
+ {
+ "not a tag #5",
+ "a < b",
+ "a < b",
+ },
+ {
+ "not a tag #6",
+ "<.>",
+ "<.>",
+ },
+ {
+ "not a tag #7",
+ "a<<<b>>>c",
+ "a<<$<b>$>>c",
+ },
+ {
+ "not a tag #8",
+ "if x<0 and y < 0 then x*y>0",
+ "if x<0 and y < 0 then x*y>0",
+ },
+ // EOF in a tag name.
+ {
+ "tag name eof #0",
+ "<a",
+ "",
+ },
+ {
+ "tag name eof #1",
+ "<a ",
+ "",
+ },
+ {
+ "tag name eof #2",
+ "a<b",
+ "a",
+ },
+ {
+ "tag name eof #3",
+ "<a><b",
+ "<a>",
+ },
+ {
+ "tag name eof #4",
+ `<a x`,
+ `<a x="">`,
+ },
// Some malformed tags that are missing a '>'.
{
"malformed tag #0",
},
{
"Attributes with a solitary single quote",
- "<p id=can't><p id=won't>",
- "<p id=\"can't\">$<p id=\"won't\">",
+ `<p id=can't><p id=won't>`,
+ `<p id="can't">$<p id="won't">`,
},
}
for _, tt := range tokenTests {
z := NewTokenizer(bytes.NewBuffer([]byte(tt.html)))
z.ReturnComments = true
- for i, s := range strings.Split(tt.golden, "$") {
- if z.Next() == ErrorToken {
- t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
- continue loop
- }
- actual := z.Token().String()
- if s != actual {
- t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
- continue loop
+ if tt.golden != "" {
+ for i, s := range strings.Split(tt.golden, "$") {
+ if z.Next() == ErrorToken {
+ t.Errorf("%s token %d: want %q got error %v", tt.desc, i, s, z.Error())
+ continue loop
+ }
+ actual := z.Token().String()
+ if s != actual {
+ t.Errorf("%s token %d: want %q got %q", tt.desc, i, s, actual)
+ continue loop
+ }
}
}
z.Next()