return k
}
-// lower finds the largest alphabetic [0-9A-Za-z]* word at the start of z.buf[i:]
-// and returns that word lower-cased, as well as the trimmed cursor location
-// after that word.
-func (z *Tokenizer) lower(i int) ([]byte, int) {
+// word finds the largest alphabetic [0-9A-Za-z]* word at the start
+// of z.buf[i:] and returns that word (optionally lower-cased), as
+// well as the trimmed cursor location after that word.
+func (z *Tokenizer) word(i int, lower bool) ([]byte, int) {
i0 := i
loop:
for ; i < z.p1; i++ {
case '0' <= c && c <= '9':
// No-op.
case 'A' <= c && c <= 'Z':
- z.buf[i] = c + 'a' - 'A'
+ if lower {
+ z.buf[i] = c + 'a' - 'A'
+ }
case 'a' <= c && c <= 'z':
// No-op.
default:
if z.buf[i] == '/' {
i++
}
- name, z.p0 = z.lower(i)
+ name, z.p0 = z.word(i, true)
hasAttr = z.p0 != z.p1
return
}
// attribute for the current tag token and whether there are more attributes.
// The contents of the returned slices may change on the next call to Next.
func (z *Tokenizer) TagAttr() (key, val []byte, moreAttr bool) {
- key, i := z.lower(z.p0)
- // Get past the "=\"".
- if i == z.p1 || z.buf[i] != '=' {
+ key, i := z.word(z.p0, true)
+ // Check for an empty attribute value.
+ if i == z.p1 {
+ z.p0 = i
+ return
+ }
+ // Get past the equals and quote characters.
+ if z.buf[i] != '=' {
+ z.p0, moreAttr = i, true
return
}
i = z.trim(i + 1)
- if i == z.p1 || z.buf[i] != '"' {
+ if i == z.p1 {
+ z.p0 = i
+ return
+ }
+ closeQuote := z.buf[i]
+ if closeQuote != '\'' && closeQuote != '"' {
+ val, z.p0 = z.word(i, false)
+ moreAttr = z.p0 != z.p1
return
}
i = z.trim(i + 1)
- // Copy and unescape everything up to the closing '"'.
+ // Copy and unescape everything up to the closing quote.
dst, src := i, i
loop:
for src < z.p1 {
c := z.buf[src]
switch c {
- case '"':
+ case closeQuote:
src++
break loop
case '&':
`<a b="c&noSuchEntity;d"><&alsoDoesntExist;&`,
`<a b="c&noSuchEntity;d">$<&alsoDoesntExist;&`,
},
+
+ // Attribute tests:
+ // http://dev.w3.org/html5/spec/Overview.html#attributes-0
+ {
+ "Empty attribute",
+ `<input disabled FOO>`,
+ `<input disabled="" foo="">`,
+ },
+ {
+ "Empty attribute, whitespace",
+ `<input disabled FOO >`,
+ `<input disabled="" foo="">`,
+ },
+ {
+ "Unquoted attribute value",
+ `<input value=yes FOO=BAR>`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Unquoted attribute value, trailing space",
+ `<input value=yes FOO=BAR >`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Single-quoted attribute value",
+ `<input value='yes' FOO='BAR'>`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Single-quoted attribute value, trailing space",
+ `<input value='yes' FOO='BAR' >`,
+ `<input value="yes" foo="BAR">`,
+ },
+ {
+ "Double-quoted attribute value",
+ `<input value="I'm an attribute" FOO="BAR">`,
+ `<input value="I'm an attribute" foo="BAR">`,
+ },
}
func TestTokenizer(t *testing.T) {