return b
}
-const escapedChars = `&'<>"`
+const escapedChars = "&'<>\"\r"
func escape(w writer, s string) error {
i := strings.IndexAny(s, escapedChars)
case '"':
// """ is shorter than """.
esc = """
+ case '\r':
+ esc = " "
default:
panic("unrecognized escape character")
}
FAIL "<svg>\x00a</svg><frameset>"
PASS "<svg><path></path></svg><frameset>"
PASS "<svg><p><frameset>"
-FAIL "<!DOCTYPE html><pre>\r\n\r\nA</pre>"
-FAIL "<!DOCTYPE html><pre>\r\rA</pre>"
+PASS "<!DOCTYPE html><pre>\r\n\r\nA</pre>"
+PASS "<!DOCTYPE html><pre>\r\rA</pre>"
PASS "<!DOCTYPE html><pre>\rA</pre>"
PASS "<!DOCTYPE html><table><tr><td><math><mtext>\x00a"
PASS "<!DOCTYPE html><table><tr><td><svg><foreignObject>\x00a"
return z.buf[z.raw.start:z.raw.end]
}
+// convertNewlines converts "\r" and "\r\n" in s to "\n".
+// The conversion happens in place, but the resulting slice may be shorter.
+func convertNewlines(s []byte) []byte {
+ for i, c := range s {
+ if c != '\r' {
+ continue
+ }
+
+ src := i + 1
+ if src >= len(s) || s[src] != '\n' {
+ s[i] = '\n'
+ continue
+ }
+
+ dst := i
+ for src < len(s) {
+ if s[src] == '\r' {
+ if src+1 < len(s) && s[src+1] == '\n' {
+ src++
+ }
+ s[dst] = '\n'
+ } else {
+ s[dst] = s[src]
+ }
+ src++
+ dst++
+ }
+ return s[:dst]
+ }
+ return s
+}
+
// Text returns the unescaped text of a text, comment or doctype token. The
// contents of the returned slice may change on the next call to Next.
func (z *Tokenizer) Text() []byte {
s := z.buf[z.data.start:z.data.end]
z.data.start = z.raw.end
z.data.end = z.raw.end
+ s = convertNewlines(s)
if !z.textIsRaw {
s = unescape(s)
}
z.nAttrReturned++
key = z.buf[x[0].start:x[0].end]
val = z.buf[x[1].start:x[1].end]
- return lower(key), unescape(val), z.nAttrReturned < len(z.attr)
+ return lower(key), unescape(convertNewlines(val)), z.nAttrReturned < len(z.attr)
}
}
return nil, nil, false
}
}
+func TestConvertNewlines(t *testing.T) {
+ testCases := map[string]string{
+ "Mac\rDOS\r\nUnix\n": "Mac\nDOS\nUnix\n",
+ "Unix\nMac\rDOS\r\n": "Unix\nMac\nDOS\n",
+ "DOS\r\nDOS\r\nDOS\r\n": "DOS\nDOS\nDOS\n",
+ "": "",
+ "\n": "\n",
+ "\n\r": "\n\n",
+ "\r": "\n",
+ "\r\n": "\n",
+ "\r\n\n": "\n\n",
+ "\r\n\r": "\n\n",
+ "\r\n\r\n": "\n\n",
+ "\r\r": "\n\n",
+ "\r\r\n": "\n\n",
+ "\r\r\n\n": "\n\n\n",
+ "\r\r\r\n": "\n\n\n",
+ "\r \n": "\n \n",
+ "xyz": "xyz",
+ }
+ for in, want := range testCases {
+ if got := string(convertNewlines([]byte(in))); got != want {
+ t.Errorf("input %q: got %q, want %q", in, got, want)
+ }
+ }
+}
+
const (
rawLevel = iota
lowLevel