// unescapeEntity reads an entity like "<" from b[src:] and writes the
// corresponding "<" to b[dst:], returning the incremented dst and src cursors.
// Precondition: b[src] == '&' && dst <= src.
-func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
+// attribute should be true if parsing an attribute value.
+func unescapeEntity(b []byte, dst, src int, attribute bool) (dst1, src1 int) {
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
// i starts at 1 because we already know that s[0] == '&'.
// Consume the maximum number of characters possible, with the
// consumed characters matching one of the named references.
- // TODO(nigeltao): unescape("¬it;") should be "¬it;"
for i < len(s) {
c := s[i]
i++
// Lower-cased characters are more common in entities, so we check for them first.
- if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
+ if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
continue
}
if c != ';' {
}
entityName := string(s[1:i])
- if x := entity[entityName]; x != 0 {
+ if entityName == "" {
+ // No-op.
+ } else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
+ // No-op.
+ } else if x := entity[entityName]; x != 0 {
return dst + utf8.EncodeRune(b[dst:], x), src + i
- } else if x := entity2[entityName]; x[0] != 0 { // Check if it's a two-character entity.
+ } else if x := entity2[entityName]; x[0] != 0 {
dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
+ } else if !attribute {
+ maxLen := len(entityName) - 1
+ if maxLen > longestEntityWithoutSemicolon {
+ maxLen = longestEntityWithoutSemicolon
+ }
+ for j := maxLen; j > 1; j-- {
+ if x := entity[entityName[:j]]; x != 0 {
+ return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
+ }
+ }
}
dst1, src1 = dst+i, src+i
func unescape(b []byte) []byte {
for i, c := range b {
if c == '&' {
- dst, src := unescapeEntity(b, i, i)
+ dst, src := unescapeEntity(b, i, i, false)
for src < len(b) {
c := b[src]
if c == '&' {
- dst, src = unescapeEntity(b, dst, src)
+ dst, src = unescapeEntity(b, dst, src, false)
} else {
b[dst] = c
dst, src = dst+1, src+1