ErrorCount int // number of errors encountered
}
+const bom = 0xFEFF // byte order mark, only permitted as very first character
+
// Read the next Unicode char into s.ch.
// s.ch < 0 means end-of-file.
//
r, w = utf8.DecodeRune(s.src[s.rdOffset:])
if r == utf8.RuneError && w == 1 {
s.error(s.offset, "illegal UTF-8 encoding")
+ } else if r == bom && s.offset > 0 {
+ s.error(s.offset, "illegal byte order mark")
}
}
s.rdOffset += w
s.ErrorCount = 0
s.next()
- if s.ch == '\uFEFF' {
- s.next() // ignore BOM
+ if s.ch == bom {
+ s.next() // ignore BOM at file beginning
}
}
case '|':
tok = s.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
default:
- s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
+ // next reports unexpected BOMs - don't repeat
+ if ch != bom {
+ s.error(s.file.Offset(pos), fmt.Sprintf("illegal character %#U", ch))
+ }
insertSemi = s.insertSemi // preserve insertSemi info
tok = token.ILLEGAL
lit = string(ch)
{"0X", token.INT, 0, "illegal hexadecimal number"},
{"\"abc\x00def\"", token.STRING, 4, "illegal character NUL"},
{"\"abc\x80def\"", token.STRING, 4, "illegal UTF-8 encoding"},
- {"\ufeff\ufeff", token.ILLEGAL, 3, "illegal character U+FEFF"}, // only first BOM is ignored
+ {"\ufeff\ufeff", token.ILLEGAL, 3, "illegal byte order mark"}, // only first BOM is ignored
+ {"//\ufeff", token.COMMENT, 2, "illegal byte order mark"}, // only first BOM is ignored
+ {"'\ufeff" + `'`, token.CHAR, 1, "illegal byte order mark"}, // only first BOM is ignored
+ {`"` + "abc\ufeffdef" + `"`, token.STRING, 4, "illegal byte order mark"}, // only first BOM is ignored
}
func TestScanErrors(t *testing.T) {