// not ASCII
r, w = utf8.DecodeRune(s.src[s.rdOffset:])
if r == utf8.RuneError && w == 1 {
- s.error(s.offset, "illegal UTF-8 encoding")
+ in := s.src[s.rdOffset:]
+ if s.offset == 0 &&
+ len(in) >= 2 &&
+ (in[0] == 0xFF && in[1] == 0xFE || in[0] == 0xFE && in[1] == 0xFF) {
+ // U+FEFF BOM at start of file, encoded as big- or little-endian
+ // UCS-2 (i.e. 2-byte UTF-16). Give specific error (go.dev/issue/71950).
+ s.error(s.offset, "illegal UTF-8 encoding (got UTF-16)")
+ s.rdOffset += len(in) // consume all input to avoid error cascade
+ } else {
+ s.error(s.offset, "illegal UTF-8 encoding")
+ }
} else if r == bom && s.offset > 0 {
s.error(s.offset, "illegal byte order mark")
}
package scanner
import (
+ "fmt"
"go/token"
"os"
"path/filepath"
"runtime"
+ "slices"
"strings"
"testing"
)
}
}
+func TestUTF16(t *testing.T) {
+ // This test doesn't fit within TestScanErrors because
+ // the latter assumes that there was only one error.
+ for _, src := range []string{
+ "\xfe\xff\x00p\x00a\x00c\x00k\x00a\x00g\x00e\x00 \x00p", // BOM + "package p" encoded as UTF-16 BE
+ "\xff\xfep\x00a\x00c\x00k\x00a\x00g\x00e\x00 \x00p\x00", // BOM + "package p" encoded as UTF-16 LE
+ } {
+ var got []string
+ eh := func(posn token.Position, msg string) {
+ got = append(got, fmt.Sprintf("#%d: %s", posn.Offset, msg))
+ }
+ var sc Scanner
+ sc.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), eh, 0)
+ sc.Scan()
+
+ // We expect two errors:
+ // one from the decoder, one from the scanner.
+ want := []string{
+ "#0: illegal UTF-8 encoding (got UTF-16)",
+ "#0: illegal character U+FFFD '�'",
+ }
+ if !slices.Equal(got, want) {
+ t.Errorf("Scan(%q) returned errors %q, want %q", src, got, want)
+ }
+ }
+}
+
// Verify that no comments show up as literal values when skipping comments.
func TestIssue10213(t *testing.T) {
const src = `