go/scanner: report specific error for UCS-2 encoded files

author Alan Donovan <adonovan@google.com>

Tue, 25 Feb 2025 20:25:56 +0000 (15:25 -0500)

committer Gopher Robot <gobot@golang.org>

Wed, 26 Feb 2025 03:23:05 +0000 (19:23 -0800)
author Alan Donovan <adonovan@google.com>
Tue, 25 Feb 2025 20:25:56 +0000 (15:25 -0500)
committer Gopher Robot <gobot@golang.org>
Wed, 26 Feb 2025 03:23:05 +0000 (19:23 -0800)
diff --git a/src/go/scanner/scanner.go b/src/go/scanner/scanner.go

index 8ca74667fc42a462449a00ac57b1220f5bd487a1..153252b5cc3ebdd8c159c6d723f4a3e5e13435ef 100644 (file)
--- a/src/go/scanner/scanner.go
+++ b/src/go/scanner/scanner.go
@@ -71,7 +71,17 @@ func (s *Scanner) next() {
                         // not ASCII
                         r, w = utf8.DecodeRune(s.src[s.rdOffset:])
                         if r == utf8.RuneError && w == 1 {
-                               s.error(s.offset, "illegal UTF-8 encoding")
+                               in := s.src[s.rdOffset:]
+                               if s.offset == 0 &&
+                                       len(in) >= 2 &&
+                                       (in[0] == 0xFF && in[1] == 0xFE || in[0] == 0xFE && in[1] == 0xFF) {
+                                       // U+FEFF BOM at start of file, encoded as big- or little-endian
+                                       // UCS-2 (i.e. 2-byte UTF-16). Give specific error (go.dev/issue/71950).
+                                       s.error(s.offset, "illegal UTF-8 encoding (got UTF-16)")
+                                       s.rdOffset += len(in) // consume all input to avoid error cascade
+                               } else {
+                                       s.error(s.offset, "illegal UTF-8 encoding")
+                               }
                         } else if r == bom && s.offset > 0 {
                                 s.error(s.offset, "illegal byte order mark")
                         }
diff --git a/src/go/scanner/scanner_test.go b/src/go/scanner/scanner_test.go

index 916a40a8744ed77513bd112b2ea044bff7fb2b3c..98036bea4e0969cd2ffac57487bd0144b8257a7c 100644 (file)
--- a/src/go/scanner/scanner_test.go
+++ b/src/go/scanner/scanner_test.go
@@ -5,10 +5,12 @@
  package scanner
  
  import (
+       "fmt"
         "go/token"
         "os"
         "path/filepath"
         "runtime"
+       "slices"
         "strings"
         "testing"
  )
@@ -822,6 +824,33 @@ func TestScanErrors(t *testing.T) {
         }
  }
  
+func TestUTF16(t *testing.T) {
+       // This test doesn't fit within TestScanErrors because
+       // the latter assumes that there was only one error.
+       for _, src := range []string{
+               "\xfe\xff\x00p\x00a\x00c\x00k\x00a\x00g\x00e\x00 \x00p", // BOM + "package p" encoded as UTF-16 BE
+               "\xff\xfep\x00a\x00c\x00k\x00a\x00g\x00e\x00 \x00p\x00", // BOM + "package p" encoded as UTF-16 LE
+       } {
+               var got []string
+               eh := func(posn token.Position, msg string) {
+                       got = append(got, fmt.Sprintf("#%d: %s", posn.Offset, msg))
+               }
+               var sc Scanner
+               sc.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), eh, 0)
+               sc.Scan()
+
+               // We expect two errors:
+               // one from the decoder, one from the scanner.
+               want := []string{
+                       "#0: illegal UTF-8 encoding (got UTF-16)",
+                       "#0: illegal character U+FFFD '�'",
+               }
+               if !slices.Equal(got, want) {
+                       t.Errorf("Scan(%q) returned errors %q, want %q", src, got, want)
+               }
+       }
+}
+
  // Verify that no comments show up as literal values when skipping comments.
  func TestIssue10213(t *testing.T) {
         const src = `
author	Alan Donovan <adonovan@google.com>
	Tue, 25 Feb 2025 20:25:56 +0000 (15:25 -0500)
committer	Gopher Robot <gobot@golang.org>
	Wed, 26 Feb 2025 03:23:05 +0000 (19:23 -0800)
src/go/scanner/scanner.go		patch \| blob \| history
src/go/scanner/scanner_test.go		patch \| blob \| history