]> Cypherpunks repositories - gostls13.git/commitdiff
go/scanner: report specific error for UCS-2 encoded files
authorAlan Donovan <adonovan@google.com>
Tue, 25 Feb 2025 20:25:56 +0000 (15:25 -0500)
committerGopher Robot <gobot@golang.org>
Wed, 26 Feb 2025 03:23:05 +0000 (19:23 -0800)
Windows text files may be encoded as UCS-2 (i.e. 2-byte UTF-16).
This CL causes the scanner to emit a better error when it reads
a file in this encoding.

+ test

Fixes #71950

Change-Id: Ia65bbf9a60e36984b0f3e4865591aa6978d2bde2
Reviewed-on: https://go-review.googlesource.com/c/go/+/652515
Reviewed-by: Rob Pike <r@golang.org>
Reviewed-by: Robert Griesemer <gri@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Alan Donovan <adonovan@google.com>
Commit-Queue: Alan Donovan <adonovan@google.com>

src/go/scanner/scanner.go
src/go/scanner/scanner_test.go

index 8ca74667fc42a462449a00ac57b1220f5bd487a1..153252b5cc3ebdd8c159c6d723f4a3e5e13435ef 100644 (file)
@@ -71,7 +71,17 @@ func (s *Scanner) next() {
                        // not ASCII
                        r, w = utf8.DecodeRune(s.src[s.rdOffset:])
                        if r == utf8.RuneError && w == 1 {
-                               s.error(s.offset, "illegal UTF-8 encoding")
+                               in := s.src[s.rdOffset:]
+                               if s.offset == 0 &&
+                                       len(in) >= 2 &&
+                                       (in[0] == 0xFF && in[1] == 0xFE || in[0] == 0xFE && in[1] == 0xFF) {
+                                       // U+FEFF BOM at start of file, encoded as big- or little-endian
+                                       // UCS-2 (i.e. 2-byte UTF-16). Give specific error (go.dev/issue/71950).
+                                       s.error(s.offset, "illegal UTF-8 encoding (got UTF-16)")
+                                       s.rdOffset += len(in) // consume all input to avoid error cascade
+                               } else {
+                                       s.error(s.offset, "illegal UTF-8 encoding")
+                               }
                        } else if r == bom && s.offset > 0 {
                                s.error(s.offset, "illegal byte order mark")
                        }
index 916a40a8744ed77513bd112b2ea044bff7fb2b3c..98036bea4e0969cd2ffac57487bd0144b8257a7c 100644 (file)
@@ -5,10 +5,12 @@
 package scanner
 
 import (
+       "fmt"
        "go/token"
        "os"
        "path/filepath"
        "runtime"
+       "slices"
        "strings"
        "testing"
 )
@@ -822,6 +824,33 @@ func TestScanErrors(t *testing.T) {
        }
 }
 
+func TestUTF16(t *testing.T) {
+       // This test doesn't fit within TestScanErrors because
+       // the latter assumes that there was only one error.
+       for _, src := range []string{
+               "\xfe\xff\x00p\x00a\x00c\x00k\x00a\x00g\x00e\x00 \x00p", // BOM + "package p" encoded as UTF-16 BE
+               "\xff\xfep\x00a\x00c\x00k\x00a\x00g\x00e\x00 \x00p\x00", // BOM + "package p" encoded as UTF-16 LE
+       } {
+               var got []string
+               eh := func(posn token.Position, msg string) {
+                       got = append(got, fmt.Sprintf("#%d: %s", posn.Offset, msg))
+               }
+               var sc Scanner
+               sc.Init(fset.AddFile("", fset.Base(), len(src)), []byte(src), eh, 0)
+               sc.Scan()
+
+               // We expect two errors:
+               // one from the decoder, one from the scanner.
+               want := []string{
+                       "#0: illegal UTF-8 encoding (got UTF-16)",
+                       "#0: illegal character U+FFFD '�'",
+               }
+               if !slices.Equal(got, want) {
+                       t.Errorf("Scan(%q) returned errors %q, want %q", src, got, want)
+               }
+       }
+}
+
 // Verify that no comments show up as literal values when skipping comments.
 func TestIssue10213(t *testing.T) {
        const src = `