From 974236bda9b9aad87b4b10ec9af2cc01b14e382f Mon Sep 17 00:00:00 2001 From: qmuntal Date: Fri, 5 May 2023 18:17:18 +0200 Subject: [PATCH] os, syscall: support ill-formed UTF-16 strings on Windows Windows UTF-16 strings can contain unpaired surrogates, which can't be decoded into a valid UTF-8 string. This file defines a set of functions that can be used to encode and decode potentially ill-formed UTF-16 strings by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). WTF-8 is a strict superset of UTF-8, i.e. any string that is well-formed in UTF-8 is also well-formed in WTF-8 and the content is unchanged. Also, the conversion never fails and is lossless. The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string is that the conversion is lossless even for ill-formed UTF-16 strings. This property allows to read an ill-formed UTF-16 string, convert it to a Go string, and convert it back to the same original UTF-16 string. Fixes #59971 Change-Id: Id6007f6e537844913402b233e73d698688cd5ba6 Reviewed-on: https://go-review.googlesource.com/c/go/+/493036 TryBot-Result: Gopher Robot Reviewed-by: Bryan Mills Run-TryBot: Quim Muntal Reviewed-by: Cherry Mui Reviewed-by: Paul Hampson --- .../syscall/execenv/execenv_windows.go | 3 +- .../syscall/windows/registry/value.go | 2 +- .../syscall/windows/syscall_windows.go | 7 +- src/os/dir_windows.go | 3 +- src/os/exec/lp_windows_test.go | 3 +- src/os/file_windows.go | 3 +- src/os/os_windows_test.go | 69 ++++++ src/syscall/env_windows.go | 5 +- src/syscall/export_windows_test.go | 3 + src/syscall/syscall_windows.go | 48 +++-- src/syscall/wtf8_windows.go | 92 ++++++++ src/syscall/wtf8_windows_test.go | 200 ++++++++++++++++++ 12 files changed, 402 insertions(+), 36 deletions(-) create mode 100644 src/syscall/wtf8_windows.go create mode 100644 src/syscall/wtf8_windows_test.go diff --git a/src/internal/syscall/execenv/execenv_windows.go b/src/internal/syscall/execenv/execenv_windows.go index 46ba12efc5..2a89ed1f58 100644 --- a/src/internal/syscall/execenv/execenv_windows.go +++ b/src/internal/syscall/execenv/execenv_windows.go @@ -9,7 +9,6 @@ package execenv import ( "internal/syscall/windows" "syscall" - "unicode/utf16" "unsafe" ) @@ -41,7 +40,7 @@ func Default(sys *syscall.SysProcAttr) (env []string, err error) { } entry := unsafe.Slice(blockp, (uintptr(end)-uintptr(unsafe.Pointer(blockp)))/2) - env = append(env, string(utf16.Decode(entry))) + env = append(env, syscall.UTF16ToString(entry)) blockp = (*uint16)(unsafe.Add(end, size)) } return diff --git a/src/internal/syscall/windows/registry/value.go b/src/internal/syscall/windows/registry/value.go index 025574015f..7dfee0330f 100644 --- a/src/internal/syscall/windows/registry/value.go +++ b/src/internal/syscall/windows/registry/value.go @@ -217,7 +217,7 @@ func (k Key) GetStringsValue(name string) (val []string, valtype uint32, err err from := 0 for i, c := range p { if c == 0 { - val = append(val, string(utf16.Decode(p[from:i]))) + val = append(val, syscall.UTF16ToString(p[from:i])) from = i + 1 } } diff --git a/src/internal/syscall/windows/syscall_windows.go b/src/internal/syscall/windows/syscall_windows.go index cfe4695258..53d32a14a0 100644 --- a/src/internal/syscall/windows/syscall_windows.go +++ b/src/internal/syscall/windows/syscall_windows.go @@ -7,7 +7,6 @@ package windows import ( "sync" "syscall" - "unicode/utf16" "unsafe" ) @@ -17,17 +16,13 @@ func UTF16PtrToString(p *uint16) string { if p == nil { return "" } - // Find NUL terminator. end := unsafe.Pointer(p) n := 0 for *(*uint16)(end) != 0 { end = unsafe.Pointer(uintptr(end) + unsafe.Sizeof(*p)) n++ } - // Turn *uint16 into []uint16. - s := unsafe.Slice(p, n) - // Decode []uint16 into string. - return string(utf16.Decode(s)) + return syscall.UTF16ToString(unsafe.Slice(p, n)) } const ( diff --git a/src/os/dir_windows.go b/src/os/dir_windows.go index cee05cc729..7792d03040 100644 --- a/src/os/dir_windows.go +++ b/src/os/dir_windows.go @@ -11,7 +11,6 @@ import ( "runtime" "sync" "syscall" - "unicode/utf16" "unsafe" ) @@ -104,7 +103,7 @@ func (file *File) readdir(n int, mode readdirMode) (names []string, dirents []Di d.bufp = 0 } nameslice := unsafe.Slice(&info.FileName[0], info.FileNameLength/2) - name := string(utf16.Decode(nameslice)) + name := syscall.UTF16ToString(nameslice) if name == "." || name == ".." { // Useless names continue } diff --git a/src/os/exec/lp_windows_test.go b/src/os/exec/lp_windows_test.go index 50d522948a..4d85a5f415 100644 --- a/src/os/exec/lp_windows_test.go +++ b/src/os/exec/lp_windows_test.go @@ -587,7 +587,6 @@ package main import ( "os" "syscall" - "unicode/utf16" "unsafe" ) @@ -599,7 +598,7 @@ func getMyName() (string, error) { if n == 0 { return "", err } - return string(utf16.Decode(b[0:n])), nil + return syscall.UTF16ToString(b[0:n]), nil } func main() { diff --git a/src/os/file_windows.go b/src/os/file_windows.go index f5a436e235..37db3f931c 100644 --- a/src/os/file_windows.go +++ b/src/os/file_windows.go @@ -11,7 +11,6 @@ import ( "runtime" "sync" "syscall" - "unicode/utf16" "unsafe" ) @@ -259,7 +258,7 @@ func tempDir() string { // Otherwise remove terminating \. n-- } - return string(utf16.Decode(b[:n])) + return syscall.UTF16ToString(b[:n]) } } diff --git a/src/os/os_windows_test.go b/src/os/os_windows_test.go index 21a8c21d1e..fbc8cc1b9f 100644 --- a/src/os/os_windows_test.go +++ b/src/os/os_windows_test.go @@ -18,6 +18,7 @@ import ( "path/filepath" "reflect" "runtime" + "slices" "sort" "strings" "syscall" @@ -1377,3 +1378,71 @@ func TestAppExecLinkStat(t *testing.T) { t.Errorf("exec.LookPath(%q) = %q; want %q", pythonPath, p, pythonPath) } } + +func TestIllformedUTF16FileName(t *testing.T) { + dir := t.TempDir() + const sep = string(os.PathSeparator) + if !strings.HasSuffix(dir, sep) { + dir += sep + } + + // This UTF-16 file name is ill-formed as it contains low surrogates that are not preceded by high surrogates ([1:5]). + namew := []uint16{0x2e, 0xdc6d, 0xdc73, 0xdc79, 0xdc73, 0x30, 0x30, 0x30, 0x31, 0} + + // Create a file whose name contains unpaired surrogates. + // Use syscall.CreateFile instead of os.Create to simulate a file that is created by + // a non-Go program so the file name hasn't gone through syscall.UTF16FromString. + dirw := utf16.Encode([]rune(dir)) + pathw := append(dirw, namew...) + fd, err := syscall.CreateFile(&pathw[0], syscall.GENERIC_ALL, 0, nil, syscall.CREATE_NEW, 0, 0) + if err != nil { + t.Fatal(err) + } + syscall.CloseHandle(fd) + + name := syscall.UTF16ToString(namew) + path := filepath.Join(dir, name) + // Verify that os.Lstat can query the file. + fi, err := os.Lstat(path) + if err != nil { + t.Fatal(err) + } + if got := fi.Name(); got != name { + t.Errorf("got %q, want %q", got, name) + } + // Verify that File.Readdirnames lists the file. + f, err := os.Open(dir) + if err != nil { + t.Fatal(err) + } + files, err := f.Readdirnames(0) + f.Close() + if err != nil { + t.Fatal(err) + } + if !slices.Contains(files, name) { + t.Error("file not listed") + } + // Verify that os.RemoveAll can remove the directory + // and that it doesn't hang. + err = os.RemoveAll(dir) + if err != nil { + t.Error(err) + } +} + +func TestUTF16Alloc(t *testing.T) { + allowsPerRun := func(want int, f func()) { + t.Helper() + got := int(testing.AllocsPerRun(5, f)) + if got != want { + t.Errorf("got %d allocs, want %d", got, want) + } + } + allowsPerRun(1, func() { + syscall.UTF16ToString([]uint16{'a', 'b', 'c'}) + }) + allowsPerRun(1, func() { + syscall.UTF16FromString("abc") + }) +} diff --git a/src/syscall/env_windows.go b/src/syscall/env_windows.go index 94364f930c..20d74b51e0 100644 --- a/src/syscall/env_windows.go +++ b/src/syscall/env_windows.go @@ -7,7 +7,6 @@ package syscall import ( - "unicode/utf16" "unsafe" ) @@ -24,7 +23,7 @@ func Getenv(key string) (value string, found bool) { return "", false } if n <= uint32(len(b)) { - return string(utf16.Decode(b[:n])), true + return UTF16ToString(b[:n]), true } } } @@ -90,7 +89,7 @@ func Environ() []string { } entry := unsafe.Slice(envp, (uintptr(end)-uintptr(unsafe.Pointer(envp)))/size) - r = append(r, string(utf16.Decode(entry))) + r = append(r, UTF16ToString(entry)) envp = (*uint16)(unsafe.Add(end, size)) } return r diff --git a/src/syscall/export_windows_test.go b/src/syscall/export_windows_test.go index a72a1ee391..eccf1bccac 100644 --- a/src/syscall/export_windows_test.go +++ b/src/syscall/export_windows_test.go @@ -9,3 +9,6 @@ var UpdateProcThreadAttribute = updateProcThreadAttribute var DeleteProcThreadAttributeList = deleteProcThreadAttributeList const PROC_THREAD_ATTRIBUTE_HANDLE_LIST = _PROC_THREAD_ATTRIBUTE_HANDLE_LIST + +var EncodeWTF16 = encodeWTF16 +var DecodeWTF16 = decodeWTF16 diff --git a/src/syscall/syscall_windows.go b/src/syscall/syscall_windows.go index 1f7753663b..c3fa415832 100644 --- a/src/syscall/syscall_windows.go +++ b/src/syscall/syscall_windows.go @@ -14,7 +14,6 @@ import ( "internal/race" "runtime" "sync" - "unicode/utf16" "unsafe" ) @@ -37,7 +36,8 @@ func StringToUTF16(s string) []uint16 { // UTF16FromString returns the UTF-16 encoding of the UTF-8 string // s, with a terminating NUL added. If s contains a NUL byte at any -// location, it returns (nil, EINVAL). +// location, it returns (nil, EINVAL). Unpaired surrogates +// are encoded using WTF-8. func UTF16FromString(s string) ([]uint16, error) { if bytealg.IndexByteString(s, 0) != -1 { return nil, EINVAL @@ -49,22 +49,37 @@ func UTF16FromString(s string) ([]uint16, error) { // equal than the number of UTF-16 code units. // Also account for the terminating NUL character. buf := make([]uint16, 0, len(s)+1) - for _, r := range s { - buf = utf16.AppendRune(buf, r) - } - return utf16.AppendRune(buf, '\x00'), nil + buf = encodeWTF16(s, buf) + return append(buf, 0), nil } // UTF16ToString returns the UTF-8 encoding of the UTF-16 sequence s, -// with a terminating NUL removed. +// with a terminating NUL removed. Unpaired surrogates are decoded +// using WTF-8 instead of UTF-8 encoding. func UTF16ToString(s []uint16) string { + maxLen := 0 for i, v := range s { if v == 0 { s = s[0:i] break } + switch { + case v <= rune1Max: + maxLen += 1 + case v <= rune2Max: + maxLen += 2 + default: + // r is a non-surrogate that decodes to 3 bytes, + // or is an unpaired surrogate (also 3 bytes in WTF-8), + // or is one half of a valid surrogate pair. + // If it is half of a pair, we will add 3 for the second surrogate + // (total of 6) and overestimate by 2 bytes for the pair, + // since the resulting rune only requires 4 bytes. + maxLen += 3 + } } - return string(utf16.Decode(s)) + buf := decodeWTF16(s, make([]byte, 0, maxLen)) + return unsafe.String(unsafe.SliceData(buf), len(buf)) } // utf16PtrToString is like UTF16ToString, but takes *uint16 @@ -73,17 +88,13 @@ func utf16PtrToString(p *uint16) string { if p == nil { return "" } - // Find NUL terminator. end := unsafe.Pointer(p) n := 0 for *(*uint16)(end) != 0 { end = unsafe.Pointer(uintptr(end) + unsafe.Sizeof(*p)) n++ } - // Turn *uint16 into []uint16. - s := unsafe.Slice(p, n) - // Decode []uint16 into string. - return string(utf16.Decode(s)) + return UTF16ToString(unsafe.Slice(p, n)) } // StringToUTF16Ptr returns pointer to the UTF-16 encoding of @@ -97,6 +108,7 @@ func StringToUTF16Ptr(s string) *uint16 { return &StringToUTF16(s)[0] } // UTF16PtrFromString returns pointer to the UTF-16 encoding of // the UTF-8 string s, with a terminating NUL added. If s // contains a NUL byte at any location, it returns (nil, EINVAL). +// Unpaired surrogates are encoded using WTF-8. func UTF16PtrFromString(s string) (*uint16, error) { a, err := UTF16FromString(s) if err != nil { @@ -143,7 +155,7 @@ func (e Errno) Error() string { // trim terminating \r and \n for ; n > 0 && (b[n-1] == '\n' || b[n-1] == '\r'); n-- { } - return string(utf16.Decode(b[:n])) + return UTF16ToString(b[:n]) } const ( @@ -525,7 +537,7 @@ func Getwd() (wd string, err error) { if e != nil { return "", e } - return string(utf16.Decode(b[0:n])), nil + return UTF16ToString(b[0:n]), nil } func Chdir(path string) (err error) { @@ -573,13 +585,13 @@ func Rename(oldpath, newpath string) (err error) { } func ComputerName() (name string, err error) { - var n uint32 = MAX_COMPUTERNAME_LENGTH + 1 - b := make([]uint16, n) + b := make([]uint16, MAX_COMPUTERNAME_LENGTH+1) + var n uint32 e := GetComputerName(&b[0], &n) if e != nil { return "", e } - return string(utf16.Decode(b[0:n])), nil + return UTF16ToString(b[:n]), nil } func Ftruncate(fd Handle, length int64) (err error) { diff --git a/src/syscall/wtf8_windows.go b/src/syscall/wtf8_windows.go new file mode 100644 index 0000000000..f166021b7c --- /dev/null +++ b/src/syscall/wtf8_windows.go @@ -0,0 +1,92 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +// Windows UTF-16 strings can contain unpaired surrogates, which can't be +// decoded into a valid UTF-8 string. This file defines a set of functions +// that can be used to encode and decode potentially ill-formed UTF-16 strings +// by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/). +// +// WTF-8 is a strict superset of UTF-8, i.e. any string that is +// well-formed in UTF-8 is also well-formed in WTF-8 and the content +// is unchanged. Also, the conversion never fails and is lossless. +// +// The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string +// is that the conversion is lossless even for ill-formed UTF-16 strings. +// This property allows to read an ill-formed UTF-16 string, convert it +// to a Go string, and convert it back to the same original UTF-16 string. +// +// See go.dev/issues/59971 for more info. + +package syscall + +import ( + "unicode/utf16" + "unicode/utf8" +) + +const ( + surr1 = 0xd800 + surr2 = 0xdc00 + surr3 = 0xe000 + + tx = 0b10000000 + t3 = 0b11100000 + maskx = 0b00111111 + mask3 = 0b00001111 + + rune1Max = 1<<7 - 1 + rune2Max = 1<<11 - 1 +) + +// encodeWTF16 returns the potentially ill-formed +// UTF-16 encoding of s. +func encodeWTF16(s string, buf []uint16) []uint16 { + for i := 0; i < len(s); { + // Cannot use 'for range s' because it expects valid + // UTF-8 runes. + r, size := utf8.DecodeRuneInString(s[i:]) + if r == utf8.RuneError { + // Check if s[i:] contains a valid WTF-8 encoded surrogate. + if sc := s[i:]; len(sc) >= 3 && sc[0] == 0xED && 0xA0 <= sc[1] && sc[1] <= 0xBF && 0x80 <= sc[2] && sc[2] <= 0xBF { + r = rune(sc[0]&mask3)<<12 + rune(sc[1]&maskx)<<6 + rune(sc[2]&maskx) + buf = append(buf, uint16(r)) + i += 3 + continue + } + } + i += size + buf = utf16.AppendRune(buf, r) + } + return buf +} + +// decodeWTF16 returns the WTF-8 encoding of +// the potentially ill-formed UTF-16 s. +func decodeWTF16(s []uint16, buf []byte) []byte { + for i := 0; i < len(s); i++ { + var ar rune + switch r := s[i]; { + case r < surr1, surr3 <= r: + // normal rune + ar = rune(r) + case surr1 <= r && r < surr2 && i+1 < len(s) && + surr2 <= s[i+1] && s[i+1] < surr3: + // valid surrogate sequence + ar = utf16.DecodeRune(rune(r), rune(s[i+1])) + i++ + default: + // WTF-8 fallback. + // This only handles the 3-byte case of utf8.AppendRune, + // as surrogates always fall in that case. + ar = rune(r) + if ar > utf8.MaxRune { + ar = utf8.RuneError + } + buf = append(buf, t3|byte(ar>>12), tx|byte(ar>>6)&maskx, tx|byte(ar)&maskx) + continue + } + buf = utf8.AppendRune(buf, ar) + } + return buf +} diff --git a/src/syscall/wtf8_windows_test.go b/src/syscall/wtf8_windows_test.go new file mode 100644 index 0000000000..077f718fd5 --- /dev/null +++ b/src/syscall/wtf8_windows_test.go @@ -0,0 +1,200 @@ +// Copyright 2023 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package syscall_test + +import ( + "fmt" + "slices" + "syscall" + "testing" + "unicode/utf16" + "unicode/utf8" + "unsafe" +) + +var wtf8tests = []struct { + str string + wstr []uint16 +}{ + { + str: "\x00", + wstr: []uint16{0x00}, + }, + { + str: "\x5C", + wstr: []uint16{0x5C}, + }, + { + str: "\x7F", + wstr: []uint16{0x7F}, + }, + + // 2-byte + { + str: "\xC2\x80", + wstr: []uint16{0x80}, + }, + { + str: "\xD7\x8A", + wstr: []uint16{0x05CA}, + }, + { + str: "\xDF\xBF", + wstr: []uint16{0x07FF}, + }, + + // 3-byte + { + str: "\xE0\xA0\x80", + wstr: []uint16{0x0800}, + }, + { + str: "\xE2\xB0\xBC", + wstr: []uint16{0x2C3C}, + }, + { + str: "\xEF\xBF\xBF", + wstr: []uint16{0xFFFF}, + }, + // unmatched surrogate halves + // high surrogates: 0xD800 to 0xDBFF + { + str: "\xED\xA0\x80", + wstr: []uint16{0xD800}, + }, + { + // "High surrogate followed by another high surrogate" + str: "\xED\xA0\x80\xED\xA0\x80", + wstr: []uint16{0xD800, 0xD800}, + }, + { + // "High surrogate followed by a symbol that is not a surrogate" + str: string([]byte{0xED, 0xA0, 0x80, 0xA}), + wstr: []uint16{0xD800, 0xA}, + }, + { + // "Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate" + str: string([]byte{0xED, 0xA0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xA0, 0x80}), + wstr: []uint16{0xD800, 0xD834, 0xDF06, 0xD800}, + }, + { + str: "\xED\xA6\xAF", + wstr: []uint16{0xD9AF}, + }, + { + str: "\xED\xAF\xBF", + wstr: []uint16{0xDBFF}, + }, + // low surrogates: 0xDC00 to 0xDFFF + { + str: "\xED\xB0\x80", + wstr: []uint16{0xDC00}, + }, + { + // "Low surrogate followed by another low surrogate" + str: "\xED\xB0\x80\xED\xB0\x80", + wstr: []uint16{0xDC00, 0xDC00}, + }, + { + // "Low surrogate followed by a symbol that is not a surrogate" + str: string([]byte{0xED, 0xB0, 0x80, 0xA}), + wstr: []uint16{0xDC00, 0xA}, + }, + { + // "Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate" + str: string([]byte{0xED, 0xB0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xB0, 0x80}), + wstr: []uint16{0xDC00, 0xD834, 0xDF06, 0xDC00}, + }, + { + str: "\xED\xBB\xAE", + wstr: []uint16{0xDEEE}, + }, + { + str: "\xED\xBF\xBF", + wstr: []uint16{0xDFFF}, + }, + + // 4-byte + { + str: "\xF0\x90\x80\x80", + wstr: []uint16{0xD800, 0xDC00}, + }, + { + str: "\xF0\x9D\x8C\x86", + wstr: []uint16{0xD834, 0xDF06}, + }, + { + str: "\xF4\x8F\xBF\xBF", + wstr: []uint16{0xDBFF, 0xDFFF}, + }, +} + +func TestWTF16Rountrip(t *testing.T) { + for _, tt := range wtf8tests { + t.Run(fmt.Sprintf("%X", tt.str), func(t *testing.T) { + got := syscall.EncodeWTF16(tt.str, nil) + got2 := string(syscall.DecodeWTF16(got, nil)) + if got2 != tt.str { + t.Errorf("got:\n%s\nwant:\n%s", got2, tt.str) + } + }) + } +} + +func TestWTF16Golden(t *testing.T) { + for _, tt := range wtf8tests { + t.Run(fmt.Sprintf("%X", tt.str), func(t *testing.T) { + got := syscall.EncodeWTF16(tt.str, nil) + if !slices.Equal(got, tt.wstr) { + t.Errorf("got:\n%v\nwant:\n%v", got, tt.wstr) + } + }) + } +} + +func FuzzEncodeWTF16(f *testing.F) { + for _, tt := range wtf8tests { + f.Add(tt.str) + } + f.Fuzz(func(t *testing.T, b string) { + // test that there are no panics + got := syscall.EncodeWTF16(b, nil) + syscall.DecodeWTF16(got, nil) + if utf8.ValidString(b) { + // if the input is a valid UTF-8 string, then + // test that syscall.EncodeWTF16 behaves as + // utf16.Encode + want := utf16.Encode([]rune(b)) + if !slices.Equal(got, want) { + t.Errorf("got:\n%v\nwant:\n%v", got, want) + } + } + }) +} + +func FuzzDecodeWTF16(f *testing.F) { + for _, tt := range wtf8tests { + b := unsafe.Slice((*uint8)(unsafe.Pointer(unsafe.SliceData(tt.wstr))), len(tt.wstr)*2) + f.Add(b) + } + f.Fuzz(func(t *testing.T, b []byte) { + u16 := unsafe.Slice((*uint16)(unsafe.Pointer(unsafe.SliceData(b))), len(b)/2) + got := syscall.DecodeWTF16(u16, nil) + if utf8.Valid(got) { + // if the input is a valid UTF-8 string, then + // test that syscall.DecodeWTF16 behaves as + // utf16.Decode + want := utf16.Decode(u16) + if string(got) != string(want) { + t.Errorf("got:\n%s\nwant:\n%s", string(got), string(want)) + } + } + // WTF-8 should always roundtrip + got2 := syscall.EncodeWTF16(string(got), nil) + if !slices.Equal(got2, u16) { + t.Errorf("got:\n%v\nwant:\n%v", got2, u16) + } + }) +} -- 2.48.1