import (
"internal/syscall/windows"
"syscall"
- "unicode/utf16"
"unsafe"
)
}
entry := unsafe.Slice(blockp, (uintptr(end)-uintptr(unsafe.Pointer(blockp)))/2)
- env = append(env, string(utf16.Decode(entry)))
+ env = append(env, syscall.UTF16ToString(entry))
blockp = (*uint16)(unsafe.Add(end, size))
}
return
from := 0
for i, c := range p {
if c == 0 {
- val = append(val, string(utf16.Decode(p[from:i])))
+ val = append(val, syscall.UTF16ToString(p[from:i]))
from = i + 1
}
}
import (
"sync"
"syscall"
- "unicode/utf16"
"unsafe"
)
if p == nil {
return ""
}
- // Find NUL terminator.
end := unsafe.Pointer(p)
n := 0
for *(*uint16)(end) != 0 {
end = unsafe.Pointer(uintptr(end) + unsafe.Sizeof(*p))
n++
}
- // Turn *uint16 into []uint16.
- s := unsafe.Slice(p, n)
- // Decode []uint16 into string.
- return string(utf16.Decode(s))
+ return syscall.UTF16ToString(unsafe.Slice(p, n))
}
const (
"runtime"
"sync"
"syscall"
- "unicode/utf16"
"unsafe"
)
d.bufp = 0
}
nameslice := unsafe.Slice(&info.FileName[0], info.FileNameLength/2)
- name := string(utf16.Decode(nameslice))
+ name := syscall.UTF16ToString(nameslice)
if name == "." || name == ".." { // Useless names
continue
}
import (
"os"
"syscall"
- "unicode/utf16"
"unsafe"
)
if n == 0 {
return "", err
}
- return string(utf16.Decode(b[0:n])), nil
+ return syscall.UTF16ToString(b[0:n]), nil
}
func main() {
"runtime"
"sync"
"syscall"
- "unicode/utf16"
"unsafe"
)
// Otherwise remove terminating \.
n--
}
- return string(utf16.Decode(b[:n]))
+ return syscall.UTF16ToString(b[:n])
}
}
"path/filepath"
"reflect"
"runtime"
+ "slices"
"sort"
"strings"
"syscall"
t.Errorf("exec.LookPath(%q) = %q; want %q", pythonPath, p, pythonPath)
}
}
+
+func TestIllformedUTF16FileName(t *testing.T) {
+ dir := t.TempDir()
+ const sep = string(os.PathSeparator)
+ if !strings.HasSuffix(dir, sep) {
+ dir += sep
+ }
+
+ // This UTF-16 file name is ill-formed as it contains low surrogates that are not preceded by high surrogates ([1:5]).
+ namew := []uint16{0x2e, 0xdc6d, 0xdc73, 0xdc79, 0xdc73, 0x30, 0x30, 0x30, 0x31, 0}
+
+ // Create a file whose name contains unpaired surrogates.
+ // Use syscall.CreateFile instead of os.Create to simulate a file that is created by
+ // a non-Go program so the file name hasn't gone through syscall.UTF16FromString.
+ dirw := utf16.Encode([]rune(dir))
+ pathw := append(dirw, namew...)
+ fd, err := syscall.CreateFile(&pathw[0], syscall.GENERIC_ALL, 0, nil, syscall.CREATE_NEW, 0, 0)
+ if err != nil {
+ t.Fatal(err)
+ }
+ syscall.CloseHandle(fd)
+
+ name := syscall.UTF16ToString(namew)
+ path := filepath.Join(dir, name)
+ // Verify that os.Lstat can query the file.
+ fi, err := os.Lstat(path)
+ if err != nil {
+ t.Fatal(err)
+ }
+ if got := fi.Name(); got != name {
+ t.Errorf("got %q, want %q", got, name)
+ }
+ // Verify that File.Readdirnames lists the file.
+ f, err := os.Open(dir)
+ if err != nil {
+ t.Fatal(err)
+ }
+ files, err := f.Readdirnames(0)
+ f.Close()
+ if err != nil {
+ t.Fatal(err)
+ }
+ if !slices.Contains(files, name) {
+ t.Error("file not listed")
+ }
+ // Verify that os.RemoveAll can remove the directory
+ // and that it doesn't hang.
+ err = os.RemoveAll(dir)
+ if err != nil {
+ t.Error(err)
+ }
+}
+
+func TestUTF16Alloc(t *testing.T) {
+ allowsPerRun := func(want int, f func()) {
+ t.Helper()
+ got := int(testing.AllocsPerRun(5, f))
+ if got != want {
+ t.Errorf("got %d allocs, want %d", got, want)
+ }
+ }
+ allowsPerRun(1, func() {
+ syscall.UTF16ToString([]uint16{'a', 'b', 'c'})
+ })
+ allowsPerRun(1, func() {
+ syscall.UTF16FromString("abc")
+ })
+}
package syscall
import (
- "unicode/utf16"
"unsafe"
)
return "", false
}
if n <= uint32(len(b)) {
- return string(utf16.Decode(b[:n])), true
+ return UTF16ToString(b[:n]), true
}
}
}
}
entry := unsafe.Slice(envp, (uintptr(end)-uintptr(unsafe.Pointer(envp)))/size)
- r = append(r, string(utf16.Decode(entry)))
+ r = append(r, UTF16ToString(entry))
envp = (*uint16)(unsafe.Add(end, size))
}
return r
var DeleteProcThreadAttributeList = deleteProcThreadAttributeList
const PROC_THREAD_ATTRIBUTE_HANDLE_LIST = _PROC_THREAD_ATTRIBUTE_HANDLE_LIST
+
+var EncodeWTF16 = encodeWTF16
+var DecodeWTF16 = decodeWTF16
"internal/race"
"runtime"
"sync"
- "unicode/utf16"
"unsafe"
)
// UTF16FromString returns the UTF-16 encoding of the UTF-8 string
// s, with a terminating NUL added. If s contains a NUL byte at any
-// location, it returns (nil, EINVAL).
+// location, it returns (nil, EINVAL). Unpaired surrogates
+// are encoded using WTF-8.
func UTF16FromString(s string) ([]uint16, error) {
if bytealg.IndexByteString(s, 0) != -1 {
return nil, EINVAL
// equal than the number of UTF-16 code units.
// Also account for the terminating NUL character.
buf := make([]uint16, 0, len(s)+1)
- for _, r := range s {
- buf = utf16.AppendRune(buf, r)
- }
- return utf16.AppendRune(buf, '\x00'), nil
+ buf = encodeWTF16(s, buf)
+ return append(buf, 0), nil
}
// UTF16ToString returns the UTF-8 encoding of the UTF-16 sequence s,
-// with a terminating NUL removed.
+// with a terminating NUL removed. Unpaired surrogates are decoded
+// using WTF-8 instead of UTF-8 encoding.
func UTF16ToString(s []uint16) string {
+ maxLen := 0
for i, v := range s {
if v == 0 {
s = s[0:i]
break
}
+ switch {
+ case v <= rune1Max:
+ maxLen += 1
+ case v <= rune2Max:
+ maxLen += 2
+ default:
+ // r is a non-surrogate that decodes to 3 bytes,
+ // or is an unpaired surrogate (also 3 bytes in WTF-8),
+ // or is one half of a valid surrogate pair.
+ // If it is half of a pair, we will add 3 for the second surrogate
+ // (total of 6) and overestimate by 2 bytes for the pair,
+ // since the resulting rune only requires 4 bytes.
+ maxLen += 3
+ }
}
- return string(utf16.Decode(s))
+ buf := decodeWTF16(s, make([]byte, 0, maxLen))
+ return unsafe.String(unsafe.SliceData(buf), len(buf))
}
// utf16PtrToString is like UTF16ToString, but takes *uint16
if p == nil {
return ""
}
- // Find NUL terminator.
end := unsafe.Pointer(p)
n := 0
for *(*uint16)(end) != 0 {
end = unsafe.Pointer(uintptr(end) + unsafe.Sizeof(*p))
n++
}
- // Turn *uint16 into []uint16.
- s := unsafe.Slice(p, n)
- // Decode []uint16 into string.
- return string(utf16.Decode(s))
+ return UTF16ToString(unsafe.Slice(p, n))
}
// StringToUTF16Ptr returns pointer to the UTF-16 encoding of
// UTF16PtrFromString returns pointer to the UTF-16 encoding of
// the UTF-8 string s, with a terminating NUL added. If s
// contains a NUL byte at any location, it returns (nil, EINVAL).
+// Unpaired surrogates are encoded using WTF-8.
func UTF16PtrFromString(s string) (*uint16, error) {
a, err := UTF16FromString(s)
if err != nil {
// trim terminating \r and \n
for ; n > 0 && (b[n-1] == '\n' || b[n-1] == '\r'); n-- {
}
- return string(utf16.Decode(b[:n]))
+ return UTF16ToString(b[:n])
}
const (
if e != nil {
return "", e
}
- return string(utf16.Decode(b[0:n])), nil
+ return UTF16ToString(b[0:n]), nil
}
func Chdir(path string) (err error) {
}
func ComputerName() (name string, err error) {
- var n uint32 = MAX_COMPUTERNAME_LENGTH + 1
- b := make([]uint16, n)
+ b := make([]uint16, MAX_COMPUTERNAME_LENGTH+1)
+ var n uint32
e := GetComputerName(&b[0], &n)
if e != nil {
return "", e
}
- return string(utf16.Decode(b[0:n])), nil
+ return UTF16ToString(b[:n]), nil
}
func Ftruncate(fd Handle, length int64) (err error) {
--- /dev/null
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Windows UTF-16 strings can contain unpaired surrogates, which can't be
+// decoded into a valid UTF-8 string. This file defines a set of functions
+// that can be used to encode and decode potentially ill-formed UTF-16 strings
+// by using the [the WTF-8 encoding](https://simonsapin.github.io/wtf-8/).
+//
+// WTF-8 is a strict superset of UTF-8, i.e. any string that is
+// well-formed in UTF-8 is also well-formed in WTF-8 and the content
+// is unchanged. Also, the conversion never fails and is lossless.
+//
+// The benefit of using WTF-8 instead of UTF-8 when decoding a UTF-16 string
+// is that the conversion is lossless even for ill-formed UTF-16 strings.
+// This property allows to read an ill-formed UTF-16 string, convert it
+// to a Go string, and convert it back to the same original UTF-16 string.
+//
+// See go.dev/issues/59971 for more info.
+
+package syscall
+
+import (
+ "unicode/utf16"
+ "unicode/utf8"
+)
+
+const (
+ surr1 = 0xd800
+ surr2 = 0xdc00
+ surr3 = 0xe000
+
+ tx = 0b10000000
+ t3 = 0b11100000
+ maskx = 0b00111111
+ mask3 = 0b00001111
+
+ rune1Max = 1<<7 - 1
+ rune2Max = 1<<11 - 1
+)
+
+// encodeWTF16 returns the potentially ill-formed
+// UTF-16 encoding of s.
+func encodeWTF16(s string, buf []uint16) []uint16 {
+ for i := 0; i < len(s); {
+ // Cannot use 'for range s' because it expects valid
+ // UTF-8 runes.
+ r, size := utf8.DecodeRuneInString(s[i:])
+ if r == utf8.RuneError {
+ // Check if s[i:] contains a valid WTF-8 encoded surrogate.
+ if sc := s[i:]; len(sc) >= 3 && sc[0] == 0xED && 0xA0 <= sc[1] && sc[1] <= 0xBF && 0x80 <= sc[2] && sc[2] <= 0xBF {
+ r = rune(sc[0]&mask3)<<12 + rune(sc[1]&maskx)<<6 + rune(sc[2]&maskx)
+ buf = append(buf, uint16(r))
+ i += 3
+ continue
+ }
+ }
+ i += size
+ buf = utf16.AppendRune(buf, r)
+ }
+ return buf
+}
+
+// decodeWTF16 returns the WTF-8 encoding of
+// the potentially ill-formed UTF-16 s.
+func decodeWTF16(s []uint16, buf []byte) []byte {
+ for i := 0; i < len(s); i++ {
+ var ar rune
+ switch r := s[i]; {
+ case r < surr1, surr3 <= r:
+ // normal rune
+ ar = rune(r)
+ case surr1 <= r && r < surr2 && i+1 < len(s) &&
+ surr2 <= s[i+1] && s[i+1] < surr3:
+ // valid surrogate sequence
+ ar = utf16.DecodeRune(rune(r), rune(s[i+1]))
+ i++
+ default:
+ // WTF-8 fallback.
+ // This only handles the 3-byte case of utf8.AppendRune,
+ // as surrogates always fall in that case.
+ ar = rune(r)
+ if ar > utf8.MaxRune {
+ ar = utf8.RuneError
+ }
+ buf = append(buf, t3|byte(ar>>12), tx|byte(ar>>6)&maskx, tx|byte(ar)&maskx)
+ continue
+ }
+ buf = utf8.AppendRune(buf, ar)
+ }
+ return buf
+}
--- /dev/null
+// Copyright 2023 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package syscall_test
+
+import (
+ "fmt"
+ "slices"
+ "syscall"
+ "testing"
+ "unicode/utf16"
+ "unicode/utf8"
+ "unsafe"
+)
+
+var wtf8tests = []struct {
+ str string
+ wstr []uint16
+}{
+ {
+ str: "\x00",
+ wstr: []uint16{0x00},
+ },
+ {
+ str: "\x5C",
+ wstr: []uint16{0x5C},
+ },
+ {
+ str: "\x7F",
+ wstr: []uint16{0x7F},
+ },
+
+ // 2-byte
+ {
+ str: "\xC2\x80",
+ wstr: []uint16{0x80},
+ },
+ {
+ str: "\xD7\x8A",
+ wstr: []uint16{0x05CA},
+ },
+ {
+ str: "\xDF\xBF",
+ wstr: []uint16{0x07FF},
+ },
+
+ // 3-byte
+ {
+ str: "\xE0\xA0\x80",
+ wstr: []uint16{0x0800},
+ },
+ {
+ str: "\xE2\xB0\xBC",
+ wstr: []uint16{0x2C3C},
+ },
+ {
+ str: "\xEF\xBF\xBF",
+ wstr: []uint16{0xFFFF},
+ },
+ // unmatched surrogate halves
+ // high surrogates: 0xD800 to 0xDBFF
+ {
+ str: "\xED\xA0\x80",
+ wstr: []uint16{0xD800},
+ },
+ {
+ // "High surrogate followed by another high surrogate"
+ str: "\xED\xA0\x80\xED\xA0\x80",
+ wstr: []uint16{0xD800, 0xD800},
+ },
+ {
+ // "High surrogate followed by a symbol that is not a surrogate"
+ str: string([]byte{0xED, 0xA0, 0x80, 0xA}),
+ wstr: []uint16{0xD800, 0xA},
+ },
+ {
+ // "Unmatched high surrogate, followed by a surrogate pair, followed by an unmatched high surrogate"
+ str: string([]byte{0xED, 0xA0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xA0, 0x80}),
+ wstr: []uint16{0xD800, 0xD834, 0xDF06, 0xD800},
+ },
+ {
+ str: "\xED\xA6\xAF",
+ wstr: []uint16{0xD9AF},
+ },
+ {
+ str: "\xED\xAF\xBF",
+ wstr: []uint16{0xDBFF},
+ },
+ // low surrogates: 0xDC00 to 0xDFFF
+ {
+ str: "\xED\xB0\x80",
+ wstr: []uint16{0xDC00},
+ },
+ {
+ // "Low surrogate followed by another low surrogate"
+ str: "\xED\xB0\x80\xED\xB0\x80",
+ wstr: []uint16{0xDC00, 0xDC00},
+ },
+ {
+ // "Low surrogate followed by a symbol that is not a surrogate"
+ str: string([]byte{0xED, 0xB0, 0x80, 0xA}),
+ wstr: []uint16{0xDC00, 0xA},
+ },
+ {
+ // "Unmatched low surrogate, followed by a surrogate pair, followed by an unmatched low surrogate"
+ str: string([]byte{0xED, 0xB0, 0x80, 0xF0, 0x9D, 0x8C, 0x86, 0xED, 0xB0, 0x80}),
+ wstr: []uint16{0xDC00, 0xD834, 0xDF06, 0xDC00},
+ },
+ {
+ str: "\xED\xBB\xAE",
+ wstr: []uint16{0xDEEE},
+ },
+ {
+ str: "\xED\xBF\xBF",
+ wstr: []uint16{0xDFFF},
+ },
+
+ // 4-byte
+ {
+ str: "\xF0\x90\x80\x80",
+ wstr: []uint16{0xD800, 0xDC00},
+ },
+ {
+ str: "\xF0\x9D\x8C\x86",
+ wstr: []uint16{0xD834, 0xDF06},
+ },
+ {
+ str: "\xF4\x8F\xBF\xBF",
+ wstr: []uint16{0xDBFF, 0xDFFF},
+ },
+}
+
+func TestWTF16Rountrip(t *testing.T) {
+ for _, tt := range wtf8tests {
+ t.Run(fmt.Sprintf("%X", tt.str), func(t *testing.T) {
+ got := syscall.EncodeWTF16(tt.str, nil)
+ got2 := string(syscall.DecodeWTF16(got, nil))
+ if got2 != tt.str {
+ t.Errorf("got:\n%s\nwant:\n%s", got2, tt.str)
+ }
+ })
+ }
+}
+
+func TestWTF16Golden(t *testing.T) {
+ for _, tt := range wtf8tests {
+ t.Run(fmt.Sprintf("%X", tt.str), func(t *testing.T) {
+ got := syscall.EncodeWTF16(tt.str, nil)
+ if !slices.Equal(got, tt.wstr) {
+ t.Errorf("got:\n%v\nwant:\n%v", got, tt.wstr)
+ }
+ })
+ }
+}
+
+func FuzzEncodeWTF16(f *testing.F) {
+ for _, tt := range wtf8tests {
+ f.Add(tt.str)
+ }
+ f.Fuzz(func(t *testing.T, b string) {
+ // test that there are no panics
+ got := syscall.EncodeWTF16(b, nil)
+ syscall.DecodeWTF16(got, nil)
+ if utf8.ValidString(b) {
+ // if the input is a valid UTF-8 string, then
+ // test that syscall.EncodeWTF16 behaves as
+ // utf16.Encode
+ want := utf16.Encode([]rune(b))
+ if !slices.Equal(got, want) {
+ t.Errorf("got:\n%v\nwant:\n%v", got, want)
+ }
+ }
+ })
+}
+
+func FuzzDecodeWTF16(f *testing.F) {
+ for _, tt := range wtf8tests {
+ b := unsafe.Slice((*uint8)(unsafe.Pointer(unsafe.SliceData(tt.wstr))), len(tt.wstr)*2)
+ f.Add(b)
+ }
+ f.Fuzz(func(t *testing.T, b []byte) {
+ u16 := unsafe.Slice((*uint16)(unsafe.Pointer(unsafe.SliceData(b))), len(b)/2)
+ got := syscall.DecodeWTF16(u16, nil)
+ if utf8.Valid(got) {
+ // if the input is a valid UTF-8 string, then
+ // test that syscall.DecodeWTF16 behaves as
+ // utf16.Decode
+ want := utf16.Decode(u16)
+ if string(got) != string(want) {
+ t.Errorf("got:\n%s\nwant:\n%s", string(got), string(want))
+ }
+ }
+ // WTF-8 should always roundtrip
+ got2 := syscall.EncodeWTF16(string(got), nil)
+ if !slices.Equal(got2, u16) {
+ t.Errorf("got:\n%v\nwant:\n%v", got2, u16)
+ }
+ })
+}