return Map(c.ToTitle, s)
}
+// ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes
+// representing invalid UTF-8 replaced with the bytes in replacement, which may be empty.
+func ToValidUTF8(s, replacement []byte) []byte {
+ b := make([]byte, 0, len(s)+len(replacement))
+ invalid := false // previous byte was from an invalid UTF-8 sequence
+ for i := 0; i < len(s); {
+ c := s[i]
+ if c < utf8.RuneSelf {
+ i++
+ invalid = false
+ b = append(b, byte(c))
+ continue
+ }
+ _, wid := utf8.DecodeRune(s[i:])
+ if wid == 1 {
+ i++
+ if !invalid {
+ invalid = true
+ b = append(b, replacement...)
+ }
+ continue
+ }
+ invalid = false
+ b = append(b, s[i:i+wid]...)
+ i += wid
+ }
+ return b
+}
+
// isSeparator reports whether the rune could mark a word boundary.
// TODO: update when package unicode captures more of the properties.
func isSeparator(r rune) bool {
}
}
+var toValidUTF8Tests = []struct {
+ in string
+ repl string
+ out string
+}{
+ {"", "\uFFFD", ""},
+ {"abc", "\uFFFD", "abc"},
+ {"\uFDDD", "\uFFFD", "\uFDDD"},
+ {"a\xffb", "\uFFFD", "a\uFFFDb"},
+ {"a\xffb\uFFFD", "X", "aXb\uFFFD"},
+ {"a☺\xffb☺\xC0\xAFc☺\xff", "", "a☺b☺c☺"},
+ {"a☺\xffb☺\xC0\xAFc☺\xff", "日本語", "a☺日本語b☺日本語c☺日本語"},
+ {"\xC0\xAF", "\uFFFD", "\uFFFD"},
+ {"\xE0\x80\xAF", "\uFFFD", "\uFFFD"},
+ {"\xed\xa0\x80", "abc", "abc"},
+ {"\xed\xbf\xbf", "\uFFFD", "\uFFFD"},
+ {"\xF0\x80\x80\xaf", "☺", "☺"},
+ {"\xF8\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"},
+ {"\xFC\x80\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"},
+}
+
+func TestToValidUTF8(t *testing.T) {
+ for _, tc := range toValidUTF8Tests {
+ got := ToValidUTF8([]byte(tc.in), []byte(tc.repl))
+ if !Equal(got, []byte(tc.out)) {
+ t.Errorf("ToValidUTF8(%q, %q) = %q; want %q", tc.in, tc.repl, got, tc.out)
+ }
+ }
+}
+
func TestTrimSpace(t *testing.T) { runStringTests(t, TrimSpace, "TrimSpace", trimSpaceTests) }
type RepeatTest struct {
}
}
+func BenchmarkToValidUTF8(b *testing.B) {
+ tests := []struct {
+ name string
+ input []byte
+ }{
+ {"Valid", []byte("typical")},
+ {"InvalidASCII", []byte("foo\xffbar")},
+ {"InvalidNonASCII", []byte("日本語\xff日本語")},
+ }
+ replacement := []byte("\uFFFD")
+ b.ResetTimer()
+ for _, test := range tests {
+ b.Run(test.name, func(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ ToValidUTF8(test.input, replacement)
+ }
+ })
+ }
+}
+
func makeBenchInputHard() []byte {
tokens := [...]string{
"<a>", "<p>", "<b>", "<strong>",
return Map(c.ToTitle, s)
}
+// ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences
+// replaced by the replacement string, which may be empty.
+func ToValidUTF8(s, replacement string) string {
+ var b Builder
+
+ for i, c := range s {
+ if c != utf8.RuneError {
+ continue
+ }
+
+ _, wid := utf8.DecodeRuneInString(s[i:])
+ if wid == 1 {
+ b.Grow(len(s) + len(replacement))
+ b.WriteString(s[:i])
+ s = s[i:]
+ break
+ }
+ }
+
+ // Fast path for unchanged input
+ if b.Cap() == 0 { // didn't call b.Grow above
+ return s
+ }
+
+ invalid := false // previous byte was from an invalid UTF-8 sequence
+ for i := 0; i < len(s); {
+ c := s[i]
+ if c < utf8.RuneSelf {
+ i++
+ invalid = false
+ b.WriteByte(c)
+ continue
+ }
+ _, wid := utf8.DecodeRuneInString(s[i:])
+ if wid == 1 {
+ i++
+ if !invalid {
+ invalid = true
+ b.WriteString(replacement)
+ }
+ continue
+ }
+ invalid = false
+ b.WriteString(s[i : i+wid])
+ i += wid
+ }
+
+ return b.String()
+}
+
// isSeparator reports whether the rune could mark a word boundary.
// TODO: update when package unicode captures more of the properties.
func isSeparator(r rune) bool {
func TestToLower(t *testing.T) { runStringTests(t, ToLower, "ToLower", lowerTests) }
+var toValidUTF8Tests = []struct {
+ in string
+ repl string
+ out string
+}{
+ {"", "\uFFFD", ""},
+ {"abc", "\uFFFD", "abc"},
+ {"\uFDDD", "\uFFFD", "\uFDDD"},
+ {"a\xffb", "\uFFFD", "a\uFFFDb"},
+ {"a\xffb\uFFFD", "X", "aXb\uFFFD"},
+ {"a☺\xffb☺\xC0\xAFc☺\xff", "", "a☺b☺c☺"},
+ {"a☺\xffb☺\xC0\xAFc☺\xff", "日本語", "a☺日本語b☺日本語c☺日本語"},
+ {"\xC0\xAF", "\uFFFD", "\uFFFD"},
+ {"\xE0\x80\xAF", "\uFFFD", "\uFFFD"},
+ {"\xed\xa0\x80", "abc", "abc"},
+ {"\xed\xbf\xbf", "\uFFFD", "\uFFFD"},
+ {"\xF0\x80\x80\xaf", "☺", "☺"},
+ {"\xF8\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"},
+ {"\xFC\x80\x80\x80\x80\xAF", "\uFFFD", "\uFFFD"},
+}
+
+func TestToValidUTF8(t *testing.T) {
+ for _, tc := range toValidUTF8Tests {
+ got := ToValidUTF8(tc.in, tc.repl)
+ if got != tc.out {
+ t.Errorf("ToValidUTF8(%q, %q) = %q; want %q", tc.in, tc.repl, got, tc.out)
+ }
+ }
+}
+
func BenchmarkToUpper(b *testing.B) {
for _, tc := range upperTests {
b.Run(tc.in, func(b *testing.B) {
}
}
+func BenchmarkToValidUTF8(b *testing.B) {
+ tests := []struct {
+ name string
+ input string
+ }{
+ {"Valid", "typical"},
+ {"InvalidASCII", "foo\xffbar"},
+ {"InvalidNonASCII", "日本語\xff日本語"},
+ }
+ replacement := "\uFFFD"
+ b.ResetTimer()
+ for _, test := range tests {
+ b.Run(test.name, func(b *testing.B) {
+ for i := 0; i < b.N; i++ {
+ ToValidUTF8(test.input, replacement)
+ }
+ })
+ }
+}
+
type predicate struct {
f func(rune) bool
name string