From f371b30f326b66e4c5c13c7ea51358a42c431752 Mon Sep 17 00:00:00 2001 From: Joe Tsai Date: Wed, 11 Aug 2021 23:51:09 -0700 Subject: [PATCH] unicode/utf8: add AppendRune MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit AppendRune appends the UTF-8 encoding of a rune to a []byte. It is a generally more user friendly than EncodeRune. EncodeASCIIRune-4 2.35ns ± 2% EncodeJapaneseRune-4 4.60ns ± 2% AppendASCIIRune-4 0.30ns ± 3% AppendJapaneseRune-4 4.70ns ± 2% The ASCII case is written to be inlineable. Fixes #47609 Change-Id: If4f71eedffd2bd4ef0d7f960cb55b41c637eec54 Reviewed-on: https://go-review.googlesource.com/c/go/+/345571 Trust: Joe Tsai Reviewed-by: Rob Pike Run-TryBot: Rob Pike TryBot-Result: Go Bot --- src/cmd/compile/internal/test/inl_test.go | 1 + src/unicode/utf8/utf8.go | 26 +++++++++++++++++++++++ src/unicode/utf8/utf8_test.go | 25 ++++++++++++++++++++++ 3 files changed, 52 insertions(+) diff --git a/src/cmd/compile/internal/test/inl_test.go b/src/cmd/compile/internal/test/inl_test.go index bbdbe0c37c..4f20ec1bd1 100644 --- a/src/cmd/compile/internal/test/inl_test.go +++ b/src/cmd/compile/internal/test/inl_test.go @@ -122,6 +122,7 @@ func TestIntendedInlining(t *testing.T) { "FullRune", "FullRuneInString", "RuneLen", + "AppendRune", "ValidRune", }, "reflect": { diff --git a/src/unicode/utf8/utf8.go b/src/unicode/utf8/utf8.go index 557e8a7770..6938c7e6a7 100644 --- a/src/unicode/utf8/utf8.go +++ b/src/unicode/utf8/utf8.go @@ -369,6 +369,32 @@ func EncodeRune(p []byte, r rune) int { } } +// AppendRune appends the UTF-8 encoding of r to the end of p and +// returns the extended buffer. If the rune is out of range, +// it appends the encoding of RuneError. +func AppendRune(p []byte, r rune) []byte { + // This function is inlineable for fast handling of ASCII. + if uint32(r) <= rune1Max { + return append(p, byte(r)) + } + return appendRuneNonASCII(p, r) +} + +func appendRuneNonASCII(p []byte, r rune) []byte { + // Negative values are erroneous. Making it unsigned addresses the problem. + switch i := uint32(r); { + case i <= rune2Max: + return append(p, t2|byte(r>>6), tx|byte(r)&maskx) + case i > MaxRune, surrogateMin <= i && i <= surrogateMax: + r = RuneError + fallthrough + case i <= rune3Max: + return append(p, t3|byte(r>>12), tx|byte(r>>6)&maskx, tx|byte(r)&maskx) + default: + return append(p, t4|byte(r>>18), tx|byte(r>>12)&maskx, tx|byte(r>>6)&maskx, tx|byte(r)&maskx) + } +} + // RuneCount returns the number of runes in p. Erroneous and short // encodings are treated as single runes of width 1 byte. func RuneCount(p []byte) int { diff --git a/src/unicode/utf8/utf8_test.go b/src/unicode/utf8/utf8_test.go index eaf1b5ffee..a60040ecfd 100644 --- a/src/unicode/utf8/utf8_test.go +++ b/src/unicode/utf8/utf8_test.go @@ -127,6 +127,17 @@ func TestEncodeRune(t *testing.T) { } } +func TestAppendRune(t *testing.T) { + for _, m := range utf8map { + if buf := AppendRune(nil, m.r); string(buf) != m.str { + t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, m.str) + } + if buf := AppendRune([]byte("init"), m.r); string(buf) != "init"+m.str { + t.Errorf("AppendRune(nil, %#04x) = %s, want %s", m.r, buf, "init"+m.str) + } + } +} + func TestDecodeRune(t *testing.T) { for _, m := range utf8map { b := []byte(m.str) @@ -583,6 +594,20 @@ func BenchmarkEncodeJapaneseRune(b *testing.B) { } } +func BenchmarkAppendASCIIRune(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + AppendRune(buf[:0], 'a') + } +} + +func BenchmarkAppendJapaneseRune(b *testing.B) { + buf := make([]byte, UTFMax) + for i := 0; i < b.N; i++ { + AppendRune(buf[:0], '本') + } +} + func BenchmarkDecodeASCIIRune(b *testing.B) { a := []byte{'a'} for i := 0; i < b.N; i++ { -- 2.50.0