]> Cypherpunks repositories - gostls13.git/commitdiff
unicode/utf16: add func RuneLen
authorJes Cok <xigua67damn@gmail.com>
Thu, 7 Mar 2024 13:36:47 +0000 (21:36 +0800)
committerGopher Robot <gobot@golang.org>
Thu, 7 Mar 2024 19:08:48 +0000 (19:08 +0000)
This CL adds func RuneLen, while here, also uses RuneLen to simplify
code in Encode.

Fixes #44940

Change-Id: Ifd3b537f69880dfd32a69a6733d8d3c2b5d4ecba
Reviewed-on: https://go-review.googlesource.com/c/go/+/569755
Reviewed-by: Ian Lance Taylor <iant@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Commit-Queue: Ian Lance Taylor <iant@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Auto-Submit: Ian Lance Taylor <iant@google.com>

api/next/44940.txt [new file with mode: 0644]
doc/next/6-stdlib/99-minor/unicode/utf16/44940.md [new file with mode: 0644]
src/unicode/utf16/export_test.go
src/unicode/utf16/utf16.go
src/unicode/utf16/utf16_test.go

diff --git a/api/next/44940.txt b/api/next/44940.txt
new file mode 100644 (file)
index 0000000..4efb7c5
--- /dev/null
@@ -0,0 +1 @@
+pkg unicode/utf16, func RuneLen(int32) int #44940
diff --git a/doc/next/6-stdlib/99-minor/unicode/utf16/44940.md b/doc/next/6-stdlib/99-minor/unicode/utf16/44940.md
new file mode 100644 (file)
index 0000000..79a36cd
--- /dev/null
@@ -0,0 +1,3 @@
+The [`unicode/utf16.RuneLen`](/pkg/unicode/utf16#RuneLen) function returns
+the number of 16-bit words in the UTF-16 encoding of the rune. It returns -1
+if the rune is not a valid value to encode in UTF-16.
index e0c57f52aef5410f7d3d0bb9c76af77f3afe8137..74a89bf39a3f65492645ffa2da381765c89d08dc 100644 (file)
@@ -6,6 +6,9 @@ package utf16
 
 // Extra names for constants so we can validate them during testing.
 const (
+       Surr1           = surr1
+       Surr3           = surr3
+       SurrSelf        = surrSelf
        MaxRune         = maxRune
        ReplacementChar = replacementChar
 )
index 1c6d2c66c30c40f14bc20266468da429563fbcac..0293bbf639bc84f3a543f71c63464f35aff1e924 100644 (file)
@@ -52,6 +52,19 @@ func EncodeRune(r rune) (r1, r2 rune) {
        return surr1 + (r>>10)&0x3ff, surr2 + r&0x3ff
 }
 
+// RuneLen returns the number of 16-bit words in the UTF-16 encoding of the rune.
+// It returns -1 if the rune is not a valid value to encode in UTF-16.
+func RuneLen(r rune) int {
+       switch {
+       case 0 <= r && r < surr1, surr3 <= r && r < surrSelf:
+               return 1
+       case surrSelf <= r && r <= maxRune:
+               return 2
+       default:
+               return -1
+       }
+}
+
 // Encode returns the UTF-16 encoding of the Unicode code point sequence s.
 func Encode(s []rune) []uint16 {
        n := len(s)
@@ -64,13 +77,11 @@ func Encode(s []rune) []uint16 {
        a := make([]uint16, n)
        n = 0
        for _, v := range s {
-               switch {
-               case 0 <= v && v < surr1, surr3 <= v && v < surrSelf:
-                       // normal rune
+               switch RuneLen(v) {
+               case 1: // normal rune
                        a[n] = uint16(v)
                        n++
-               case surrSelf <= v && v <= maxRune:
-                       // needs surrogate sequence
+               case 2: // needs surrogate sequence
                        r1, r2 := EncodeRune(v)
                        a[n] = uint16(r1)
                        a[n+1] = uint16(r2)
index a5a503d3874bb214a6733582c4e3d303b156d5c5..74a4a6746b2cd323909d5d58a5d63b352be28f40 100644 (file)
@@ -22,6 +22,26 @@ func TestConstants(t *testing.T) {
        }
 }
 
+func TestRuneLen(t *testing.T) {
+       for _, tt := range []struct {
+               r      rune
+               length int
+       }{
+               {0, 1},
+               {Surr1 - 1, 1},
+               {Surr3, 1},
+               {SurrSelf - 1, 1},
+               {SurrSelf, 2},
+               {MaxRune, 2},
+               {MaxRune + 1, -1},
+               {-1, -1},
+       } {
+               if length := RuneLen(tt.r); length != tt.length {
+                       t.Errorf("RuneLen(%#U) = %d, want %d", tt.r, length, tt.length)
+               }
+       }
+}
+
 type encodeTest struct {
        in  []rune
        out []uint16