unicode/utf8: optimize Valid to parity with ValidString

author Alan Donovan <alan@alandonovan.net>

Wed, 5 Jan 2022 14:20:15 +0000 (09:20 -0500)

committer Daniel Martí <mvdan@mvdan.cc>

Wed, 2 Mar 2022 11:33:18 +0000 (11:33 +0000)
author Alan Donovan <alan@alandonovan.net>
Wed, 5 Jan 2022 14:20:15 +0000 (09:20 -0500)
committer Daniel Martí <mvdan@mvdan.cc>
Wed, 2 Mar 2022 11:33:18 +0000 (11:33 +0000)
diff --git a/src/unicode/utf8/utf8.go b/src/unicode/utf8/utf8.go

index 6938c7e6a7742655d8d09bcae767bfd8fb1bf659..1e9f666e235d3540a0233fec52d3cdc40b34e475 100644 (file)
--- a/src/unicode/utf8/utf8.go
+++ b/src/unicode/utf8/utf8.go
@@ -475,6 +475,11 @@ func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
  
  // Valid reports whether p consists entirely of valid UTF-8-encoded runes.
  func Valid(p []byte) bool {
+       // This optimization avoids the need to recompute the capacity
+       // when generating code for p[8:], bringing it to parity with
+       // ValidString, which was 20% faster on long ASCII strings.
+       p = p[:len(p):len(p)]
+
         // Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
         for len(p) >= 8 {
                 // Combining two 32 bit loads allows the same code to be used
diff --git a/src/unicode/utf8/utf8_test.go b/src/unicode/utf8/utf8_test.go

index e9be4d2d636eec84bcb8dc9efe55047a83bf2fd3..e7c31222cca090541f7059b9174cd9ff05114972 100644 (file)
--- a/src/unicode/utf8/utf8_test.go
+++ b/src/unicode/utf8/utf8_test.go
@@ -6,6 +6,7 @@ package utf8_test
  
  import (
         "bytes"
+       "strings"
         "testing"
         "unicode"
         . "unicode/utf8"
@@ -554,6 +555,8 @@ func BenchmarkRuneCountInStringTenJapaneseChars(b *testing.B) {
         }
  }
  
+var ascii100000 = strings.Repeat("0123456789", 10000)
+
  func BenchmarkValidTenASCIIChars(b *testing.B) {
         s := []byte("0123456789")
         for i := 0; i < b.N; i++ {
@@ -561,12 +564,32 @@ func BenchmarkValidTenASCIIChars(b *testing.B) {
         }
  }
  
+func BenchmarkValid100KASCIIChars(b *testing.B) {
+       s := []byte(ascii100000)
+       for i := 0; i < b.N; i++ {
+               Valid(s)
+       }
+}
+
  func BenchmarkValidTenJapaneseChars(b *testing.B) {
         s := []byte("日本語日本語日本語日")
         for i := 0; i < b.N; i++ {
                 Valid(s)
         }
  }
+func BenchmarkValidLongMostlyASCII(b *testing.B) {
+       longMostlyASCII := []byte(longStringMostlyASCII)
+       for i := 0; i < b.N; i++ {
+               Valid(longMostlyASCII)
+       }
+}
+
+func BenchmarkValidLongJapanese(b *testing.B) {
+       longJapanese := []byte(longStringJapanese)
+       for i := 0; i < b.N; i++ {
+               Valid(longJapanese)
+       }
+}
  
  func BenchmarkValidStringTenASCIIChars(b *testing.B) {
         for i := 0; i < b.N; i++ {
@@ -574,12 +597,47 @@ func BenchmarkValidStringTenASCIIChars(b *testing.B) {
         }
  }
  
+func BenchmarkValidString100KASCIIChars(b *testing.B) {
+       for i := 0; i < b.N; i++ {
+               ValidString(ascii100000)
+       }
+}
+
  func BenchmarkValidStringTenJapaneseChars(b *testing.B) {
         for i := 0; i < b.N; i++ {
                 ValidString("日本語日本語日本語日")
         }
  }
  
+func BenchmarkValidStringLongMostlyASCII(b *testing.B) {
+       for i := 0; i < b.N; i++ {
+               ValidString(longStringMostlyASCII)
+       }
+}
+
+func BenchmarkValidStringLongJapanese(b *testing.B) {
+       for i := 0; i < b.N; i++ {
+               ValidString(longStringJapanese)
+       }
+}
+
+var longStringMostlyASCII string // ~100KB, ~97% ASCII
+var longStringJapanese string    // ~100KB, non-ASCII
+
+func init() {
+       const japanese = "日本語日本語日本語日"
+       var b bytes.Buffer
+       for i := 0; b.Len() < 100_000; i++ {
+               if i%100 == 0 {
+                       b.WriteString(japanese)
+               } else {
+                       b.WriteString("0123456789")
+               }
+       }
+       longStringMostlyASCII = b.String()
+       longStringJapanese = strings.Repeat(japanese, 100_000/len(japanese))
+}
+
  func BenchmarkEncodeASCIIRune(b *testing.B) {
         buf := make([]byte, UTFMax)
         for i := 0; i < b.N; i++ {
author	Alan Donovan <alan@alandonovan.net>
	Wed, 5 Jan 2022 14:20:15 +0000 (09:20 -0500)
committer	Daniel Martí <mvdan@mvdan.cc>
	Wed, 2 Mar 2022 11:33:18 +0000 (11:33 +0000)
src/unicode/utf8/utf8.go		patch \| blob \| history
src/unicode/utf8/utf8_test.go		patch \| blob \| history