From 7b9de668bd68f366d87ba50e9aeb1ba1d0bdb8e5 Mon Sep 17 00:00:00 2001
From: Keith Randall <khr@golang.org>
Date: Sun, 22 Jun 2025 11:48:57 -0700
Subject: [PATCH] unicode/utf8: skip ahead during ascii runs in
 Valid/ValidString
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

When we see an ASCII character, we will probably see many.
Grab & check increasingly large chunks of the string for ASCII-only-ness.

Also redo some of the non-ASCII code to make it more optimizer friendly.

goos: linux
goarch: amd64
pkg: unicode/utf8
cpu: 12th Gen Intel(R) Core(TM) i7-12700
                               â     base     â                 exp                 â
                               â    sec/op    â   sec/op     vs base                â
ValidTenASCIIChars-20             3.596n Â± 3%   2.522n Â± 1%  -29.86% (p=0.000 n=10)
Valid100KASCIIChars-20            6.094Âµ Â± 2%   2.115Âµ Â± 1%  -65.29% (p=0.000 n=10)
ValidTenJapaneseChars-20          21.02n Â± 0%   18.61n Â± 2%  -11.44% (p=0.000 n=10)
ValidLongMostlyASCII-20          51.774Âµ Â± 0%   3.836Âµ Â± 1%  -92.59% (p=0.000 n=10)
ValidLongJapanese-20             102.40Âµ Â± 1%   50.95Âµ Â± 1%  -50.24% (p=0.000 n=10)
ValidStringTenASCIIChars-20       2.640n Â± 3%   2.526n Â± 1%   -4.34% (p=0.000 n=10)
ValidString100KASCIIChars-20      5.585Âµ Â± 7%   2.118Âµ Â± 1%  -62.07% (p=0.000 n=10)
ValidStringTenJapaneseChars-20    21.29n Â± 2%   18.67n Â± 1%  -12.31% (p=0.000 n=10)
ValidStringLongMostlyASCII-20    52.431Âµ Â± 1%   3.841Âµ Â± 0%  -92.67% (p=0.000 n=10)
ValidStringLongJapanese-20       102.66Âµ Â± 1%   50.90Âµ Â± 1%  -50.42% (p=0.000 n=10)
geomean                           1.152Âµ        454.8n       -60.53%

This is an attempt to see if we can get enough performance that we don't
need to consider assembly like that in CL 681695.

Change-Id: I8250feb797a6b4e7d335c23929f6e3acc8b24840
Reviewed-on: https://go-review.googlesource.com/c/go/+/682778
Reviewed-by: Cuong Manh Le <cuong.manhle.vn@gmail.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
Auto-Submit: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/unicode/utf8/utf8.go      | 154 ++++++++++++++++++----------------
 src/unicode/utf8/utf8_test.go |  10 +++
 2 files changed, 93 insertions(+), 71 deletions(-)

diff --git a/src/unicode/utf8/utf8.go b/src/unicode/utf8/utf8.go
index 3be2f15e8a..01cad1cc81 100644
--- a/src/unicode/utf8/utf8.go
+++ b/src/unicode/utf8/utf8.go
@@ -430,99 +430,111 @@ func RuneCountInString(s string) (n int) {
 // bits set to 10.
 func RuneStart(b byte) bool { return b&0xC0 != 0x80 }
 
+const ptrSize = 4 << (^uintptr(0) >> 63)
+const hiBits = 0x8080808080808080 >> (64 - 8*ptrSize)
+
+func word[T string | []byte](s T) uintptr {
+	if ptrSize == 4 {
+		return uintptr(s[0]) | uintptr(s[1])<<8 | uintptr(s[2])<<16 | uintptr(s[3])<<24
+	}
+	return uintptr(uint64(s[0]) | uint64(s[1])<<8 | uint64(s[2])<<16 | uint64(s[3])<<24 | uint64(s[4])<<32 | uint64(s[5])<<40 | uint64(s[6])<<48 | uint64(s[7])<<56)
+}
+
 // Valid reports whether p consists entirely of valid UTF-8-encoded runes.
 func Valid(p []byte) bool {
 	// This optimization avoids the need to recompute the capacity
-	// when generating code for p[8:], bringing it to parity with
+	// when generating code for slicing p, bringing it to parity with
 	// ValidString, which was 20% faster on long ASCII strings.
 	p = p[:len(p):len(p)]
 
-	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
-	for len(p) >= 8 {
-		// Combining two 32 bit loads allows the same code to be used
-		// for 32 and 64 bit platforms.
-		// The compiler can generate a 32bit load for first32 and second32
-		// on many platforms. See test/codegen/memcombine.go.
-		first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
-		second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
-		if (first32|second32)&0x80808080 != 0 {
-			// Found a non ASCII byte (>= RuneSelf).
-			break
-		}
-		p = p[8:]
-	}
-	n := len(p)
-	for i := 0; i < n; {
-		pi := p[i]
-		if pi < RuneSelf {
-			i++
+	for len(p) > 0 {
+		p0 := p[0]
+		if p0 < RuneSelf {
+			p = p[1:]
+			// If there's one ASCII byte, there are probably more.
+			// Advance quickly through ASCII-only data.
+			// Note: using > instead of >= here is intentional. That avoids
+			// needing pointing-past-the-end fixup on the slice operations.
+			if len(p) > ptrSize && word(p)&hiBits == 0 {
+				p = p[ptrSize:]
+				if len(p) > 2*ptrSize && (word(p)|word(p[ptrSize:]))&hiBits == 0 {
+					p = p[2*ptrSize:]
+					for len(p) > 4*ptrSize && ((word(p)|word(p[ptrSize:]))|(word(p[2*ptrSize:])|word(p[3*ptrSize:])))&hiBits == 0 {
+						p = p[4*ptrSize:]
+					}
+				}
+			}
 			continue
 		}
-		x := first[pi]
-		if x == xx {
-			return false // Illegal starter byte.
-		}
+		x := first[p0]
 		size := int(x & 7)
-		if i+size > n {
-			return false // Short or invalid.
-		}
 		accept := acceptRanges[x>>4]
-		if c := p[i+1]; c < accept.lo || accept.hi < c {
-			return false
-		} else if size == 2 {
-		} else if c := p[i+2]; c < locb || hicb < c {
-			return false
-		} else if size == 3 {
-		} else if c := p[i+3]; c < locb || hicb < c {
-			return false
+		switch size {
+		case 2:
+			if len(p) < 2 || p[1] < accept.lo || accept.hi < p[1] {
+				return false
+			}
+			p = p[2:]
+		case 3:
+			if len(p) < 3 || p[1] < accept.lo || accept.hi < p[1] || p[2] < locb || hicb < p[2] {
+				return false
+			}
+			p = p[3:]
+		case 4:
+			if len(p) < 4 || p[1] < accept.lo || accept.hi < p[1] || p[2] < locb || hicb < p[2] || p[3] < locb || hicb < p[3] {
+				return false
+			}
+			p = p[4:]
+		default:
+			return false // illegal starter byte
 		}
-		i += size
 	}
 	return true
 }
 
 // ValidString reports whether s consists entirely of valid UTF-8-encoded runes.
 func ValidString(s string) bool {
-	// Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
-	for len(s) >= 8 {
-		// Combining two 32 bit loads allows the same code to be used
-		// for 32 and 64 bit platforms.
-		// The compiler can generate a 32bit load for first32 and second32
-		// on many platforms. See test/codegen/memcombine.go.
-		first32 := uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 | uint32(s[3])<<24
-		second32 := uint32(s[4]) | uint32(s[5])<<8 | uint32(s[6])<<16 | uint32(s[7])<<24
-		if (first32|second32)&0x80808080 != 0 {
-			// Found a non ASCII byte (>= RuneSelf).
-			break
-		}
-		s = s[8:]
-	}
-	n := len(s)
-	for i := 0; i < n; {
-		si := s[i]
-		if si < RuneSelf {
-			i++
+	for len(s) > 0 {
+		s0 := s[0]
+		if s0 < RuneSelf {
+			s = s[1:]
+			// If there's one ASCII byte, there are probably more.
+			// Advance quickly through ASCII-only data.
+			// Note: using > instead of >= here is intentional. That avoids
+			// needing pointing-past-the-end fixup on the slice operations.
+			if len(s) > ptrSize && word(s)&hiBits == 0 {
+				s = s[ptrSize:]
+				if len(s) > 2*ptrSize && (word(s)|word(s[ptrSize:]))&hiBits == 0 {
+					s = s[2*ptrSize:]
+					for len(s) > 4*ptrSize && ((word(s)|word(s[ptrSize:]))|(word(s[2*ptrSize:])|word(s[3*ptrSize:])))&hiBits == 0 {
+						s = s[4*ptrSize:]
+					}
+				}
+			}
 			continue
 		}
-		x := first[si]
-		if x == xx {
-			return false // Illegal starter byte.
-		}
+		x := first[s0]
 		size := int(x & 7)
-		if i+size > n {
-			return false // Short or invalid.
-		}
 		accept := acceptRanges[x>>4]
-		if c := s[i+1]; c < accept.lo || accept.hi < c {
-			return false
-		} else if size == 2 {
-		} else if c := s[i+2]; c < locb || hicb < c {
-			return false
-		} else if size == 3 {
-		} else if c := s[i+3]; c < locb || hicb < c {
-			return false
+		switch size {
+		case 2:
+			if len(s) < 2 || s[1] < accept.lo || accept.hi < s[1] {
+				return false
+			}
+			s = s[2:]
+		case 3:
+			if len(s) < 3 || s[1] < accept.lo || accept.hi < s[1] || s[2] < locb || hicb < s[2] {
+				return false
+			}
+			s = s[3:]
+		case 4:
+			if len(s) < 4 || s[1] < accept.lo || accept.hi < s[1] || s[2] < locb || hicb < s[2] || s[3] < locb || hicb < s[3] {
+				return false
+			}
+			s = s[4:]
+		default:
+			return false // illegal starter byte
 		}
-		i += size
 	}
 	return true
 }
diff --git a/src/unicode/utf8/utf8_test.go b/src/unicode/utf8/utf8_test.go
index 865167731f..aece0fab73 100644
--- a/src/unicode/utf8/utf8_test.go
+++ b/src/unicode/utf8/utf8_test.go
@@ -489,6 +489,16 @@ var validTests = []ValidTest{
 	{string("\xed\xbf\xbf"), false},         // U+DFFF low surrogate (sic)
 }
 
+func init() {
+	for i := range 100 {
+		validTests = append(validTests, ValidTest{in: strings.Repeat("a", i), out: true})
+		validTests = append(validTests, ValidTest{in: strings.Repeat("a", i) + "Ð", out: true})
+		validTests = append(validTests, ValidTest{in: strings.Repeat("a", i) + "\xe2", out: false})
+		validTests = append(validTests, ValidTest{in: strings.Repeat("a", i) + "Ð" + strings.Repeat("b", i), out: true})
+		validTests = append(validTests, ValidTest{in: strings.Repeat("a", i) + "\xe2" + strings.Repeat("b", i), out: false})
+	}
+}
+
 func TestValid(t *testing.T) {
 	for _, tt := range validTests {
 		if Valid([]byte(tt.in)) != tt.out {
-- 
2.52.0