From e0414d74fe2d38a6de1cadbbc135d578b11a27af Mon Sep 17 00:00:00 2001
From: Corentin Chary <corentin.chary@datadoghq.com>
Date: Wed, 28 Jan 2026 22:17:38 +0100
Subject: [PATCH] bytes, strings: replace asciiSet bitset with lookup table
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Replace the 32-byte bitset implementation with a 256-byte lookup table
for simpler and faster ASCII character membership testing.

The bitset implementation used bit manipulation (shifts, masks, AND/OR)
requiring multiple CPU operations per lookup. The lookup table uses
direct array indexing with a single load and compare, reducing CPU
overhead significantly.

Using [256]bool instead of [256]byte allows the compiler to eliminate
the comparison instruction entirely, as bool values are guaranteed to be
either 0 or 1.

The full 256-element array (rather than 128 elements) is used because it
eliminates branches entirely. Testing shows [256]bool is 68% faster than
[128]bool with an explicit bounds check (488Âµs vs 821Âµs) due to avoiding
branch misprediction penalties in the hot path. Using [128]bool with bit
masking (c&0x7f) eliminates bounds checks but still costs ~10% performance
due to the AND operation.

The 224-byte increase in memory usage is acceptable for modern systems,
and the simpler implementation is easier to understand and maintain.

Full benchmark results demonstrating ~1.5x improvements across all
affected functions are available at:
https://github.com/golang/go/issues/77194#issuecomment-3814095806

This supersedes CL 737920 with a simpler approach that improves
performance for all architectures without requiring SIMD instructions.

Updates #77194

Change-Id: I272ee6de05b963a8efc62e7e8838735fb0c4f41b
Reviewed-on: https://go-review.googlesource.com/c/go/+/739982
Auto-Submit: Keith Randall <khr@golang.org>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Alan Donovan <adonovan@google.com>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Keith Randall <khr@google.com>
---
 src/bytes/bytes.go     | 43 +++++++++++++++++++++++++++++-------------
 src/strings/strings.go | 43 +++++++++++++++++++++++++++++-------------
 2 files changed, 60 insertions(+), 26 deletions(-)

diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go
index 87183c0195..9576350bf6 100644
--- a/src/bytes/bytes.go
+++ b/src/bytes/bytes.go
@@ -246,7 +246,7 @@ func IndexAny(s []byte, chars string) int {
 		}
 		return IndexRune(s, r)
 	}
-	if len(s) > 8 {
+	if shouldUseASCIISet(len(s)) {
 		if as, isASCII := makeASCIISet(chars); isASCII {
 			for i, c := range s {
 				if as.contains(c) {
@@ -301,7 +301,7 @@ func LastIndexAny(s []byte, chars string) int {
 		// Avoid scanning all of s.
 		return -1
 	}
-	if len(s) > 8 {
+	if shouldUseASCIISet(len(s)) {
 		if as, isASCII := makeASCIISet(chars); isASCII {
 			for i := len(s) - 1; i >= 0; i-- {
 				if as.contains(s[i]) {
@@ -935,15 +935,22 @@ func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
 	return -1
 }
 
-// asciiSet is a 32-byte value, where each bit represents the presence of a
-// given ASCII character in the set. The 128-bits of the lower 16 bytes,
-// starting with the least-significant bit of the lowest word to the
-// most-significant bit of the highest word, map to the full range of all
-// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
-// ensuring that any non-ASCII character will be reported as not in the set.
-// This allocates a total of 32 bytes even though the upper half
-// is unused to avoid bounds checks in asciiSet.contains.
-type asciiSet [8]uint32
+// asciiSet is a 256-byte lookup table for fast ASCII character membership testing.
+// Each element corresponds to an ASCII character value, with true indicating the
+// character is in the set. Using bool instead of byte allows the compiler to
+// eliminate the comparison instruction, as bool values are guaranteed to be 0 or 1.
+//
+// The full 256-element table is used rather than a 128-element table to avoid
+// additional operations in the lookup path. Alternative approaches were tested:
+//   - [128]bool with explicit bounds check (if c >= 128): introduces branches
+//     that cause pipeline stalls, resulting in ~70% slower performance
+//   - [128]bool with masking (c&0x7f): eliminates bounds checks but the AND
+//     operation still costs ~10% performance compared to direct indexing
+//
+// The 256-element array allows direct indexing with no bounds checks, no branches,
+// and no masking operations, providing optimal performance. The additional 128 bytes
+// of memory is a worthwhile tradeoff for the simpler, faster code.
+type asciiSet [256]bool
 
 // makeASCIISet creates a set of ASCII characters and reports whether all
 // characters in chars are ASCII.
@@ -953,14 +960,24 @@ func makeASCIISet(chars string) (as asciiSet, ok bool) {
 		if c >= utf8.RuneSelf {
 			return as, false
 		}
-		as[c/32] |= 1 << (c % 32)
+		as[c] = true
 	}
 	return as, true
 }
 
 // contains reports whether c is inside the set.
 func (as *asciiSet) contains(c byte) bool {
-	return (as[c/32] & (1 << (c % 32))) != 0
+	return as[c]
+}
+
+// shouldUseASCIISet returns whether to use the lookup table optimization.
+// The threshold of 8 bytes balances initialization cost against per-byte
+// search cost, performing well across all charset sizes.
+//
+// More complex heuristics (e.g., different thresholds per charset size)
+// add branching overhead that eats away any theoretical improvements.
+func shouldUseASCIISet(bufLen int) bool {
+	return bufLen > 8
 }
 
 // containsRune is a simplified version of strings.ContainsRune
diff --git a/src/strings/strings.go b/src/strings/strings.go
index 56d4d07b01..28289a35bf 100644
--- a/src/strings/strings.go
+++ b/src/strings/strings.go
@@ -193,7 +193,7 @@ func IndexAny(s, chars string) int {
 		}
 		return IndexRune(s, r)
 	}
-	if len(s) > 8 {
+	if shouldUseASCIISet(len(s)) {
 		if as, isASCII := makeASCIISet(chars); isASCII {
 			for i := 0; i < len(s); i++ {
 				if as.contains(s[i]) {
@@ -229,7 +229,7 @@ func LastIndexAny(s, chars string) int {
 		}
 		return -1
 	}
-	if len(s) > 8 {
+	if shouldUseASCIISet(len(s)) {
 		if as, isASCII := makeASCIISet(chars); isASCII {
 			for i := len(s) - 1; i >= 0; i-- {
 				if as.contains(s[i]) {
@@ -931,15 +931,22 @@ func lastIndexFunc(s string, f func(rune) bool, truth bool) int {
 	return -1
 }
 
-// asciiSet is a 32-byte value, where each bit represents the presence of a
-// given ASCII character in the set. The 128-bits of the lower 16 bytes,
-// starting with the least-significant bit of the lowest word to the
-// most-significant bit of the highest word, map to the full range of all
-// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
-// ensuring that any non-ASCII character will be reported as not in the set.
-// This allocates a total of 32 bytes even though the upper half
-// is unused to avoid bounds checks in asciiSet.contains.
-type asciiSet [8]uint32
+// asciiSet is a 256-byte lookup table for fast ASCII character membership testing.
+// Each element corresponds to an ASCII character value, with true indicating the
+// character is in the set. Using bool instead of byte allows the compiler to
+// eliminate the comparison instruction, as bool values are guaranteed to be 0 or 1.
+//
+// The full 256-element table is used rather than a 128-element table to avoid
+// additional operations in the lookup path. Alternative approaches were tested:
+//   - [128]bool with explicit bounds check (if c >= 128): introduces branches
+//     that cause pipeline stalls, resulting in ~70% slower performance
+//   - [128]bool with masking (c&0x7f): eliminates bounds checks but the AND
+//     operation still costs ~10% performance compared to direct indexing
+//
+// The 256-element array allows direct indexing with no bounds checks, no branches,
+// and no masking operations, providing optimal performance. The additional 128 bytes
+// of memory is a worthwhile tradeoff for the simpler, faster code.
+type asciiSet [256]bool
 
 // makeASCIISet creates a set of ASCII characters and reports whether all
 // characters in chars are ASCII.
@@ -949,14 +956,24 @@ func makeASCIISet(chars string) (as asciiSet, ok bool) {
 		if c >= utf8.RuneSelf {
 			return as, false
 		}
-		as[c/32] |= 1 << (c % 32)
+		as[c] = true
 	}
 	return as, true
 }
 
 // contains reports whether c is inside the set.
 func (as *asciiSet) contains(c byte) bool {
-	return (as[c/32] & (1 << (c % 32))) != 0
+	return as[c]
+}
+
+// shouldUseASCIISet returns whether to use the lookup table optimization.
+// The threshold of 8 bytes balances initialization cost against per-byte
+// search cost, performing well across all charset sizes.
+//
+// More complex heuristics (e.g., different thresholds per charset size)
+// add branching overhead that eats away any theoretical improvements.
+func shouldUseASCIISet(bufLen int) bool {
+	return bufLen > 8
 }
 
 // Trim returns a slice of the string s with all leading and
-- 
2.52.0