From e0414d74fe2d38a6de1cadbbc135d578b11a27af Mon Sep 17 00:00:00 2001 From: Corentin Chary Date: Wed, 28 Jan 2026 22:17:38 +0100 Subject: [PATCH] bytes, strings: replace asciiSet bitset with lookup table MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Replace the 32-byte bitset implementation with a 256-byte lookup table for simpler and faster ASCII character membership testing. The bitset implementation used bit manipulation (shifts, masks, AND/OR) requiring multiple CPU operations per lookup. The lookup table uses direct array indexing with a single load and compare, reducing CPU overhead significantly. Using [256]bool instead of [256]byte allows the compiler to eliminate the comparison instruction entirely, as bool values are guaranteed to be either 0 or 1. The full 256-element array (rather than 128 elements) is used because it eliminates branches entirely. Testing shows [256]bool is 68% faster than [128]bool with an explicit bounds check (488µs vs 821µs) due to avoiding branch misprediction penalties in the hot path. Using [128]bool with bit masking (c&0x7f) eliminates bounds checks but still costs ~10% performance due to the AND operation. The 224-byte increase in memory usage is acceptable for modern systems, and the simpler implementation is easier to understand and maintain. Full benchmark results demonstrating ~1.5x improvements across all affected functions are available at: https://github.com/golang/go/issues/77194#issuecomment-3814095806 This supersedes CL 737920 with a simpler approach that improves performance for all architectures without requiring SIMD instructions. Updates #77194 Change-Id: I272ee6de05b963a8efc62e7e8838735fb0c4f41b Reviewed-on: https://go-review.googlesource.com/c/go/+/739982 Auto-Submit: Keith Randall LUCI-TryBot-Result: Go LUCI Reviewed-by: Alan Donovan Reviewed-by: Keith Randall Reviewed-by: Keith Randall --- src/bytes/bytes.go | 43 +++++++++++++++++++++++++++++------------- src/strings/strings.go | 43 +++++++++++++++++++++++++++++------------- 2 files changed, 60 insertions(+), 26 deletions(-) diff --git a/src/bytes/bytes.go b/src/bytes/bytes.go index 87183c0195..9576350bf6 100644 --- a/src/bytes/bytes.go +++ b/src/bytes/bytes.go @@ -246,7 +246,7 @@ func IndexAny(s []byte, chars string) int { } return IndexRune(s, r) } - if len(s) > 8 { + if shouldUseASCIISet(len(s)) { if as, isASCII := makeASCIISet(chars); isASCII { for i, c := range s { if as.contains(c) { @@ -301,7 +301,7 @@ func LastIndexAny(s []byte, chars string) int { // Avoid scanning all of s. return -1 } - if len(s) > 8 { + if shouldUseASCIISet(len(s)) { if as, isASCII := makeASCIISet(chars); isASCII { for i := len(s) - 1; i >= 0; i-- { if as.contains(s[i]) { @@ -935,15 +935,22 @@ func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int { return -1 } -// asciiSet is a 32-byte value, where each bit represents the presence of a -// given ASCII character in the set. The 128-bits of the lower 16 bytes, -// starting with the least-significant bit of the lowest word to the -// most-significant bit of the highest word, map to the full range of all -// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed, -// ensuring that any non-ASCII character will be reported as not in the set. -// This allocates a total of 32 bytes even though the upper half -// is unused to avoid bounds checks in asciiSet.contains. -type asciiSet [8]uint32 +// asciiSet is a 256-byte lookup table for fast ASCII character membership testing. +// Each element corresponds to an ASCII character value, with true indicating the +// character is in the set. Using bool instead of byte allows the compiler to +// eliminate the comparison instruction, as bool values are guaranteed to be 0 or 1. +// +// The full 256-element table is used rather than a 128-element table to avoid +// additional operations in the lookup path. Alternative approaches were tested: +// - [128]bool with explicit bounds check (if c >= 128): introduces branches +// that cause pipeline stalls, resulting in ~70% slower performance +// - [128]bool with masking (c&0x7f): eliminates bounds checks but the AND +// operation still costs ~10% performance compared to direct indexing +// +// The 256-element array allows direct indexing with no bounds checks, no branches, +// and no masking operations, providing optimal performance. The additional 128 bytes +// of memory is a worthwhile tradeoff for the simpler, faster code. +type asciiSet [256]bool // makeASCIISet creates a set of ASCII characters and reports whether all // characters in chars are ASCII. @@ -953,14 +960,24 @@ func makeASCIISet(chars string) (as asciiSet, ok bool) { if c >= utf8.RuneSelf { return as, false } - as[c/32] |= 1 << (c % 32) + as[c] = true } return as, true } // contains reports whether c is inside the set. func (as *asciiSet) contains(c byte) bool { - return (as[c/32] & (1 << (c % 32))) != 0 + return as[c] +} + +// shouldUseASCIISet returns whether to use the lookup table optimization. +// The threshold of 8 bytes balances initialization cost against per-byte +// search cost, performing well across all charset sizes. +// +// More complex heuristics (e.g., different thresholds per charset size) +// add branching overhead that eats away any theoretical improvements. +func shouldUseASCIISet(bufLen int) bool { + return bufLen > 8 } // containsRune is a simplified version of strings.ContainsRune diff --git a/src/strings/strings.go b/src/strings/strings.go index 56d4d07b01..28289a35bf 100644 --- a/src/strings/strings.go +++ b/src/strings/strings.go @@ -193,7 +193,7 @@ func IndexAny(s, chars string) int { } return IndexRune(s, r) } - if len(s) > 8 { + if shouldUseASCIISet(len(s)) { if as, isASCII := makeASCIISet(chars); isASCII { for i := 0; i < len(s); i++ { if as.contains(s[i]) { @@ -229,7 +229,7 @@ func LastIndexAny(s, chars string) int { } return -1 } - if len(s) > 8 { + if shouldUseASCIISet(len(s)) { if as, isASCII := makeASCIISet(chars); isASCII { for i := len(s) - 1; i >= 0; i-- { if as.contains(s[i]) { @@ -931,15 +931,22 @@ func lastIndexFunc(s string, f func(rune) bool, truth bool) int { return -1 } -// asciiSet is a 32-byte value, where each bit represents the presence of a -// given ASCII character in the set. The 128-bits of the lower 16 bytes, -// starting with the least-significant bit of the lowest word to the -// most-significant bit of the highest word, map to the full range of all -// 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed, -// ensuring that any non-ASCII character will be reported as not in the set. -// This allocates a total of 32 bytes even though the upper half -// is unused to avoid bounds checks in asciiSet.contains. -type asciiSet [8]uint32 +// asciiSet is a 256-byte lookup table for fast ASCII character membership testing. +// Each element corresponds to an ASCII character value, with true indicating the +// character is in the set. Using bool instead of byte allows the compiler to +// eliminate the comparison instruction, as bool values are guaranteed to be 0 or 1. +// +// The full 256-element table is used rather than a 128-element table to avoid +// additional operations in the lookup path. Alternative approaches were tested: +// - [128]bool with explicit bounds check (if c >= 128): introduces branches +// that cause pipeline stalls, resulting in ~70% slower performance +// - [128]bool with masking (c&0x7f): eliminates bounds checks but the AND +// operation still costs ~10% performance compared to direct indexing +// +// The 256-element array allows direct indexing with no bounds checks, no branches, +// and no masking operations, providing optimal performance. The additional 128 bytes +// of memory is a worthwhile tradeoff for the simpler, faster code. +type asciiSet [256]bool // makeASCIISet creates a set of ASCII characters and reports whether all // characters in chars are ASCII. @@ -949,14 +956,24 @@ func makeASCIISet(chars string) (as asciiSet, ok bool) { if c >= utf8.RuneSelf { return as, false } - as[c/32] |= 1 << (c % 32) + as[c] = true } return as, true } // contains reports whether c is inside the set. func (as *asciiSet) contains(c byte) bool { - return (as[c/32] & (1 << (c % 32))) != 0 + return as[c] +} + +// shouldUseASCIISet returns whether to use the lookup table optimization. +// The threshold of 8 bytes balances initialization cost against per-byte +// search cost, performing well across all charset sizes. +// +// More complex heuristics (e.g., different thresholds per charset size) +// add branching overhead that eats away any theoretical improvements. +func shouldUseASCIISet(bufLen int) bool { + return bufLen > 8 } // Trim returns a slice of the string s with all leading and -- 2.52.0