From bfc209518e3f6a15407139d70f31ab4a5dd646c7 Mon Sep 17 00:00:00 2001
From: thepudds <thepudds1460@gmail.com>
Date: Sat, 7 Dec 2024 16:17:33 -0500
Subject: [PATCH] internal/runtime/maps: speed up small map lookups ~1.7x for
 unpredictable keys
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

On master, lookups on small Swiss Table maps (<= 8 elements) for
non-specialized key types are seemingly a performance regression
compared to the Go 1.23 map implementation (reported in #70849).
Currently, a linear scan is used for gets in these cases.

This CL changes (*Map).getWithKeySmall to instead use the SIMD or SWAR
match on the control bytes to then jump to candidate matching slots,
with sample results below for a 16-byte key. This especially helps the
hit case when the key is unpredictable, which previously had to scan an
unpredictable number of control bytes to find a candidate slot when the
key is unpredictable.

Separately, other CLs in this stack modify the main Swiss Table
benchmarks to randomize lookup key order (vs. previously most of the
benchmarks had a repeating lookup key ordering, which likely is
predictable until the map is too big). We have sample results for the
randomized key order benchmarks followed by results from the older
benchmarks.

The first table below is with randomized key order. For hits, the older
results get slower as there are more elements. With this CL, we see hits
for unpredictable key ordering (sizes 2-8) get a ~1.7x speedup from
~25ns to ~14ns, with a now consistent lookup time for the different
sizes. (The 1 element size map has a predictable key ordering because
there is only one key, and that reports a modest ~0.5ns or ~3%
performance penalty). Misses for unpredictable key order get a ~1.3x
speedup, from ~13ns to ~10ns, with similar results for the 1 element
size.

                                                   â no-fix-new-bmarks  â fix-with-new-bmarks   â
                                                   â     sec/op         â  sec/op       vs base â
MapSmallAccessHit/Key=smallType/Elem=int32/len=1-4        13.26n Â±  0%   13.64n Â±  0%   +2.90% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=2-4        19.47n Â±  0%   13.62n Â±  0%  -30.05% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=3-4        22.23n Â±  0%   13.64n Â±  0%  -38.68% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=4-4        23.98n Â±  0%   13.64n Â±  0%  -43.11% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=5-4        25.02n Â±  0%   13.67n Â±  0%  -45.35% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=6-4        25.77n Â±  1%   13.68n Â±  2%  -46.89% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=7-4        26.38n Â±  0%   13.64n Â±  0%  -48.28% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=8-4        26.31n Â±  0%   13.71n Â± 21%  -47.90% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=1-4      13.055n Â±  0%   9.815n Â±  0%  -24.82% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=2-4      13.070n Â±  0%   9.813n Â±  0%  -24.92% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=3-4      13.060n Â±  0%   9.819n Â±  0%  -24.82% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=4-4      13.075n Â±  0%   9.816n Â±  0%  -24.92% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=5-4      13.060n Â±  0%   9.826n Â±  0%  -24.76% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=6-4      13.095n Â± 19%   9.834n Â± 31%  -24.90% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=7-4      13.075n Â± 19%   9.822n Â± 27%  -24.88% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=8-4       13.11n Â± 16%   12.14n Â± 19%   -7.43% (p=0.000 n=20)

The next table uses the original benchmarks from just before this CL
stack (i.e., without shuffling lookup keys).

With this CL, we see improvement that is directionally similar to the
above results but not as large, presumably because the branches in the
linear scan are fairly predictable with predictable keys. (The numbers
here also include the time from a mod in the benchmark code, which
seemed to take around ~1/3 of CPU time based on spot checking a couple
of examples, vs. the modified benchmarks shown above have removed that
mod).

                                                  â master-8c3e391573 â   just-fix-with-old-bmarks       â
                                                  â      sec/op       â    sec/op     vs base            â
MapSmallAccessHit/Key=smallType/Elem=int32/len=1-4      20.85n Â±  0%   21.69n Â±  0%   +4.03% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=2-4      21.22n Â±  0%   21.70n Â±  0%   +2.24% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=3-4      21.73n Â±  0%   21.71n Â±  0%        ~ (p=0.158 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=4-4      22.06n Â±  0%   21.71n Â±  0%   -1.56% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=5-4      22.41n Â±  0%   21.73n Â±  0%   -3.01% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=6-4      22.71n Â±  0%   21.72n Â±  0%   -4.38% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=7-4      22.98n Â±  0%   21.71n Â±  0%   -5.53% (p=0.000 n=20)
MapSmallAccessHit/Key=smallType/Elem=int32/len=8-4      23.20n Â±  0%   21.72n Â±  0%   -6.36% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=1-4     19.95n Â±  0%   17.30n Â±  0%  -13.28% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=2-4     19.96n Â±  0%   17.31n Â±  0%  -13.28% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=3-4     19.95n Â±  0%   17.29n Â±  0%  -13.33% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=4-4     19.95n Â±  0%   17.30n Â±  0%  -13.29% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=5-4     19.96n Â± 25%   17.32n Â±  0%  -13.22% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=6-4     19.99n Â± 24%   17.29n Â±  0%  -13.51% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=7-4     19.97n Â± 20%   17.34n Â± 16%  -13.14% (p=0.000 n=20)
MapSmallAccessMiss/Key=smallType/Elem=int32/len=8-4     20.02n Â± 11%   17.33n Â± 14%  -13.44% (p=0.000 n=20)
geomean                                                 21.02n         19.39n         -7.78%

See #70849 for additional benchmark results, including results for arm64
(which also means without SIMD support).

Updates #54766
Updates #70700
Fixes #70849

Change-Id: Ic2361bb6fc15b4436d1d1d5be7e4712e547f611b
Reviewed-on: https://go-review.googlesource.com/c/go/+/634396
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Dmitri Shuralyov <dmitshur@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/internal/runtime/maps/map.go  | 15 +++++++--------
 src/runtime/map_benchmark_test.go |  3 +++
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/internal/runtime/maps/map.go b/src/internal/runtime/maps/map.go
index b4db522978..94000a942d 100644
--- a/src/internal/runtime/maps/map.go
+++ b/src/internal/runtime/maps/map.go
@@ -439,15 +439,10 @@ func (m *Map) getWithKeySmall(typ *abi.SwissMapType, hash uintptr, key unsafe.Po
 		data: m.dirPtr,
 	}
 
-	h2 := uint8(h2(hash))
-	ctrls := *g.ctrls()
+	match := g.ctrls().matchH2(h2(hash))
 
-	for i := uintptr(0); i < abi.SwissMapGroupSlots; i++ {
-		c := uint8(ctrls)
-		ctrls >>= 8
-		if c != h2 {
-			continue
-		}
+	for match != 0 {
+		i := match.first()
 
 		slotKey := g.key(typ, i)
 		if typ.IndirectKey() {
@@ -461,8 +456,12 @@ func (m *Map) getWithKeySmall(typ *abi.SwissMapType, hash uintptr, key unsafe.Po
 			}
 			return slotKey, slotElem, true
 		}
+
+		match = match.removeFirst()
 	}
 
+	// No match here means key is not in the map.
+	// (A single group means no need to probe or check for empty).
 	return nil, nil, false
 }
 
diff --git a/src/runtime/map_benchmark_test.go b/src/runtime/map_benchmark_test.go
index 43c8f0bb61..bf195fa30d 100644
--- a/src/runtime/map_benchmark_test.go
+++ b/src/runtime/map_benchmark_test.go
@@ -1182,9 +1182,12 @@ func BenchmarkMapSmallAccessHit(b *testing.B) {
 	b.Run("Key=int32/Elem=int32", smallBenchSizes(benchmarkMapAccessHit[int32, int32]))
 	b.Run("Key=int64/Elem=int64", smallBenchSizes(benchmarkMapAccessHit[int64, int64]))
 	b.Run("Key=string/Elem=string", smallBenchSizes(benchmarkMapAccessHit[string, string]))
+	b.Run("Key=smallType/Elem=int32", smallBenchSizes(benchmarkMapAccessHit[smallType, int32]))
 }
+
 func BenchmarkMapSmallAccessMiss(b *testing.B) {
 	b.Run("Key=int32/Elem=int32", smallBenchSizes(benchmarkMapAccessMiss[int32, int32]))
 	b.Run("Key=int64/Elem=int64", smallBenchSizes(benchmarkMapAccessMiss[int64, int64]))
 	b.Run("Key=string/Elem=string", smallBenchSizes(benchmarkMapAccessMiss[string, string]))
+	b.Run("Key=smallType/Elem=int32", smallBenchSizes(benchmarkMapAccessMiss[smallType, int32]))
 }
-- 
2.51.0