]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: reduce linear search through pcvalue cache
authorJosh Bleecher Snyder <josharian@gmail.com>
Sun, 1 Apr 2018 23:39:34 +0000 (16:39 -0700)
committerJosh Bleecher Snyder <josharian@gmail.com>
Fri, 9 Nov 2018 16:06:56 +0000 (16:06 +0000)
This change introduces two optimizations together,
one for recursive and one for non-recursive stacks.

For recursive stacks, we introduce the new entry
at the beginning of the cache, so it can be found first.
This adds an extra read and write.
While we're here, switch from fastrandn, which does a multiply,
to fastrand % n, which does a shift.

For non-recursive stacks, split the cache from [16]pcvalueCacheEnt
into [2][8]pcvalueCacheEnt, and add a very cheap associative lookup.

name                old time/op  new time/op  delta
StackCopyPtr-8       118ms ± 1%   106ms ± 2%  -9.56%  (p=0.000 n=17+18)
StackCopy-8         95.8ms ± 1%  87.0ms ± 3%  -9.11%  (p=0.000 n=19+20)
StackCopyNoCache-8   135ms ± 2%   139ms ± 1%  +3.06%  (p=0.000 n=19+18)

During make.bash, the association function used has this return distribution:

percent count  return value
 53.23% 678797 1
 46.74% 596094 0

It is definitely not perfect, but it is pretty good,
and that's all we need.

Change-Id: I2cabb1d26b99c5111bc28f427016a2a5e6c620fd
Reviewed-on: https://go-review.googlesource.com/c/110564
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
src/cmd/compile/internal/gc/inl_test.go
src/runtime/symtab.go

index ac86cda2b87ae63d7849acb106edb38a5f55925c..5a8c19e2cb447a67ca3a2f0e9f1ea097088288a3 100644 (file)
@@ -55,6 +55,7 @@ func TestIntendedInlining(t *testing.T) {
                        "isDirectIface",
                        "itabHashFunc",
                        "noescape",
+                       "pcvalueCacheKey",
                        "readUnaligned32",
                        "readUnaligned64",
                        "releasem",
index 1dc7ab740eef57207a12bed91b5f70f154d00683..edda45c669f67e71493e69330b04788e79a7cafe 100644 (file)
@@ -698,7 +698,7 @@ func findfunc(pc uintptr) funcInfo {
 }
 
 type pcvalueCache struct {
-       entries [16]pcvalueCacheEnt
+       entries [2][8]pcvalueCacheEnt
 }
 
 type pcvalueCacheEnt struct {
@@ -709,6 +709,14 @@ type pcvalueCacheEnt struct {
        val int32
 }
 
+// pcvalueCacheKey returns the outermost index in a pcvalueCache to use for targetpc.
+// It must be very cheap to calculate.
+// For now, align to sys.PtrSize and reduce mod the number of entries.
+// In practice, this appears to be fairly randomly and evenly distributed.
+func pcvalueCacheKey(targetpc uintptr) uintptr {
+       return (targetpc / sys.PtrSize) % uintptr(len(pcvalueCache{}.entries))
+}
+
 func pcvalue(f funcInfo, off int32, targetpc uintptr, cache *pcvalueCache, strict bool) int32 {
        if off == 0 {
                return -1
@@ -721,13 +729,14 @@ func pcvalue(f funcInfo, off int32, targetpc uintptr, cache *pcvalueCache, stric
        // cheaper than doing the hashing for a less associative
        // cache.
        if cache != nil {
-               for i := range cache.entries {
+               x := pcvalueCacheKey(targetpc)
+               for i := range cache.entries[x] {
                        // We check off first because we're more
                        // likely to have multiple entries with
                        // different offsets for the same targetpc
                        // than the other way around, so we'll usually
                        // fail in the first clause.
-                       ent := &cache.entries[i]
+                       ent := &cache.entries[x][i]
                        if ent.off == off && ent.targetpc == targetpc {
                                return ent.val
                        }
@@ -756,9 +765,14 @@ func pcvalue(f funcInfo, off int32, targetpc uintptr, cache *pcvalueCache, stric
                        // replacement prevents a performance cliff if
                        // a recursive stack's cycle is slightly
                        // larger than the cache.
+                       // Put the new element at the beginning,
+                       // since it is the most likely to be newly used.
                        if cache != nil {
-                               ci := fastrandn(uint32(len(cache.entries)))
-                               cache.entries[ci] = pcvalueCacheEnt{
+                               x := pcvalueCacheKey(targetpc)
+                               e := &cache.entries[x]
+                               ci := fastrand() % uint32(len(cache.entries[x]))
+                               e[ci] = e[0]
+                               e[0] = pcvalueCacheEnt{
                                        targetpc: targetpc,
                                        off:      off,
                                        val:      val,