From: Josh Bleecher Snyder <josharian@gmail.com>
Date: Wed, 10 May 2017 17:19:43 +0000 (-0700)
Subject: runtime: speed up stack copying
X-Git-Tag: go1.9beta1~22
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=aafd96408feef0785d32fd3e1c5a67d4159a98e7;p=gostls13.git

runtime: speed up stack copying

I was surprised to see readvarint show up in a cpu profile.

Use a few simple optimizations to speed up stack copying:

* Avoid making a copy of the cache.entries array or any of its elements.
* Use a shift instead of a signed division in stackmapdata.
* Change readvarint to return the number of bytes consumed
  rather than an updated slice.
* Make some minor optimizations to readvarint to help the compiler.
* Avoid called readvarint when the value fits in a single byte.

The first and last optimizations are the most significant,
although they all contribute a little.

Add a benchmark for stack copying that includes lots of different
functions in a recursive loop, to bust the cache.

This might speed up other runtime operations as well;
I only benchmarked stack copying.

name                old time/op  new time/op  delta
StackCopy-8         96.4ms ± 2%  82.7ms ± 1%  -14.24%  (p=0.000 n=20+19)
StackCopyNoCache-8   167ms ± 1%   131ms ± 1%  -21.58%  (p=0.000 n=20+20)

Change-Id: I13d5c455c65073c73b656acad86cf8e8e3c9807b
Reviewed-on: https://go-review.googlesource.com/43150
Run-TryBot: Josh Bleecher Snyder <josharian@gmail.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
---

diff --git a/src/runtime/stack_test.go b/src/runtime/stack_test.go
index 965c4e6838..485e327c41 100644
--- a/src/runtime/stack_test.go
+++ b/src/runtime/stack_test.go
@@ -453,3 +453,175 @@ func count(n int) int {
 	}
 	return 1 + count(n-1)
 }
+
+func BenchmarkStackCopyNoCache(b *testing.B) {
+	c := make(chan bool)
+	for i := 0; i < b.N; i++ {
+		go func() {
+			count1(1000000)
+			c <- true
+		}()
+		<-c
+	}
+}
+
+func count1(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count2(n-1)
+}
+
+func count2(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count3(n-1)
+}
+
+func count3(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count4(n-1)
+}
+
+func count4(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count5(n-1)
+}
+
+func count5(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count6(n-1)
+}
+
+func count6(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count7(n-1)
+}
+
+func count7(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count8(n-1)
+}
+
+func count8(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count9(n-1)
+}
+
+func count9(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count10(n-1)
+}
+
+func count10(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count11(n-1)
+}
+
+func count11(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count12(n-1)
+}
+
+func count12(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count13(n-1)
+}
+
+func count13(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count14(n-1)
+}
+
+func count14(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count15(n-1)
+}
+
+func count15(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count16(n-1)
+}
+
+func count16(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count17(n-1)
+}
+
+func count17(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count18(n-1)
+}
+
+func count18(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count19(n-1)
+}
+
+func count19(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count20(n-1)
+}
+
+func count20(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count21(n-1)
+}
+
+func count21(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count22(n-1)
+}
+
+func count22(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count23(n-1)
+}
+
+func count23(n int) int {
+	if n == 0 {
+		return 0
+	}
+	return 1 + count1(n-1)
+}
diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go
index 8fb3d3ca94..029c2f15af 100644
--- a/src/runtime/symtab.go
+++ b/src/runtime/symtab.go
@@ -686,12 +686,13 @@ func pcvalue(f funcInfo, off int32, targetpc uintptr, cache *pcvalueCache, stric
 	// cheaper than doing the hashing for a less associative
 	// cache.
 	if cache != nil {
-		for _, ent := range cache.entries {
+		for i := range cache.entries {
 			// We check off first because we're more
 			// likely to have multiple entries with
 			// different offsets for the same targetpc
 			// than the other way around, so we'll usually
 			// fail in the first clause.
+			ent := &cache.entries[i]
 			if ent.off == off && ent.targetpc == targetpc {
 				return ent.val
 			}
@@ -836,35 +837,47 @@ func funcdata(f funcInfo, i int32) unsafe.Pointer {
 
 // step advances to the next pc, value pair in the encoded table.
 func step(p []byte, pc *uintptr, val *int32, first bool) (newp []byte, ok bool) {
-	p, uvdelta := readvarint(p)
+	// For both uvdelta and pcdelta, the common case (~70%)
+	// is that they are a single byte. If so, avoid calling readvarint.
+	uvdelta := uint32(p[0])
 	if uvdelta == 0 && !first {
 		return nil, false
 	}
+	n := uint32(1)
+	if uvdelta&0x80 != 0 {
+		n, uvdelta = readvarint(p)
+	}
+	p = p[n:]
 	if uvdelta&1 != 0 {
 		uvdelta = ^(uvdelta >> 1)
 	} else {
 		uvdelta >>= 1
 	}
 	vdelta := int32(uvdelta)
-	p, pcdelta := readvarint(p)
+	pcdelta := uint32(p[0])
+	n = 1
+	if pcdelta&0x80 != 0 {
+		n, pcdelta = readvarint(p)
+	}
+	p = p[n:]
 	*pc += uintptr(pcdelta * sys.PCQuantum)
 	*val += vdelta
 	return p, true
 }
 
 // readvarint reads a varint from p.
-func readvarint(p []byte) (newp []byte, val uint32) {
-	var v, shift uint32
+func readvarint(p []byte) (read uint32, val uint32) {
+	var v, shift, n uint32
 	for {
-		b := p[0]
-		p = p[1:]
-		v |= (uint32(b) & 0x7F) << shift
+		b := p[n]
+		n++
+		v |= uint32(b&0x7F) << (shift & 31)
 		if b&0x80 == 0 {
 			break
 		}
 		shift += 7
 	}
-	return p, v
+	return n, v
 }
 
 type stackmap struct {
@@ -878,7 +891,7 @@ func stackmapdata(stkmap *stackmap, n int32) bitvector {
 	if n < 0 || n >= stkmap.n {
 		throw("stackmapdata: index out of range")
 	}
-	return bitvector{stkmap.nbit, (*byte)(add(unsafe.Pointer(&stkmap.bytedata), uintptr(n*((stkmap.nbit+7)/8))))}
+	return bitvector{stkmap.nbit, (*byte)(add(unsafe.Pointer(&stkmap.bytedata), uintptr(n*((stkmap.nbit+7)>>3))))}
 }
 
 // inlinedCall is the encoding of entries in the FUNCDATA_InlTree table.