From: Josh Bleecher Snyder Date: Wed, 10 May 2017 17:19:43 +0000 (-0700) Subject: runtime: speed up stack copying X-Git-Tag: go1.9beta1~22 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=aafd96408feef0785d32fd3e1c5a67d4159a98e7;p=gostls13.git runtime: speed up stack copying I was surprised to see readvarint show up in a cpu profile. Use a few simple optimizations to speed up stack copying: * Avoid making a copy of the cache.entries array or any of its elements. * Use a shift instead of a signed division in stackmapdata. * Change readvarint to return the number of bytes consumed rather than an updated slice. * Make some minor optimizations to readvarint to help the compiler. * Avoid called readvarint when the value fits in a single byte. The first and last optimizations are the most significant, although they all contribute a little. Add a benchmark for stack copying that includes lots of different functions in a recursive loop, to bust the cache. This might speed up other runtime operations as well; I only benchmarked stack copying. name old time/op new time/op delta StackCopy-8 96.4ms ± 2% 82.7ms ± 1% -14.24% (p=0.000 n=20+19) StackCopyNoCache-8 167ms ± 1% 131ms ± 1% -21.58% (p=0.000 n=20+20) Change-Id: I13d5c455c65073c73b656acad86cf8e8e3c9807b Reviewed-on: https://go-review.googlesource.com/43150 Run-TryBot: Josh Bleecher Snyder TryBot-Result: Gobot Gobot Reviewed-by: Austin Clements --- diff --git a/src/runtime/stack_test.go b/src/runtime/stack_test.go index 965c4e6838..485e327c41 100644 --- a/src/runtime/stack_test.go +++ b/src/runtime/stack_test.go @@ -453,3 +453,175 @@ func count(n int) int { } return 1 + count(n-1) } + +func BenchmarkStackCopyNoCache(b *testing.B) { + c := make(chan bool) + for i := 0; i < b.N; i++ { + go func() { + count1(1000000) + c <- true + }() + <-c + } +} + +func count1(n int) int { + if n == 0 { + return 0 + } + return 1 + count2(n-1) +} + +func count2(n int) int { + if n == 0 { + return 0 + } + return 1 + count3(n-1) +} + +func count3(n int) int { + if n == 0 { + return 0 + } + return 1 + count4(n-1) +} + +func count4(n int) int { + if n == 0 { + return 0 + } + return 1 + count5(n-1) +} + +func count5(n int) int { + if n == 0 { + return 0 + } + return 1 + count6(n-1) +} + +func count6(n int) int { + if n == 0 { + return 0 + } + return 1 + count7(n-1) +} + +func count7(n int) int { + if n == 0 { + return 0 + } + return 1 + count8(n-1) +} + +func count8(n int) int { + if n == 0 { + return 0 + } + return 1 + count9(n-1) +} + +func count9(n int) int { + if n == 0 { + return 0 + } + return 1 + count10(n-1) +} + +func count10(n int) int { + if n == 0 { + return 0 + } + return 1 + count11(n-1) +} + +func count11(n int) int { + if n == 0 { + return 0 + } + return 1 + count12(n-1) +} + +func count12(n int) int { + if n == 0 { + return 0 + } + return 1 + count13(n-1) +} + +func count13(n int) int { + if n == 0 { + return 0 + } + return 1 + count14(n-1) +} + +func count14(n int) int { + if n == 0 { + return 0 + } + return 1 + count15(n-1) +} + +func count15(n int) int { + if n == 0 { + return 0 + } + return 1 + count16(n-1) +} + +func count16(n int) int { + if n == 0 { + return 0 + } + return 1 + count17(n-1) +} + +func count17(n int) int { + if n == 0 { + return 0 + } + return 1 + count18(n-1) +} + +func count18(n int) int { + if n == 0 { + return 0 + } + return 1 + count19(n-1) +} + +func count19(n int) int { + if n == 0 { + return 0 + } + return 1 + count20(n-1) +} + +func count20(n int) int { + if n == 0 { + return 0 + } + return 1 + count21(n-1) +} + +func count21(n int) int { + if n == 0 { + return 0 + } + return 1 + count22(n-1) +} + +func count22(n int) int { + if n == 0 { + return 0 + } + return 1 + count23(n-1) +} + +func count23(n int) int { + if n == 0 { + return 0 + } + return 1 + count1(n-1) +} diff --git a/src/runtime/symtab.go b/src/runtime/symtab.go index 8fb3d3ca94..029c2f15af 100644 --- a/src/runtime/symtab.go +++ b/src/runtime/symtab.go @@ -686,12 +686,13 @@ func pcvalue(f funcInfo, off int32, targetpc uintptr, cache *pcvalueCache, stric // cheaper than doing the hashing for a less associative // cache. if cache != nil { - for _, ent := range cache.entries { + for i := range cache.entries { // We check off first because we're more // likely to have multiple entries with // different offsets for the same targetpc // than the other way around, so we'll usually // fail in the first clause. + ent := &cache.entries[i] if ent.off == off && ent.targetpc == targetpc { return ent.val } @@ -836,35 +837,47 @@ func funcdata(f funcInfo, i int32) unsafe.Pointer { // step advances to the next pc, value pair in the encoded table. func step(p []byte, pc *uintptr, val *int32, first bool) (newp []byte, ok bool) { - p, uvdelta := readvarint(p) + // For both uvdelta and pcdelta, the common case (~70%) + // is that they are a single byte. If so, avoid calling readvarint. + uvdelta := uint32(p[0]) if uvdelta == 0 && !first { return nil, false } + n := uint32(1) + if uvdelta&0x80 != 0 { + n, uvdelta = readvarint(p) + } + p = p[n:] if uvdelta&1 != 0 { uvdelta = ^(uvdelta >> 1) } else { uvdelta >>= 1 } vdelta := int32(uvdelta) - p, pcdelta := readvarint(p) + pcdelta := uint32(p[0]) + n = 1 + if pcdelta&0x80 != 0 { + n, pcdelta = readvarint(p) + } + p = p[n:] *pc += uintptr(pcdelta * sys.PCQuantum) *val += vdelta return p, true } // readvarint reads a varint from p. -func readvarint(p []byte) (newp []byte, val uint32) { - var v, shift uint32 +func readvarint(p []byte) (read uint32, val uint32) { + var v, shift, n uint32 for { - b := p[0] - p = p[1:] - v |= (uint32(b) & 0x7F) << shift + b := p[n] + n++ + v |= uint32(b&0x7F) << (shift & 31) if b&0x80 == 0 { break } shift += 7 } - return p, v + return n, v } type stackmap struct { @@ -878,7 +891,7 @@ func stackmapdata(stkmap *stackmap, n int32) bitvector { if n < 0 || n >= stkmap.n { throw("stackmapdata: index out of range") } - return bitvector{stkmap.nbit, (*byte)(add(unsafe.Pointer(&stkmap.bytedata), uintptr(n*((stkmap.nbit+7)/8))))} + return bitvector{stkmap.nbit, (*byte)(add(unsafe.Pointer(&stkmap.bytedata), uintptr(n*((stkmap.nbit+7)>>3))))} } // inlinedCall is the encoding of entries in the FUNCDATA_InlTree table.