From 899a4ad47e452ede041fdb99204575a407dd94f2 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Wed, 15 Apr 2015 17:08:58 -0400 Subject: [PATCH] runtime: Speed up heapBitsForObject Optimized heapBitsForObject by special casing objects whose size is a power of two. When a span holding such objects is initialized I added a mask that when &ed with an interior pointer results in the base of the pointer. For the garbage benchmark this resulted in CPU_CLK_UNHALTED in heapBitsForObject going from 7.7% down to 5.9% of the total, INST_RETIRED went from 12.2 -> 8.7. Here are the benchmarks that were at lease plus or minus 1%. benchmark old ns/op new ns/op delta BenchmarkFmtFprintfString 249 221 -11.24% BenchmarkFmtFprintfInt 247 223 -9.72% BenchmarkFmtFprintfEmpty 76.5 69.6 -9.02% BenchmarkBinaryTree17 4106631412 3744550160 -8.82% BenchmarkFmtFprintfFloat 424 399 -5.90% BenchmarkGoParse 4484421 4242115 -5.40% BenchmarkGobEncode 8803668 8449107 -4.03% BenchmarkFmtManyArgs 1494 1436 -3.88% BenchmarkGobDecode 10431051 10032606 -3.82% BenchmarkFannkuch11 2591306713 2517400464 -2.85% BenchmarkTimeParse 361 371 +2.77% BenchmarkJSONDecode 70620492 68830357 -2.53% BenchmarkRegexpMatchMedium_1K 54693 53343 -2.47% BenchmarkTemplate 90008879 91929940 +2.13% BenchmarkTimeFormat 380 387 +1.84% BenchmarkRegexpMatchEasy1_32 111 113 +1.80% BenchmarkJSONEncode 21359159 21007583 -1.65% BenchmarkRegexpMatchEasy1_1K 603 613 +1.66% BenchmarkRegexpMatchEasy0_32 127 129 +1.57% BenchmarkFmtFprintfIntInt 399 393 -1.50% BenchmarkRegexpMatchEasy0_1K 373 378 +1.34% Change-Id: I78e297161026f8b5cc7507c965fd3e486f81ed29 Reviewed-on: https://go-review.googlesource.com/8980 Reviewed-by: Austin Clements --- src/runtime/mbitmap.go | 41 ++++++++++++++++++++--------------------- src/runtime/mheap.go | 5 ++++- src/runtime/msize.go | 17 ++++++++++++++--- 3 files changed, 38 insertions(+), 25 deletions(-) diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 5dad2a0782..f0704bdb5d 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -154,17 +154,16 @@ func heapBitsForSpan(base uintptr) (hbits heapBits) { // return base == 0 // otherwise return the base of the object. func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits, s *mspan) { - if p < mheap_.arena_start || p >= mheap_.arena_used { + arenaStart := mheap_.arena_start + if p < arenaStart || p >= mheap_.arena_used { return } - + off := p - arenaStart + idx := off >> _PageShift // p points into the heap, but possibly to the middle of an object. // Consult the span table to find the block beginning. - // TODO(rsc): Factor this out. k := p >> _PageShift - x := k - x -= mheap_.arena_start >> _PageShift - s = h_spans[x] + s = h_spans[idx] if s == nil || pageID(k) < s.start || p >= s.limit || s.state != mSpanInUse { if s == nil || s.state == _MSpanStack { // If s is nil, the virtual address has never been part of the heap. @@ -188,23 +187,23 @@ func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits, s *mspan) { printunlock() throw("objectstart: bad pointer in unexpected span") } - return } - base = s.base() - if p-base >= s.elemsize { - // n := (p - base) / s.elemsize, using division by multiplication - n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2) - - const debugMagic = false - if debugMagic { - n2 := (p - base) / s.elemsize - if n != n2 { - println("runtime: bad div magic", (p - base), s.elemsize, s.divShift, s.divMul, s.divShift2) - throw("bad div magic") - } + // If this span holds object of a power of 2 size, just mask off the bits to + // the interior of the object. Otherwise use the size to get the base. + if s.baseMask != 0 { + // optimize for power of 2 sized objects. + base = s.base() + base = base + (p-base)&s.baseMask + // base = p & s.baseMask is faster for small spans, + // but doesn't work for large spans. + // Overall, it's faster to use the more general computation above. + } else { + base = s.base() + if p-base >= s.elemsize { + // n := (p - base) / s.elemsize, using division by multiplication + n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2) + base += n * s.elemsize } - - base += n * s.elemsize } // Now that we know the actual base, compute heapBits to return to caller. hbits = heapBitsForAddr(base) diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index c5de8218c2..fe44231e7b 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -24,7 +24,6 @@ type mheap struct { nspan uint32 sweepgen uint32 // sweep generation, see comment in mspan sweepdone uint32 // all spans are swept - // span lookup spans **mspan spans_mapped uintptr @@ -99,6 +98,7 @@ type mspan struct { // if sweepgen == h->sweepgen - 1, the span is currently being swept // if sweepgen == h->sweepgen, the span is swept and ready to use // h->sweepgen is incremented by 2 after every GC + sweepgen uint32 divMul uint32 // for divide by elemsize - divMagic.mul ref uint16 // capacity - number of objects in freelist @@ -114,6 +114,7 @@ type mspan struct { limit uintptr // end of data in span speciallock mutex // guards specials list specials *special // linked list of special records sorted by offset. + baseMask uintptr // if non-0, elemsize is a power of 2, & this will get object allocation base } func (s *mspan) base() uintptr { @@ -384,12 +385,14 @@ func mHeap_Alloc_m(h *mheap, npage uintptr, sizeclass int32, large bool) *mspan s.divShift = 0 s.divMul = 0 s.divShift2 = 0 + s.baseMask = 0 } else { s.elemsize = uintptr(class_to_size[sizeclass]) m := &class_to_divmagic[sizeclass] s.divShift = m.shift s.divMul = m.mul s.divShift2 = m.shift2 + s.baseMask = m.baseMask } // update stats, sweep lists diff --git a/src/runtime/msize.go b/src/runtime/msize.go index 9ba145dbf6..bc735beb42 100644 --- a/src/runtime/msize.go +++ b/src/runtime/msize.go @@ -215,14 +215,24 @@ func roundupsize(size uintptr) uintptr { // http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html // http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html type divMagic struct { - shift uint8 - mul uint32 - shift2 uint8 + shift uint8 + mul uint32 + shift2 uint8 + baseMask uintptr } func computeDivMagic(d uint32) divMagic { var m divMagic + // If the size is a power of two, heapBitsForObject can divide even faster by masking. + // Compute this mask. + if d&(d-1) == 0 { + // It is a power of 2 (assuming dinptr != 1) + m.baseMask = ^(uintptr(d) - 1) + } else { + m.baseMask = 0 + } + // Compute pre-shift by factoring power of 2 out of d. for d&1 == 0 { m.shift++ @@ -239,5 +249,6 @@ func computeDivMagic(d uint32) divMagic { } m.mul = uint32(((1 << k) + d64 - 1) / d64) // ⌈2^k / d⌉ m.shift2 = k + return m } -- 2.48.1