]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: use multiply instead of divide in heapBitsForObject
authorRuss Cox <rsc@golang.org>
Wed, 4 Mar 2015 16:34:50 +0000 (11:34 -0500)
committerRuss Cox <rsc@golang.org>
Wed, 4 Mar 2015 17:46:47 +0000 (17:46 +0000)
These benchmarks show the effect of the combination of this change
and Rick's pending CL 6665. Code with interior pointers is helped
much more than code without, but even code without doesn't suffer
too badly.

benchmark                          old ns/op      new ns/op      delta
BenchmarkBinaryTree17              6989407768     6851728175     -1.97%
BenchmarkFannkuch11                4416250775     4405762558     -0.24%
BenchmarkFmtFprintfEmpty           134            130            -2.99%
BenchmarkFmtFprintfString          491            402            -18.13%
BenchmarkFmtFprintfInt             430            420            -2.33%
BenchmarkFmtFprintfIntInt          748            663            -11.36%
BenchmarkFmtFprintfPrefixedInt     602            534            -11.30%
BenchmarkFmtFprintfFloat           728            699            -3.98%
BenchmarkFmtManyArgs               2528           2507           -0.83%
BenchmarkGobDecode                 17448191       17749756       +1.73%
BenchmarkGobEncode                 14579824       14370183       -1.44%
BenchmarkGzip                      656489990      652669348      -0.58%
BenchmarkGunzip                    141254147      141099278      -0.11%
BenchmarkHTTPClientServer          94111          93738          -0.40%
BenchmarkJSONEncode                36305013       36696440       +1.08%
BenchmarkJSONDecode                124652000      128176454      +2.83%
BenchmarkMandelbrot200             6009333        5997093        -0.20%
BenchmarkGoParse                   7651583        7623494        -0.37%
BenchmarkRegexpMatchEasy0_32       213            213            +0.00%
BenchmarkRegexpMatchEasy0_1K       511            494            -3.33%
BenchmarkRegexpMatchEasy1_32       186            187            +0.54%
BenchmarkRegexpMatchEasy1_1K       1834           1827           -0.38%
BenchmarkRegexpMatchMedium_32      427            412            -3.51%
BenchmarkRegexpMatchMedium_1K      154841         153086         -1.13%
BenchmarkRegexpMatchHard_32        7473           7478           +0.07%
BenchmarkRegexpMatchHard_1K        233587         232272         -0.56%
BenchmarkRevcomp                   918797689      944528032      +2.80%
BenchmarkTemplate                  167665081      167773121      +0.06%
BenchmarkTimeParse                 631            636            +0.79%
BenchmarkTimeFormat                672            666            -0.89%

Change-Id: Ia923de3cdb3993b640fe0a02cbe2c7babc16f32c
Reviewed-on: https://go-review.googlesource.com/6782
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
src/runtime/mbitmap.go
src/runtime/mheap.go
src/runtime/msize.go

index 45920443639c9b776e448654905a2fbbd649fc82..6b46ad18cbfb1227cb302ae04c8e403138a73767 100644 (file)
@@ -202,7 +202,19 @@ func heapBitsForObject(p uintptr) (base uintptr, hbits heapBits) {
        }
        base = s.base()
        if p-base >= s.elemsize {
-               base += (p - base) / s.elemsize * s.elemsize
+               // n := (p - base) / s.elemsize, using division by multiplication
+               n := uintptr(uint64(p-base) >> s.divShift * uint64(s.divMul) >> s.divShift2)
+
+               const debugMagic = false
+               if debugMagic {
+                       n2 := (p - base) / s.elemsize
+                       if n != n2 {
+                               println("runtime: bad div magic", (p - base), s.elemsize, s.divShift, s.divMul, s.divShift2)
+                               throw("bad div magic")
+                       }
+               }
+
+               base += n * s.elemsize
        }
        if base == p {
                print("runtime: failed to find block beginning for ", hex(p), " s=", hex(s.start*_PageSize), " s.limit=", hex(s.limit), "\n")
index 94ef4de56aae74eb7ed1adabb82a8afe54750518..fc4dfeea975c6016eba34fa31d5776fa7665d0da 100644 (file)
@@ -101,11 +101,14 @@ type mspan struct {
        // if sweepgen == h->sweepgen, the span is swept and ready to use
        // h->sweepgen is incremented by 2 after every GC
        sweepgen    uint32
+       divMul      uint32   // for divide by elemsize - divMagic.mul
        ref         uint16   // capacity - number of objects in freelist
        sizeclass   uint8    // size class
        incache     bool     // being used by an mcache
        state       uint8    // mspaninuse etc
        needzero    uint8    // needs to be zeroed before allocation
+       divShift    uint8    // for divide by elemsize - divMagic.shift
+       divShift2   uint8    // for divide by elemsize - divMagic.shift2
        elemsize    uintptr  // computed from sizeclass or from npages
        unusedsince int64    // first time spotted by gc in mspanfree state
        npreleased  uintptr  // number of pages released to the os
@@ -385,8 +388,15 @@ func mHeap_Alloc_m(h *mheap, npage uintptr, sizeclass int32, large bool) *mspan
                s.sizeclass = uint8(sizeclass)
                if sizeclass == 0 {
                        s.elemsize = s.npages << _PageShift
+                       s.divShift = 0
+                       s.divMul = 0
+                       s.divShift2 = 0
                } else {
                        s.elemsize = uintptr(class_to_size[sizeclass])
+                       m := &class_to_divmagic[sizeclass]
+                       s.divShift = m.shift
+                       s.divMul = m.mul
+                       s.divShift2 = m.shift2
                }
 
                // update stats, sweep lists
index 370cae629e237b29031dde73c9927685bf643a59..f2a7cb9dddf0ce9d84b95b3edd686ab1975a2632 100644 (file)
@@ -48,6 +48,8 @@ package runtime
 
 var class_to_size [_NumSizeClasses]int32
 var class_to_allocnpages [_NumSizeClasses]int32
+var class_to_divmagic [_NumSizeClasses]divMagic
+
 var size_to_class8 [1024/8 + 1]int8
 var size_to_class128 [(_MaxSmallSize-1024)/128 + 1]int8
 
@@ -144,6 +146,11 @@ func initSizes() {
        for i := 0; i < len(class_to_size); i++ {
                memstats.by_size[i].size = uint32(class_to_size[i])
        }
+
+       for i := 1; i < len(class_to_size); i++ {
+               class_to_divmagic[i] = computeDivMagic(uint32(class_to_size[i]))
+       }
+
        return
 
 dump:
@@ -182,3 +189,55 @@ func roundupsize(size uintptr) uintptr {
        }
        return round(size, _PageSize)
 }
+
+// divMagic holds magic constants to implement division
+// by a particular constant as a shift, multiply, and shift.
+// That is, given
+//     m = computeMagic(d)
+// then
+//     n/d == ((n>>m.shift) * m.mul) >> m.shift2
+//
+// The magic computation picks m such that
+//     d = d₁*d₂
+//     d₂= 2^m.shift
+//     m.mul = ⌈2^m.shift2 / d₁⌉
+//
+// The magic computation here is tailored for malloc block sizes
+// and does not handle arbitrary d correctly. Malloc block sizes d are
+// always even, so the first shift implements the factors of 2 in d
+// and then the mul and second shift implement the odd factor
+// that remains. Because the first shift divides n by at least 2 (actually 8)
+// before the multiply gets involved, the huge corner cases that
+// require additional adjustment are impossible, so the usual
+// fixup is not needed.
+//
+// For more details see Hacker's Delight, Chapter 10, and
+// http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html
+// http://ridiculousfish.com/blog/posts/labor-of-division-episode-iii.html
+type divMagic struct {
+       shift  uint8
+       mul    uint32
+       shift2 uint8
+}
+
+func computeDivMagic(d uint32) divMagic {
+       var m divMagic
+
+       // Compute pre-shift by factoring power of 2 out of d.
+       for d&1 == 0 {
+               m.shift++
+               d >>= 1
+       }
+
+       // Compute largest k such that ⌈2^k / d⌉ fits in a 32-bit int.
+       // This is always a good enough approximation.
+       // We could use smaller k for some divisors but there's no point.
+       k := uint8(63)
+       d64 := uint64(d)
+       for ((1<<k)+d64-1)/d64 >= 1<<32 {
+               k--
+       }
+       m.mul = uint32(((1 << k) + d64 - 1) / d64) //  ⌈2^k / d⌉
+       m.shift2 = k
+       return m
+}