From: Russ Cox Date: Mon, 4 May 2015 15:30:10 +0000 (-0400) Subject: runtime: optimize heapBitsSetType X-Git-Tag: go1.5beta1~659 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=feb8a3b6169504040024e22b3eef5959554f9dbd;p=gostls13.git runtime: optimize heapBitsSetType For the conversion of the heap bitmap from 4-bit to 2-bit fields, I replaced heapBitsSetType with the dumbest thing that could possibly work: two atomic operations (atomicand8+atomicor8) per 2-bit field. This CL replaces that code with a proper implementation that avoids the atomics whenever possible. Benchmarks vs base CL (before the conversion to 2-bit heap bitmap) and vs Go 1.4 below. Compared to Go 1.4, SetTypePtr (a 1-pointer allocation) is 10ns slower because a race against the concurrent GC requires the use of an atomicor8 that used to be an ordinary write. This slowdown was present even in the base CL. Compared to both Go 1.4 and base, SetTypeNode8 (a 10-word allocation) is 10ns slower because it too needs a new atomic, because with the denser representation, the byte on the end of the allocation is now shared with the object next to it; this was not true with the 4-bit representation. Excluding these two (fundamental) slowdowns due to the use of atomics, the new code is noticeably faster than both Go 1.4 and the base CL. The next CL will reintroduce the ``typeDead'' optimization. Stats are from 5 runs on a MacBookPro10,2 (late 2012 Core i5). Compared to base CL (** = new atomic) name old mean new mean delta SetTypePtr 14.1ns × (0.99,1.02) 14.7ns × (0.93,1.10) ~ (p=0.175) SetTypePtr8 18.4ns × (1.00,1.01) 18.6ns × (0.81,1.21) ~ (p=0.866) SetTypePtr16 28.7ns × (1.00,1.00) 22.4ns × (0.90,1.27) -21.88% (p=0.015) SetTypePtr32 52.3ns × (1.00,1.00) 33.8ns × (0.93,1.24) -35.37% (p=0.001) SetTypePtr64 79.2ns × (1.00,1.00) 55.1ns × (1.00,1.01) -30.43% (p=0.000) SetTypePtr126 118ns × (1.00,1.00) 100ns × (1.00,1.00) -15.97% (p=0.000) SetTypePtr128 130ns × (0.92,1.19) 98ns × (1.00,1.00) -24.36% (p=0.008) SetTypePtrSlice 726ns × (0.96,1.08) 760ns × (1.00,1.00) ~ (p=0.152) SetTypeNode1 14.1ns × (0.94,1.15) 12.0ns × (1.00,1.01) -14.60% (p=0.020) SetTypeNode1Slice 135ns × (0.96,1.07) 88ns × (1.00,1.00) -34.53% (p=0.000) SetTypeNode8 20.9ns × (1.00,1.01) 32.6ns × (1.00,1.00) +55.37% (p=0.000) ** SetTypeNode8Slice 414ns × (0.99,1.02) 244ns × (1.00,1.00) -41.09% (p=0.000) SetTypeNode64 80.0ns × (1.00,1.00) 57.4ns × (1.00,1.00) -28.23% (p=0.000) SetTypeNode64Slice 2.15µs × (1.00,1.01) 1.56µs × (1.00,1.00) -27.43% (p=0.000) SetTypeNode124 119ns × (0.99,1.00) 100ns × (1.00,1.00) -16.11% (p=0.000) SetTypeNode124Slice 3.40µs × (1.00,1.00) 2.93µs × (1.00,1.00) -13.80% (p=0.000) SetTypeNode126 120ns × (1.00,1.01) 98ns × (1.00,1.00) -18.19% (p=0.000) SetTypeNode126Slice 3.53µs × (0.98,1.08) 3.02µs × (1.00,1.00) -14.49% (p=0.002) SetTypeNode1024 726ns × (0.97,1.09) 740ns × (1.00,1.00) ~ (p=0.451) SetTypeNode1024Slice 24.9µs × (0.89,1.37) 23.1µs × (1.00,1.00) ~ (p=0.476) Compared to Go 1.4 (** = new atomic) name old mean new mean delta SetTypePtr 5.71ns × (0.89,1.19) 14.68ns × (0.93,1.10) +157.24% (p=0.000) ** SetTypePtr8 19.3ns × (0.96,1.10) 18.6ns × (0.81,1.21) ~ (p=0.638) SetTypePtr16 30.7ns × (0.99,1.03) 22.4ns × (0.90,1.27) -26.88% (p=0.005) SetTypePtr32 51.5ns × (1.00,1.00) 33.8ns × (0.93,1.24) -34.40% (p=0.001) SetTypePtr64 83.6ns × (0.94,1.12) 55.1ns × (1.00,1.01) -34.12% (p=0.001) SetTypePtr126 137ns × (0.87,1.26) 100ns × (1.00,1.00) -27.10% (p=0.028) SetTypePtrSlice 865ns × (0.80,1.23) 760ns × (1.00,1.00) ~ (p=0.243) SetTypeNode1 15.2ns × (0.88,1.12) 12.0ns × (1.00,1.01) -20.89% (p=0.014) SetTypeNode1Slice 156ns × (0.93,1.16) 88ns × (1.00,1.00) -43.57% (p=0.001) SetTypeNode8 23.8ns × (0.90,1.18) 32.6ns × (1.00,1.00) +36.76% (p=0.003) ** SetTypeNode8Slice 502ns × (0.92,1.10) 244ns × (1.00,1.00) -51.46% (p=0.000) SetTypeNode64 85.6ns × (0.94,1.11) 57.4ns × (1.00,1.00) -32.89% (p=0.001) SetTypeNode64Slice 2.36µs × (0.91,1.14) 1.56µs × (1.00,1.00) -33.96% (p=0.002) SetTypeNode124 130ns × (0.91,1.12) 100ns × (1.00,1.00) -23.49% (p=0.004) SetTypeNode124Slice 3.81µs × (0.90,1.22) 2.93µs × (1.00,1.00) -23.09% (p=0.025) There are fewer benchmarks vs Go 1.4 because unrolling directly into the heap bitmap is not yet implemented, so those would not be meaningful comparisons. These benchmarks were not present in Go 1.4 as distributed. The backport to Go 1.4 is in github.com/rsc/go's go14bench branch, commit 71d5ee5. Change-Id: I95ed05a22bf484b0fc9efad549279e766c98d2b6 Reviewed-on: https://go-review.googlesource.com/9704 Reviewed-by: Rick Hudson --- diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index dea6879adc..b866d7f732 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -476,12 +476,33 @@ func heapBitsSweepSpan(base, size, n uintptr, f func(uintptr)) { // (The number of values is given by dataSize / typ.size.) // If dataSize < size, the fragment [x+dataSize, x+size) is // recorded as non-pointer data. +// It is known that the type has pointers somewhere; +// malloc does not call heapBitsSetType when there are no pointers, +// because all free objects are marked as noscan during +// heapBitsSweepSpan. +// There can only be one allocation from a given span active at a time, +// so this code is not racing with other instances of itself, +// and we don't allocate from a span until it has been swept, +// so this code is not racing with heapBitsSweepSpan. +// It is, however, racing with the concurrent GC mark phase, +// which can be setting the mark bit in the leading 2-bit entry +// of an allocated block. The block we are modifying is not quite +// allocated yet, so the GC marker is not racing with updates to x's bits, +// but if the start or end of x shares a bitmap byte with an adjacent +// object, the GC marker is racing with updates to those object's mark bits. func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { // From here till marked label marking the object as allocated // and storing type info in the GC bitmap. h := heapBitsForAddr(x) - var ptrmask *uint8 + // dataSize is always size rounded up to the next malloc size class, + // except in the case of allocating a defer block, in which case + // size is sizeof(_defer{}) (at least 6 words) and dataSize may be + // arbitrarily larger. + // + // The checks for size == ptrSize and size == 2*ptrSize can therefore + // assume that dataSize == size without checking it explicitly. + if size == ptrSize { // It's one word and it has pointers, it must be a pointer. // The bitmap byte is shared with the one-word object @@ -494,6 +515,8 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) { atomicor8(h.bitp, bitPointer<>(j%8))&1 != 0 { - atomicor8(h.bitp, bitPointer<= nw { + goto Phase3 } - if i >= 2 { - atomicor8(h.bitp, bitMarked<>= 4 + nb -= 4 + + case 4: + // Ptrmask and heap bitmap are misaligned. + // The bits for the first two words are in a byte shared with another object + // and must be updated atomically. + // NOTE(rsc): The atomic here may not be necessary. + // We took care of 1-word and 2-word objects above, + // so this is at least a 6-word object, so our start bits + // are shared only with the type bits of another object, + // not with its mark bit. Since there is only one allocation + // from a given span at a time, we should be able to set + // these bits non-atomically. Not worth the risk right now. + hb = (b&1)<<4 | (b&2)<<(4+heapBitsWidth-1) // bits being prepared for *h.bitp + b >>= 2 + nb -= 2 + // Note: no bitMarker in hb because the first two words don't get markers from us. + atomicor8(hbitp, uint8(hb)) + hbitp = subtractb(hbitp, 1) + + // Expand 8-bit chunks of ptrmask into pairs of heap bitmap bytes. + // We know the object size is a multiple of 2 words but not 4, so the + // object size minus the 2 words we just handled is a multiple of 4, + // so we can use non-atomic writes to the heap bitmap for the + // rest of this code, even for the final fragment or a trailing dead marker byte. + + // Loop prepares bits for final byte but stops before writing them, + // so that in the case where we need to write only part of a byte, + // the code below the loop can truncate the bitMarked. + w += 2 + } + + // Phase 2: Full bytes in bitmap, up to but not including write to last byte (full or partial) in bitmap. + // The loop computes the bits for that last write but does not execute the write; + // it leaves the bits in hb for processing by phase 3. + // To avoid repeated adjustment of nb, we subtract out the 4 bits we're going to + // use in the first half of the loop right now, and then we only adjust nb explicitly + // if the 8 bits used by each iteration isn't balanced by 8 bits loaded mid-loop. + nb -= 4 + for { + // Emit bitmap byte. + // b has at least nb+4 bits, with one exception: + // if w+4 >= nw, then b has only nw-w bits, + // but we'll stop at the break and then truncate + // appropriately in Phase 3. + hb = b&1 | (b&2)<<(heapBitsWidth-1) | (b&4)<<(2*heapBitsWidth-2) | (b&8)<<(3*heapBitsWidth-3) + hb |= bitMarked | bitMarked<= nw { + break } - h = h.next() + *hbitp = uint8(hb) + hbitp = subtractb(hbitp, 1) + b >>= 4 + + // Load more bits. b has nb right now. + if p != endp { + // Fast path: keep reading from ptrmask. + // nb unmodified: we just loaded 8 bits, + // and the next iteration will consume 8 bits, + // leaving us with the same nb the next time we're here. + b |= uintptr(*p) << nb + p = addb(p, 1) + } else if p == nil { + // Almost as fast path: track bit count and refill from pbits. + // For short repetitions. + if nb < 8 { + b |= pbits << nb + nb += endnb + } + nb -= 8 // for next iteration + } else { + // Slow path: reached end of ptrmask. + // Process final partial byte and rewind to start. + b |= uintptr(*p) << nb + nb += endnb + if nb < 8 { + b |= uintptr(*ptrmask) << nb + p = addb(ptrmask, 1) + } else { + nb -= 8 + p = ptrmask + } + } + + // Emit bitmap byte. + hb = b&1 | (b&2)<<(heapBitsWidth-1) | (b&4)<<(2*heapBitsWidth-2) | (b&8)<<(3*heapBitsWidth-3) + hb |= bitMarked | bitMarked<= nw { + break + } + *hbitp = uint8(hb) + hbitp = subtractb(hbitp, 1) + b >>= 4 } - if dataSize < size { - atomicand8(h.bitp, ^((bitPointer | bitMarked) << h.shift)) + +Phase3: + // Phase 3: Special case for final byte or half-byte describing final fragment of data. + // If there are not four data words for this final fragment, we must clear the mark bits + // in the 2-bit entries for the missing words. Clearing them creates a ``dead'' entry + // to tell the GC scan to stop scanning this object early. + // If there are four words in the final fragment but there is more data, + // then we must write a ``dead'' entry to the next bitmap byte. + if frag := (nw - w) % 4; frag != 0 { + // Data ends at least one word early. + hb &= 1<<(heapBitsWidth*frag) - 1 + if w*ptrSize <= size { + // We own the whole byte and get the dead marker for free. + *hbitp = uint8(hb) + } else { + // We only own the bottom half of the byte. + // If frag == 1, we get a dead marker for free. + // If frag == 2, no dead marker needed (we've reached the end of the object). + atomicand8(hbitp, 0xf0) + atomicor8(hbitp, uint8(hb)) + } + } else { + // Data ends with a full bitmap byte. + *hbitp = uint8(hb) + if w*ptrSize < size { + // There's more data in the allocated object. + // Write a dead marker in the next byte. + hbitp = subtractb(hbitp, 1) + if (w+4)*ptrSize <= size { + // We own the whole byte. + *hbitp = 0 + } else { + // We only own the bottom half of the byte. + atomicand8(hbitp, 0xf0) + } + } + } + + const test = false // slow but helpful + if test { + // Double-check that bits to be written were written correctly. + // Does not check that other bits were not written, unfortunately. + h := heapBitsForAddr(x) + nptr := typ.size / ptrSize + for i := uintptr(0); i <= dataSize/ptrSize; i++ { + j := i % nptr + var have, want uint8 + if i == dataSize/ptrSize { + if dataSize >= size { + break + } + have = (*h.bitp >> h.shift) & 3 + want = 0 // dead bits + } else { + have = (*h.bitp >> h.shift) & 3 + if (*addb(ptrmask, j/8)>>(j%8))&1 != 0 { + want |= bitPointer + } + if i >= 2 { + want |= bitMarked + } else { + have &^= bitMarked + } + } + if have != want { + println("mismatch writing bits for", *typ._string, "x", dataSize/typ.size) + print("typ.size=", typ.size, " dataSize=", dataSize, " size=", size, "\n") + h = heapBitsForAddr(x) + print("initial bits h.bitp=", h.bitp, " h.shift=", h.shift, "\n") + print("p=", p, " endp=", endp, " endnb=", endnb, " pbits=", hex(pbits), " b=", hex(b), " nb=", nb, "\n") + println("at word", i, "offset", i*ptrSize, "have", have, "want", want) + throw("bad heapBitsSetType") + } + h = h.next() + } } } diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index 917299c9df..bf21e47d83 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -601,18 +601,16 @@ func scanobject(b uintptr, gcw *gcWork) { // in the type bit for the one word. The only one-word objects // are pointers, or else they'd be merged with other non-pointer // data into larger allocations. - if n != 1 { - b := hbits.bits() - if i >= 2*ptrSize && b&bitMarked == 0 { - break // no more pointers in this object - } - if b&bitPointer == 0 { - continue // not a pointer - } + bits := hbits.bits() + if i >= 2*ptrSize && bits&bitMarked == 0 { + break // no more pointers in this object + } + if bits&bitPointer == 0 { + continue // not a pointer } - // Work here is duplicated in scanblock. - // If you make changes here, make changes there too. + // Work here is duplicated in scanblock and above. + // If you make changes here, make changes there too. obj := *(*uintptr)(unsafe.Pointer(b + i)) // At this point we have extracted the next potential pointer.