From 8dda1c4c08adf8b2107dec1c0d70d24443269ccd Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Wed, 2 Mar 2016 12:15:02 -0500 Subject: [PATCH] [dev.garbage] runtime: remove heapBitsSweepSpan Prior to this CL the sweep phase was responsible for locating all objects that were about to be freed and calling a function to process the object. This was done by the function heapBitsSweepSpan. Part of processing included calls to tracefree and msanfree as well as counting how many objects were freed. The calls to tracefree and msanfree have been moved into the gcmalloc routine and called when the object is about to be reallocated. The counting of free objects has been optimized using an array based popcnt algorithm and if all the objects in a span are free then span is freed. Similarly the code to locate the next free object has been optimized to use an array based ctz (count trailing zero). Various hot paths in the allocation logic have been optimized. At this point the garbage benchmark is within 3% of the 1.6 release. Change-Id: I00643c442e2ada1685c010c3447e4ea8537d2dfa Reviewed-on: https://go-review.googlesource.com/20201 Reviewed-by: Austin Clements --- src/runtime/malloc.go | 59 ++++++++- src/runtime/mbitmap.go | 258 +++++++++++++++++++--------------------- src/runtime/mgcsweep.go | 40 +------ src/runtime/mheap.go | 12 +- 4 files changed, 191 insertions(+), 178 deletions(-) diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go index 574ce3dafc..2da13f2073 100644 --- a/src/runtime/malloc.go +++ b/src/runtime/malloc.go @@ -496,6 +496,33 @@ const ( _FlagNoZero = 1 << 1 // don't zero memory ) +// nextFreeFast returns the next free object if one is quickly available. +// Otherwise it returns 0. +func (c *mcache) nextFreeFast(sizeclass int8) gclinkptr { + s := c.alloc[sizeclass] + ctzIndex := uint8(s.allocCache & 0xff) + if ctzIndex != 0 { + theBit := uint64(ctzVals[ctzIndex]) + freeidx := s.freeindex // help the pre ssa compiler out here with cse. + result := freeidx + uintptr(theBit) + if result < s.nelems { + s.allocCache >>= (theBit + 1) + freeidx = result + 1 + if freeidx%64 == 0 && freeidx != s.nelems { + // We just incremented s.freeindex so it isn't 0 + // so we are moving to the next aCache. + whichByte := freeidx / 8 + s.refillAllocCache(whichByte) + } + s.freeindex = freeidx + v := gclinkptr(result*s.elemsize + s.base()) + s.allocCount++ + return v + } + } + return 0 +} + // nextFree returns the next free object from the cached span if one is available. // Otherwise it refills the cache with a span with an available object and // returns that object along with a flag indicating that this was a heavy @@ -508,9 +535,9 @@ func (c *mcache) nextFree(sizeclass int8) (v gclinkptr, shouldhelpgc bool) { freeIndex := s.nextFreeIndex() if freeIndex == s.nelems { // The span is full. - if uintptr(s.allocCount) > s.nelems { + if uintptr(s.allocCount) != s.nelems { println("runtime: s.allocCount=", s.allocCount, "s.nelems=", s.nelems) - throw("s.allocCount > s.nelems && freeIndex == s.nelems") + throw("s.allocCount != s.nelems && freeIndex == s.nelems") } systemstack(func() { c.refill(int32(sizeclass)) @@ -644,7 +671,10 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { } // Allocate a new maxTinySize block. var v gclinkptr - v, shouldhelpgc = c.nextFree(tinySizeClass) + v = c.nextFreeFast(tinySizeClass) + if v == 0 { + v, shouldhelpgc = c.nextFree(tinySizeClass) + } x = unsafe.Pointer(v) (*[2]uint64)(x)[0] = 0 (*[2]uint64)(x)[1] = 0 @@ -664,7 +694,10 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { } size = uintptr(class_to_size[sizeclass]) var v gclinkptr - v, shouldhelpgc = c.nextFree(sizeclass) + v = c.nextFreeFast(sizeclass) + if v == 0 { + v, shouldhelpgc = c.nextFree(sizeclass) + } x = unsafe.Pointer(v) if flags&flagNoZero == 0 { memclr(unsafe.Pointer(v), size) @@ -725,9 +758,27 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer { }) } + // The object x is about to be reused but tracefree and msanfree + // need to be informed. + // TODO:(rlh) It is quite possible that this object is being allocated + // out of a fresh span and that there is no preceding call to + // tracealloc with this object. If this is an issue then initialization + // of the fresh span needs to leave some crumbs around that can be used to + // avoid these calls. Furthermore these crumbs a likely the same as + // those needed to determine if the object needs to be zeroed. + // In the case of msanfree it does not make sense to call msanfree + // followed by msanmalloc. msanfree indicates that the bytes are not + // initialized but msanmalloc is about to indicate that they are. + // It makes no difference whether msanmalloc has been called on these + // bytes or not. + if debug.allocfreetrace != 0 { + tracefree(unsafe.Pointer(x), size) + } + if raceenabled { racemalloc(x, size) } + if msanenabled { msanmalloc(x, size) } diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go index 910c4fa844..ea398904e3 100644 --- a/src/runtime/mbitmap.go +++ b/src/runtime/mbitmap.go @@ -186,17 +186,59 @@ func (s *mspan) allocBitsForIndex(allocBitIndex uintptr) markBits { return markBits{&s.allocBits[whichByte], uint8(1 << whichBit), allocBitIndex} } +// ctzVals contains the count of trailing zeros for the +// index. 0 returns 8 indicating 8 zeros. +var ctzVals = [256]int8{ + 8, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 6, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 7, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 6, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 5, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0, + 4, 0, 1, 0, 2, 0, 1, 0, + 3, 0, 1, 0, 2, 0, 1, 0} + // A temporary stand in for the count trailing zero ctz instruction. // IA bsf works on 64 bit non-zero word. func ctz64(markBits uint64) uint64 { - if markBits == 0 { + ctz8 := ctzVals[markBits&0xff] + if ctz8 != 8 { + return uint64(ctz8) + } else if markBits == 0 { // low byte is zero check fill word. return 64 // bits in 64 bit word, ensures loop terminates } - // tz holds trailing zero count. - tz := uint64(0) - for mask := uint64(1); mask&markBits == 0; mask, tz = mask<<1, tz+1 { + result := uint64(8) + markBits >>= 8 + for ctz8 = ctzVals[markBits&0xff]; ctz8 == 8; ctz8 = ctzVals[markBits&0xff] { + result += 8 + markBits >>= 8 } - return tz + result += uint64(ctz8) + return result } // refillAllocCache takes 8 bytes s.allocBits starting at whichByte @@ -222,10 +264,12 @@ func (s *mspan) refillAllocCache(whichByte uintptr) { // There are hardware instructions that can be used to make this // faster if profiling warrants it. func (s *mspan) nextFreeIndex() uintptr { - if s.freeindex == s.nelems { - return s.freeindex + sfreeindex := s.freeindex + snelems := s.nelems + if sfreeindex == snelems { + return sfreeindex } - if s.freeindex > s.nelems { + if sfreeindex > snelems { throw("s.freeindex > s.nelems") } @@ -233,37 +277,37 @@ func (s *mspan) nextFreeIndex() uintptr { bitIndex := ctz64(aCache) for bitIndex == 64 { // Move index to start of next cached bits. - s.freeindex = (s.freeindex + 64) &^ (64 - 1) - if s.freeindex >= s.nelems { - s.freeindex = s.nelems - return s.freeindex + sfreeindex = (sfreeindex + 64) &^ (64 - 1) + if sfreeindex >= snelems { + s.freeindex = snelems + return snelems } - whichByte := s.freeindex / 8 + whichByte := sfreeindex / 8 // Refill s.allocCache with the next 64 alloc bits. - // Unlike in allocBits a 1 in s.allocCache means - // the object is not marked. s.refillAllocCache(whichByte) aCache = s.allocCache bitIndex = ctz64(aCache) // Nothing was available try again now allocCache has been refilled. } - result := s.freeindex + uintptr(bitIndex) - if result >= s.nelems { - s.freeindex = s.nelems - return s.freeindex + result := sfreeindex + uintptr(bitIndex) + if result >= snelems { + s.freeindex = snelems + return snelems } - s.allocCache >>= bitIndex + 1 - s.freeindex = result + 1 - if s.freeindex%64 == 0 && s.freeindex != s.nelems { + s.allocCache >>= (bitIndex + 1) + sfreeindex = result + 1 + + if sfreeindex%64 == 0 && sfreeindex != snelems { // We just incremented s.freeindex so it isn't 0. // As each 1 in s.allocCache was encountered and used for allocation // it was shifted away. At this point s.allocCache contains all 0s. // Refill s.allocCache so that it corresponds // to the bits at s.allocBits starting at s.freeindex. - whichByte := s.freeindex / 8 + whichByte := sfreeindex / 8 s.refillAllocCache(whichByte) } + s.freeindex = sfreeindex return result } @@ -760,120 +804,60 @@ func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) { } } -// heapBitsSweepSpan coordinates the sweeping of a span and inspects -// each freed object. If objects are being traced or if msan is enabled -// then heapBitsSweepSpan calls f(p), where p is the object's base address. -// When not tracing and msan is not enabled heapBitsSweepSpan is lightweight. -// heapBitsSweepSpan never alters the pointer/scalar heapBit maps. HeapBit map -// maintenance is the responsibility of the allocation routines. -// TODO:(rlh) Deal with the checkmark bits but moving them -// out of heap bitmap thus enabling bulk clearing. -func heapBitsSweepSpan(s *mspan, f func(uintptr)) (nfree int) { - base := s.base() - size := s.elemsize - n := s.nelems - cl := s.sizeclass - doCall := debug.allocfreetrace != 0 || msanenabled || cl == 0 - h := heapBitsForSpan(base) - switch { - default: - throw("heapBitsSweepSpan") - case sys.PtrSize == 8 && size == sys.PtrSize: - nfree = heapBitsSweep8BitPtrs(h, s, base, n, cl, doCall, f) - case size%(4*sys.PtrSize) == 0: - nfree = heapBitsSweepMap(h, s, base, size, n, cl, doCall, f) - case size%(4*sys.PtrSize) == 2*sys.PtrSize: - nfree = heapBitsSweepMap(h, s, base, size, n, cl, doCall, f) - } - return -} - -func heapBitsSweep8BitPtrs(h heapBits, s *mspan, base, n uintptr, cl uint8, doCall bool, f func(uintptr)) (nfree int) { - mbits := s.markBitsForBase() - // Consider mark bits in all four 2-bit entries of each bitmap byte. - if cl == 0 { - throw("8BitPtrs are not in cl 0") - } - // Consider mark bits in all four 2-bit entries of each bitmap byte. - for i := uintptr(0); i < n; i++ { - // Note that unlike the other size cases, we leave the pointer bits set here. - // These are initialized during initSpan when the span is created and left - // in place the whole time the span is used for pointer-sized objects. - // That lets heapBitsSetType avoid an atomic update to set the pointer bit - // during allocation. - if !mbits.isMarked() { - nfree++ - if mbits.index < s.freeindex { - f(base + i*sys.PtrSize) - } else if s.allocBits[mbits.index/8]&mbits.mask == 1 { - // it was marked in the previous cycle but not this cycle - // if it wasn't marked in the prvious cycle the call would be redundant. - f(base + i*sys.PtrSize) - } - } - mbits.advance() - } - return nfree -} - -// nextFreed returns the next object that is being freed during this GC cycle. -// If the mark bit is set then the object is free. If it is < s.freeindex -// then either the object was freed during by this GC cycle. -// If it is >= freeindex then if the allocBit is set then it was -// freed during this GC cycle. If the allocBit is 0 it was freed -// during a previous cycle so is not considered a freed. -func (m *markBits) nextFreed(nelems uintptr, s *mspan, totalFree *int) bool { - mByte := *m.bytep - for { - for mByte == 0xff { - if m.index >= nelems { - return false - } - m.index = (m.index + 8) &^ (8 - 1) - m.mask = 1 - m.bytep = add1(m.bytep) - mByte = *m.bytep - // Nothing free found totalFree remains the same. - } - if m.index >= nelems { - return false - } - for m.index < nelems { - if m.mask&mByte == 0 { - // At this point we have a free object so update totalFree - *totalFree++ - if m.index < s.freeindex { - return true - } - if s.allocBits[m.index/8]&m.mask != 0 { - return true - } - } - if m.mask == 1<<7 { - m.mask = 1 - m.bytep = add1(m.bytep) - mByte = *m.bytep - m.index++ - break - } else { - m.mask = m.mask << 1 - m.index++ - } - } - } - return false -} - -func heapBitsSweepMap(h heapBits, s *mspan, base, size, n uintptr, cl uint8, doCall bool, f func(uintptr)) int { - totalFree := 0 - twobits := s.markBitsForBase() - for twobits.nextFreed(n, s, &totalFree) { - if doCall { - f(base + twobits.index*size) - } - twobits.advance() - } - return totalFree +// oneBitCount is indexed by byte and produces the +// number of 1 bits in that byte. For example 128 has 1 bit set +// and oneBitCount[128] will holds 1. +var oneBitCount = [256]uint8{ + 0, 1, 1, 2, 1, 2, 2, 3, + 1, 2, 2, 3, 2, 3, 3, 4, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 1, 2, 2, 3, 2, 3, 3, 4, + 2, 3, 3, 4, 3, 4, 4, 5, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 2, 3, 3, 4, 3, 4, 4, 5, + 3, 4, 4, 5, 4, 5, 5, 6, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 3, 4, 4, 5, 4, 5, 5, 6, + 4, 5, 5, 6, 5, 6, 6, 7, + 4, 5, 5, 6, 5, 6, 6, 7, + 5, 6, 6, 7, 6, 7, 7, 8} + +// countFree runs through the mark bits in a span and counts the number of free objects +// in the span. +// TODO:(rlh) Use popcount intrinsic. +func (s *mspan) countFree() int { + count := 0 + maxIndex := s.nelems / 8 + for i := uintptr(0); i < maxIndex; i++ { + count += int(oneBitCount[s.gcmarkBits[i]]) + } + + if bitsInLastByte := s.nelems % 8; bitsInLastByte != 0 { + markBits := uint8(s.gcmarkBits[maxIndex]) + mask := uint8((1 << bitsInLastByte) - 1) + bits := markBits & mask + count += int(oneBitCount[bits]) + } + return int(s.nelems) - count } // heapBitsSetType records that the new allocation [x, x+size) diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go index c217ee8d86..1a6be6634d 100644 --- a/src/runtime/mgcsweep.go +++ b/src/runtime/mgcsweep.go @@ -8,7 +8,6 @@ package runtime import ( "runtime/internal/atomic" - "runtime/internal/sys" "unsafe" ) @@ -252,40 +251,13 @@ func (s *mspan) sweep(preserve bool) bool { } } - // Sweep through n objects of given size starting at p. - // This thread owns the span now, so it can manipulate - // the block bitmap without atomic operations. - - nfree = heapBitsSweepSpan(s, func(p uintptr) { - // At this point we know that we are looking at a garbage object - // that needs to be collected. - if debug.allocfreetrace != 0 { - tracefree(unsafe.Pointer(p), size) - } - if msanenabled { - msanfree(unsafe.Pointer(p), size) - } - - // Reset to allocated+noscan. - if cl == 0 { - // Free large span. - if preserve { - throw("can't preserve large span") - } - s.needzero = 1 + // Count the number of free objects in this span. + nfree = s.countFree() + if cl == 0 && nfree != 0 { + s.needzero = 1 + freeToHeap = true + } - // Free the span after heapBitsSweepSpan - // returns, since it's not done with the span. - freeToHeap = true - } else { - // Free small object. - if size > 2*sys.PtrSize { - *(*uintptr)(unsafe.Pointer(p + sys.PtrSize)) = uintptrMask & 0xdeaddeaddeaddead // mark as "needs to be zeroed" - } else if size > sys.PtrSize { - *(*uintptr)(unsafe.Pointer(p + sys.PtrSize)) = 0 - } - } - }) s.allocCount = uint16(s.nelems) - uint16(nfree) wasempty := s.nextFreeIndex() == s.nelems diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go index 4be503315b..b0b3bbd957 100644 --- a/src/runtime/mheap.go +++ b/src/runtime/mheap.go @@ -137,6 +137,9 @@ type mspan struct { // // Object n starts at address n*elemsize + (start << pageShift). freeindex uintptr + // TODO: Look up nelems from sizeclass and remove this field if it + // helps performance. + nelems uintptr // number of object in the span. // Cache of the allocBits at freeindex. allocCache is shifted // such that the lowest bit corresponds to the bit freeindex. @@ -147,9 +150,6 @@ type mspan struct { allocCache uint64 allocBits *[maxObjsPerSpan / 8]uint8 gcmarkBits *[maxObjsPerSpan / 8]uint8 - nelems uintptr // number of object in the span. - // TODO(rlh) consider moving some of these fields into seperate arrays. - // Put another way is an array of structs a better idea than a struct of arrays. // allocBits and gcmarkBits currently point to either markbits1 // or markbits2. At the end of a GC cycle allocBits and @@ -753,6 +753,12 @@ func (h *mheap) freeSpan(s *mspan, acct int32) { mp.mcache.local_scan = 0 memstats.tinyallocs += uint64(mp.mcache.local_tinyallocs) mp.mcache.local_tinyallocs = 0 + if msanenabled { + // Tell msan that this entire span is no longer in use. + base := unsafe.Pointer(s.base()) + bytes := s.npages << _PageShift + msanfree(base, bytes) + } if acct != 0 { memstats.heap_objects-- } -- 2.48.1