From: Keith Randall Date: Fri, 3 Jul 2020 18:28:50 +0000 (-0700) Subject: runtime: use bit-parallel operations to compute heap bit summaries X-Git-Tag: go1.16beta1~1305 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=4e5ed83e8d2fbbbc8f6524f40ab3b6733dc57a38;p=gostls13.git runtime: use bit-parallel operations to compute heap bit summaries The new implementation is much faster in all cases. name old time/op new time/op delta PallocBitsSummarize/Unpacked00-16 142ns ± 1% 7ns ± 2% -94.75% (p=0.000 n=10+9) PallocBitsSummarize/UnpackedFFFFFFFFFFFFFFFF-16 172ns ± 0% 24ns ± 0% -86.02% (p=0.000 n=9+9) PallocBitsSummarize/UnpackedAA-16 145ns ± 0% 32ns ± 0% -78.16% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedAAAAAAAAAAAAAAAA-16 172ns ± 0% 33ns ± 0% -80.95% (p=0.000 n=9+9) PallocBitsSummarize/Unpacked80000000AAAAAAAA-16 162ns ± 1% 60ns ± 0% -62.69% (p=0.000 n=10+9) PallocBitsSummarize/UnpackedAAAAAAAA00000001-16 163ns ± 0% 68ns ± 1% -58.47% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedBBBBBBBBBBBBBBBB-16 172ns ± 0% 35ns ± 0% -79.70% (p=0.000 n=9+9) PallocBitsSummarize/Unpacked80000000BBBBBBBB-16 161ns ± 0% 63ns ± 0% -60.61% (p=0.000 n=8+10) PallocBitsSummarize/UnpackedBBBBBBBB00000001-16 163ns ± 0% 60ns ± 0% -63.14% (p=0.000 n=9+10) PallocBitsSummarize/UnpackedCCCCCCCCCCCCCCCC-16 172ns ± 0% 39ns ± 0% -77.41% (p=0.000 n=7+10) PallocBitsSummarize/Unpacked4444444444444444-16 172ns ± 0% 39ns ± 0% -77.42% (p=0.000 n=7+10) PallocBitsSummarize/Unpacked4040404040404040-16 173ns ± 2% 51ns ± 1% -70.55% (p=0.000 n=10+10) PallocBitsSummarize/Unpacked4000400040004000-16 160ns ± 1% 53ns ± 0% -66.78% (p=0.000 n=10+10) PallocBitsSummarize/Unpacked1000404044CCAAFF-16 169ns ± 1% 59ns ± 1% -65.28% (p=0.000 n=10+10) Change-Id: I94daa645b76a9cf9c93edeb2058d7132216fcb72 Reviewed-on: https://go-review.googlesource.com/c/go/+/240900 Run-TryBot: Keith Randall TryBot-Result: Gobot Gobot Reviewed-by: Michael Knyszek --- diff --git a/src/runtime/mpallocbits.go b/src/runtime/mpallocbits.go index a8011341bc..ff79bfbc1a 100644 --- a/src/runtime/mpallocbits.go +++ b/src/runtime/mpallocbits.go @@ -120,78 +120,99 @@ func (b *pageBits) popcntRange(i, n uint) (s uint) { // sake of documentation, 0s are free pages and 1s are allocated pages. type pallocBits pageBits -// consec8tab is a table containing the number of consecutive -// zero bits for any uint8 value. -// -// The table is generated by calling consec8(i) for each -// possible uint8 value, which is defined as: -// -// // consec8 counts the maximum number of consecutive 0 bits -// // in a uint8. -// func consec8(n uint8) int { -// n = ^n -// i := 0 -// for n != 0 { -// n &= (n << 1) -// i++ -// } -// return i -// } -var consec8tab = [256]uint{ - 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, - 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, - 5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, - 4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, - 6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, - 4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1, - 5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1, - 4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1, - 7, 6, 5, 5, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, - 4, 3, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, - 5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1, - 4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1, - 6, 5, 4, 4, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, - 4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 1, - 5, 4, 3, 3, 2, 2, 2, 2, 3, 2, 1, 1, 2, 1, 1, 1, - 4, 3, 2, 2, 2, 1, 1, 1, 3, 2, 1, 1, 2, 1, 1, 0, -} - // summarize returns a packed summary of the bitmap in pallocBits. func (b *pallocBits) summarize() pallocSum { - // TODO(mknyszek): There may be something more clever to be done - // here to make the summarize operation more efficient. For example, - // we can compute start and end with 64-bit wide operations easily, - // but max is a bit more complex. Perhaps there exists some way to - // leverage the 64-bit start and end to our advantage? - var start, max, end uint + var start, max, cur uint + const notSetYet = ^uint(0) // sentinel for start value + start = notSetYet for i := 0; i < len(b); i++ { - a := b[i] - for j := 0; j < 64; j += 8 { - k := uint8(a >> j) - - // Compute start. - si := uint(sys.TrailingZeros8(k)) - if start == uint(i*64+j) { - start += si - } + x := b[i] + if x == 0 { + cur += 64 + continue + } + t := uint(sys.TrailingZeros64(x)) + l := uint(sys.LeadingZeros64(x)) - // Compute max. - if end+si > max { - max = end + si - } - if mi := consec8tab[k]; mi > max { - max = mi + // Finish any region spanning the uint64s + cur += t + if start == notSetYet { + start = cur + } + if cur > max { + max = cur + } + // Final region that might span to next uint64 + cur = l + } + if start == notSetYet { + // Made it all the way through without finding a single 1 bit. + const n = uint(64 * len(b)) + return packPallocSum(n, n, n) + } + if cur > max { + max = cur + } + if max >= 64-2 { + // There is no way an internal run of zeros could beat max. + return packPallocSum(start, max, cur) + } + // Now look inside each uint64 for runs of zeros. + // All uint64s must be nonzero, or we would have aborted above. +outer: + for i := 0; i < len(b); i++ { + x := b[i] + + // Look inside this uint64. We have a pattern like + // 000000 1xxxxx1 000000 + // We need to look inside the 1xxxxx1 for any contiguous + // region of zeros. + + // We already know the trailing zeros are no larger than max. Remove them. + x >>= sys.TrailingZeros64(x) & 63 + if x&(x+1) == 0 { // no more zeros (except at the top). + continue + } + + // Strategy: shrink all runs of zeros by max. If any runs of zero + // remain, then we've identified a larger maxiumum zero run. + p := max // number of zeros we still need to shrink by. + k := uint(1) // current minimum length of runs of ones in x. + for { + // Shrink all runs of zeros by p places (except the top zeros). + for p > 0 { + if p <= k { + // Shift p ones down into the top of each run of zeros. + x |= x >> (p & 63) + if x&(x+1) == 0 { // no more zeros (except at the top). + continue outer + } + break + } + // Shift k ones down into the top of each run of zeros. + x |= x >> (k & 63) + if x&(x+1) == 0 { // no more zeros (except at the top). + continue outer + } + p -= k + // We've just doubled the minimum length of 1-runs. + // This allows us to shift farther in the next iteration. + k *= 2 } - // Compute end. - if k == 0 { - end += 8 - } else { - end = uint(sys.LeadingZeros8(k)) + // The length of the lowest-order zero run is an increment to our maximum. + j := uint(sys.TrailingZeros64(^x)) // count contiguous trailing ones + x >>= j & 63 // remove trailing ones + j = uint(sys.TrailingZeros64(x)) // count contiguous trailing zeros + x >>= j & 63 // remove zeros + max += j // we have a new maximum! + if x&(x+1) == 0 { // no more zeros (except at the top). + continue outer } + p = j // remove j more zeros from each zero run. } } - return packPallocSum(start, max, end) + return packPallocSum(start, max, cur) } // find searches for npages contiguous free pages in pallocBits and returns diff --git a/src/runtime/mpallocbits_test.go b/src/runtime/mpallocbits_test.go index 71a29f3b3a..42268a1698 100644 --- a/src/runtime/mpallocbits_test.go +++ b/src/runtime/mpallocbits_test.go @@ -101,7 +101,7 @@ func invertPallocBits(b *PallocBits) { // Ensures two packed summaries are identical, and reports a detailed description // of the difference if they're not. -func checkPallocSum(t *testing.T, got, want PallocSum) { +func checkPallocSum(t testing.TB, got, want PallocSum) { if got.Start() != want.Start() { t.Errorf("inconsistent start: got %d, want %d", got.Start(), want.Start()) } @@ -297,17 +297,29 @@ func TestPallocBitsSummarize(t *testing.T) { // Benchmarks how quickly we can summarize a PallocBits. func BenchmarkPallocBitsSummarize(b *testing.B) { - buf0 := new(PallocBits) - buf1 := new(PallocBits) - for i := 0; i < len(buf1); i++ { - buf1[i] = ^uint64(0) - } - bufa := new(PallocBits) - for i := 0; i < len(bufa); i++ { - bufa[i] = 0xaa - } - for _, buf := range []*PallocBits{buf0, buf1, bufa} { - b.Run(fmt.Sprintf("Unpacked%02X", buf[0]), func(b *testing.B) { + patterns := []uint64{ + 0, + ^uint64(0), + 0xaa, + 0xaaaaaaaaaaaaaaaa, + 0x80000000aaaaaaaa, + 0xaaaaaaaa00000001, + 0xbbbbbbbbbbbbbbbb, + 0x80000000bbbbbbbb, + 0xbbbbbbbb00000001, + 0xcccccccccccccccc, + 0x4444444444444444, + 0x4040404040404040, + 0x4000400040004000, + 0x1000404044ccaaff, + } + for _, p := range patterns { + buf := new(PallocBits) + for i := 0; i < len(buf); i++ { + buf[i] = p + } + b.Run(fmt.Sprintf("Unpacked%02X", p), func(b *testing.B) { + checkPallocSum(b, buf.Summarize(), SummarizeSlow(buf)) for i := 0; i < b.N; i++ { buf.Summarize() }