From: Keith Randall Date: Tue, 7 Jul 2020 03:46:31 +0000 (-0700) Subject: runtime: bit parallel implementation of findBitRange64 X-Git-Tag: go1.16beta1~1304 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=8b8f926fc3f0e8f002d0a8e97aab9500e4db83a7;p=gostls13.git runtime: bit parallel implementation of findBitRange64 Use a bit-parallel implementation of findBitRange64. It uses a repeated shift-'N-and technique to erase all the free marks that are too small for the allocation. Also some small improvements to find1. name old time/op new time/op delta FindBitRange64/Pattern00Size2-16 4.19ns ± 0% 2.26ns ± 0% -46.04% (p=0.000 n=10+8) FindBitRange64/Pattern00Size8-16 4.19ns ± 0% 2.12ns ± 0% -49.35% (p=0.000 n=9+10) FindBitRange64/Pattern00Size32-16 4.20ns ± 0% 2.12ns ± 0% -49.49% (p=0.000 n=10+8) FindBitRange64/PatternFFFFFFFFFFFFFFFFSize2-16 2.13ns ± 0% 2.27ns ± 0% +6.28% (p=0.000 n=10+10) FindBitRange64/PatternFFFFFFFFFFFFFFFFSize8-16 2.13ns ± 0% 4.46ns ± 0% +109.39% (p=0.000 n=10+9) FindBitRange64/PatternFFFFFFFFFFFFFFFFSize32-16 2.13ns ± 1% 5.58ns ± 0% +162.37% (p=0.000 n=10+9) FindBitRange64/PatternAASize2-16 22.2ns ± 0% 2.3ns ± 0% -89.82% (p=0.000 n=9+8) FindBitRange64/PatternAASize8-16 22.2ns ± 0% 2.1ns ± 1% -90.41% (p=0.000 n=9+10) FindBitRange64/PatternAASize32-16 22.2ns ± 0% 2.1ns ± 1% -90.43% (p=0.000 n=10+10) FindBitRange64/PatternAAAAAAAAAAAAAAAASize2-16 156ns ± 1% 2ns ± 0% -98.54% (p=0.000 n=10+10) FindBitRange64/PatternAAAAAAAAAAAAAAAASize8-16 155ns ± 1% 2ns ± 0% -98.63% (p=0.000 n=10+8) FindBitRange64/PatternAAAAAAAAAAAAAAAASize32-16 155ns ± 0% 2ns ± 1% -98.63% (p=0.000 n=8+10) FindBitRange64/Pattern80000000AAAAAAAASize2-16 81.2ns ± 0% 2.3ns ± 1% -97.21% (p=0.000 n=10+10) FindBitRange64/Pattern80000000AAAAAAAASize8-16 81.1ns ± 0% 2.1ns ± 0% -97.39% (p=0.000 n=10+9) FindBitRange64/Pattern80000000AAAAAAAASize32-16 81.1ns ± 0% 2.1ns ± 0% -97.38% (p=0.000 n=10+10) FindBitRange64/PatternAAAAAAAA00000001Size2-16 76.8ns ± 1% 2.3ns ± 0% -97.05% (p=0.000 n=10+10) FindBitRange64/PatternAAAAAAAA00000001Size8-16 76.6ns ± 0% 2.1ns ± 0% -97.23% (p=0.000 n=8+10) FindBitRange64/PatternAAAAAAAA00000001Size32-16 76.7ns ± 0% 2.1ns ± 0% -97.23% (p=0.000 n=9+9) FindBitRange64/PatternBBBBBBBBBBBBBBBBSize2-16 2.13ns ± 0% 2.27ns ± 0% +6.57% (p=0.000 n=8+8) FindBitRange64/PatternBBBBBBBBBBBBBBBBSize8-16 76.7ns ± 0% 2.9ns ± 0% -96.20% (p=0.000 n=9+10) FindBitRange64/PatternBBBBBBBBBBBBBBBBSize32-16 76.7ns ± 0% 2.9ns ± 0% -96.20% (p=0.000 n=10+10) FindBitRange64/Pattern80000000BBBBBBBBSize2-16 2.12ns ± 0% 2.27ns ± 1% +6.74% (p=0.000 n=10+10) FindBitRange64/Pattern80000000BBBBBBBBSize8-16 44.8ns ± 0% 2.9ns ± 0% -93.49% (p=0.000 n=9+10) FindBitRange64/Pattern80000000BBBBBBBBSize32-16 44.9ns ± 0% 2.9ns ± 0% -93.49% (p=0.000 n=10+8) FindBitRange64/PatternBBBBBBBB00000001Size2-16 4.20ns ± 1% 2.27ns ± 1% -46.02% (p=0.000 n=10+10) FindBitRange64/PatternBBBBBBBB00000001Size8-16 44.9ns ± 0% 2.9ns ± 1% -93.51% (p=0.000 n=10+9) FindBitRange64/PatternBBBBBBBB00000001Size32-16 44.9ns ± 0% 2.9ns ± 0% -93.51% (p=0.000 n=10+9) FindBitRange64/PatternCCCCCCCCCCCCCCCCSize2-16 4.19ns ± 0% 2.26ns ± 0% -46.10% (p=0.000 n=10+10) FindBitRange64/PatternCCCCCCCCCCCCCCCCSize8-16 76.5ns ± 0% 2.9ns ± 0% -96.19% (p=0.000 n=8+7) FindBitRange64/PatternCCCCCCCCCCCCCCCCSize32-16 76.5ns ± 0% 2.9ns ± 0% -96.19% (p=0.000 n=10+8) FindBitRange64/Pattern4444444444444444Size2-16 76.4ns ± 0% 2.3ns ± 0% -97.04% (p=0.000 n=8+10) FindBitRange64/Pattern4444444444444444Size8-16 76.5ns ± 0% 2.1ns ± 0% -97.23% (p=0.000 n=9+10) FindBitRange64/Pattern4444444444444444Size32-16 76.5ns ± 0% 2.1ns ± 0% -97.23% (p=0.000 n=8+10) FindBitRange64/Pattern4040404040404040Size2-16 40.3ns ± 0% 2.3ns ± 0% -94.38% (p=0.000 n=7+10) FindBitRange64/Pattern4040404040404040Size8-16 40.2ns ± 0% 2.1ns ± 0% -94.75% (p=0.000 n=10+10) FindBitRange64/Pattern4040404040404040Size32-16 40.2ns ± 0% 2.1ns ± 0% -94.76% (p=0.000 n=10+6) FindBitRange64/Pattern4000400040004000Size2-16 22.2ns ± 0% 2.2ns ± 0% -89.86% (p=0.001 n=8+9) FindBitRange64/Pattern4000400040004000Size8-16 22.2ns ± 0% 2.1ns ± 0% -90.52% (p=0.000 n=8+10) FindBitRange64/Pattern4000400040004000Size32-16 22.2ns ± 1% 2.1ns ± 0% -90.50% (p=0.000 n=10+10) The cases that slow down aren't really that slow, and those inputs never actually occur (there's a short circuit before the call to findBitRange64 for that case). Change-Id: I50fae62915098032d8ce7fa57ef29eee9deb01ba Reviewed-on: https://go-review.googlesource.com/c/go/+/241279 Reviewed-by: Michael Knyszek --- diff --git a/src/runtime/mpallocbits.go b/src/runtime/mpallocbits.go index ff79bfbc1a..ff112300c3 100644 --- a/src/runtime/mpallocbits.go +++ b/src/runtime/mpallocbits.go @@ -218,7 +218,7 @@ outer: // find searches for npages contiguous free pages in pallocBits and returns // the index where that run starts, as well as the index of the first free page // it found in the search. searchIdx represents the first known free page and -// where to begin the search from. +// where to begin the next search from. // // If find fails to find any free space, it returns an index of ^uint(0) and // the new searchIdx should be ignored. @@ -239,9 +239,10 @@ func (b *pallocBits) find(npages uintptr, searchIdx uint) (uint, uint) { // // See find for an explanation of the searchIdx parameter. func (b *pallocBits) find1(searchIdx uint) uint { + _ = b[0] // lift nil check out of loop for i := searchIdx / 64; i < uint(len(b)); i++ { x := b[i] - if x == ^uint64(0) { + if ^x == 0 { continue } return i*64 + uint(sys.TrailingZeros64(^x)) @@ -263,18 +264,18 @@ func (b *pallocBits) findSmallN(npages uintptr, searchIdx uint) (uint, uint) { end, newSearchIdx := uint(0), ^uint(0) for i := searchIdx / 64; i < uint(len(b)); i++ { bi := b[i] - if bi == ^uint64(0) { + if ^bi == 0 { end = 0 continue } // First see if we can pack our allocation in the trailing // zeros plus the end of the last 64 bits. - start := uint(sys.TrailingZeros64(bi)) if newSearchIdx == ^uint(0) { // The new searchIdx is going to be at these 64 bits after any // 1s we file, so count trailing 1s. newSearchIdx = i*64 + uint(sys.TrailingZeros64(^bi)) } + start := uint(sys.TrailingZeros64(bi)) if end+start >= uint(npages) { return i*64 - end, newSearchIdx } @@ -369,15 +370,33 @@ func (b *pallocBits) pages64(i uint) uint64 { // findBitRange64 returns the bit index of the first set of // n consecutive 1 bits. If no consecutive set of 1 bits of // size n may be found in c, then it returns an integer >= 64. +// n must be > 0. func findBitRange64(c uint64, n uint) uint { - i := uint(0) - cont := uint(sys.TrailingZeros64(^c)) - for cont < n && i < 64 { - i += cont - i += uint(sys.TrailingZeros64(c >> i)) - cont = uint(sys.TrailingZeros64(^(c >> i))) + // This implementation is based on shrinking the length of + // runs of contiguous 1 bits. We remove the top n-1 1 bits + // from each run of 1s, then look for the first remaining 1 bit. + p := n - 1 // number of 1s we want to remove. + k := uint(1) // current minimum width of runs of 0 in c. + for p > 0 { + if p <= k { + // Shift p 0s down into the top of each run of 1s. + c &= c >> (p & 63) + break + } + // Shift k 0s down into the top of each run of 1s. + c &= c >> (k & 63) + if c == 0 { + return 64 + } + p -= k + // We've just doubled the minimum length of 0-runs. + // This allows us to shift farther in the next iteration. + k *= 2 } - return i + // Find first remaining 1. + // Since we shrunk from the top down, the first 1 is in + // its correct original position. + return uint(sys.TrailingZeros64(c)) } // pallocData encapsulates pallocBits and a bitmap for diff --git a/src/runtime/mpallocbits_test.go b/src/runtime/mpallocbits_test.go index 42268a1698..5095e24220 100644 --- a/src/runtime/mpallocbits_test.go +++ b/src/runtime/mpallocbits_test.go @@ -504,10 +504,9 @@ func TestFindBitRange64(t *testing.T) { t.Errorf("case (%016x, %d): got %d, want %d", x, n, i, result) } } - for i := uint(0); i <= 64; i++ { + for i := uint(1); i <= 64; i++ { check(^uint64(0), i, 0) } - check(0, 0, 0) for i := uint(1); i <= 64; i++ { check(0, i, ^uint(0)) } @@ -520,3 +519,33 @@ func TestFindBitRange64(t *testing.T) { check(0xffff03ff0107ffff, 16, 0) check(0x0fff03ff01079fff, 16, ^uint(0)) } + +func BenchmarkFindBitRange64(b *testing.B) { + patterns := []uint64{ + 0, + ^uint64(0), + 0xaa, + 0xaaaaaaaaaaaaaaaa, + 0x80000000aaaaaaaa, + 0xaaaaaaaa00000001, + 0xbbbbbbbbbbbbbbbb, + 0x80000000bbbbbbbb, + 0xbbbbbbbb00000001, + 0xcccccccccccccccc, + 0x4444444444444444, + 0x4040404040404040, + 0x4000400040004000, + } + sizes := []uint{ + 2, 8, 32, + } + for _, pattern := range patterns { + for _, size := range sizes { + b.Run(fmt.Sprintf("Pattern%02XSize%d", pattern, size), func(b *testing.B) { + for i := 0; i < b.N; i++ { + FindBitRange64(pattern, size) + } + }) + } + } +}