]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: bit parallel implementation of findBitRange64
authorKeith Randall <khr@golang.org>
Tue, 7 Jul 2020 03:46:31 +0000 (20:46 -0700)
committerKeith Randall <khr@golang.org>
Mon, 17 Aug 2020 22:00:17 +0000 (22:00 +0000)
Use a bit-parallel implementation of findBitRange64.
It uses a repeated shift-'N-and technique to erase all the
free marks that are too small for the allocation.

Also some small improvements to find1.

name                                             old time/op  new time/op  delta
FindBitRange64/Pattern00Size2-16                 4.19ns ± 0%  2.26ns ± 0%   -46.04%  (p=0.000 n=10+8)
FindBitRange64/Pattern00Size8-16                 4.19ns ± 0%  2.12ns ± 0%   -49.35%  (p=0.000 n=9+10)
FindBitRange64/Pattern00Size32-16                4.20ns ± 0%  2.12ns ± 0%   -49.49%  (p=0.000 n=10+8)
FindBitRange64/PatternFFFFFFFFFFFFFFFFSize2-16   2.13ns ± 0%  2.27ns ± 0%    +6.28%  (p=0.000 n=10+10)
FindBitRange64/PatternFFFFFFFFFFFFFFFFSize8-16   2.13ns ± 0%  4.46ns ± 0%  +109.39%  (p=0.000 n=10+9)
FindBitRange64/PatternFFFFFFFFFFFFFFFFSize32-16  2.13ns ± 1%  5.58ns ± 0%  +162.37%  (p=0.000 n=10+9)
FindBitRange64/PatternAASize2-16                 22.2ns ± 0%   2.3ns ± 0%   -89.82%  (p=0.000 n=9+8)
FindBitRange64/PatternAASize8-16                 22.2ns ± 0%   2.1ns ± 1%   -90.41%  (p=0.000 n=9+10)
FindBitRange64/PatternAASize32-16                22.2ns ± 0%   2.1ns ± 1%   -90.43%  (p=0.000 n=10+10)
FindBitRange64/PatternAAAAAAAAAAAAAAAASize2-16    156ns ± 1%     2ns ± 0%   -98.54%  (p=0.000 n=10+10)
FindBitRange64/PatternAAAAAAAAAAAAAAAASize8-16    155ns ± 1%     2ns ± 0%   -98.63%  (p=0.000 n=10+8)
FindBitRange64/PatternAAAAAAAAAAAAAAAASize32-16   155ns ± 0%     2ns ± 1%   -98.63%  (p=0.000 n=8+10)
FindBitRange64/Pattern80000000AAAAAAAASize2-16   81.2ns ± 0%   2.3ns ± 1%   -97.21%  (p=0.000 n=10+10)
FindBitRange64/Pattern80000000AAAAAAAASize8-16   81.1ns ± 0%   2.1ns ± 0%   -97.39%  (p=0.000 n=10+9)
FindBitRange64/Pattern80000000AAAAAAAASize32-16  81.1ns ± 0%   2.1ns ± 0%   -97.38%  (p=0.000 n=10+10)
FindBitRange64/PatternAAAAAAAA00000001Size2-16   76.8ns ± 1%   2.3ns ± 0%   -97.05%  (p=0.000 n=10+10)
FindBitRange64/PatternAAAAAAAA00000001Size8-16   76.6ns ± 0%   2.1ns ± 0%   -97.23%  (p=0.000 n=8+10)
FindBitRange64/PatternAAAAAAAA00000001Size32-16  76.7ns ± 0%   2.1ns ± 0%   -97.23%  (p=0.000 n=9+9)
FindBitRange64/PatternBBBBBBBBBBBBBBBBSize2-16   2.13ns ± 0%  2.27ns ± 0%    +6.57%  (p=0.000 n=8+8)
FindBitRange64/PatternBBBBBBBBBBBBBBBBSize8-16   76.7ns ± 0%   2.9ns ± 0%   -96.20%  (p=0.000 n=9+10)
FindBitRange64/PatternBBBBBBBBBBBBBBBBSize32-16  76.7ns ± 0%   2.9ns ± 0%   -96.20%  (p=0.000 n=10+10)
FindBitRange64/Pattern80000000BBBBBBBBSize2-16   2.12ns ± 0%  2.27ns ± 1%    +6.74%  (p=0.000 n=10+10)
FindBitRange64/Pattern80000000BBBBBBBBSize8-16   44.8ns ± 0%   2.9ns ± 0%   -93.49%  (p=0.000 n=9+10)
FindBitRange64/Pattern80000000BBBBBBBBSize32-16  44.9ns ± 0%   2.9ns ± 0%   -93.49%  (p=0.000 n=10+8)
FindBitRange64/PatternBBBBBBBB00000001Size2-16   4.20ns ± 1%  2.27ns ± 1%   -46.02%  (p=0.000 n=10+10)
FindBitRange64/PatternBBBBBBBB00000001Size8-16   44.9ns ± 0%   2.9ns ± 1%   -93.51%  (p=0.000 n=10+9)
FindBitRange64/PatternBBBBBBBB00000001Size32-16  44.9ns ± 0%   2.9ns ± 0%   -93.51%  (p=0.000 n=10+9)
FindBitRange64/PatternCCCCCCCCCCCCCCCCSize2-16   4.19ns ± 0%  2.26ns ± 0%   -46.10%  (p=0.000 n=10+10)
FindBitRange64/PatternCCCCCCCCCCCCCCCCSize8-16   76.5ns ± 0%   2.9ns ± 0%   -96.19%  (p=0.000 n=8+7)
FindBitRange64/PatternCCCCCCCCCCCCCCCCSize32-16  76.5ns ± 0%   2.9ns ± 0%   -96.19%  (p=0.000 n=10+8)
FindBitRange64/Pattern4444444444444444Size2-16   76.4ns ± 0%   2.3ns ± 0%   -97.04%  (p=0.000 n=8+10)
FindBitRange64/Pattern4444444444444444Size8-16   76.5ns ± 0%   2.1ns ± 0%   -97.23%  (p=0.000 n=9+10)
FindBitRange64/Pattern4444444444444444Size32-16  76.5ns ± 0%   2.1ns ± 0%   -97.23%  (p=0.000 n=8+10)
FindBitRange64/Pattern4040404040404040Size2-16   40.3ns ± 0%   2.3ns ± 0%   -94.38%  (p=0.000 n=7+10)
FindBitRange64/Pattern4040404040404040Size8-16   40.2ns ± 0%   2.1ns ± 0%   -94.75%  (p=0.000 n=10+10)
FindBitRange64/Pattern4040404040404040Size32-16  40.2ns ± 0%   2.1ns ± 0%   -94.76%  (p=0.000 n=10+6)
FindBitRange64/Pattern4000400040004000Size2-16   22.2ns ± 0%   2.2ns ± 0%   -89.86%  (p=0.001 n=8+9)
FindBitRange64/Pattern4000400040004000Size8-16   22.2ns ± 0%   2.1ns ± 0%   -90.52%  (p=0.000 n=8+10)
FindBitRange64/Pattern4000400040004000Size32-16  22.2ns ± 1%   2.1ns ± 0%   -90.50%  (p=0.000 n=10+10)

The cases that slow down aren't really that slow, and those inputs
never actually occur (there's a short circuit before the call to
findBitRange64 for that case).

Change-Id: I50fae62915098032d8ce7fa57ef29eee9deb01ba
Reviewed-on: https://go-review.googlesource.com/c/go/+/241279
Reviewed-by: Michael Knyszek <mknyszek@google.com>
src/runtime/mpallocbits.go
src/runtime/mpallocbits_test.go

index ff79bfbc1a7f3f274f523c92183c787015654b00..ff112300c37365ac962bccc560623c5642b0db63 100644 (file)
@@ -218,7 +218,7 @@ outer:
 // find searches for npages contiguous free pages in pallocBits and returns
 // the index where that run starts, as well as the index of the first free page
 // it found in the search. searchIdx represents the first known free page and
-// where to begin the search from.
+// where to begin the next search from.
 //
 // If find fails to find any free space, it returns an index of ^uint(0) and
 // the new searchIdx should be ignored.
@@ -239,9 +239,10 @@ func (b *pallocBits) find(npages uintptr, searchIdx uint) (uint, uint) {
 //
 // See find for an explanation of the searchIdx parameter.
 func (b *pallocBits) find1(searchIdx uint) uint {
+       _ = b[0] // lift nil check out of loop
        for i := searchIdx / 64; i < uint(len(b)); i++ {
                x := b[i]
-               if x == ^uint64(0) {
+               if ^x == 0 {
                        continue
                }
                return i*64 + uint(sys.TrailingZeros64(^x))
@@ -263,18 +264,18 @@ func (b *pallocBits) findSmallN(npages uintptr, searchIdx uint) (uint, uint) {
        end, newSearchIdx := uint(0), ^uint(0)
        for i := searchIdx / 64; i < uint(len(b)); i++ {
                bi := b[i]
-               if bi == ^uint64(0) {
+               if ^bi == 0 {
                        end = 0
                        continue
                }
                // First see if we can pack our allocation in the trailing
                // zeros plus the end of the last 64 bits.
-               start := uint(sys.TrailingZeros64(bi))
                if newSearchIdx == ^uint(0) {
                        // The new searchIdx is going to be at these 64 bits after any
                        // 1s we file, so count trailing 1s.
                        newSearchIdx = i*64 + uint(sys.TrailingZeros64(^bi))
                }
+               start := uint(sys.TrailingZeros64(bi))
                if end+start >= uint(npages) {
                        return i*64 - end, newSearchIdx
                }
@@ -369,15 +370,33 @@ func (b *pallocBits) pages64(i uint) uint64 {
 // findBitRange64 returns the bit index of the first set of
 // n consecutive 1 bits. If no consecutive set of 1 bits of
 // size n may be found in c, then it returns an integer >= 64.
+// n must be > 0.
 func findBitRange64(c uint64, n uint) uint {
-       i := uint(0)
-       cont := uint(sys.TrailingZeros64(^c))
-       for cont < n && i < 64 {
-               i += cont
-               i += uint(sys.TrailingZeros64(c >> i))
-               cont = uint(sys.TrailingZeros64(^(c >> i)))
+       // This implementation is based on shrinking the length of
+       // runs of contiguous 1 bits. We remove the top n-1 1 bits
+       // from each run of 1s, then look for the first remaining 1 bit.
+       p := n - 1   // number of 1s we want to remove.
+       k := uint(1) // current minimum width of runs of 0 in c.
+       for p > 0 {
+               if p <= k {
+                       // Shift p 0s down into the top of each run of 1s.
+                       c &= c >> (p & 63)
+                       break
+               }
+               // Shift k 0s down into the top of each run of 1s.
+               c &= c >> (k & 63)
+               if c == 0 {
+                       return 64
+               }
+               p -= k
+               // We've just doubled the minimum length of 0-runs.
+               // This allows us to shift farther in the next iteration.
+               k *= 2
        }
-       return i
+       // Find first remaining 1.
+       // Since we shrunk from the top down, the first 1 is in
+       // its correct original position.
+       return uint(sys.TrailingZeros64(c))
 }
 
 // pallocData encapsulates pallocBits and a bitmap for
index 42268a169887313fcf5eff27ec3d71df5c2cd0af..5095e24220e80dd19869ed840bec3c1b3c57f5bc 100644 (file)
@@ -504,10 +504,9 @@ func TestFindBitRange64(t *testing.T) {
                        t.Errorf("case (%016x, %d): got %d, want %d", x, n, i, result)
                }
        }
-       for i := uint(0); i <= 64; i++ {
+       for i := uint(1); i <= 64; i++ {
                check(^uint64(0), i, 0)
        }
-       check(0, 0, 0)
        for i := uint(1); i <= 64; i++ {
                check(0, i, ^uint(0))
        }
@@ -520,3 +519,33 @@ func TestFindBitRange64(t *testing.T) {
        check(0xffff03ff0107ffff, 16, 0)
        check(0x0fff03ff01079fff, 16, ^uint(0))
 }
+
+func BenchmarkFindBitRange64(b *testing.B) {
+       patterns := []uint64{
+               0,
+               ^uint64(0),
+               0xaa,
+               0xaaaaaaaaaaaaaaaa,
+               0x80000000aaaaaaaa,
+               0xaaaaaaaa00000001,
+               0xbbbbbbbbbbbbbbbb,
+               0x80000000bbbbbbbb,
+               0xbbbbbbbb00000001,
+               0xcccccccccccccccc,
+               0x4444444444444444,
+               0x4040404040404040,
+               0x4000400040004000,
+       }
+       sizes := []uint{
+               2, 8, 32,
+       }
+       for _, pattern := range patterns {
+               for _, size := range sizes {
+                       b.Run(fmt.Sprintf("Pattern%02XSize%d", pattern, size), func(b *testing.B) {
+                               for i := 0; i < b.N; i++ {
+                                       FindBitRange64(pattern, size)
+                               }
+                       })
+               }
+       }
+}