From a20fd1f6ba668ec0bd8c432d26def2b65cc6609a Mon Sep 17 00:00:00 2001
From: Austin Clements <austin@google.com>
Date: Fri, 29 Apr 2016 14:51:48 -0400
Subject: [PATCH] runtime: reclaim scan/dead bit in first word
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

With the switch to separate mark bitmaps, the scan/dead bit for the
first word of each object is now unused. Reclaim this bit and use it
as a scan/dead bit, just like words three and on. The second word is
still used for checkmark.

This dramatically simplifies heapBitsSetTypeNoScan and hasPointers,
since they no longer need different cases for 1, 2, and 3+ word
objects. They can instead just manipulate the heap bitmap for the
first word and be done with it.

In order to enable this, we change heapBitsSetType and runGCProg to
always set the scan/dead bit to scan for the first word on every code
path. Since these functions only apply to types that have pointers,
there's no need to do this conditionally: it's *always* necessary to
set the scan bit in the first word.

We also change every place that scans an object and checks if there
are more pointers. Rather than only checking morePointers if the word
is >= 2, we now check morePointers if word != 1 (since that's the
checkmark word).

Looking forward, we should probably reclaim the checkmark bit, too,
but that's going to be quite a bit more work.

Tested by setting doubleCheck in heapBitsSetType and running all.bash
on both linux/amd64 and linux/386, and by running GOGC=10 all.bash.

This particularly improves the FmtFprintf* go1 benchmarks, since they
do a large amount of noscan allocation.

name                      old time/op    new time/op    delta
BinaryTree17-12              2.34s Â± 1%     2.38s Â± 1%  +1.70%  (p=0.000 n=17+19)
Fannkuch11-12                2.09s Â± 0%     2.09s Â± 1%    ~     (p=0.276 n=17+16)
FmtFprintfEmpty-12          44.9ns Â± 2%    44.8ns Â± 2%    ~     (p=0.340 n=19+18)
FmtFprintfString-12          127ns Â± 0%     125ns Â± 0%  -1.57%  (p=0.000 n=16+15)
FmtFprintfInt-12             128ns Â± 0%     122ns Â± 1%  -4.45%  (p=0.000 n=15+20)
FmtFprintfIntInt-12          207ns Â± 1%     193ns Â± 0%  -6.55%  (p=0.000 n=19+14)
FmtFprintfPrefixedInt-12     197ns Â± 1%     191ns Â± 0%  -2.93%  (p=0.000 n=17+18)
FmtFprintfFloat-12           263ns Â± 0%     248ns Â± 1%  -5.88%  (p=0.000 n=15+19)
FmtManyArgs-12               794ns Â± 0%     779ns Â± 1%  -1.90%  (p=0.000 n=18+18)
GobDecode-12                7.14ms Â± 2%    7.11ms Â± 1%    ~     (p=0.072 n=20+20)
GobEncode-12                5.85ms Â± 1%    5.82ms Â± 1%  -0.49%  (p=0.000 n=20+20)
Gzip-12                      218ms Â± 1%     215ms Â± 1%  -1.22%  (p=0.000 n=19+19)
Gunzip-12                   36.8ms Â± 0%    36.7ms Â± 0%  -0.18%  (p=0.006 n=18+20)
HTTPClientServer-12         77.1Âµs Â± 4%    77.1Âµs Â± 3%    ~     (p=0.945 n=19+20)
JSONEncode-12               15.6ms Â± 1%    15.9ms Â± 1%  +1.68%  (p=0.000 n=18+20)
JSONDecode-12               55.2ms Â± 1%    53.6ms Â± 1%  -2.93%  (p=0.000 n=17+19)
Mandelbrot200-12            4.05ms Â± 1%    4.05ms Â± 0%    ~     (p=0.306 n=17+17)
GoParse-12                  3.14ms Â± 1%    3.10ms Â± 1%  -1.31%  (p=0.000 n=19+18)
RegexpMatchEasy0_32-12      69.3ns Â± 1%    70.0ns Â± 0%  +0.89%  (p=0.000 n=19+17)
RegexpMatchEasy0_1K-12       237ns Â± 1%     236ns Â± 0%  -0.62%  (p=0.000 n=19+16)
RegexpMatchEasy1_32-12      69.5ns Â± 1%    70.3ns Â± 1%  +1.14%  (p=0.000 n=18+17)
RegexpMatchEasy1_1K-12       377ns Â± 1%     366ns Â± 1%  -3.03%  (p=0.000 n=15+19)
RegexpMatchMedium_32-12      107ns Â± 1%     107ns Â± 2%    ~     (p=0.318 n=20+19)
RegexpMatchMedium_1K-12     33.8Âµs Â± 3%    33.5Âµs Â± 1%  -1.04%  (p=0.001 n=20+19)
RegexpMatchHard_32-12       1.68Âµs Â± 1%    1.73Âµs Â± 0%  +2.50%  (p=0.000 n=20+18)
RegexpMatchHard_1K-12       50.8Âµs Â± 1%    52.0Âµs Â± 1%  +2.50%  (p=0.000 n=19+18)
Revcomp-12                   381ms Â± 1%     385ms Â± 1%  +1.00%  (p=0.000 n=17+18)
Template-12                 64.9ms Â± 3%    62.6ms Â± 1%  -3.55%  (p=0.000 n=19+18)
TimeParse-12                 324ns Â± 0%     328ns Â± 1%  +1.25%  (p=0.000 n=18+18)
TimeFormat-12                345ns Â± 0%     334ns Â± 0%  -3.31%  (p=0.000 n=15+17)
[Geo mean]                  52.1Âµs         51.5Âµs       -1.00%

Change-Id: I13e74da3193a7f80794c654f944d1f0d60817049
Reviewed-on: https://go-review.googlesource.com/22632
Reviewed-by: Rick Hudson <rlh@golang.org>
Run-TryBot: Austin Clements <austin@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
---
 src/reflect/all_test.go    |   3 +
 src/runtime/cgocall.go     |   2 +-
 src/runtime/gcinfo_test.go |   5 +-
 src/runtime/heapdump.go    |   2 +-
 src/runtime/malloc.go      |   2 +-
 src/runtime/mbitmap.go     | 128 ++++++++++++++-----------------------
 src/runtime/mgcmark.go     |   2 +-
 7 files changed, 59 insertions(+), 85 deletions(-)

diff --git a/src/reflect/all_test.go b/src/reflect/all_test.go
index f8ffaae8e1..d4c3e4e588 100644
--- a/src/reflect/all_test.go
+++ b/src/reflect/all_test.go
@@ -5399,6 +5399,9 @@ func verifyGCBitsSlice(t *testing.T, typ Type, cap int, bits []byte) {
 	for len(bits) > 2 && bits[len(bits)-1] == 0 {
 		bits = bits[:len(bits)-1]
 	}
+	if len(bits) == 2 && bits[0] == 0 && bits[1] == 0 {
+		bits = bits[:0]
+	}
 	if !bytes.Equal(heapBits, bits) {
 		t.Errorf("heapBits incorrect for make(%v, 0, %v)\nhave %v\nwant %v", typ, cap, heapBits, bits)
 	}
diff --git a/src/runtime/cgocall.go b/src/runtime/cgocall.go
index 887343edd1..6dceff09ef 100644
--- a/src/runtime/cgocall.go
+++ b/src/runtime/cgocall.go
@@ -559,7 +559,7 @@ func cgoCheckUnknownPointer(p unsafe.Pointer, msg string) (base, i uintptr) {
 		}
 		n := span.elemsize
 		for i = uintptr(0); i < n; i += sys.PtrSize {
-			if i >= 2*sys.PtrSize && !hbits.morePointers() {
+			if i != 1*sys.PtrSize && !hbits.morePointers() {
 				// No more possible pointers.
 				break
 			}
diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go
index 9a61b4f2b2..011f005403 100644
--- a/src/runtime/gcinfo_test.go
+++ b/src/runtime/gcinfo_test.go
@@ -66,7 +66,7 @@ func verifyGCInfo(t *testing.T, name string, p interface{}, mask0 []byte) {
 }
 
 func padDead(mask []byte) []byte {
-	// Because the dead bit isn't encoded until the third word,
+	// Because the dead bit isn't encoded in the second word,
 	// and because on 32-bit systems a one-word allocation
 	// uses a two-word block, the pointer info for a one-word
 	// object needs to be expanded to include an extra scalar
@@ -81,6 +81,9 @@ func trimDead(mask []byte) []byte {
 	for len(mask) > 2 && mask[len(mask)-1] == typeScalar {
 		mask = mask[:len(mask)-1]
 	}
+	if len(mask) == 2 && mask[0] == typeScalar && mask[1] == typeScalar {
+		mask = mask[:0]
+	}
 	return mask
 }
 
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index 4afe663418..c317b5f969 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -714,7 +714,7 @@ func makeheapobjbv(p uintptr, size uintptr) bitvector {
 	i := uintptr(0)
 	hbits := heapBitsForAddr(p)
 	for ; i < nptr; i++ {
-		if i >= 2 && !hbits.morePointers() {
+		if i != 1 && !hbits.morePointers() {
 			break // end of object
 		}
 		if hbits.isPointer() {
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index c9cc82192d..bb17919fd0 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -700,7 +700,7 @@ func mallocgc(size uintptr, typ *_type, needzero bool) unsafe.Pointer {
 
 	var scanSize uintptr
 	if noscan {
-		heapBitsSetTypeNoScan(uintptr(x), size)
+		heapBitsSetTypeNoScan(uintptr(x))
 	} else {
 		// If allocating a defer+arg block, now that we've picked a malloc size
 		// large enough to hold everything, cut the "asked for" size down to
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index 0bfb184945..8061e1d138 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -24,13 +24,15 @@
 // In each 2-bit entry, the lower bit holds the same information as in the 1-bit
 // bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
 // The meaning of the high bit depends on the position of the word being described
-// in its allocated object. In the first word, the high bit is unused.
+// in its allocated object. In all words *except* the second word, the
+// high bit indicates that the object is still being described. In
+// these words, if a bit pair with a high bit 0 is encountered, the
+// low bit can also be assumed to be 0, and the object description is
+// over. This 00 is called the ``dead'' encoding: it signals that the
+// rest of the words in the object are uninteresting to the garbage
+// collector.
+//
 // In the second word, the high bit is the GC ``checkmarked'' bit (see below).
-// In the third and later words, the high bit indicates that the object is still
-// being described. In these words, if a bit pair with a high bit 0 is encountered,
-// the low bit can also be assumed to be 0, and the object description is over.
-// This 00 is called the ``dead'' encoding: it signals that the rest of the words
-// in the object are uninteresting to the garbage collector.
 //
 // The 2-bit entries are split when written into the byte, so that the top half
 // of the byte contains 4 high bits and the bottom half contains 4 low (pointer)
@@ -76,7 +78,7 @@ import (
 
 const (
 	bitPointer = 1 << 0
-	bitMarked  = 1 << 4
+	bitMarked  = 1 << 4 // TODO: Rename bitScan.
 
 	heapBitsShift   = 1                     // shift offset between successive bitPointer or bitMarked entries
 	heapBitmapScale = sys.PtrSize * (8 / 2) // number of data bytes described by one heap bitmap byte
@@ -490,7 +492,7 @@ func (h heapBits) bits() uint32 {
 
 // morePointers returns true if this word and all remaining words in this object
 // are scalars.
-// h must not describe the first or second word of the object.
+// h must not describe the second word of the object.
 func (h heapBits) morePointers() bool {
 	return h.bits()&bitMarked != 0
 }
@@ -512,22 +514,7 @@ func (h heapBits) hasPointers(size uintptr) bool {
 	if size == sys.PtrSize { // 1-word objects are always pointers
 		return true
 	}
-	// Otherwise, at least a 2-word object, and at least 2-word aligned,
-	// so h.shift is either 0 or 2, so we know we can get the bits for the
-	// first two words out of *h.bitp.
-	// If either of the first two words is a pointer, not pointer free.
-	b := uint32(*h.bitp >> h.shift)
-	if b&(bitPointer|bitPointer<<heapBitsShift) != 0 {
-		return true
-	}
-	if size == 2*sys.PtrSize {
-		return false
-	}
-	// At least a 4-word object. Check scan bit (aka marked bit) in third word.
-	if h.shift == 0 {
-		return b&(bitMarked<<(2*heapBitsShift)) != 0
-	}
-	return uint32(*subtract1(h.bitp))&bitMarked != 0
+	return (*h.bitp>>h.shift)&bitMarked != 0
 }
 
 // isCheckmarked reports whether the heap bits have the checkmarked bit set.
@@ -720,9 +707,9 @@ func typeBitsBulkBarrier(typ *_type, p, size uintptr) {
 // TODO(rsc): Perhaps introduce a different heapBitsSpan type.
 
 // initSpan initializes the heap bitmap for a span.
-// It clears all mark and checkmark bits.
+// It clears all checkmark bits.
 // If this is a span of pointer-sized objects, it initializes all
-// words to pointer (and there are no dead bits).
+// words to pointer/scan.
 // Otherwise, it initializes all words to scalar/dead.
 func (h heapBits) initSpan(s *mspan) {
 	size, n, total := s.layout()
@@ -745,7 +732,7 @@ func (h heapBits) initSpan(s *mspan) {
 		end := h.bitp
 		bitp := subtractb(end, nbyte-1)
 		for {
-			*bitp = bitPointerAll
+			*bitp = bitPointerAll | bitMarkedAll
 			if bitp == end {
 				break
 			}
@@ -897,6 +884,9 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 			if !h.isPointer() {
 				throw("heapBitsSetType: pointer bit missing")
 			}
+			if !h.morePointers() {
+				throw("heapBitsSetType: scan bit missing")
+			}
 		}
 		return
 	}
@@ -924,17 +914,17 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 				// unused second word.
 				if gcphase == _GCoff {
 					*h.bitp &^= (bitPointer | bitMarked | ((bitPointer | bitMarked) << heapBitsShift)) << h.shift
-					*h.bitp |= bitPointer << h.shift
+					*h.bitp |= (bitPointer | bitMarked) << h.shift
 				} else {
 					atomic.And8(h.bitp, ^uint8((bitPointer|bitMarked|((bitPointer|bitMarked)<<heapBitsShift))<<h.shift))
-					atomic.Or8(h.bitp, bitPointer<<h.shift)
+					atomic.Or8(h.bitp, (bitPointer|bitMarked)<<h.shift)
 				}
 			} else {
 				// 2-element slice of pointer.
 				if gcphase == _GCoff {
-					*h.bitp |= (bitPointer | bitPointer<<heapBitsShift) << h.shift
+					*h.bitp |= (bitPointer | bitMarked | bitPointer<<heapBitsShift) << h.shift
 				} else {
-					atomic.Or8(h.bitp, (bitPointer|bitPointer<<heapBitsShift)<<h.shift)
+					atomic.Or8(h.bitp, (bitPointer|bitMarked|bitPointer<<heapBitsShift)<<h.shift)
 				}
 			}
 			return
@@ -948,7 +938,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 			}
 		}
 		b := uint32(*ptrmask)
-		hb := b & 3
+		hb := (b & 3) | bitMarked
 		if gcphase == _GCoff {
 			// bitPointer == 1, bitMarked is 1 << 4, heapBitsShift is 1.
 			// 110011 is shifted h.shift and complemented.
@@ -1143,12 +1133,20 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 	case h.shift == 0:
 		// Ptrmask and heap bitmap are aligned.
 		// Handle first byte of bitmap specially.
-		// The first byte we write out contains the first two words of the object.
-		// In those words, the mark bits are mark and checkmark, respectively,
-		// and must not be set. In all following words, we want to set the mark bit
-		// as a signal that the object continues to the next 2-bit entry in the bitmap.
+		//
+		// The first byte we write out covers the first four
+		// words of the object. The scan/dead bit on the first
+		// word must be set to scan since there are pointers
+		// somewhere in the object. The scan/dead bit on the
+		// second word is the checkmark, so we don't set it.
+		// In all following words, we set the scan/dead
+		// appropriately to indicate that the object contains
+		// to the next 2-bit entry in the bitmap.
+		//
+		// TODO: It doesn't matter if we set the checkmark, so
+		// maybe this case isn't needed any more.
 		hb = b & bitPointerAll
-		hb |= bitMarked<<(2*heapBitsShift) | bitMarked<<(3*heapBitsShift)
+		hb |= bitMarked | bitMarked<<(2*heapBitsShift) | bitMarked<<(3*heapBitsShift)
 		if w += 4; w >= nw {
 			goto Phase3
 		}
@@ -1169,14 +1167,18 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		// from a given span at a time, we should be able to set
 		// these bits non-atomically. Not worth the risk right now.
 		hb = (b & (bitPointer | bitPointer<<heapBitsShift)) << (2 * heapBitsShift)
+		// This is not noscan, so set the scan bit in the
+		// first word.
+		hb |= bitMarked << (2 * heapBitsShift)
 		b >>= 2
 		nb -= 2
-		// Note: no bitMarker in hb because the first two words don't get markers from us.
+		// Note: no bitMarker for second word because that's
+		// the checkmark.
 		if gcphase == _GCoff {
-			*hbitp &^= uint8((bitPointer | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
+			*hbitp &^= uint8((bitPointer | bitMarked | (bitPointer << heapBitsShift)) << (2 * heapBitsShift))
 			*hbitp |= uint8(hb)
 		} else {
-			atomic.And8(hbitp, ^(uint8(bitPointer|bitPointer<<heapBitsShift) << (2 * heapBitsShift)))
+			atomic.And8(hbitp, ^(uint8(bitPointer|bitMarked|bitPointer<<heapBitsShift) << (2 * heapBitsShift)))
 			atomic.Or8(hbitp, uint8(hb))
 		}
 		hbitp = subtract1(hbitp)
@@ -1336,7 +1338,7 @@ Phase4:
 				if j < nptr && (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
 					want |= bitPointer
 				}
-				if i >= 2 {
+				if i != 1 {
 					want |= bitMarked
 				} else {
 					have &^= bitMarked
@@ -1366,39 +1368,11 @@ Phase4:
 	}
 }
 
-// heapBitsSetTypeNoScan marks x as noscan. For objects with 1 or 2
-// words set their bitPointers to off (0).
-// All other objects have the first 3 bitPointers set to
-// off (0) and the scan word in the third word
-// also set to off (0).
-func heapBitsSetTypeNoScan(x, size uintptr) {
+// heapBitsSetTypeNoScan marks x as noscan by setting the first word
+// of x in the heap bitmap to scalar/dead.
+func heapBitsSetTypeNoScan(x uintptr) {
 	h := heapBitsForAddr(uintptr(x))
-	bitp := h.bitp
-
-	if sys.PtrSize == 8 && size == sys.PtrSize {
-		// If this is truely noScan the tinyAlloc logic should have noticed
-		// and combined such objects.
-		throw("noscan object is too small")
-	} else if size%(4*sys.PtrSize) == 0 {
-		*bitp &^= bitPointer | bitPointer<<heapBitsShift | (bitMarked|bitPointer)<<(2*heapBitsShift)
-	} else if size%(4*sys.PtrSize) == 2*sys.PtrSize {
-		if h.shift == 0 {
-			*bitp &^= (bitPointer | bitPointer<<heapBitsShift)
-			if size > 2*sys.PtrSize {
-				*bitp &^= (bitPointer | bitMarked) << (2 * heapBitsShift)
-			}
-		} else if h.shift == 2 {
-			*bitp &^= bitPointer<<(2*heapBitsShift) | bitPointer<<(3*heapBitsShift)
-			if size > 2*sys.PtrSize {
-				bitp = subtract1(bitp)
-				*bitp &^= bitPointer | bitMarked
-			}
-		} else {
-			throw("Type has unrecognized size")
-		}
-	} else {
-		throw("Type has unrecognized size")
-	}
+	*h.bitp &^= (bitPointer | bitMarked) << h.shift
 }
 
 var debugPtrmask struct {
@@ -1804,12 +1778,6 @@ Run:
 			dst = subtract1(dst)
 			bits >>= 4
 		}
-		// Clear the mark bits in the first two entries.
-		// They are the actual mark and checkmark bits,
-		// not non-dead markers. It simplified the code
-		// above to set the marker in every bit written and
-		// then clear these two as a special case at the end.
-		*dstStart &^= bitMarked | bitMarked<<heapBitsShift
 	}
 	return totalBits
 }
@@ -1925,7 +1893,7 @@ func getgcmask(ep interface{}) (mask []byte) {
 			if hbits.isPointer() {
 				mask[i/sys.PtrSize] = 1
 			}
-			if i >= 2*sys.PtrSize && !hbits.morePointers() {
+			if i != 1*sys.PtrSize && !hbits.morePointers() {
 				mask = mask[:i/sys.PtrSize]
 				break
 			}
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 8c8ce67fbf..af3205ab23 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -1132,7 +1132,7 @@ func scanobject(b uintptr, gcw *gcWork) {
 		// in the type bit for the one word. The only one-word objects
 		// are pointers, or else they'd be merged with other non-pointer
 		// data into larger allocations.
-		if i >= 2*sys.PtrSize && !hbits.morePointers() {
+		if i != 1*sys.PtrSize && !hbits.morePointers() {
 			break // no more pointers in this object
 		}
 		if !hbits.isPointer() {
-- 
2.52.0