From 0234dfd493b9bb009728d60658aa0ccdd2463c09 Mon Sep 17 00:00:00 2001
From: Russ Cox <rsc@golang.org>
Date: Mon, 4 May 2015 10:19:24 -0400
Subject: [PATCH] runtime: use 2-bit heap bitmap (in place of 4-bit)

Previous CLs changed the representation of the non-heap type bitmaps
to be 1-bit bitmaps (pointer or not). Before this CL, the heap bitmap
stored a 2-bit type for each word and a mark bit and checkmark bit
for the first word of the object. (There used to be additional per-word bits.)

Reduce heap bitmap to 2-bit, with 1 dedicated to pointer or not,
and the other used for mark, checkmark, and "keep scanning forward
to find pointers in this object." See comments for details.

This CL replaces heapBitsSetType with very slow but obviously correct code.
A followup CL will optimize it. (Spoiler: the new code is faster than Go 1.4 was.)

Change-Id: I999577a133f3cfecacebdec9cdc3573c235c7fb9
Reviewed-on: https://go-review.googlesource.com/9703
Reviewed-by: Rick Hudson <rlh@golang.org>
Reviewed-by: Austin Clements <austin@google.com>
---
 src/runtime/gcinfo_test.go |  23 +-
 src/runtime/heapdump.go    |   7 +-
 src/runtime/mbitmap.go     | 489 +++++++++++++++++++------------------
 src/runtime/mgcmark.go     |  34 ++-
 src/runtime/stack1.go      |   6 +
 5 files changed, 293 insertions(+), 266 deletions(-)

diff --git a/src/runtime/gcinfo_test.go b/src/runtime/gcinfo_test.go
index b4ab9134aa..dd5c25e0b1 100644
--- a/src/runtime/gcinfo_test.go
+++ b/src/runtime/gcinfo_test.go
@@ -10,6 +10,12 @@ import (
 	"testing"
 )
 
+const (
+	typeScalar  = 0
+	typePointer = 1
+	typeDead    = 255
+)
+
 // TestGCInfo tests that various objects in heap, data and bss receive correct GC pointer type info.
 func TestGCInfo(t *testing.T) {
 	verifyGCInfo(t, "bss ScalarPtr", &bssScalarPtr, infoScalarPtr)
@@ -37,7 +43,9 @@ func TestGCInfo(t *testing.T) {
 	verifyGCInfo(t, "stack iface", new(Iface), nonStackInfo(infoIface))
 
 	for i := 0; i < 10; i++ {
+		verifyGCInfo(t, "heap PtrSlice", escape(&make([]*byte, 10)[0]), infoPtr10)
 		verifyGCInfo(t, "heap ScalarPtr", escape(new(ScalarPtr)), infoScalarPtr)
+		verifyGCInfo(t, "heap ScalarPtrSlice", escape(&make([]ScalarPtr, 4)[0]), infoScalarPtr4)
 		verifyGCInfo(t, "heap PtrScalar", escape(new(PtrScalar)), infoPtrScalar)
 		verifyGCInfo(t, "heap BigStruct", escape(new(BigStruct)), infoBigStruct())
 		verifyGCInfo(t, "heap string", escape(new(string)), infoString)
@@ -78,18 +86,7 @@ func escape(p interface{}) interface{} {
 	return p
 }
 
-const (
-	typeDead = iota
-	typeScalar
-	typePointer
-)
-
-const (
-	BitsString = iota // unused
-	BitsSlice         // unused
-	BitsIface
-	BitsEface
-)
+var infoPtr10 = []byte{typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer, typePointer}
 
 type ScalarPtr struct {
 	q int
@@ -102,6 +99,8 @@ type ScalarPtr struct {
 
 var infoScalarPtr = []byte{typeScalar, typePointer, typeScalar, typePointer, typeScalar, typePointer}
 
+var infoScalarPtr4 = append(append(append(append([]byte(nil), infoScalarPtr...), infoScalarPtr...), infoScalarPtr...), infoScalarPtr...)
+
 type PtrScalar struct {
 	q *int
 	w int
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go
index e18aa79164..0add63acb4 100644
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -730,14 +730,13 @@ func makeheapobjbv(p uintptr, size uintptr) bitvector {
 	i := uintptr(0)
 	hbits := heapBitsForAddr(p)
 	for ; i < nptr; i++ {
-		bits := hbits.typeBits()
-		if bits == typeDead {
+		if i >= 2 && !hbits.isMarked() {
 			break // end of object
 		}
-		hbits = hbits.next()
-		if bits == typePointer {
+		if hbits.isPointer() {
 			tmpbuf[i/8] |= 1 << (i % 8)
 		}
+		hbits = hbits.next()
 	}
 	return bitvector{int32(i), &tmpbuf[0]}
 }
diff --git a/src/runtime/mbitmap.go b/src/runtime/mbitmap.go
index cfdd259371..dea6879adc 100644
--- a/src/runtime/mbitmap.go
+++ b/src/runtime/mbitmap.go
@@ -6,48 +6,36 @@
 //
 // Stack, data, and bss bitmaps
 //
-// Not handled in this file, but worth mentioning: stack frames and global data
-// in the data and bss sections are described by 1-bit bitmaps in which 0 means
-// scalar or uninitialized or dead and 1 means pointer to visit during GC.
-//
-// Comparing this 1-bit form with the 2-bit form described below, 0 represents
-// both the 2-bit 00 and 01, while 1 represents the 2-bit 10.
-// Therefore conversions between the two (until the 2-bit form is gone)
-// can be done by x>>1 for 2-bit to 1-bit and x+1 for 1-bit to 2-bit.
-//
-// Type bitmaps
-//
-// Types that aren't too large
-// record information about the layout of their memory words using a type bitmap.
-// The bitmap holds two bits for each pointer-sized word. The two-bit values are:
-//
-// 	00 - typeDead: not a pointer, and no pointers in the rest of the object
-//	01 - typeScalar: not a pointer
-//	10 - typePointer: a pointer that GC should trace
-//	11 - unused
-//
-// typeDead only appears in type bitmaps in Go type descriptors
-// and in type bitmaps embedded in the heap bitmap (see below).
+// Stack frames and global variables in the data and bss sections are described
+// by 1-bit bitmaps in which 0 means uninteresting and 1 means live pointer
+// to be visited during GC.
 //
 // Heap bitmap
 //
 // The allocated heap comes from a subset of the memory in the range [start, used),
 // where start == mheap_.arena_start and used == mheap_.arena_used.
-// The heap bitmap comprises 4 bits for each pointer-sized word in that range,
+// The heap bitmap comprises 2 bits for each pointer-sized word in that range,
 // stored in bytes indexed backward in memory from start.
-// That is, the byte at address start-1 holds the 4-bit entries for the two words
-// start, start+ptrSize, the byte at start-2 holds the entries for start+2*ptrSize,
-// start+3*ptrSize, and so on.
-// In the byte holding the entries for addresses p and p+ptrSize, the low 4 bits
-// describe p and the high 4 bits describe p+ptrSize.
+// That is, the byte at address start-1 holds the 2-bit entries for the four words
+// start through start+3*ptrSize, the byte at start-2 holds the entries for
+// start+4*ptrSize through start+7*ptrSize, and so on.
+// In each byte, the low 2 bits describe the first word, the next 2 bits describe
+// the next word, and so on.
 //
-// The 4 bits for each word are:
-//	0001 - not used
-//	0010 - bitMarked: this object has been marked by GC
-//	tt00 - word type bits, as in a type bitmap.
+// In each 2-bit entry, the lower bit holds the same information as in the 1-bit
+// bitmaps: 0 means uninteresting and 1 means live pointer to be visited during GC.
+// The meaning of the high bit depends on the position of the word being described
+// in its allocated object. In the first word, the high bit is the GC ``marked'' bit.
+// In the second word, the high bit is the GC ``checkmarked'' bit (see below).
+// In the third and later words, the high bit indicates that the object is still
+// being described. In these words, if a bit pair with a high bit 0 is encountered,
+// the low bit can also be assumed to be 0, and the object description is over.
+// This 00 is called the ``dead'' encoding: it signals that the rest of the words
+// in the object are uninteresting to the garbage collector.
 //
-// The code makes use of the fact that the zero value for a heap bitmap nibble
-// has no boundary bit set, no marked bit set, and type bits == typeDead.
+// The code makes use of the fact that the zero value for a heap bitmap
+// has no live pointer bit set and is (depending on position), not marked,
+// not checkmarked, and is the dead encoding.
 // These properties must be preserved when modifying the encoding.
 //
 // Checkmarks
@@ -57,44 +45,32 @@
 // collector implementation. As a sanity check, the GC has a 'checkmark'
 // mode that retraverses the object graph with the world stopped, to make
 // sure that everything that should be marked is marked.
-// In checkmark mode, in the heap bitmap, the type bits for the first word
-// of an object are redefined:
-//
-//	00 - typeScalarCheckmarked // typeScalar, checkmarked
-//	01 - typeScalar // typeScalar, not checkmarked
-//	10 - typePointer // typePointer, not checkmarked
-//	11 - typePointerCheckmarked // typePointer, checkmarked
+// In checkmark mode, in the heap bitmap, the high bit of the 2-bit entry
+// for the second word of the object holds the checkmark bit.
+// When not in checkmark mode, this bit is set to 1.
 //
-// That is, typeDead is redefined to be typeScalar + a checkmark, and the
-// previously unused 11 pattern is redefined to be typePointer + a checkmark.
-// To prepare for this mode, we must move any typeDead in the first word of
-// a multiword object to the second word.
+// The smallest possible allocation is 8 bytes. On a 32-bit machine, that
+// means every allocated object has two words, so there is room for the
+// checkmark bit. On a 64-bit machine, however, the 8-byte allocation is
+// just one word, so the second bit pair is not available for encoding the
+// checkmark. However, because non-pointer allocations are combined
+// into larger 16-byte (maxTinySize) allocations, a plain 8-byte allocation
+// must be a pointer, so the type bit in the first word is not actually needed.
+// It is still used in general, except in checkmark the type bit is repurposed
+// as the checkmark bit and then reinitialized (to 1) as the type bit when
+// finished.
 
 package runtime
 
 import "unsafe"
 
 const (
-	typeDead               = 0
-	typeScalarCheckmarked  = 0
-	typeScalar             = 1
-	typePointer            = 2
-	typePointerCheckmarked = 3
-
-	typeBitsWidth = 2 // # of type bits per pointer-sized word
-	typeMask      = 1<<typeBitsWidth - 1
-
-	heapBitsWidth   = 4
-	heapBitmapScale = ptrSize * (8 / heapBitsWidth) // number of data bytes per heap bitmap byte
-	bitMarked       = 2
-	typeShift       = 2
-)
+	bitPointer = 1
+	bitMarked  = 2
 
-// Information from the compiler about the layout of stack frames.
-type bitvector struct {
-	n        int32 // # of bits
-	bytedata *uint8
-}
+	heapBitsWidth   = 2                             // heap bitmap bits to describe one pointer
+	heapBitmapScale = ptrSize * (8 / heapBitsWidth) // number of data bytes described by one heap bitmap byte
+)
 
 // addb returns the byte pointer p+n.
 //go:nowritebarrier
@@ -141,8 +117,9 @@ type heapBits struct {
 // heapBitsForAddr returns the heapBits for the address addr.
 // The caller must have already checked that addr is in the range [mheap_.arena_start, mheap_.arena_used).
 func heapBitsForAddr(addr uintptr) heapBits {
+	// 2 bits per work, 4 pairs per byte, and a mask is hard coded.
 	off := (addr - mheap_.arena_start) / ptrSize
-	return heapBits{(*uint8)(unsafe.Pointer(mheap_.arena_start - off/2 - 1)), uint32(4 * (off & 1))}
+	return heapBits{(*uint8)(unsafe.Pointer(mheap_.arena_start - off/4 - 1)), uint32(2 * (off & 3))}
 }
 
 // heapBitsForSpan returns the heapBits for the span base address base.
@@ -229,20 +206,39 @@ func (h heapBits) prefetch() {
 // That is, if h describes address p, h.next() describes p+ptrSize.
 // Note that next does not modify h. The caller must record the result.
 func (h heapBits) next() heapBits {
-	if h.shift == 0 {
-		return heapBits{h.bitp, 4}
+	if h.shift < 8-heapBitsWidth {
+		return heapBits{h.bitp, h.shift + heapBitsWidth}
 	}
 	return heapBits{subtractb(h.bitp, 1), 0}
 }
 
+// forward returns the heapBits describing n pointer-sized words ahead of h in memory.
+// That is, if h describes address p, h.forward(n) describes p+n*ptrSize.
+// h.forward(1) is equivalent to h.next(), just slower.
+// Note that forward does not modify h. The caller must record the result.
+// bits returns the heap bits for the current word.
+func (h heapBits) forward(n uintptr) heapBits {
+	n += uintptr(h.shift) / heapBitsWidth
+	return heapBits{subtractb(h.bitp, n/4), uint32(n%4) * heapBitsWidth}
+}
+
+// The caller can test isMarked and isPointer by &-ing with bitMarked and bitPointer.
+// The result includes in its higher bits the bits for subsequent words
+// described by the same bitmap byte.
+func (h heapBits) bits() uint32 {
+	return uint32(*h.bitp) >> h.shift
+}
+
 // isMarked reports whether the heap bits have the marked bit set.
+// h must describe the initial word of the object.
 func (h heapBits) isMarked() bool {
 	return *h.bitp&(bitMarked<<h.shift) != 0
 }
 
 // setMarked sets the marked bit in the heap bits, atomically.
+// h must describe the initial word of the object.
 func (h heapBits) setMarked() {
-	// Each byte of GC bitmap holds info for two words.
+	// Each byte of GC bitmap holds info for four words.
 	// Might be racing with other updates, so use atomic update always.
 	// We used to be clever here and use a non-atomic update in certain
 	// cases, but it's not worth the risk.
@@ -250,31 +246,68 @@ func (h heapBits) setMarked() {
 }
 
 // setMarkedNonAtomic sets the marked bit in the heap bits, non-atomically.
+// h must describe the initial word of the object.
 func (h heapBits) setMarkedNonAtomic() {
 	*h.bitp |= bitMarked << h.shift
 }
 
-// typeBits returns the heap bits' type bits.
-func (h heapBits) typeBits() uint8 {
-	return (*h.bitp >> (h.shift + typeShift)) & typeMask
+// isPointer reports whether the heap bits describe a pointer word.
+// h must describe the initial word of the object.
+func (h heapBits) isPointer() bool {
+	return (*h.bitp>>h.shift)&bitPointer != 0
+}
+
+// hasPointers reports whether the given object has any pointers.
+// It must be told how large the object at h is, so that it does not read too
+// far into the bitmap.
+// h must describe the initial word of the object.
+func (h heapBits) hasPointers(size uintptr) bool {
+	if size == ptrSize { // 1-word objects are always pointers
+		return true
+	}
+	// Otherwise, at least a 2-word object, and at least 2-word aligned,
+	// so h.shift is either 0 or 4, so we know we can get the bits for the
+	// first two words out of *h.bitp.
+	// If either of the first two words is a pointer, not pointer free.
+	b := uint32(*h.bitp >> h.shift)
+	if b&(bitPointer|bitPointer<<heapBitsWidth) != 0 {
+		return true
+	}
+	if size == 2*ptrSize {
+		return false
+	}
+	// At least a 4-word object. Check scan bit (aka marked bit) in third word.
+	if h.shift == 0 {
+		return b&(bitMarked<<(2*heapBitsWidth)) != 0
+	}
+	return uint32(*subtractb(h.bitp, 1))&bitMarked != 0
 }
 
 // isCheckmarked reports whether the heap bits have the checkmarked bit set.
-func (h heapBits) isCheckmarked() bool {
-	typ := h.typeBits()
-	return typ == typeScalarCheckmarked || typ == typePointerCheckmarked
+// It must be told how large the object at h is, because the encoding of the
+// checkmark bit varies by size.
+// h must describe the initial word of the object.
+func (h heapBits) isCheckmarked(size uintptr) bool {
+	if size == ptrSize {
+		return (*h.bitp>>h.shift)&bitPointer != 0
+	}
+	// All multiword objects are 2-word aligned,
+	// so we know that the initial word's 2-bit pair
+	// and the second word's 2-bit pair are in the
+	// same heap bitmap byte, *h.bitp.
+	return (*h.bitp>>(heapBitsWidth+h.shift))&bitMarked != 0
 }
 
 // setCheckmarked sets the checkmarked bit.
-func (h heapBits) setCheckmarked() {
-	typ := h.typeBits()
-	if typ == typeScalar {
-		// Clear low type bit to turn 01 into 00.
-		atomicand8(h.bitp, ^((1 << typeShift) << h.shift))
-	} else if typ == typePointer {
-		// Set low type bit to turn 10 into 11.
-		atomicor8(h.bitp, (1<<typeShift)<<h.shift)
+// It must be told how large the object at h is, because the encoding of the
+// checkmark bit varies by size.
+// h must describe the initial word of the object.
+func (h heapBits) setCheckmarked(size uintptr) {
+	if size == ptrSize {
+		atomicor8(h.bitp, bitPointer<<h.shift)
+		return
 	}
+	atomicor8(h.bitp, bitMarked<<(heapBitsWidth+h.shift))
 }
 
 // The methods operating on spans all require that h has been returned
@@ -295,95 +328,43 @@ func (h heapBits) initSpan(size, n, total uintptr) {
 }
 
 // initCheckmarkSpan initializes a span for being checkmarked.
-// This would be a no-op except that we need to rewrite any
-// typeDead bits in the first word of the object into typeScalar
-// followed by a typeDead in the second word of the object.
+// It clears the checkmark bits, which are set to 1 in normal operation.
 func (h heapBits) initCheckmarkSpan(size, n, total uintptr) {
-	if size == ptrSize {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if ptrSize == 8 && size == ptrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 		// Only possible on 64-bit system, since minimum size is 8.
-		// Must update both top and bottom nibble of each byte.
-		// There is no second word in these objects, so all we have
-		// to do is rewrite typeDead to typeScalar by adding the 1<<typeShift bit.
+		// Must clear type bit (checkmark bit) of every word.
+		// The type bit is the lower of every two-bit pair.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
-			x := int(*bitp)
-
-			if (x>>typeShift)&typeMask == typeDead {
-				x += (typeScalar - typeDead) << typeShift
-			}
-			if (x>>(4+typeShift))&typeMask == typeDead {
-				x += (typeScalar - typeDead) << (4 + typeShift)
-			}
-			*bitp = uint8(x)
+		for i := uintptr(0); i < n; i += 4 {
+			*bitp &^= bitPointer | bitPointer<<2 | bitPointer<<4 | bitPointer<<6
 			bitp = subtractb(bitp, 1)
 		}
 		return
 	}
-
-	// Update bottom nibble for first word of each object.
-	// If the bottom nibble says typeDead, change to typeScalar
-	// and clear top nibble to mark as typeDead.
-	bitp := h.bitp
-	step := size / heapBitmapScale
 	for i := uintptr(0); i < n; i++ {
-		x := *bitp
-		if (x>>typeShift)&typeMask == typeDead {
-			x += (typeScalar - typeDead) << typeShift
-			x &= 0x0f // clear top nibble to typeDead
-		}
-		bitp = subtractb(bitp, step)
+		*h.bitp &^= bitMarked << (heapBitsWidth + h.shift)
+		h = h.forward(size / ptrSize)
 	}
 }
 
-// clearCheckmarkSpan removes all the checkmarks from a span.
-// If it finds a multiword object starting with typeScalar typeDead,
-// it rewrites the heap bits to the simpler typeDead typeDead.
+// clearCheckmarkSpan undoes all the checkmarking in a span.
+// The actual checkmark bits are ignored, so the only work to do
+// is to fix the pointer bits. (Pointer bits are ignored by scanobject
+// but consulted by typedmemmove.)
 func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
-	if size == ptrSize {
+	// The ptrSize == 8 is a compile-time constant false on 32-bit and eliminates this code entirely.
+	if ptrSize == 8 && size == ptrSize {
+		// Checkmark bit is type bit, bottom bit of every 2-bit entry.
 		// Only possible on 64-bit system, since minimum size is 8.
-		// Must update both top and bottom nibble of each byte.
-		// typeScalarCheckmarked can be left as typeDead,
-		// but we want to change typeScalar back to typeDead.
+		// Must clear type bit (checkmark bit) of every word.
+		// The type bit is the lower of every two-bit pair.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
-			x := int(*bitp)
-			switch typ := (x >> typeShift) & typeMask; typ {
-			case typeScalar:
-				x += (typeDead - typeScalar) << typeShift
-			case typePointerCheckmarked:
-				x += (typePointer - typePointerCheckmarked) << typeShift
-			}
-
-			switch typ := (x >> (4 + typeShift)) & typeMask; typ {
-			case typeScalar:
-				x += (typeDead - typeScalar) << (4 + typeShift)
-			case typePointerCheckmarked:
-				x += (typePointer - typePointerCheckmarked) << (4 + typeShift)
-			}
-
-			*bitp = uint8(x)
+		for i := uintptr(0); i < n; i += 4 {
+			*bitp |= bitPointer | bitPointer<<2 | bitPointer<<4 | bitPointer<<6
 			bitp = subtractb(bitp, 1)
 		}
-		return
-	}
-
-	// Update bottom nibble for first word of each object.
-	// If the bottom nibble says typeScalarCheckmarked and the top is not typeDead,
-	// change to typeScalar. Otherwise leave, since typeScalarCheckmarked == typeDead.
-	// If the bottom nibble says typePointerCheckmarked, change to typePointer.
-	bitp := h.bitp
-	step := size / heapBitmapScale
-	for i := uintptr(0); i < n; i++ {
-		x := int(*bitp)
-		switch typ := (x >> typeShift) & typeMask; {
-		case typ == typeScalarCheckmarked && (x>>(4+typeShift))&typeMask != typeDead:
-			x += (typeScalar - typeScalarCheckmarked) << typeShift
-		case typ == typePointerCheckmarked:
-			x += (typePointer - typePointerCheckmarked) << typeShift
-		}
-
-		*bitp = uint8(x)
-		bitp = subtractb(bitp, step)
 	}
 }
 
@@ -393,44 +374,98 @@ func (h heapBits) clearCheckmarkSpan(size, n, total uintptr) {
 // bits for the first two words (or one for single-word objects) to typeDead
 // and then calls f(p), where p is the object's base address.
 // f is expected to add the object to a free list.
+// For non-free objects, heapBitsSweepSpan turns off the marked bit.
 func heapBitsSweepSpan(base, size, n uintptr, f func(uintptr)) {
 	h := heapBitsForSpan(base)
-	if size == ptrSize {
-		// Only possible on 64-bit system, since minimum size is 8.
-		// Must read and update both top and bottom nibble of each byte.
+	switch {
+	default:
+		throw("heapBitsSweepSpan")
+	case size == ptrSize:
+		// Consider mark bits in all four 2-bit entries of each bitmap byte.
 		bitp := h.bitp
-		for i := uintptr(0); i < n; i += 2 {
-			x := int(*bitp)
+		for i := uintptr(0); i < n; i += 4 {
+			x := uint32(*bitp)
 			if x&bitMarked != 0 {
 				x &^= bitMarked
 			} else {
-				x &^= typeMask << typeShift
+				x &^= bitPointer
 				f(base + i*ptrSize)
 			}
+			if x&(bitMarked<<2) != 0 {
+				x &^= bitMarked << 2
+			} else {
+				x &^= bitPointer << 2
+				f(base + (i+1)*ptrSize)
+			}
 			if x&(bitMarked<<4) != 0 {
 				x &^= bitMarked << 4
 			} else {
-				x &^= typeMask << (4 + typeShift)
-				f(base + (i+1)*ptrSize)
+				x &^= bitPointer << 4
+				f(base + (i+2)*ptrSize)
+			}
+			if x&(bitMarked<<6) != 0 {
+				x &^= bitMarked << 6
+			} else {
+				x &^= bitPointer << 6
+				f(base + (i+3)*ptrSize)
 			}
 			*bitp = uint8(x)
 			bitp = subtractb(bitp, 1)
 		}
-		return
-	}
 
-	bitp := h.bitp
-	step := size / heapBitmapScale
-	for i := uintptr(0); i < n; i++ {
-		x := int(*bitp)
-		if x&bitMarked != 0 {
-			x &^= bitMarked
-		} else {
-			x = 0
-			f(base + i*size)
+	case size%(4*ptrSize) == 0:
+		// Mark bit is in first word of each object.
+		// Each object starts at bit 0 of a heap bitmap byte.
+		bitp := h.bitp
+		step := size / heapBitmapScale
+		for i := uintptr(0); i < n; i++ {
+			x := uint32(*bitp)
+			if x&bitMarked != 0 {
+				x &^= bitMarked
+			} else {
+				x = 0
+				f(base + i*size)
+			}
+			*bitp = uint8(x)
+			bitp = subtractb(bitp, step)
+		}
+
+	case size%(4*ptrSize) == 2*ptrSize:
+		// Mark bit is in first word of each object,
+		// but every other object starts halfway through a heap bitmap byte.
+		// Unroll loop 2x to handle alternating shift count and step size.
+		bitp := h.bitp
+		step := size / heapBitmapScale
+		var i uintptr
+		for i = uintptr(0); i < n; i += 2 {
+			x := uint32(*bitp)
+			if x&bitMarked != 0 {
+				x &^= bitMarked
+			} else {
+				x &^= 0x0f
+				f(base + i*size)
+				if size > 2*ptrSize {
+					x = 0
+				}
+			}
+			*bitp = uint8(x)
+			if i+1 >= n {
+				break
+			}
+			bitp = subtractb(bitp, step)
+			x = uint32(*bitp)
+			if x&(bitMarked<<4) != 0 {
+				x &^= bitMarked << 4
+			} else {
+				x &^= 0xf0
+				f(base + (i+1)*size)
+				if size > 2*ptrSize {
+					*subtractb(bitp, 1) = 0
+				}
+			}
+			*bitp = uint8(x)
+			bitp = subtractb(bitp, step+1)
 		}
-		*bitp = uint8(x)
-		bitp = subtractb(bitp, step)
 	}
 }
 
@@ -456,7 +491,7 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		// when initializing the span, and then the atomicor8 here
 		// goes away - heapBitsSetType would be a no-op
 		// in that case.
-		atomicor8(h.bitp, typePointer<<(typeShift+h.shift))
+		atomicor8(h.bitp, bitPointer<<h.shift)
 		return
 	}
 	if typ.kind&kindGCProg != 0 {
@@ -489,41 +524,28 @@ func heapBitsSetType(x, size, dataSize uintptr, typ *_type) {
 		ptrmask = (*uint8)(unsafe.Pointer(typ.gc[0])) // pointer to unrolled mask
 	}
 
-	// Copy from 1-bit ptrmask into 4-bit bitmap.
-	elemSize := typ.size
-	var v uint32 // pending byte of 4-bit bitmap; uint32 for better code gen
-	nv := 0      // number of bits added to v
-	for i := uintptr(0); i < dataSize; i += elemSize {
-		// At each word, b holds the pending bits from the 1-bit bitmap,
-		// with a sentinel 1 bit above all the actual bits.
-		// When b == 1, that means it is out of bits and needs to be refreshed.
-		// *(p+1) is the next byte to read.
-		p := ptrmask
-		b := uint32(*p) | 0x100
-		for j := uintptr(0); j < elemSize; j += ptrSize {
-			if b == 1 {
-				p = addb(p, 1)
-				b = uint32(*p) | 0x100
-			}
-			// b&1 is 1 for pointer, 0 for scalar.
-			// We want typePointer (2) or typeScalar (1), so add 1.
-			v |= ((b & 1) + 1) << (uint(nv) + typeShift)
-			b >>= 1
-			if nv += heapBitsWidth; nv == 8 {
-				*h.bitp = uint8(v)
-				h.bitp = subtractb(h.bitp, 1)
-				v = 0
-				nv = 0
-			}
+	// Copy from 1-bit ptrmask into 2-bit bitmap.
+	// If size is a multiple of 4 words, then the bitmap bytes for the object
+	// are not shared with any other object and can be written directly.
+	// On 64-bit systems, many sizes are only 16-byte aligned; half of
+	// those are not multiples of 4 words (for example, 48/8 = 6 words);
+	// those share either the leading byte or the trailing byte of their bitmaps
+	// with another object.
+	nptr := typ.size / ptrSize
+	_ = nptr
+	for i := uintptr(0); i < dataSize/ptrSize; i++ {
+		atomicand8(h.bitp, ^((bitPointer | bitMarked) << h.shift))
+		j := i % nptr
+		if (*addb(ptrmask, j/8)>>(j%8))&1 != 0 {
+			atomicor8(h.bitp, bitPointer<<h.shift)
+		}
+		if i >= 2 {
+			atomicor8(h.bitp, bitMarked<<h.shift)
 		}
+		h = h.next()
 	}
-
-	// Finish final byte of bitmap and mark next word (if any) with typeDead (0)
-	if nv != 0 {
-		*h.bitp = uint8(v)
-		h.bitp = subtractb(h.bitp, 1)
-	} else if dataSize < size {
-		*h.bitp = 0
+	if dataSize < size {
+		atomicand8(h.bitp, ^((bitPointer | bitMarked) << h.shift))
 	}
 }
 
@@ -600,7 +622,7 @@ const (
 // ppos is a pointer to position in mask, in bits.
 // sparse says to generate 4-bits per word mask for heap (1-bit for data/bss otherwise).
 //go:nowritebarrier
-func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool) *byte {
+func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace bool) *byte {
 	pos := *ppos
 	mask := (*[1 << 30]byte)(unsafe.Pointer(maskp))
 	for {
@@ -616,6 +638,8 @@ func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool)
 			for i := 0; i < siz; i++ {
 				v := p[i/8] >> (uint(i) % 8) & 1
 				if inplace {
+					throw("gc inplace")
+					const typeShift = 2
 					// Store directly into GC bitmap.
 					h := heapBitsForAddr(uintptr(unsafe.Pointer(&mask[pos])))
 					if h.shift == 0 {
@@ -624,12 +648,6 @@ func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool)
 						*h.bitp |= v << (4 + typeShift)
 					}
 					pos += ptrSize
-				} else if sparse {
-					throw("sparse")
-					// 4-bits per word, type bits in high bits
-					v <<= (pos % 8) + typeShift
-					mask[pos/8] |= v
-					pos += heapBitsWidth
 				} else {
 					// 1 bit per word, for data/bss bitmap
 					mask[pos/8] |= v << (pos % 8)
@@ -647,7 +665,7 @@ func unrollgcprog1(maskp *byte, prog *byte, ppos *uintptr, inplace, sparse bool)
 			prog = (*byte)(add(unsafe.Pointer(prog), ptrSize))
 			var prog1 *byte
 			for i := uintptr(0); i < siz; i++ {
-				prog1 = unrollgcprog1(&mask[0], prog, &pos, inplace, sparse)
+				prog1 = unrollgcprog1(&mask[0], prog, &pos, inplace)
 			}
 			if *prog1 != insArrayEnd {
 				throw("unrollgcprog: array does not end with insArrayEnd")
@@ -667,7 +685,7 @@ func unrollglobgcprog(prog *byte, size uintptr) bitvector {
 	mask := (*[1 << 30]byte)(persistentalloc(masksize+1, 0, &memstats.gc_sys))
 	mask[masksize] = 0xa1
 	pos := uintptr(0)
-	prog = unrollgcprog1(&mask[0], prog, &pos, false, false)
+	prog = unrollgcprog1(&mask[0], prog, &pos, false)
 	if pos != size/ptrSize {
 		print("unrollglobgcprog: bad program size, got ", pos, ", expect ", size/ptrSize, "\n")
 		throw("unrollglobgcprog: bad program size")
@@ -682,17 +700,21 @@ func unrollglobgcprog(prog *byte, size uintptr) bitvector {
 }
 
 func unrollgcproginplace_m(v unsafe.Pointer, typ *_type, size, size0 uintptr) {
+	throw("unrollinplace")
+	// TODO(rsc): Update for 1-bit bitmaps.
 	// TODO(rsc): Explain why these non-atomic updates are okay.
 	pos := uintptr(0)
 	prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
 	for pos != size0 {
-		unrollgcprog1((*byte)(v), prog, &pos, true, true)
+		unrollgcprog1((*byte)(v), prog, &pos, true)
 	}
 
 	// Mark first word as bitAllocated.
 	// Mark word after last as typeDead.
 	if size0 < size {
 		h := heapBitsForAddr(uintptr(v) + size0)
+		const typeMask = 0
+		const typeShift = 0
 		*h.bitp &^= typeMask << typeShift
 	}
 }
@@ -707,7 +729,7 @@ func unrollgcprog_m(typ *_type) {
 	if *mask == 0 {
 		pos := uintptr(8) // skip the unroll flag
 		prog := (*byte)(unsafe.Pointer(uintptr(typ.gc[1])))
-		prog = unrollgcprog1(mask, prog, &pos, false, false)
+		prog = unrollgcprog1(mask, prog, &pos, false)
 		if *prog != insEnd {
 			throw("unrollgcprog: program does not end with insEnd")
 		}
@@ -737,26 +759,24 @@ func getgcmask(ep interface{}) (mask []byte) {
 	for datap := &firstmoduledata; datap != nil; datap = datap.next {
 		// data
 		if datap.data <= uintptr(p) && uintptr(p) < datap.edata {
+			bitmap := datap.gcdatamask.bytedata
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
 			mask = make([]byte, n/ptrSize)
 			for i := uintptr(0); i < n; i += ptrSize {
 				off := (uintptr(p) + i - datap.data) / ptrSize
-				bits := (*addb(datap.gcdatamask.bytedata, off/8) >> (off % 8)) & 1
-				bits += 1 // convert 1-bit to 2-bit
-				mask[i/ptrSize] = bits
+				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
 			}
 			return
 		}
 
 		// bss
 		if datap.bss <= uintptr(p) && uintptr(p) < datap.ebss {
+			bitmap := datap.gcbssmask.bytedata
 			n := (*ptrtype)(unsafe.Pointer(t)).elem.size
 			mask = make([]byte, n/ptrSize)
 			for i := uintptr(0); i < n; i += ptrSize {
 				off := (uintptr(p) + i - datap.bss) / ptrSize
-				bits := (*addb(datap.gcbssmask.bytedata, off/8) >> (off % 8)) & 1
-				bits += 1 // convert 1-bit to 2-bit
-				mask[i/ptrSize] = bits
+				mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
 			}
 			return
 		}
@@ -768,8 +788,14 @@ func getgcmask(ep interface{}) (mask []byte) {
 	if mlookup(uintptr(p), &base, &n, nil) != 0 {
 		mask = make([]byte, n/ptrSize)
 		for i := uintptr(0); i < n; i += ptrSize {
-			bits := heapBitsForAddr(base + i).typeBits()
-			mask[i/ptrSize] = bits
+			hbits := heapBitsForAddr(base + i)
+			if hbits.isPointer() {
+				mask[i/ptrSize] = 1
+			}
+			if i >= 2*ptrSize && !hbits.isMarked() {
+				mask[i/ptrSize] = 255
+				break
+			}
 		}
 		return
 	}
@@ -801,10 +827,9 @@ func getgcmask(ep interface{}) (mask []byte) {
 		n := (*ptrtype)(unsafe.Pointer(t)).elem.size
 		mask = make([]byte, n/ptrSize)
 		for i := uintptr(0); i < n; i += ptrSize {
+			bitmap := bv.bytedata
 			off := (uintptr(p) + i - frame.varp + size) / ptrSize
-			bits := (*addb(bv.bytedata, off/8) >> (off % 8)) & 1
-			bits += 1 // convert 1-bit to 2-bit
-			mask[i/ptrSize] = bits
+			mask[i/ptrSize] = (*addb(bitmap, off/8) >> (off % 8)) & 1
 		}
 	}
 	return
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index 9d78ddecae..917299c9df 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -597,20 +597,19 @@ func scanobject(b uintptr, gcw *gcWork) {
 			// Avoid needless hbits.next() on last iteration.
 			hbits = hbits.next()
 		}
-		bits := uintptr(hbits.typeBits())
-		if bits == typeDead {
-			break // no more pointers in this object
-		}
-
-		if bits <= typeScalar { // typeScalar, typeDead, typeScalarMarked
-			continue
-		}
-
-		if bits&typePointer != typePointer {
-			print("gc useCheckmark=", useCheckmark, " b=", hex(b), "\n")
-			throw("unexpected garbage collection bits")
+		// During checkmarking, 1-word objects store the checkmark
+		// in the type bit for the one word. The only one-word objects
+		// are pointers, or else they'd be merged with other non-pointer
+		// data into larger allocations.
+		if n != 1 {
+			b := hbits.bits()
+			if i >= 2*ptrSize && b&bitMarked == 0 {
+				break // no more pointers in this object
+			}
+			if b&bitPointer == 0 {
+				continue // not a pointer
+			}
 		}
-
 		// Work here is duplicated in scanblock.
 		// If you make changes here, make changes there too.
 
@@ -673,11 +672,11 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork
 
 			throw("checkmark found unmarked object")
 		}
-		if hbits.isCheckmarked() {
+		if hbits.isCheckmarked(span.elemsize) {
 			return
 		}
-		hbits.setCheckmarked()
-		if !hbits.isCheckmarked() {
+		hbits.setCheckmarked(span.elemsize)
+		if !hbits.isCheckmarked(span.elemsize) {
 			throw("setCheckmarked and isCheckmarked disagree")
 		}
 	} else {
@@ -685,12 +684,11 @@ func greyobject(obj, base, off uintptr, hbits heapBits, span *mspan, gcw *gcWork
 		if hbits.isMarked() {
 			return
 		}
-
 		hbits.setMarked()
 
 		// If this is a noscan object, fast-track it to black
 		// instead of greying it.
-		if hbits.typeBits() == typeDead {
+		if !hbits.hasPointers(span.elemsize) {
 			gcw.bytesMarked += uint64(span.elemsize)
 			return
 		}
diff --git a/src/runtime/stack1.go b/src/runtime/stack1.go
index f74694b7e9..d6ddf86dba 100644
--- a/src/runtime/stack1.go
+++ b/src/runtime/stack1.go
@@ -352,6 +352,12 @@ func adjustpointer(adjinfo *adjustinfo, vpp unsafe.Pointer) {
 	}
 }
 
+// Information from the compiler about the layout of stack frames.
+type bitvector struct {
+	n        int32 // # of bits
+	bytedata *uint8
+}
+
 type gobitvector struct {
 	n        uintptr
 	bytedata []uint8
-- 
2.50.0