From 91f863013e6b5ba870f6bfbfda0b735cf54fb3ca Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Sun, 10 Apr 2022 20:34:17 +0000
Subject: [PATCH] runtime: redesign scavenging algorithm

Currently the runtime's scavenging algorithm involves running from the
top of the heap address space to the bottom (or as far as it gets) once
per GC cycle. Once it treads some ground, it doesn't tread it again
until the next GC cycle.

This works just fine for the background scavenger, for heap-growth
scavenging, and for debug.FreeOSMemory. However, it breaks down in the
face of a memory limit for small heaps in the tens of MiB. Basically,
because the scavenger never retreads old ground, it's completely
oblivious to new memory it could scavenge, and that it really *should*
in the face of a memory limit.

Also, every time some thread goes to scavenge in the runtime, it
reserves what could be a considerable amount of address space, hiding it
from other scavengers.

This change modifies and simplifies the implementation overall. It's
less code with complexities that are much better encapsulated. The
current implementation iterates optimistically over the address space
looking for memory to scavenge, keeping track of what it last saw. The
new implementation does the same, but instead of directly iterating over
pages, it iterates over chunks. It maintains an index of chunks (as a
bitmap over the address space) that indicate which chunks may contain
scavenge work. The page allocator populates this index, while scavengers
consume it and iterate over it optimistically.

This has a two key benefits:
1. Scavenging is much simpler: find a candidate chunk, and check it,
   essentially just using the scavengeOne fast path. There's no need for
   the complexity of iterating beyond one chunk, because the index is
   lock-free and already maintains that information.
2. If pages are freed to the page allocator (always guaranteed to be
   unscavenged), the page allocator immediately notifies all scavengers
   of the new source of work, avoiding the hiding issues of the old
   implementation.

One downside of the new implementation, however, is that it's
potentially more expensive to find pages to scavenge. In the past, if
a single page would become free high up in the address space, the
runtime's scavengers would ignore it. Now that scavengers won't, one or
more scavengers may need to iterate potentially across the whole heap to
find the next source of work. For the background scavenger, this just
means a potentially less reactive scavenger -- overall it should still
use the same amount of CPU. It means worse overheads for memory limit
scavenging, but that's not exactly something with a baseline yet.

In practice, this shouldn't be too bad, hopefully since the chunk index
is extremely compact. For a 48-bit address space, the index is only 8
MiB in size at worst, but even just one physical page in the index is
able to support up to 128 GiB heaps, provided they aren't terribly
sparse. On 32-bit platforms, the index is only 128 bytes in size.

For #48409.

Change-Id: I72b7e74365046b18c64a6417224c5d85511194fb
Reviewed-on: https://go-review.googlesource.com/c/go/+/399474
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
---
 src/runtime/export_test.go      |  47 ++-
 src/runtime/mgcscavenge.go      | 520 +++++++++++++-------------------
 src/runtime/mgcscavenge_test.go | 146 +++++++++
 src/runtime/mgcsweep.go         |  14 +-
 src/runtime/mheap.go            |  17 +-
 src/runtime/mpagealloc.go       |  48 +--
 src/runtime/mpagealloc_32bit.go |  12 +-
 src/runtime/mpagealloc_64bit.go |  79 ++++-
 src/runtime/mranges.go          |  64 ++++
 9 files changed, 565 insertions(+), 382 deletions(-)

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index f1bdf93f46..0d17ddfe30 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -968,7 +968,6 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 	p.init(new(mutex), testSysStat)
 	lockInit(p.mheapLock, lockRankMheap)
 	p.test = true
-
 	for i, init := range chunks {
 		addr := chunkBase(chunkIdx(i))
 
@@ -1007,6 +1006,18 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 			}
 		}
 
+		// Make sure the scavenge index is updated.
+		//
+		// This is an inefficient way to do it, but it's also the simplest way.
+		minPages := physPageSize / pageSize
+		if minPages < 1 {
+			minPages = 1
+		}
+		_, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, minPages)
+		if npages != 0 {
+			p.scav.index.mark(addr, addr+pallocChunkBytes)
+		}
+
 		// Update heap metadata for the allocRange calls above.
 		systemstack(func() {
 			lock(p.mheapLock)
@@ -1015,12 +1026,6 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 		})
 	}
 
-	systemstack(func() {
-		lock(p.mheapLock)
-		p.scavengeStartGen()
-		unlock(p.mheapLock)
-	})
-
 	return (*PageAlloc)(p)
 }
 
@@ -1035,6 +1040,8 @@ func FreePageAlloc(pp *PageAlloc) {
 		for l := 0; l < summaryLevels; l++ {
 			sysFreeOS(unsafe.Pointer(&p.summary[l][0]), uintptr(cap(p.summary[l]))*pallocSumBytes)
 		}
+		// Only necessary on 64-bit. This is a global on 32-bit.
+		sysFreeOS(unsafe.Pointer(&p.scav.index.chunks[0]), uintptr(cap(p.scav.index.chunks)))
 	} else {
 		resSize := uintptr(0)
 		for _, s := range p.summary {
@@ -1042,6 +1049,7 @@ func FreePageAlloc(pp *PageAlloc) {
 		}
 		sysFreeOS(unsafe.Pointer(&p.summary[0][0]), alignUp(resSize, physPageSize))
 	}
+
 	// Subtract back out whatever we mapped for the summaries.
 	// sysUsed adds to p.sysStat and memstats.mappedReady no matter what
 	// (and in anger should actually be accounted for), and there's no other
@@ -1550,3 +1558,28 @@ func (s *Scavenger) Stop() {
 	s.Wake()
 	<-s.done
 }
+
+type ScavengeIndex struct {
+	i scavengeIndex
+}
+
+func NewScavengeIndex(min, max ChunkIdx) *ScavengeIndex {
+	s := new(ScavengeIndex)
+	s.i.chunks = make([]atomic.Uint8, uintptr(1<<heapAddrBits/pallocChunkBytes/8))
+	s.i.min.Store(int32(min / 8))
+	s.i.max.Store(int32(max / 8))
+	return s
+}
+
+func (s *ScavengeIndex) Find() (ChunkIdx, uint) {
+	ci, off := s.i.find()
+	return ChunkIdx(ci), off
+}
+
+func (s *ScavengeIndex) Mark(base, limit uintptr) {
+	s.i.mark(base, limit)
+}
+
+func (s *ScavengeIndex) Clear(ci ChunkIdx) {
+	s.i.clear(chunkIdx(ci))
+}
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 9423db271b..d22e6635f8 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -118,11 +118,6 @@ const (
 	// This ratio is used as part of multiplicative factor to help the scavenger account
 	// for the additional costs of using scavenged memory in its pacing.
 	scavengeCostRatio = 0.7 * (goos.IsDarwin + goos.IsIos)
-
-	// scavengeReservationShards determines the amount of memory the scavenger
-	// should reserve for scavenging at a time. Specifically, the amount of
-	// memory reserved is (heap size in bytes) / scavengeReservationShards.
-	scavengeReservationShards = 64
 )
 
 // heapRetained returns an estimate of the current heap RSS.
@@ -642,26 +637,16 @@ func bgscavenge(c chan int) {
 // scavenge always tries to scavenge nbytes worth of memory, and will
 // only fail to do so if the heap is exhausted for now.
 func (p *pageAlloc) scavenge(nbytes uintptr) uintptr {
-	var (
-		addrs addrRange
-		gen   uint32
-	)
 	released := uintptr(0)
 	for released < nbytes {
-		if addrs.size() == 0 {
-			if addrs, gen = p.scavengeReserve(); addrs.size() == 0 {
-				break
-			}
+		ci, pageIdx := p.scav.index.find()
+		if ci == 0 {
+			break
 		}
 		systemstack(func() {
-			r, a := p.scavengeOne(addrs, nbytes-released)
-			released += r
-			addrs = a
+			released += p.scavengeOne(ci, pageIdx, nbytes-released)
 		})
 	}
-	// Only unreserve the space which hasn't been scavenged or searched
-	// to ensure we always make progress.
-	p.scavengeUnreserve(addrs, gen)
 	return released
 }
 
@@ -672,11 +657,11 @@ func (p *pageAlloc) scavenge(nbytes uintptr) uintptr {
 // application.
 //
 // scavenger.lock must be held.
-func printScavTrace(gen uint32, released uintptr, forced bool) {
+func printScavTrace(released uintptr, forced bool) {
 	assertLockHeld(&scavenger.lock)
 
 	printlock()
-	print("scav ", gen, " ",
+	print("scav ",
 		released>>10, " KiB work, ",
 		gcController.heapReleased.load()>>10, " KiB total, ",
 		(gcController.heapInUse.load()*100)/heapRetained(), "% util",
@@ -691,130 +676,20 @@ func printScavTrace(gen uint32, released uintptr, forced bool) {
 	printunlock()
 }
 
-// scavengeStartGen starts a new scavenge generation, resetting
-// the scavenger's search space to the full in-use address space.
-//
-// p.mheapLock must be held.
-//
-// Must run on the system stack because p.mheapLock must be held.
-//
-//go:systemstack
-func (p *pageAlloc) scavengeStartGen() {
-	assertLockHeld(p.mheapLock)
-
-	lock(&p.scav.lock)
-	if debug.scavtrace > 0 {
-		printScavTrace(p.scav.gen, atomic.Loaduintptr(&p.scav.released), false)
-	}
-	p.inUse.cloneInto(&p.scav.inUse)
-
-	// Pick the new starting address for the scavenger cycle.
-	var startAddr offAddr
-	if p.scav.scavLWM.lessThan(p.scav.freeHWM) {
-		// The "free" high watermark exceeds the "scavenged" low watermark,
-		// so there are free scavengable pages in parts of the address space
-		// that the scavenger already searched, the high watermark being the
-		// highest one. Pick that as our new starting point to ensure we
-		// see those pages.
-		startAddr = p.scav.freeHWM
-	} else {
-		// The "free" high watermark does not exceed the "scavenged" low
-		// watermark. This means the allocator didn't free any memory in
-		// the range we scavenged last cycle, so we might as well continue
-		// scavenging from where we were.
-		startAddr = p.scav.scavLWM
-	}
-	p.scav.inUse.removeGreaterEqual(startAddr.addr())
-
-	// reservationBytes may be zero if p.inUse.totalBytes is small, or if
-	// scavengeReservationShards is large. This case is fine as the scavenger
-	// will simply be turned off, but it does mean that scavengeReservationShards,
-	// in concert with pallocChunkBytes, dictates the minimum heap size at which
-	// the scavenger triggers. In practice this minimum is generally less than an
-	// arena in size, so virtually every heap has the scavenger on.
-	p.scav.reservationBytes = alignUp(p.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
-	p.scav.gen++
-	atomic.Storeuintptr(&p.scav.released, 0)
-	p.scav.freeHWM = minOffAddr
-	p.scav.scavLWM = maxOffAddr
-	unlock(&p.scav.lock)
-}
-
-// scavengeReserve reserves a contiguous range of the address space
-// for scavenging. The maximum amount of space it reserves is proportional
-// to the size of the heap. The ranges are reserved from the high addresses
-// first.
-//
-// Returns the reserved range and the scavenge generation number for it.
-func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
-	lock(&p.scav.lock)
-	gen := p.scav.gen
-
-	// Start by reserving the minimum.
-	r := p.scav.inUse.removeLast(p.scav.reservationBytes)
-
-	// Return early if the size is zero; we don't want to use
-	// the bogus address below.
-	if r.size() == 0 {
-		unlock(&p.scav.lock)
-		return r, gen
-	}
-
-	// The scavenger requires that base be aligned to a
-	// palloc chunk because that's the unit of operation for
-	// the scavenger, so align down, potentially extending
-	// the range.
-	newBase := alignDown(r.base.addr(), pallocChunkBytes)
-
-	// Remove from inUse however much extra we just pulled out.
-	p.scav.inUse.removeGreaterEqual(newBase)
-	unlock(&p.scav.lock)
-
-	r.base = offAddr{newBase}
-	return r, gen
-}
-
-// scavengeUnreserve returns an unscavenged portion of a range that was
-// previously reserved with scavengeReserve.
-func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
-	if r.size() == 0 {
-		return
-	}
-	if r.base.addr()%pallocChunkBytes != 0 {
-		throw("unreserving unaligned region")
-	}
-	lock(&p.scav.lock)
-	if gen == p.scav.gen {
-		p.scav.inUse.add(r)
-	}
-	unlock(&p.scav.lock)
-}
-
-// scavengeOne walks over address range work until it finds
+// scavengeOne walks over the chunk at chunk index ci and searches for
 // a contiguous run of pages to scavenge. It will try to scavenge
 // at most max bytes at once, but may scavenge more to avoid
 // breaking huge pages. Once it scavenges some memory it returns
 // how much it scavenged in bytes.
 //
-// Returns the number of bytes scavenged and the part of work
-// which was not yet searched.
+// searchIdx is the page index to start searching from in ci.
 //
-// work's base address must be aligned to pallocChunkBytes.
+// Returns the number of bytes scavenged.
 //
 // Must run on the systemstack because it acquires p.mheapLock.
 //
 //go:systemstack
-func (p *pageAlloc) scavengeOne(work addrRange, max uintptr) (uintptr, addrRange) {
-	// Defensively check if we've received an empty address range.
-	// If so, just return.
-	if work.size() == 0 {
-		// Nothing to do.
-		return 0, work
-	}
-	// Check the prerequisites of work.
-	if work.base.addr()%pallocChunkBytes != 0 {
-		throw("scavengeOne called with unaligned work region")
-	}
+func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintptr {
 	// Calculate the maximum number of pages to scavenge.
 	//
 	// This should be alignUp(max, pageSize) / pageSize but max can and will
@@ -836,168 +711,61 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr) (uintptr, addrRange
 		minPages = 1
 	}
 
-	// Fast path: check the chunk containing the top-most address in work.
-	if r, w := p.scavengeOneFast(work, minPages, maxPages); r != 0 {
-		return r, w
-	} else {
-		work = w
-	}
-
-	// findCandidate finds the next scavenge candidate in work optimistically.
-	//
-	// Returns the candidate chunk index and true on success, and false on failure.
-	//
-	// The heap need not be locked.
-	findCandidate := func(work addrRange) (chunkIdx, bool) {
-		// Iterate over this work's chunks.
-		for i := chunkIndex(work.limit.addr() - 1); i >= chunkIndex(work.base.addr()); i-- {
-			// If this chunk is totally in-use or has no unscavenged pages, don't bother
-			// doing a more sophisticated check.
-			//
-			// Note we're accessing the summary and the chunks without a lock, but
-			// that's fine. We're being optimistic anyway.
-
-			// Check quickly if there are enough free pages at all.
-			if p.summary[len(p.summary)-1][i].max() < uint(minPages) {
-				continue
-			}
-
-			// Run over the chunk looking harder for a candidate. Again, we could
-			// race with a lot of different pieces of code, but we're just being
-			// optimistic. Make sure we load the l2 pointer atomically though, to
-			// avoid races with heap growth. It may or may not be possible to also
-			// see a nil pointer in this case if we do race with heap growth, but
-			// just defensively ignore the nils. This operation is optimistic anyway.
-			l2 := (*[1 << pallocChunksL2Bits]pallocData)(atomic.Loadp(unsafe.Pointer(&p.chunks[i.l1()])))
-			if l2 != nil && l2[i.l2()].hasScavengeCandidate(minPages) {
-				return i, true
-			}
-		}
-		return 0, false
-	}
-
-	// Slow path: iterate optimistically over the in-use address space
-	// looking for any free and unscavenged page. If we think we see something,
-	// lock and verify it!
-	for work.size() != 0 {
-
-		// Search for the candidate.
-		candidateChunkIdx, ok := findCandidate(work)
-		if !ok {
-			// We didn't find a candidate, so we're done.
-			work.limit = work.base
-			break
-		}
-
-		// Lock, so we can verify what we found.
-		lock(p.mheapLock)
-
-		// Find, verify, and scavenge if we can.
-		chunk := p.chunkOf(candidateChunkIdx)
-		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
-		if npages > 0 {
-			work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)}
-			unlock(p.mheapLock)
-			return uintptr(npages) * pageSize, work
-		}
-		unlock(p.mheapLock)
-
-		// We were fooled, so let's continue from where we left off.
-		work.limit = offAddr{chunkBase(candidateChunkIdx)}
-	}
-	return 0, work
-}
-
-// scavengeOneFast is the fast path for scavengeOne, which just checks the top
-// chunk of work for some pages to scavenge.
-//
-// Must run on the system stack because it acquires the heap lock.
-//
-//go:systemstack
-func (p *pageAlloc) scavengeOneFast(work addrRange, minPages, maxPages uintptr) (uintptr, addrRange) {
-	maxAddr := work.limit.addr() - 1
-	maxChunk := chunkIndex(maxAddr)
-
 	lock(p.mheapLock)
-	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
+	if p.summary[len(p.summary)-1][ci].max() >= uint(minPages) {
 		// We only bother looking for a candidate if there at least
 		// minPages free pages at all.
-		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
+		base, npages := p.chunkOf(ci).findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
 
 		// If we found something, scavenge it and return!
 		if npages != 0 {
-			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
+			// Compute the full address for the start of the range.
+			addr := chunkBase(ci) + uintptr(base)*pageSize
+
+			// Mark the range we're about to scavenge as allocated, because
+			// we don't want any allocating goroutines to grab it while
+			// the scavenging is in progress.
+			if scav := p.allocRange(addr, uintptr(npages)); scav != 0 {
+				throw("double scavenge")
+			}
+
+			// With that done, it's safe to unlock.
 			unlock(p.mheapLock)
-			return uintptr(npages) * pageSize, work
-		}
-	}
-	unlock(p.mheapLock)
 
-	// Update the limit to reflect the fact that we checked maxChunk already.
-	work.limit = offAddr{chunkBase(maxChunk)}
-	return 0, work
-}
+			if !p.test {
+				// Only perform the actual scavenging if we're not in a test.
+				// It's dangerous to do so otherwise.
+				sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
+
+				// Update global accounting only when not in test, otherwise
+				// the runtime's accounting will be wrong.
+				nbytes := int64(npages) * pageSize
+				gcController.heapReleased.add(nbytes)
+				gcController.heapFree.add(-nbytes)
+
+				stats := memstats.heapStats.acquire()
+				atomic.Xaddint64(&stats.committed, -nbytes)
+				atomic.Xaddint64(&stats.released, nbytes)
+				memstats.heapStats.release()
+			}
 
-// scavengeRangeLocked scavenges the given region of memory.
-// The region of memory is described by its chunk index (ci),
-// the starting page index of the region relative to that
-// chunk (base), and the length of the region in pages (npages).
-//
-// Returns the base address of the scavenged region.
-//
-// p.mheapLock must be held. Unlocks p.mheapLock but reacquires
-// it before returning. Must be run on the systemstack as a result.
-//
-//go:systemstack
-func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
-	assertLockHeld(p.mheapLock)
+			// Relock the heap, because now we need to make these pages
+			// available allocation. Free them back to the page allocator.
+			lock(p.mheapLock)
+			p.free(addr, uintptr(npages), true)
 
-	// Compute the full address for the start of the range.
-	addr := chunkBase(ci) + uintptr(base)*pageSize
+			// Mark the range as scavenged.
+			p.chunkOf(ci).scavenged.setRange(base, npages)
+			unlock(p.mheapLock)
 
-	// Mark the range we're about to scavenge as allocated, because
-	// we don't want any allocating goroutines to grab it while
-	// the scavenging is in progress.
-	if scav := p.allocRange(addr, uintptr(npages)); scav != 0 {
-		throw("double scavenge")
+			return uintptr(npages) * pageSize
+		}
 	}
-
-	// With that done, it's safe to unlock.
+	// Mark this chunk as having no free pages.
+	p.scav.index.clear(ci)
 	unlock(p.mheapLock)
 
-	// Update the scavenge low watermark.
-	lock(&p.scav.lock)
-	if oAddr := (offAddr{addr}); oAddr.lessThan(p.scav.scavLWM) {
-		p.scav.scavLWM = oAddr
-	}
-	unlock(&p.scav.lock)
-
-	if !p.test {
-		// Only perform the actual scavenging if we're not in a test.
-		// It's dangerous to do so otherwise.
-		sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
-
-		// Update global accounting only when not in test, otherwise
-		// the runtime's accounting will be wrong.
-		nbytes := int64(npages) * pageSize
-		gcController.heapReleased.add(nbytes)
-		gcController.heapFree.add(-nbytes)
-
-		// Update consistent accounting too.
-		stats := memstats.heapStats.acquire()
-		atomic.Xaddint64(&stats.committed, -nbytes)
-		atomic.Xaddint64(&stats.released, nbytes)
-		memstats.heapStats.release()
-	}
-
-	// Relock the heap, because now we need to make these pages
-	// available allocation. Free them back to the page allocator.
-	lock(p.mheapLock)
-	p.free(addr, uintptr(npages), true)
-
-	// Mark the range as scavenged.
-	p.chunkOf(ci).scavenged.setRange(base, npages)
-	return addr
+	return 0
 }
 
 // fillAligned returns x but with all zeroes in m-aligned
@@ -1059,38 +827,6 @@ func fillAligned(x uint64, m uint) uint64 {
 	return ^((x - (x >> (m - 1))) | x)
 }
 
-// hasScavengeCandidate returns true if there's any min-page-aligned groups of
-// min pages of free-and-unscavenged memory in the region represented by this
-// pallocData.
-//
-// min must be a non-zero power of 2 <= maxPagesPerPhysPage.
-func (m *pallocData) hasScavengeCandidate(min uintptr) bool {
-	if min&(min-1) != 0 || min == 0 {
-		print("runtime: min = ", min, "\n")
-		throw("min must be a non-zero power of 2")
-	} else if min > maxPagesPerPhysPage {
-		print("runtime: min = ", min, "\n")
-		throw("min too large")
-	}
-
-	// The goal of this search is to see if the chunk contains any free and unscavenged memory.
-	for i := len(m.scavenged) - 1; i >= 0; i-- {
-		// 1s are scavenged OR non-free => 0s are unscavenged AND free
-		//
-		// TODO(mknyszek): Consider splitting up fillAligned into two
-		// functions, since here we technically could get by with just
-		// the first half of its computation. It'll save a few instructions
-		// but adds some additional code complexity.
-		x := fillAligned(m.scavenged[i]|m.pallocBits[i], uint(min))
-
-		// Quickly skip over chunks of non-free or scavenged pages.
-		if x != ^uint64(0) {
-			return true
-		}
-	}
-	return false
-}
-
 // findScavengeCandidate returns a start index and a size for this pallocData
 // segment which represents a contiguous region of free and unscavenged memory.
 //
@@ -1210,3 +946,157 @@ func (m *pallocData) findScavengeCandidate(searchIdx uint, min, max uintptr) (ui
 	}
 	return start, size
 }
+
+// scavengeIndex is a structure for efficiently managing which pageAlloc chunks have
+// memory available to scavenge.
+type scavengeIndex struct {
+	// chunks is a bitmap representing the entire address space. Each bit represents
+	// a single chunk, and a 1 value indicates the presence of pages available for
+	// scavenging. Updates to the bitmap are serialized by the pageAlloc lock.
+	//
+	// The underlying storage of chunks is platform dependent and may not even be
+	// totally mapped read/write. min and max reflect the extent that is safe to access.
+	// min is inclusive, max is exclusive.
+	//
+	// searchAddr is the maximum address (in the offset address space, so we have a linear
+	// view of the address space; see mranges.go:offAddr) containing memory available to
+	// scavenge. It is a hint to the find operation to avoid O(n^2) behavior in repeated lookups.
+	//
+	// searchAddr is always inclusive and should be the base address of the highest runtime
+	// page available for scavenging.
+	//
+	// searchAddr is managed by both find and mark.
+	//
+	// Normally, find monotonically decreases searchAddr as it finds no more free pages to
+	// scavenge. However, mark, when marking a new chunk at an index greater than the current
+	// searchAddr, sets searchAddr to the *negative* index into chunks of that page. The trick here
+	// is that concurrent calls to find will fail to monotonically decrease searchAddr, and so they
+	// won't barge over new memory becoming available to scavenge. Furthermore, this ensures
+	// that some future caller of find *must* observe the new high index. That caller
+	// (or any other racing with it), then makes searchAddr positive before continuing, bringing
+	// us back to our monotonically decreasing steady-state.
+	//
+	// A pageAlloc lock serializes updates between min, max, and searchAddr, so abs(searchAddr)
+	// is always guaranteed to be >= min and < max (converted to heap addresses).
+	//
+	// TODO(mknyszek): Ideally we would use something bigger than a uint8 for faster
+	// iteration like uint32, but we lack the bit twiddling intrinsics. We'd need to either
+	// copy them from math/bits or fix the fact that we can't import math/bits' code from
+	// the runtime due to compiler instrumentation.
+	searchAddr atomicOffAddr
+	chunks     []atomic.Uint8
+	minHeapIdx atomic.Int32
+	min, max   atomic.Int32
+}
+
+// find returns the highest chunk index that may contain pages available to scavenge.
+// It also returns an offset to start searching in the highest chunk.
+func (s *scavengeIndex) find() (chunkIdx, uint) {
+	searchAddr, marked := s.searchAddr.Load()
+	if searchAddr == minOffAddr.addr() {
+		// We got a cleared search addr.
+		return 0, 0
+	}
+
+	// Starting from searchAddr's chunk, and moving down to minHeapIdx,
+	// iterate until we find a chunk with pages to scavenge.
+	min := s.minHeapIdx.Load()
+	searchChunk := chunkIndex(uintptr(searchAddr))
+	start := int32(searchChunk / 8)
+	for i := start; i >= min; i-- {
+		// Skip over irrelevant address space.
+		chunks := s.chunks[i].Load()
+		if chunks == 0 {
+			continue
+		}
+		// Note that we can't have 8 leading zeroes here because
+		// we necessarily skipped that case. So, what's left is
+		// an index. If there are no zeroes, we want the 7th
+		// index, if 1 zero, the 6th, and so on.
+		n := 7 - sys.LeadingZeros8(chunks)
+		ci := chunkIdx(uint(i)*8 + uint(n))
+		if searchChunk == ci {
+			return ci, chunkPageIndex(uintptr(searchAddr))
+		}
+		// Try to reduce searchAddr to newSearchAddr.
+		newSearchAddr := chunkBase(ci) + pallocChunkBytes - pageSize
+		if marked {
+			// Attempt to be the first one to decrease the searchAddr
+			// after an increase. If we fail, that means there was another
+			// increase, or somebody else got to it before us. Either way,
+			// it doesn't matter. We may lose some performance having an
+			// incorrect search address, but it's far more important that
+			// we don't miss updates.
+			s.searchAddr.StoreUnmark(searchAddr, newSearchAddr)
+		} else {
+			// Decrease searchAddr.
+			s.searchAddr.StoreMin(newSearchAddr)
+		}
+		return ci, pallocChunkPages - 1
+	}
+	// Clear searchAddr, because we've exhausted the heap.
+	s.searchAddr.Clear()
+	return 0, 0
+}
+
+// mark sets the inclusive range of chunks between indices start and end as
+// containing pages available to scavenge.
+//
+// Must be serialized with other mark, markRange, and clear calls.
+func (s *scavengeIndex) mark(base, limit uintptr) {
+	start, end := chunkIndex(base), chunkIndex(limit-pageSize)
+	if start == end {
+		// Within a chunk.
+		mask := uint8(1 << (start % 8))
+		s.chunks[start/8].Or(mask)
+	} else if start/8 == end/8 {
+		// Within the same byte in the index.
+		mask := uint8(uint16(1<<(end-start+1))-1) << (start % 8)
+		s.chunks[start/8].Or(mask)
+	} else {
+		// Crosses multiple bytes in the index.
+		startAligned := chunkIdx(alignUp(uintptr(start), 8))
+		endAligned := chunkIdx(alignDown(uintptr(end), 8))
+
+		// Do the end of the first byte first.
+		if width := startAligned - start; width > 0 {
+			mask := uint8(uint16(1<<width)-1) << (start % 8)
+			s.chunks[start/8].Or(mask)
+		}
+		// Do the middle aligned sections that take up a whole
+		// byte.
+		for ci := startAligned; ci < endAligned; ci += 8 {
+			s.chunks[ci/8].Store(^uint8(0))
+		}
+		// Do the end of the last byte.
+		//
+		// This width check doesn't match the one above
+		// for start because aligning down into the endAligned
+		// block means we always have at least one chunk in this
+		// block (note that end is *inclusive*). This also means
+		// that if end == endAligned+n, then what we really want
+		// is to fill n+1 chunks, i.e. width n+1. By induction,
+		// this is true for all n.
+		if width := end - endAligned + 1; width > 0 {
+			mask := uint8(uint16(1<<width) - 1)
+			s.chunks[end/8].Or(mask)
+		}
+	}
+	newSearchAddr := limit - pageSize
+	searchAddr, _ := s.searchAddr.Load()
+	// N.B. Because mark is serialized, it's not necessary to do a
+	// full CAS here. mark only ever increases searchAddr, while
+	// find only ever decreases it. Since we only ever race with
+	// decreases, even if the value we loaded is stale, the actual
+	// value will never be larger.
+	if (offAddr{searchAddr}).lessThan(offAddr{newSearchAddr}) {
+		s.searchAddr.StoreMarked(newSearchAddr)
+	}
+}
+
+// clear sets the chunk at index ci as not containing pages available to scavenge.
+//
+// Must be serialized with other mark, markRange, and clear calls.
+func (s *scavengeIndex) clear(ci chunkIdx) {
+	s.chunks[ci/8].And(^uint8(1 << (ci % 8)))
+}
diff --git a/src/runtime/mgcscavenge_test.go b/src/runtime/mgcscavenge_test.go
index 8d92295961..620392f867 100644
--- a/src/runtime/mgcscavenge_test.go
+++ b/src/runtime/mgcscavenge_test.go
@@ -561,3 +561,149 @@ func TestScavenger(t *testing.T) {
 	// Clean up.
 	s.Stop()
 }
+
+func TestScavengeIndex(t *testing.T) {
+	setup := func(t *testing.T) (func(ChunkIdx, uint), func(uintptr, uintptr)) {
+		t.Helper()
+
+		// Pick some reasonable bounds. We don't need a huge range just to test.
+		si := NewScavengeIndex(BaseChunkIdx, BaseChunkIdx+64)
+		find := func(want ChunkIdx, wantOffset uint) {
+			t.Helper()
+
+			got, gotOffset := si.Find()
+			if want != got {
+				t.Errorf("find: wanted chunk index %d, got %d", want, got)
+			}
+			if want != got {
+				t.Errorf("find: wanted page offset %d, got %d", wantOffset, gotOffset)
+			}
+			if t.Failed() {
+				t.FailNow()
+			}
+			si.Clear(got)
+		}
+		mark := func(base, limit uintptr) {
+			t.Helper()
+
+			si.Mark(base, limit)
+		}
+		return find, mark
+	}
+	t.Run("Uninitialized", func(t *testing.T) {
+		find, _ := setup(t)
+		find(0, 0)
+	})
+	t.Run("OnePage", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx, 3), PageBase(BaseChunkIdx, 4))
+		find(BaseChunkIdx, 3)
+		find(0, 0)
+	})
+	t.Run("FirstPage", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx, 1))
+		find(BaseChunkIdx, 0)
+		find(0, 0)
+	})
+	t.Run("SeveralPages", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx, 9), PageBase(BaseChunkIdx, 14))
+		find(BaseChunkIdx, 13)
+		find(0, 0)
+	})
+	t.Run("WholeChunk", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0))
+		find(BaseChunkIdx, PallocChunkPages-1)
+		find(0, 0)
+	})
+	t.Run("LastPage", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx, PallocChunkPages-1), PageBase(BaseChunkIdx+1, 0))
+		find(BaseChunkIdx, PallocChunkPages-1)
+		find(0, 0)
+	})
+	t.Run("TwoChunks", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx, 128), PageBase(BaseChunkIdx+1, 128))
+		find(BaseChunkIdx+1, 127)
+		find(BaseChunkIdx, PallocChunkPages-1)
+		find(0, 0)
+	})
+	t.Run("TwoChunksOffset", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx+7, 128), PageBase(BaseChunkIdx+8, 129))
+		find(BaseChunkIdx+8, 128)
+		find(BaseChunkIdx+7, PallocChunkPages-1)
+		find(0, 0)
+	})
+	t.Run("SevenChunksOffset", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx+6, 11), PageBase(BaseChunkIdx+13, 15))
+		find(BaseChunkIdx+13, 14)
+		for i := BaseChunkIdx + 12; i >= BaseChunkIdx+6; i-- {
+			find(i, PallocChunkPages-1)
+		}
+		find(0, 0)
+	})
+	t.Run("ThirtyTwoChunks", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+32, 0))
+		for i := BaseChunkIdx + 31; i >= BaseChunkIdx; i-- {
+			find(i, PallocChunkPages-1)
+		}
+		find(0, 0)
+	})
+	t.Run("ThirtyTwoChunksOffset", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx+3, 0), PageBase(BaseChunkIdx+35, 0))
+		for i := BaseChunkIdx + 34; i >= BaseChunkIdx+3; i-- {
+			find(i, PallocChunkPages-1)
+		}
+		find(0, 0)
+	})
+	t.Run("Mark", func(t *testing.T) {
+		find, mark := setup(t)
+		for i := BaseChunkIdx; i < BaseChunkIdx+32; i++ {
+			mark(PageBase(i, 0), PageBase(i+1, 0))
+		}
+		for i := BaseChunkIdx + 31; i >= BaseChunkIdx; i-- {
+			find(i, PallocChunkPages-1)
+		}
+		find(0, 0)
+	})
+	t.Run("MarkInterleaved", func(t *testing.T) {
+		find, mark := setup(t)
+		for i := BaseChunkIdx; i < BaseChunkIdx+32; i++ {
+			mark(PageBase(i, 0), PageBase(i+1, 0))
+			find(i, PallocChunkPages-1)
+		}
+		find(0, 0)
+	})
+	t.Run("MarkIdempotentOneChunk", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0))
+		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0))
+		find(BaseChunkIdx, PallocChunkPages-1)
+		find(0, 0)
+	})
+	t.Run("MarkIdempotentThirtyTwoChunks", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+32, 0))
+		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+32, 0))
+		for i := BaseChunkIdx + 31; i >= BaseChunkIdx; i-- {
+			find(i, PallocChunkPages-1)
+		}
+		find(0, 0)
+	})
+	t.Run("MarkIdempotentThirtyTwoChunksOffset", func(t *testing.T) {
+		find, mark := setup(t)
+		mark(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+31, 0))
+		mark(PageBase(BaseChunkIdx+5, 0), PageBase(BaseChunkIdx+36, 0))
+		for i := BaseChunkIdx + 35; i >= BaseChunkIdx+4; i-- {
+			find(i, PallocChunkPages-1)
+		}
+		find(0, 0)
+	})
+}
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 0a53cd451b..698b7bff31 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -398,11 +398,15 @@ func sweepone() uintptr {
 		// between sweep done and sweep termination (e.g. not enough
 		// allocations to trigger a GC) which would be nice to fill in
 		// with scavenging work.
-		systemstack(func() {
-			lock(&mheap_.lock)
-			mheap_.pages.scavengeStartGen()
-			unlock(&mheap_.lock)
-		})
+		if debug.scavtrace > 0 {
+			systemstack(func() {
+				lock(&mheap_.lock)
+				released := atomic.Loaduintptr(&mheap_.pages.scav.released)
+				printScavTrace(released, false)
+				atomic.Storeuintptr(&mheap_.pages.scav.released, 0)
+				unlock(&mheap_.lock)
+			})
+		}
 		scavenger.ready()
 	}
 
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index ac4f99b57d..ff681a19cd 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -62,7 +62,10 @@ const (
 type mheap struct {
 	// lock must only be acquired on the system stack, otherwise a g
 	// could self-deadlock if its stack grows with the lock held.
-	lock  mutex
+	lock mutex
+
+	_ uint32 // 8-byte align pages so its alignment is consistent with tests.
+
 	pages pageAlloc // page allocation data structure
 
 	sweepgen uint32 // sweep generation, see comment in mspan; written during STW
@@ -1548,22 +1551,12 @@ func (h *mheap) scavengeAll() {
 	gp := getg()
 	gp.m.mallocing++
 
-	lock(&h.lock)
-	// Start a new scavenge generation so we have a chance to walk
-	// over the whole heap.
-	h.pages.scavengeStartGen()
-	unlock(&h.lock)
-
 	released := h.pages.scavenge(^uintptr(0))
 
-	lock(&h.pages.scav.lock)
-	gen := h.pages.scav.gen
-	unlock(&h.pages.scav.lock)
-
 	gp.m.mallocing--
 
 	if debug.scavtrace > 0 {
-		printScavTrace(gen, released, true)
+		printScavTrace(released, true)
 	}
 }
 
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index 3881974742..c85da15ff2 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -262,45 +262,20 @@ type pageAlloc struct {
 	// All access is protected by the mheapLock.
 	inUse addrRanges
 
+	_ uint32 // Align scav so it's easier to reason about alignment within scav.
+
 	// scav stores the scavenger state.
 	scav struct {
-		lock mutex
-
-		// inUse is a slice of ranges of address space which have not
-		// yet been looked at by the scavenger.
-		//
-		// Protected by lock.
-		inUse addrRanges
-
-		// gen is the scavenge generation number.
-		//
-		// Protected by lock.
-		gen uint32
-
-		// reservationBytes is how large of a reservation should be made
-		// in bytes of address space for each scavenge iteration.
-		//
-		// Protected by lock.
-		reservationBytes uintptr
+		// index is an efficient index of chunks that have pages available to
+		// scavenge.
+		index scavengeIndex
 
 		// released is the amount of memory released this generation.
 		//
 		// Updated atomically.
 		released uintptr
 
-		// scavLWM is the lowest (offset) address that the scavenger reached this
-		// scavenge generation.
-		//
-		// Protected by lock.
-		scavLWM offAddr
-
-		// freeHWM is the highest (offset) address of a page that was freed to
-		// the page allocator this scavenge generation.
-		//
-		// Protected by mheapLock.
-		freeHWM offAddr
-
-		_ uint32 // Align assistTime for atomics.
+		_ uint32 // Align assistTime for atomics on 32-bit platforms.
 
 		// scavengeAssistTime is the time spent scavenging in the last GC cycle.
 		//
@@ -348,12 +323,6 @@ func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat) {
 
 	// Set the mheapLock.
 	p.mheapLock = mheapLock
-
-	// Initialize p.scav.inUse.
-	p.scav.inUse.init(sysStat)
-
-	// Initialize scavenge tracking state.
-	p.scav.scavLWM = maxSearchAddr
 }
 
 // tryChunkOf returns the bitmap data for the given chunk.
@@ -903,10 +872,7 @@ func (p *pageAlloc) free(base, npages uintptr, scavenged bool) {
 	}
 	limit := base + npages*pageSize - 1
 	if !scavenged {
-		// Update the free high watermark for the scavenger.
-		if offLimit := (offAddr{limit}); p.scav.freeHWM.lessThan(offLimit) {
-			p.scav.freeHWM = offLimit
-		}
+		p.scav.index.mark(base, limit+1)
 	}
 	if npages == 1 {
 		// Fast path: we're clearing a single bit, and we know exactly
diff --git a/src/runtime/mpagealloc_32bit.go b/src/runtime/mpagealloc_32bit.go
index e072f70cd7..859c61d8a5 100644
--- a/src/runtime/mpagealloc_32bit.go
+++ b/src/runtime/mpagealloc_32bit.go
@@ -11,7 +11,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	// The number of levels in the radix tree.
@@ -53,6 +56,10 @@ var levelLogPages = [summaryLevels]uint{
 	logPallocChunkPages,
 }
 
+// scavengeIndexArray is the backing store for p.scav.index.chunks.
+// On 32-bit platforms, it's small enough to just be a global.
+var scavengeIndexArray [((1 << heapAddrBits) / pallocChunkBytes) / 8]atomic.Uint8
+
 // See mpagealloc_64bit.go for details.
 func (p *pageAlloc) sysInit() {
 	// Calculate how much memory all our entries will take up.
@@ -87,6 +94,9 @@ func (p *pageAlloc) sysInit() {
 
 		reservation = add(reservation, uintptr(entries)*pallocSumBytes)
 	}
+
+	// Set up the scavenge index.
+	p.scav.index.chunks = scavengeIndexArray[:]
 }
 
 // See mpagealloc_64bit.go for details.
diff --git a/src/runtime/mpagealloc_64bit.go b/src/runtime/mpagealloc_64bit.go
index 0b99209d99..bfc3e0ad90 100644
--- a/src/runtime/mpagealloc_64bit.go
+++ b/src/runtime/mpagealloc_64bit.go
@@ -6,7 +6,10 @@
 
 package runtime
 
-import "unsafe"
+import (
+	"runtime/internal/atomic"
+	"unsafe"
+)
 
 const (
 	// The number of levels in the radix tree.
@@ -83,6 +86,12 @@ func (p *pageAlloc) sysInit() {
 		sl := notInHeapSlice{(*notInHeap)(r), 0, entries}
 		p.summary[l] = *(*[]pallocSum)(unsafe.Pointer(&sl))
 	}
+
+	// Set up the scavenge index.
+	nbytes := uintptr(1<<heapAddrBits) / pallocChunkBytes / 8
+	r := sysReserve(nil, nbytes)
+	sl := notInHeapSlice{(*notInHeap)(r), int(nbytes), int(nbytes)}
+	p.scav.index.chunks = *(*[]atomic.Uint8)(unsafe.Pointer(&sl))
 }
 
 // sysGrow performs architecture-dependent operations on heap
@@ -177,4 +186,72 @@ func (p *pageAlloc) sysGrow(base, limit uintptr) {
 		sysUsed(unsafe.Pointer(need.base.addr()), need.size(), need.size())
 		p.summaryMappedReady += need.size()
 	}
+
+	// Update the scavenge index.
+	p.summaryMappedReady += p.scav.index.grow(base, limit, p.sysStat)
+}
+
+// grow increases the index's backing store in response to a heap growth.
+//
+// Returns the amount of memory added to sysStat.
+func (s *scavengeIndex) grow(base, limit uintptr, sysStat *sysMemStat) uintptr {
+	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
+		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
+		throw("sysGrow bounds not aligned to pallocChunkBytes")
+	}
+	// Map and commit the pieces of chunks that we need.
+	//
+	// We always map the full range of the minimum heap address to the
+	// maximum heap address. We don't do this for the summary structure
+	// because it's quite large and a discontiguous heap could cause a
+	// lot of memory to be used. In this situation, the worst case overhead
+	// is in the single-digit MiB if we map the whole thing.
+	//
+	// The base address of the backing store is always page-aligned,
+	// because it comes from the OS, so it's sufficient to align the
+	// index.
+	haveMin := s.min.Load()
+	haveMax := s.max.Load()
+	needMin := int32(alignDown(uintptr(chunkIndex(base)/8), physPageSize))
+	needMax := int32(alignUp(uintptr((chunkIndex(limit)+7)/8), physPageSize))
+	// Extend the range down to what we have, if there's no overlap.
+	if needMax < haveMin {
+		needMax = haveMin
+	}
+	if needMin > haveMax {
+		needMin = haveMax
+	}
+	have := makeAddrRange(
+		// Avoid a panic from indexing one past the last element.
+		uintptr(unsafe.Pointer(&s.chunks[0]))+uintptr(haveMin),
+		uintptr(unsafe.Pointer(&s.chunks[0]))+uintptr(haveMax),
+	)
+	need := makeAddrRange(
+		// Avoid a panic from indexing one past the last element.
+		uintptr(unsafe.Pointer(&s.chunks[0]))+uintptr(needMin),
+		uintptr(unsafe.Pointer(&s.chunks[0]))+uintptr(needMax),
+	)
+	// Subtract any overlap from rounding. We can't re-map memory because
+	// it'll be zeroed.
+	need = need.subtract(have)
+
+	// If we've got something to map, map it, and update the slice bounds.
+	if need.size() != 0 {
+		sysMap(unsafe.Pointer(need.base.addr()), need.size(), sysStat)
+		sysUsed(unsafe.Pointer(need.base.addr()), need.size(), need.size())
+		// Update the indices only after the new memory is valid.
+		if haveMin == 0 || needMin < haveMin {
+			s.min.Store(needMin)
+		}
+		if haveMax == 0 || needMax > haveMax {
+			s.max.Store(needMax)
+		}
+	}
+	// Update minHeapIdx. Note that even if there's no mapping work to do,
+	// we may still have a new, lower minimum heap address.
+	minHeapIdx := s.minHeapIdx.Load()
+	if baseIdx := int32(chunkIndex(base) / 8); minHeapIdx == 0 || baseIdx < minHeapIdx {
+		s.minHeapIdx.Store(baseIdx)
+	}
+	return need.size()
 }
diff --git a/src/runtime/mranges.go b/src/runtime/mranges.go
index e0be1e134e..9cf83cc613 100644
--- a/src/runtime/mranges.go
+++ b/src/runtime/mranges.go
@@ -11,6 +11,7 @@ package runtime
 
 import (
 	"internal/goarch"
+	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -141,6 +142,69 @@ func (l offAddr) addr() uintptr {
 	return l.a
 }
 
+// atomicOffAddr is like offAddr, but operations on it are atomic.
+// It also contains operations to be able to store marked addresses
+// to ensure that they're not overridden until they've been seen.
+type atomicOffAddr struct {
+	// a contains the offset address, unlike offAddr.
+	a atomic.Int64
+}
+
+// Clear attempts to store minOffAddr in atomicOffAddr. It may fail
+// if a marked value is placed in the box in the meanwhile.
+func (b *atomicOffAddr) Clear() {
+	for {
+		old := b.a.Load()
+		if old < 0 {
+			return
+		}
+		if b.a.CompareAndSwap(old, int64(minOffAddr.addr()-arenaBaseOffset)) {
+			return
+		}
+	}
+}
+
+// StoreMin stores addr if it's less than the current value in the
+// offset address space if the current value is not marked.
+func (b *atomicOffAddr) StoreMin(addr uintptr) {
+	new := int64(addr - arenaBaseOffset)
+	for {
+		old := b.a.Load()
+		if old < new {
+			return
+		}
+		if b.a.CompareAndSwap(old, new) {
+			return
+		}
+	}
+}
+
+// StoreUnmark attempts to unmark the value in atomicOffAddr and
+// replace it with newAddr. markedAddr must be a marked address
+// returned by Load. This function will not store newAddr if the
+// box no longer contains markedAddr.
+func (b *atomicOffAddr) StoreUnmark(markedAddr, newAddr uintptr) {
+	b.a.CompareAndSwap(-int64(markedAddr-arenaBaseOffset), int64(newAddr-arenaBaseOffset))
+}
+
+// StoreMarked stores addr but first converted to the offset address
+// space and then negated.
+func (b *atomicOffAddr) StoreMarked(addr uintptr) {
+	b.a.Store(-int64(addr - arenaBaseOffset))
+}
+
+// Load returns the address in the box as a virtual address. It also
+// returns if the value was marked or not.
+func (b *atomicOffAddr) Load() (uintptr, bool) {
+	v := b.a.Load()
+	wasMarked := false
+	if v < 0 {
+		wasMarked = true
+		v = -v
+	}
+	return uintptr(v) + arenaBaseOffset, wasMarked
+}
+
 // addrRanges is a data structure holding a collection of ranges of
 // address space.
 //
-- 
2.51.0