From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Fri, 23 Sep 2022 16:32:34 +0000 (+0000)
Subject: runtime: manage huge pages explicitly
X-Git-Tag: go1.21rc1~846
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=8fa9e3beee8b0e6baa7333740996181268b60a3a;p=gostls13.git

runtime: manage huge pages explicitly

This change makes it so that on Linux the Go runtime explicitly marks
page heap memory as either available to be backed by hugepages or not
using heuristics based on density.

The motivation behind this change is twofold:
1. In default Linux configurations, khugepaged can recoalesce hugepages
   even after the scavenger breaks them up, resulting in significant
   overheads for small heaps when their heaps shrink.
2. The Go runtime already has some heuristics about this, but those
   heuristics appear to have bit-rotted and result in haphazard
   hugepage management. Unlucky (but otherwise fairly dense) regions of
   memory end up not backed by huge pages while sparse regions end up
   accidentally marked MADV_HUGEPAGE and are not later broken up by the
   scavenger, because it already got the memory it needed from more
   dense sections (this is more likely to happen with small heaps that
   go idle).

In this change, the runtime uses a new policy:

1. Mark all new memory MADV_HUGEPAGE.
2. Track whether each page chunk (4 MiB) became dense during the GC
   cycle. Mark those MADV_HUGEPAGE, and hide them from the scavenger.
3. If a chunk is not dense for 1 full GC cycle, make it visible to the
   scavenger.
4. The scavenger marks a chunk MADV_NOHUGEPAGE before it scavenges it.

This policy is intended to try and back memory that is a good candidate
for huge pages (high occupancy) with huge pages, and give memory that is
not (low occupancy) to the scavenger. Occupancy is defined not just by
occupancy at any instant of time, but also occupancy in the near future.
It's generally true that by the end of a GC cycle the heap gets quite
dense (from the perspective of the page allocator).

Because we want scavenging and huge page management to happen together
(the right time to MADV_NOHUGEPAGE is just before scavenging in order to
break up huge pages and keep them that way) and the cost of applying
MADV_HUGEPAGE and MADV_NOHUGEPAGE is somewhat high, the scavenger avoids
releasing memory in dense page chunks. All this together means the
scavenger will now more generally release memory on a ~1 GC cycle delay.

Notably this has implications for scavenging to maintain the memory
limit and the runtime/debug.FreeOSMemory API. This change makes it so
that in these cases all memory is visible to the scavenger regardless of
sparseness and delays the page allocator in re-marking this memory with
MADV_NOHUGEPAGE for around 1 GC cycle to mitigate churn.

The end result of this change should be little-to-no performance
difference for dense heaps (MADV_HUGEPAGE works a lot like the default
unmarked state) but should allow the scavenger to more effectively take
back fragments of huge pages. The main risk here is churn, because
MADV_HUGEPAGE usually forces the kernel to immediately back memory with
a huge page. That's the reason for the large amount of hysteresis (1
full GC cycle) and why the definition of high density is 96% occupancy.

Fixes #55328.

Change-Id: I8da7998f1a31b498a9cc9bc662c1ae1a6bf64630
Reviewed-on: https://go-review.googlesource.com/c/go/+/436395
Reviewed-by: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Knyszek <mknyszek@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
---

diff --git a/src/runtime/debug/garbage_test.go b/src/runtime/debug/garbage_test.go
index 7213bbe641..cd91782d27 100644
--- a/src/runtime/debug/garbage_test.go
+++ b/src/runtime/debug/garbage_test.go
@@ -146,7 +146,7 @@ func TestFreeOSMemory(t *testing.T) {
 		return
 	}
 	if after.HeapReleased-before.HeapReleased < bigBytes-slack {
-		t.Fatalf("less than %d released: %d -> %d", bigBytes, before.HeapReleased, after.HeapReleased)
+		t.Fatalf("less than %d released: %d -> %d", bigBytes-slack, before.HeapReleased, after.HeapReleased)
 	}
 }
 
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 498c63f5b6..1045d510ef 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -831,7 +831,7 @@ func (p *PageAlloc) Free(base, npages uintptr) {
 		// None of the tests need any higher-level locking, so we just
 		// take the lock internally.
 		lock(pp.mheapLock)
-		pp.free(base, npages, true)
+		pp.free(base, npages)
 		unlock(pp.mheapLock)
 	})
 }
@@ -841,7 +841,7 @@ func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
 func (p *PageAlloc) Scavenge(nbytes uintptr) (r uintptr) {
 	pp := (*pageAlloc)(p)
 	systemstack(func() {
-		r = pp.scavenge(nbytes, nil)
+		r = pp.scavenge(nbytes, nil, true)
 	})
 	return
 }
@@ -995,9 +995,8 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 	p := new(pageAlloc)
 
 	// We've got an entry, so initialize the pageAlloc.
-	p.init(new(mutex), testSysStat)
+	p.init(new(mutex), testSysStat, true)
 	lockInit(p.mheapLock, lockRankMheap)
-	p.test = true
 	for i, init := range chunks {
 		addr := chunkBase(chunkIdx(i))
 
@@ -1009,11 +1008,18 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 		})
 
 		// Initialize the bitmap and update pageAlloc metadata.
-		chunk := p.chunkOf(chunkIndex(addr))
+		ci := chunkIndex(addr)
+		chunk := p.chunkOf(ci)
 
 		// Clear all the scavenged bits which grow set.
 		chunk.scavenged.clearRange(0, pallocChunkPages)
 
+		// Simulate the allocation and subsequent free of all pages in
+		// the chunk for the scavenge index. This sets the state equivalent
+		// with all pages within the index being free.
+		p.scav.index.alloc(ci, pallocChunkPages)
+		p.scav.index.free(ci, 0, pallocChunkPages)
+
 		// Apply scavenge state if applicable.
 		if scav != nil {
 			if scvg, ok := scav[i]; ok {
@@ -1033,19 +1039,10 @@ func NewPageAlloc(chunks, scav map[ChunkIdx][]BitRange) *PageAlloc {
 			// it and it's a no-op anyway.
 			if s.N != 0 {
 				chunk.allocRange(s.I, s.N)
-			}
-		}
 
-		// Make sure the scavenge index is updated.
-		//
-		// This is an inefficient way to do it, but it's also the simplest way.
-		minPages := physPageSize / pageSize
-		if minPages < 1 {
-			minPages = 1
-		}
-		_, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, minPages)
-		if npages != 0 {
-			p.scav.index.mark(addr, addr+pallocChunkBytes)
+				// Make sure the scavenge index is updated.
+				p.scav.index.alloc(ci, s.N)
+			}
 		}
 
 		// Update heap metadata for the allocRange calls above.
@@ -1070,8 +1067,6 @@ func FreePageAlloc(pp *PageAlloc) {
 		for l := 0; l < summaryLevels; l++ {
 			sysFreeOS(unsafe.Pointer(&p.summary[l][0]), uintptr(cap(p.summary[l]))*pallocSumBytes)
 		}
-		// Only necessary on 64-bit. This is a global on 32-bit.
-		sysFreeOS(unsafe.Pointer(&p.scav.index.chunks[0]), uintptr(cap(p.scav.index.chunks)))
 	} else {
 		resSize := uintptr(0)
 		for _, s := range p.summary {
@@ -1080,6 +1075,9 @@ func FreePageAlloc(pp *PageAlloc) {
 		sysFreeOS(unsafe.Pointer(&p.summary[0][0]), alignUp(resSize, physPageSize))
 	}
 
+	// Free extra data structures.
+	sysFreeOS(unsafe.Pointer(&p.scav.index.chunks[0]), uintptr(cap(p.scav.index.chunks))*unsafe.Sizeof(atomicScavChunkData{}))
+
 	// Subtract back out whatever we mapped for the summaries.
 	// sysUsed adds to p.sysStat and memstats.mappedReady no matter what
 	// (and in anger should actually be accounted for), and there's no other
@@ -1629,23 +1627,96 @@ type ScavengeIndex struct {
 
 func NewScavengeIndex(min, max ChunkIdx) *ScavengeIndex {
 	s := new(ScavengeIndex)
-	s.i.chunks = make([]atomic.Uint8, uintptr(1<<heapAddrBits/pallocChunkBytes/8))
-	s.i.min.Store(int32(min / 8))
-	s.i.max.Store(int32(max / 8))
+	// This is a bit lazy but we easily guarantee we'll be able
+	// to reference all the relevant chunks. The worst-case
+	// memory usage here is 512 MiB, but tests generally use
+	// small offsets from BaseChunkIdx, which results in ~100s
+	// of KiB in memory use.
+	//
+	// This may still be worth making better, at least by sharing
+	// this fairly large array across calls with a sync.Pool or
+	// something. Currently, when the tests are run serially,
+	// it takes around 0.5s. Not all that much, but if we have
+	// a lot of tests like this it could add up.
+	s.i.chunks = make([]atomicScavChunkData, max)
+	s.i.min.Store(uintptr(min))
+	s.i.max.Store(uintptr(max))
+	s.i.test = true
 	return s
 }
 
-func (s *ScavengeIndex) Find() (ChunkIdx, uint) {
-	ci, off := s.i.find()
+func (s *ScavengeIndex) Find(force bool) (ChunkIdx, uint) {
+	ci, off := s.i.find(force)
 	return ChunkIdx(ci), off
 }
 
-func (s *ScavengeIndex) Mark(base, limit uintptr) {
-	s.i.mark(base, limit)
+func (s *ScavengeIndex) AllocRange(base, limit uintptr) {
+	sc, ec := chunkIndex(base), chunkIndex(limit-1)
+	si, ei := chunkPageIndex(base), chunkPageIndex(limit-1)
+
+	if sc == ec {
+		// The range doesn't cross any chunk boundaries.
+		s.i.alloc(sc, ei+1-si)
+	} else {
+		// The range crosses at least one chunk boundary.
+		s.i.alloc(sc, pallocChunkPages-si)
+		for c := sc + 1; c < ec; c++ {
+			s.i.alloc(c, pallocChunkPages)
+		}
+		s.i.alloc(ec, ei+1)
+	}
+}
+
+func (s *ScavengeIndex) FreeRange(base, limit uintptr) {
+	sc, ec := chunkIndex(base), chunkIndex(limit-1)
+	si, ei := chunkPageIndex(base), chunkPageIndex(limit-1)
+
+	if sc == ec {
+		// The range doesn't cross any chunk boundaries.
+		s.i.free(sc, si, ei+1-si)
+	} else {
+		// The range crosses at least one chunk boundary.
+		s.i.free(sc, si, pallocChunkPages-si)
+		for c := sc + 1; c < ec; c++ {
+			s.i.free(c, 0, pallocChunkPages)
+		}
+		s.i.free(ec, 0, ei+1)
+	}
+}
+
+func (s *ScavengeIndex) ResetSearchAddrs() {
+	for _, a := range []*atomicOffAddr{&s.i.searchAddrBg, &s.i.searchAddrForce} {
+		addr, marked := a.Load()
+		if marked {
+			a.StoreUnmark(addr, addr)
+		}
+		a.Clear()
+	}
+	s.i.freeHWM = minOffAddr
+}
+
+func (s *ScavengeIndex) NextGen() {
+	s.i.nextGen()
+}
+
+func (s *ScavengeIndex) SetEmpty(ci ChunkIdx) {
+	s.i.setEmpty(chunkIdx(ci))
 }
 
-func (s *ScavengeIndex) Clear(ci ChunkIdx) {
-	s.i.clear(chunkIdx(ci))
+func (s *ScavengeIndex) SetNoHugePage(ci ChunkIdx) bool {
+	return s.i.setNoHugePage(chunkIdx(ci))
+}
+
+func CheckPackScavChunkData(gen uint32, inUse, lastInUse uint16, flags uint8) bool {
+	sc0 := scavChunkData{
+		gen:            gen,
+		inUse:          inUse,
+		lastInUse:      lastInUse,
+		scavChunkFlags: scavChunkFlags(flags),
+	}
+	scp := sc0.pack()
+	sc1 := unpackScavChunkData(scp)
+	return sc0 == sc1
 }
 
 const GTrackingPeriod = gTrackingPeriod
diff --git a/src/runtime/mem_linux.go b/src/runtime/mem_linux.go
index 96e890eedb..31815fb421 100644
--- a/src/runtime/mem_linux.go
+++ b/src/runtime/mem_linux.go
@@ -37,64 +37,6 @@ func sysAllocOS(n uintptr) unsafe.Pointer {
 var adviseUnused = uint32(_MADV_FREE)
 
 func sysUnusedOS(v unsafe.Pointer, n uintptr) {
-	// By default, Linux's "transparent huge page" support will
-	// merge pages into a huge page if there's even a single
-	// present regular page, undoing the effects of madvise(adviseUnused)
-	// below. On amd64, that means khugepaged can turn a single
-	// 4KB page to 2MB, bloating the process's RSS by as much as
-	// 512X. (See issue #8832 and Linux kernel bug
-	// https://bugzilla.kernel.org/show_bug.cgi?id=93111)
-	//
-	// To work around this, we explicitly disable transparent huge
-	// pages when we release pages of the heap. However, we have
-	// to do this carefully because changing this flag tends to
-	// split the VMA (memory mapping) containing v in to three
-	// VMAs in order to track the different values of the
-	// MADV_NOHUGEPAGE flag in the different regions. There's a
-	// default limit of 65530 VMAs per address space (sysctl
-	// vm.max_map_count), so we must be careful not to create too
-	// many VMAs (see issue #12233).
-	//
-	// Since huge pages are huge, there's little use in adjusting
-	// the MADV_NOHUGEPAGE flag on a fine granularity, so we avoid
-	// exploding the number of VMAs by only adjusting the
-	// MADV_NOHUGEPAGE flag on a large granularity. This still
-	// gets most of the benefit of huge pages while keeping the
-	// number of VMAs under control. With hugePageSize = 2MB, even
-	// a pessimal heap can reach 128GB before running out of VMAs.
-	if physHugePageSize != 0 {
-		// If it's a large allocation, we want to leave huge
-		// pages enabled. Hence, we only adjust the huge page
-		// flag on the huge pages containing v and v+n-1, and
-		// only if those aren't aligned.
-		var head, tail uintptr
-		if uintptr(v)&(physHugePageSize-1) != 0 {
-			// Compute huge page containing v.
-			head = alignDown(uintptr(v), physHugePageSize)
-		}
-		if (uintptr(v)+n)&(physHugePageSize-1) != 0 {
-			// Compute huge page containing v+n-1.
-			tail = alignDown(uintptr(v)+n-1, physHugePageSize)
-		}
-
-		// Note that madvise will return EINVAL if the flag is
-		// already set, which is quite likely. We ignore
-		// errors.
-		if head != 0 && head+physHugePageSize == tail {
-			// head and tail are different but adjacent,
-			// so do this in one call.
-			madvise(unsafe.Pointer(head), 2*physHugePageSize, _MADV_NOHUGEPAGE)
-		} else {
-			// Advise the huge pages containing v and v+n-1.
-			if head != 0 {
-				madvise(unsafe.Pointer(head), physHugePageSize, _MADV_NOHUGEPAGE)
-			}
-			if tail != 0 && tail != head {
-				madvise(unsafe.Pointer(tail), physHugePageSize, _MADV_NOHUGEPAGE)
-			}
-		}
-	}
-
 	if uintptr(v)&(physPageSize-1) != 0 || n&(physPageSize-1) != 0 {
 		// madvise will round this to any physical page
 		// *covered* by this range, so an unaligned madvise
@@ -133,19 +75,7 @@ func sysUsedOS(v unsafe.Pointer, n uintptr) {
 			throw("runtime: cannot remap pages in address space")
 		}
 		return
-
-		// Don't do the sysHugePage optimization in hard decommit mode.
-		// We're breaking up pages everywhere, there's no point.
 	}
-	// Partially undo the NOHUGEPAGE marks from sysUnused
-	// for whole huge pages between v and v+n. This may
-	// leave huge pages off at the end points v and v+n
-	// even though allocations may cover these entire huge
-	// pages. We could detect this and undo NOHUGEPAGE on
-	// the end points as well, but it's probably not worth
-	// the cost because when neighboring allocations are
-	// freed sysUnused will just set NOHUGEPAGE again.
-	sysHugePageOS(v, n)
 }
 
 func sysHugePageOS(v unsafe.Pointer, n uintptr) {
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index bb56ab8063..d2bf3d2d2e 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -1069,6 +1069,12 @@ func gcMarkTermination() {
 	injectglist(&work.sweepWaiters.list)
 	unlock(&work.sweepWaiters.lock)
 
+	// Increment the scavenge generation now.
+	//
+	// This moment represents peak heap in use because we're
+	// about to start sweeping.
+	mheap_.pages.scav.index.nextGen()
+
 	// Release the CPU limiter.
 	gcCPULimiter.finishGCTransition(now)
 
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index e0c04ffbc4..5976ab49cc 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -9,12 +9,14 @@
 // fragmentation and reduce the RSS of Go applications.
 //
 // Scavenging in Go happens on two fronts: there's the background
-// (asynchronous) scavenger and the heap-growth (synchronous) scavenger.
+// (asynchronous) scavenger and the allocation-time (synchronous) scavenger.
 //
 // The former happens on a goroutine much like the background sweeper which is
 // soft-capped at using scavengePercent of the mutator's time, based on
-// order-of-magnitude estimates of the costs of scavenging. The background
-// scavenger's primary goal is to bring the estimated heap RSS of the
+// order-of-magnitude estimates of the costs of scavenging. The latter happens
+// when allocating pages from the heap.
+//
+// The scavenger's primary goal is to bring the estimated heap RSS of the
 // application down to a goal.
 //
 // Before we consider what this looks like, we need to split the world into two
@@ -61,11 +63,30 @@
 //
 // The goals are updated after each GC.
 //
-// The synchronous heap-growth scavenging happens whenever the heap grows in
-// size, for some definition of heap-growth. The intuition behind this is that
-// the application had to grow the heap because existing fragments were
-// not sufficiently large to satisfy a page-level memory allocation, so we
-// scavenge those fragments eagerly to offset the growth in RSS that results.
+// Synchronous scavenging happens for one of two reasons: if an allocation would
+// exceed the memory limit or whenever the heap grows in size, for some
+// definition of heap-growth. The intuition behind this second reason is that the
+// application had to grow the heap because existing fragments were not sufficiently
+// large to satisfy a page-level memory allocation, so we scavenge those fragments
+// eagerly to offset the growth in RSS that results.
+//
+// Lastly, not all pages are available for scavenging at all times and in all cases.
+// The background scavenger and heap-growth scavenger only release memory in chunks
+// that have not been densely-allocated for at least 1 full GC cycle. The reason
+// behind this is likelihood of reuse: the Go heap is allocated in a first-fit order
+// and by the end of the GC mark phase, the heap tends to be densely packed. Releasing
+// memory in these densely packed chunks while they're being packed is counter-productive,
+// and worse, it breaks up huge pages on systems that support them. The scavenger (invoked
+// during memory allocation) further ensures that chunks it identifies as "dense" are
+// immediately eligible for being backed by huge pages. Note that for the most part these
+// density heuristics are best-effort heuristics. It's totally possible (but unlikely)
+// that a chunk that just became dense is scavenged in the case of a race between memory
+// allocation and scavenging.
+//
+// When synchronously scavenging for the memory limit or for debug.FreeOSMemory, these
+// "dense" packing heuristics are ignored (in other words, scavenging is "forced") because
+// in these scenarios returning memory to the OS is more important than keeping CPU
+// overheads low.
 
 package runtime
 
@@ -118,6 +139,11 @@ const (
 	// This ratio is used as part of multiplicative factor to help the scavenger account
 	// for the additional costs of using scavenged memory in its pacing.
 	scavengeCostRatio = 0.7 * (goos.IsDarwin + goos.IsIos)
+
+	// scavChunkHiOcFrac indicates the fraction of pages that need to be allocated
+	// in the chunk in a single GC cycle for it to be considered high density.
+	scavChunkHiOccFrac  = 0.96875
+	scavChunkHiOccPages = uint16(scavChunkHiOccFrac * pallocChunkPages)
 )
 
 // heapRetained returns an estimate of the current heap RSS.
@@ -366,7 +392,7 @@ func (s *scavengerState) init() {
 	if s.scavenge == nil {
 		s.scavenge = func(n uintptr) (uintptr, int64) {
 			start := nanotime()
-			r := mheap_.pages.scavenge(n, nil)
+			r := mheap_.pages.scavenge(n, nil, false)
 			end := nanotime()
 			if start >= end {
 				return r, 0
@@ -639,17 +665,17 @@ func bgscavenge(c chan int) {
 
 // scavenge scavenges nbytes worth of free pages, starting with the
 // highest address first. Successive calls continue from where it left
-// off until the heap is exhausted. Call scavengeStartGen to bring it
-// back to the top of the heap.
+// off until the heap is exhausted. force makes all memory available to
+// scavenge, ignoring huge page heuristics.
 //
 // Returns the amount of memory scavenged in bytes.
 //
 // scavenge always tries to scavenge nbytes worth of memory, and will
 // only fail to do so if the heap is exhausted for now.
-func (p *pageAlloc) scavenge(nbytes uintptr, shouldStop func() bool) uintptr {
+func (p *pageAlloc) scavenge(nbytes uintptr, shouldStop func() bool, force bool) uintptr {
 	released := uintptr(0)
 	for released < nbytes {
-		ci, pageIdx := p.scav.index.find()
+		ci, pageIdx := p.scav.index.find(force)
 		if ci == 0 {
 			break
 		}
@@ -737,10 +763,14 @@ func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintpt
 
 			// Mark the range we're about to scavenge as allocated, because
 			// we don't want any allocating goroutines to grab it while
-			// the scavenging is in progress.
-			if scav := p.allocRange(addr, uintptr(npages)); scav != 0 {
-				throw("double scavenge")
-			}
+			// the scavenging is in progress. Be careful here -- just do the
+			// bare minimum to avoid stepping on our own scavenging stats.
+			p.chunkOf(ci).allocRange(base, npages)
+			p.update(addr, uintptr(npages), true, true)
+
+			// Grab whether the chunk is hugepage backed and if it is,
+			// clear it. We're about to break up this huge page.
+			shouldNoHugePage := p.scav.index.setNoHugePage(ci)
 
 			// With that done, it's safe to unlock.
 			unlock(p.mheapLock)
@@ -748,13 +778,16 @@ func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintpt
 			if !p.test {
 				pageTraceScav(getg().m.p.ptr(), 0, addr, uintptr(npages))
 
-				// Only perform the actual scavenging if we're not in a test.
+				// Only perform sys* operations if we're not in a test.
 				// It's dangerous to do so otherwise.
+				if shouldNoHugePage {
+					sysNoHugePage(unsafe.Pointer(chunkBase(ci)), pallocChunkBytes)
+				}
 				sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
 
 				// Update global accounting only when not in test, otherwise
 				// the runtime's accounting will be wrong.
-				nbytes := int64(npages) * pageSize
+				nbytes := int64(npages * pageSize)
 				gcController.heapReleased.add(nbytes)
 				gcController.heapFree.add(-nbytes)
 
@@ -767,7 +800,11 @@ func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintpt
 			// Relock the heap, because now we need to make these pages
 			// available allocation. Free them back to the page allocator.
 			lock(p.mheapLock)
-			p.free(addr, uintptr(npages), true)
+			if b := (offAddr{addr}); b.lessThan(p.searchAddr) {
+				p.searchAddr = b
+			}
+			p.chunkOf(ci).free(base, npages)
+			p.update(addr, uintptr(npages), true, false)
 
 			// Mark the range as scavenged.
 			p.chunkOf(ci).scavenged.setRange(base, npages)
@@ -777,7 +814,7 @@ func (p *pageAlloc) scavengeOne(ci chunkIdx, searchIdx uint, max uintptr) uintpt
 		}
 	}
 	// Mark this chunk as having no free pages.
-	p.scav.index.clear(ci)
+	p.scav.index.setEmpty(ci)
 	unlock(p.mheapLock)
 
 	return 0
@@ -965,27 +1002,33 @@ func (m *pallocData) findScavengeCandidate(searchIdx uint, min, max uintptr) (ui
 // scavengeIndex is a structure for efficiently managing which pageAlloc chunks have
 // memory available to scavenge.
 type scavengeIndex struct {
-	// chunks is a bitmap representing the entire address space. Each bit represents
-	// a single chunk, and a 1 value indicates the presence of pages available for
-	// scavenging. Updates to the bitmap are serialized by the pageAlloc lock.
+	// chunks is a scavChunkData-per-chunk structure that indicates the presence of pages
+	// available for scavenging. Updates to the index are serialized by the pageAlloc lock.
 	//
-	// The underlying storage of chunks is platform dependent and may not even be
-	// totally mapped read/write. min and max reflect the extent that is safe to access.
-	// min is inclusive, max is exclusive.
+	// It tracks chunk occupancy and a generation counter per chunk. If a chunk's occupancy
+	// never exceeds pallocChunkDensePages over the course of a single GC cycle, the chunk
+	// becomes eligible for scavenging on the next cycle. If a chunk ever hits this density
+	// threshold it immediately becomes unavailable for scavenging in the current cycle as
+	// well as the next.
 	//
-	// searchAddr is the maximum address (in the offset address space, so we have a linear
+	// For a chunk size of 4 MiB this structure will only use 2 MiB for a 1 TiB contiguous heap.
+	chunks   []atomicScavChunkData
+	min, max atomic.Uintptr
+
+	// searchAddr* is the maximum address (in the offset address space, so we have a linear
 	// view of the address space; see mranges.go:offAddr) containing memory available to
 	// scavenge. It is a hint to the find operation to avoid O(n^2) behavior in repeated lookups.
 	//
-	// searchAddr is always inclusive and should be the base address of the highest runtime
+	// searchAddr* is always inclusive and should be the base address of the highest runtime
 	// page available for scavenging.
 	//
-	// searchAddr is managed by both find and mark.
+	// searchAddrForce is managed by find and free.
+	// searchAddrBg is managed by find and nextGen.
 	//
-	// Normally, find monotonically decreases searchAddr as it finds no more free pages to
+	// Normally, find monotonically decreases searchAddr* as it finds no more free pages to
 	// scavenge. However, mark, when marking a new chunk at an index greater than the current
 	// searchAddr, sets searchAddr to the *negative* index into chunks of that page. The trick here
-	// is that concurrent calls to find will fail to monotonically decrease searchAddr, and so they
+	// is that concurrent calls to find will fail to monotonically decrease searchAddr*, and so they
 	// won't barge over new memory becoming available to scavenge. Furthermore, this ensures
 	// that some future caller of find *must* observe the new high index. That caller
 	// (or any other racing with it), then makes searchAddr positive before continuing, bringing
@@ -994,47 +1037,52 @@ type scavengeIndex struct {
 	// A pageAlloc lock serializes updates between min, max, and searchAddr, so abs(searchAddr)
 	// is always guaranteed to be >= min and < max (converted to heap addresses).
 	//
-	// TODO(mknyszek): Ideally we would use something bigger than a uint8 for faster
-	// iteration like uint32, but we lack the bit twiddling intrinsics. We'd need to either
-	// copy them from math/bits or fix the fact that we can't import math/bits' code from
-	// the runtime due to compiler instrumentation.
-	searchAddr atomicOffAddr
-	chunks     []atomic.Uint8
-	minHeapIdx atomic.Int32
-	min, max   atomic.Int32
+	// searchAddrBg is increased only on each new generation and is mainly used by the
+	// background scavenger and heap-growth scavenging. searchAddrForce is increased continuously
+	// as memory gets freed and is mainly used by eager memory reclaim such as debug.FreeOSMemory
+	// and scavenging to maintain the memory limit.
+	searchAddrBg    atomicOffAddr
+	searchAddrForce atomicOffAddr
+
+	// freeHWM is the highest address (in offset address space) that was freed
+	// this generation.
+	freeHWM offAddr
+
+	// Generation counter. Updated by nextGen at the end of each mark phase.
+	gen uint32
+
+	// test indicates whether or not we're in a test.
+	test bool
 }
 
 // find returns the highest chunk index that may contain pages available to scavenge.
 // It also returns an offset to start searching in the highest chunk.
-func (s *scavengeIndex) find() (chunkIdx, uint) {
-	searchAddr, marked := s.searchAddr.Load()
+func (s *scavengeIndex) find(force bool) (chunkIdx, uint) {
+	cursor := &s.searchAddrBg
+	if force {
+		cursor = &s.searchAddrForce
+	}
+	searchAddr, marked := cursor.Load()
 	if searchAddr == minOffAddr.addr() {
 		// We got a cleared search addr.
 		return 0, 0
 	}
 
-	// Starting from searchAddr's chunk, and moving down to minHeapIdx,
-	// iterate until we find a chunk with pages to scavenge.
-	min := s.minHeapIdx.Load()
-	searchChunk := chunkIndex(uintptr(searchAddr))
-	start := int32(searchChunk / 8)
+	// Starting from searchAddr's chunk, iterate until we find a chunk with pages to scavenge.
+	gen := s.gen
+	min := chunkIdx(s.min.Load())
+	start := chunkIndex(uintptr(searchAddr))
 	for i := start; i >= min; i-- {
-		// Skip over irrelevant address space.
-		chunks := s.chunks[i].Load()
-		if chunks == 0 {
+		// Skip over chunks.
+		if !s.chunks[i].load().shouldScavenge(gen, force) {
 			continue
 		}
-		// Note that we can't have 8 leading zeroes here because
-		// we necessarily skipped that case. So, what's left is
-		// an index. If there are no zeroes, we want the 7th
-		// index, if 1 zero, the 6th, and so on.
-		n := 7 - sys.LeadingZeros8(chunks)
-		ci := chunkIdx(uint(i)*8 + uint(n))
-		if searchChunk == ci {
-			return ci, chunkPageIndex(uintptr(searchAddr))
+		// We're still scavenging this chunk.
+		if i == start {
+			return i, chunkPageIndex(uintptr(searchAddr))
 		}
 		// Try to reduce searchAddr to newSearchAddr.
-		newSearchAddr := chunkBase(ci) + pallocChunkBytes - pageSize
+		newSearchAddr := chunkBase(i) + pallocChunkBytes - pageSize
 		if marked {
 			// Attempt to be the first one to decrease the searchAddr
 			// after an increase. If we fail, that means there was another
@@ -1042,78 +1090,273 @@ func (s *scavengeIndex) find() (chunkIdx, uint) {
 			// it doesn't matter. We may lose some performance having an
 			// incorrect search address, but it's far more important that
 			// we don't miss updates.
-			s.searchAddr.StoreUnmark(searchAddr, newSearchAddr)
+			cursor.StoreUnmark(searchAddr, newSearchAddr)
 		} else {
 			// Decrease searchAddr.
-			s.searchAddr.StoreMin(newSearchAddr)
+			cursor.StoreMin(newSearchAddr)
 		}
-		return ci, pallocChunkPages - 1
+		return i, pallocChunkPages - 1
 	}
 	// Clear searchAddr, because we've exhausted the heap.
-	s.searchAddr.Clear()
+	cursor.Clear()
 	return 0, 0
 }
 
-// mark sets the inclusive range of chunks between indices start and end as
-// containing pages available to scavenge.
+// alloc updates metadata for chunk at index ci with the fact that
+// an allocation of npages occurred.
 //
-// Must be serialized with other mark, markRange, and clear calls.
-func (s *scavengeIndex) mark(base, limit uintptr) {
-	start, end := chunkIndex(base), chunkIndex(limit-pageSize)
-	if start == end {
-		// Within a chunk.
-		mask := uint8(1 << (start % 8))
-		s.chunks[start/8].Or(mask)
-	} else if start/8 == end/8 {
-		// Within the same byte in the index.
-		mask := uint8(uint16(1<<(end-start+1))-1) << (start % 8)
-		s.chunks[start/8].Or(mask)
-	} else {
-		// Crosses multiple bytes in the index.
-		startAligned := chunkIdx(alignUp(uintptr(start), 8))
-		endAligned := chunkIdx(alignDown(uintptr(end), 8))
-
-		// Do the end of the first byte first.
-		if width := startAligned - start; width > 0 {
-			mask := uint8(uint16(1<<width)-1) << (start % 8)
-			s.chunks[start/8].Or(mask)
-		}
-		// Do the middle aligned sections that take up a whole
-		// byte.
-		for ci := startAligned; ci < endAligned; ci += 8 {
-			s.chunks[ci/8].Store(^uint8(0))
-		}
-		// Do the end of the last byte.
-		//
-		// This width check doesn't match the one above
-		// for start because aligning down into the endAligned
-		// block means we always have at least one chunk in this
-		// block (note that end is *inclusive*). This also means
-		// that if end == endAligned+n, then what we really want
-		// is to fill n+1 chunks, i.e. width n+1. By induction,
-		// this is true for all n.
-		if width := end - endAligned + 1; width > 0 {
-			mask := uint8(uint16(1<<width) - 1)
-			s.chunks[end/8].Or(mask)
+// alloc may only run concurrently with find.
+func (s *scavengeIndex) alloc(ci chunkIdx, npages uint) {
+	sc := s.chunks[ci].load()
+	sc.alloc(npages, s.gen)
+	if !sc.isHugePage() && sc.inUse > scavChunkHiOccPages {
+		// Mark dense chunks as specifically backed by huge pages.
+		sc.setHugePage()
+		if !s.test {
+			sysHugePage(unsafe.Pointer(chunkBase(ci)), pallocChunkBytes)
 		}
 	}
-	newSearchAddr := limit - pageSize
-	searchAddr, _ := s.searchAddr.Load()
-	// N.B. Because mark is serialized, it's not necessary to do a
-	// full CAS here. mark only ever increases searchAddr, while
+	s.chunks[ci].store(sc)
+}
+
+// free updates metadata for chunk at index ci with the fact that
+// a free of npages occurred.
+//
+// free may only run concurrently with find.
+func (s *scavengeIndex) free(ci chunkIdx, page, npages uint) {
+	sc := s.chunks[ci].load()
+	sc.free(npages, s.gen)
+	s.chunks[ci].store(sc)
+
+	// Update scavenge search addresses.
+	addr := chunkBase(ci) + uintptr(page+npages-1)*pageSize
+	if s.freeHWM.lessThan(offAddr{addr}) {
+		s.freeHWM = offAddr{addr}
+	}
+	// N.B. Because free is serialized, it's not necessary to do a
+	// full CAS here. free only ever increases searchAddr, while
 	// find only ever decreases it. Since we only ever race with
 	// decreases, even if the value we loaded is stale, the actual
 	// value will never be larger.
-	if (offAddr{searchAddr}).lessThan(offAddr{newSearchAddr}) {
-		s.searchAddr.StoreMarked(newSearchAddr)
+	searchAddr, _ := s.searchAddrForce.Load()
+	if (offAddr{searchAddr}).lessThan(offAddr{addr}) {
+		s.searchAddrForce.StoreMarked(addr)
+	}
+}
+
+// nextGen moves the scavenger forward one generation. Must be called
+// once per GC cycle, but may be called more often to force more memory
+// to be released.
+//
+// nextGen may only run concurrently with find.
+func (s *scavengeIndex) nextGen() {
+	s.gen++
+	searchAddr, _ := s.searchAddrBg.Load()
+	if (offAddr{searchAddr}).lessThan(s.freeHWM) {
+		s.searchAddrBg.StoreMarked(s.freeHWM.addr())
 	}
+	s.freeHWM = minOffAddr
+}
+
+// setEmpty marks that the scavenger has finished looking at ci
+// for now to prevent the scavenger from getting stuck looking
+// at the same chunk.
+//
+// setEmpty may only run concurrently with find.
+func (s *scavengeIndex) setEmpty(ci chunkIdx) {
+	val := s.chunks[ci].load()
+	val.setEmpty()
+	s.chunks[ci].store(val)
 }
 
-// clear sets the chunk at index ci as not containing pages available to scavenge.
+// setNoHugePage updates the backed-by-hugepages status of a particular chunk.
+// Returns true if the set was successful (not already backed by huge pages).
 //
-// Must be serialized with other mark, markRange, and clear calls.
-func (s *scavengeIndex) clear(ci chunkIdx) {
-	s.chunks[ci/8].And(^uint8(1 << (ci % 8)))
+// setNoHugePage may only run concurrently with find.
+func (s *scavengeIndex) setNoHugePage(ci chunkIdx) bool {
+	val := s.chunks[ci].load()
+	if !val.isHugePage() {
+		return false
+	}
+	val.setNoHugePage()
+	s.chunks[ci].store(val)
+	return true
+}
+
+// atomicScavChunkData is an atomic wrapper around a scavChunkData
+// that stores it in its packed form.
+type atomicScavChunkData struct {
+	value atomic.Uint64
+}
+
+// load loads and unpacks a scavChunkData.
+func (sc *atomicScavChunkData) load() scavChunkData {
+	return unpackScavChunkData(sc.value.Load())
+}
+
+// store packs and writes a new scavChunkData. store must be serialized
+// with other calls to store.
+func (sc *atomicScavChunkData) store(ssc scavChunkData) {
+	sc.value.Store(ssc.pack())
+}
+
+// scavChunkData tracks information about a palloc chunk for
+// scavenging. It packs well into 64 bits.
+//
+// The zero value always represents a valid newly-grown chunk.
+type scavChunkData struct {
+	// inUse indicates how many pages in this chunk are currently
+	// allocated.
+	//
+	// Only the first 10 bits are used.
+	inUse uint16
+
+	// lastInUse indicates how many pages in this chunk were allocated
+	// when we transitioned from gen-1 to gen.
+	//
+	// Only the first 10 bits are used.
+	lastInUse uint16
+
+	// gen is the generation counter from a scavengeIndex from the
+	// last time this scavChunkData was updated.
+	gen uint32
+
+	// scavChunkFlags represents additional flags
+	//
+	// Note: only 6 bits are available.
+	scavChunkFlags
+}
+
+// unpackScavChunkData unpacks a scavChunkData from a uint64.
+func unpackScavChunkData(sc uint64) scavChunkData {
+	return scavChunkData{
+		inUse:          uint16(sc),
+		lastInUse:      uint16(sc>>16) & scavChunkInUseMask,
+		gen:            uint32(sc >> 32),
+		scavChunkFlags: scavChunkFlags(uint8(sc>>(16+logScavChunkInUseMax)) & scavChunkFlagsMask),
+	}
+}
+
+// pack returns sc packed into a uint64.
+func (sc scavChunkData) pack() uint64 {
+	return uint64(sc.inUse) |
+		(uint64(sc.lastInUse) << 16) |
+		(uint64(sc.scavChunkFlags) << (16 + logScavChunkInUseMax)) |
+		(uint64(sc.gen) << 32)
+}
+
+const (
+	// scavChunkHasFree indicates whether the chunk has anything left to
+	// scavenge. This is the opposite of "empty," used elsewhere in this
+	// file. The reason we say "HasFree" here is so the zero value is
+	// correct for a newly-grown chunk. (New memory is scavenged.)
+	scavChunkHasFree scavChunkFlags = 1 << iota
+	// scavChunkNoHugePage indicates whether this chunk has been marked
+	// sysNoHugePage. If not set, it means the chunk is marked sysHugePage.
+	// The negative here is unfortunate, but necessary to make it so that
+	// the zero value of scavChunkData accurately represents the state of
+	// a newly-grown chunk. (New memory is marked as backed by huge pages.)
+	scavChunkNoHugePage
+
+	// scavChunkMaxFlags is the maximum number of flags we can have, given how
+	// a scavChunkData is packed into 8 bytes.
+	scavChunkMaxFlags  = 6
+	scavChunkFlagsMask = (1 << scavChunkMaxFlags) - 1
+
+	// logScavChunkInUseMax is the number of bits needed to represent the number
+	// of pages allocated in a single chunk. This is 1 more than log2 of the
+	// number of pages in the chunk because we need to represent a fully-allocated
+	// chunk.
+	logScavChunkInUseMax = logPallocChunkPages + 1
+	scavChunkInUseMask   = (1 << logScavChunkInUseMax) - 1
+)
+
+// scavChunkFlags is a set of bit-flags for the scavenger for each palloc chunk.
+type scavChunkFlags uint8
+
+// isEmpty returns true if the hasFree flag is unset.
+func (sc *scavChunkFlags) isEmpty() bool {
+	return (*sc)&scavChunkHasFree == 0
+}
+
+// setEmpty clears the hasFree flag.
+func (sc *scavChunkFlags) setEmpty() {
+	*sc &^= scavChunkHasFree
+}
+
+// setNonEmpty sets the hasFree flag.
+func (sc *scavChunkFlags) setNonEmpty() {
+	*sc |= scavChunkHasFree
+}
+
+// isHugePage returns false if the noHugePage flag is set.
+func (sc *scavChunkFlags) isHugePage() bool {
+	return (*sc)&scavChunkNoHugePage == 0
+}
+
+// setHugePage clears the noHugePage flag.
+func (sc *scavChunkFlags) setHugePage() {
+	*sc &^= scavChunkNoHugePage
+}
+
+// setNoHugePage sets the noHugePage flag.
+func (sc *scavChunkFlags) setNoHugePage() {
+	*sc |= scavChunkNoHugePage
+}
+
+// shouldScavenge returns true if the corresponding chunk should be interrogated
+// by the scavenger.
+func (sc scavChunkData) shouldScavenge(currGen uint32, force bool) bool {
+	if sc.isEmpty() {
+		// Nothing to scavenge.
+		return false
+	}
+	if force {
+		// We're forcing the memory to be scavenged.
+		return true
+	}
+	if sc.gen == currGen {
+		// In the current generation, if either the current or last generation
+		// is dense, then skip scavenging. Inverting that, we should scavenge
+		// if both the current and last generation were not dense.
+		return sc.inUse < scavChunkHiOccPages && sc.lastInUse < scavChunkHiOccPages
+	}
+	// If we're one or more generations ahead, we know inUse represents the current
+	// state of the chunk, since otherwise it would've been updated already.
+	return sc.inUse < scavChunkHiOccPages
+}
+
+// alloc updates sc given that npages were allocated in the corresponding chunk.
+func (sc *scavChunkData) alloc(npages uint, newGen uint32) {
+	if uint(sc.inUse)+npages > pallocChunkPages {
+		print("runtime: inUse=", sc.inUse, " npages=", npages, "\n")
+		throw("too many pages allocated in chunk?")
+	}
+	if sc.gen != newGen {
+		sc.lastInUse = sc.inUse
+		sc.gen = newGen
+	}
+	sc.inUse += uint16(npages)
+	if sc.inUse == pallocChunkPages {
+		// There's nothing for the scavenger to take from here.
+		sc.setEmpty()
+	}
+}
+
+// free updates sc given that npages was freed in the corresponding chunk.
+func (sc *scavChunkData) free(npages uint, newGen uint32) {
+	if uint(sc.inUse) < npages {
+		print("runtime: inUse=", sc.inUse, " npages=", npages, "\n")
+		throw("allocated pages below zero?")
+	}
+	if sc.gen != newGen {
+		sc.lastInUse = sc.inUse
+		sc.gen = newGen
+	}
+	sc.inUse -= uint16(npages)
+	// The scavenger can no longer be done with this chunk now that
+	// new memory has been freed into it.
+	sc.setNonEmpty()
 }
 
 type piController struct {
diff --git a/src/runtime/mgcscavenge_test.go b/src/runtime/mgcscavenge_test.go
index c436ff060f..d7624d6d72 100644
--- a/src/runtime/mgcscavenge_test.go
+++ b/src/runtime/mgcscavenge_test.go
@@ -564,149 +564,278 @@ func TestScavenger(t *testing.T) {
 }
 
 func TestScavengeIndex(t *testing.T) {
-	setup := func(t *testing.T) (func(ChunkIdx, uint), func(uintptr, uintptr)) {
+	// This test suite tests the scavengeIndex data structure.
+
+	// markFunc is a function that makes the address range [base, limit)
+	// available for scavenging in a test index.
+	type markFunc func(base, limit uintptr)
+
+	// findFunc is a function that searches for the next available page
+	// to scavenge in the index. It asserts that the page is found in
+	// chunk "ci" at page "offset."
+	type findFunc func(ci ChunkIdx, offset uint)
+
+	// The structure of the tests below is as follows:
+	//
+	// setup creates a fake scavengeIndex that can be mutated and queried by
+	// the functions it returns. Those functions capture the testing.T that
+	// setup is called with, so they're bound to the subtest they're created in.
+	//
+	// Tests are then organized into test cases which mark some pages as
+	// scavenge-able then try to find them. Tests expect that the initial
+	// state of the scavengeIndex has all of the chunks as dense in the last
+	// generation and empty to the scavenger.
+	//
+	// There are a few additional tests that interleave mark and find operations,
+	// so they're defined separately, but use the same infrastructure.
+	setup := func(t *testing.T, force bool) (mark markFunc, find findFunc, nextGen func()) {
 		t.Helper()
 
 		// Pick some reasonable bounds. We don't need a huge range just to test.
 		si := NewScavengeIndex(BaseChunkIdx, BaseChunkIdx+64)
-		find := func(want ChunkIdx, wantOffset uint) {
+
+		// Initialize all the chunks as dense and empty.
+		//
+		// Also, reset search addresses so that we can get page offsets.
+		si.AllocRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+64, 0))
+		si.NextGen()
+		si.FreeRange(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+64, 0))
+		for ci := BaseChunkIdx; ci < BaseChunkIdx+64; ci++ {
+			si.SetEmpty(ci)
+		}
+		si.ResetSearchAddrs()
+
+		// Create and return test functions.
+		mark = func(base, limit uintptr) {
 			t.Helper()
 
-			got, gotOffset := si.Find()
+			si.AllocRange(base, limit)
+			si.FreeRange(base, limit)
+		}
+		find = func(want ChunkIdx, wantOffset uint) {
+			t.Helper()
+
+			got, gotOffset := si.Find(force)
 			if want != got {
 				t.Errorf("find: wanted chunk index %d, got %d", want, got)
 			}
-			if want != got {
+			if wantOffset != gotOffset {
 				t.Errorf("find: wanted page offset %d, got %d", wantOffset, gotOffset)
 			}
 			if t.Failed() {
 				t.FailNow()
 			}
-			si.Clear(got)
+			si.SetEmpty(got)
 		}
-		mark := func(base, limit uintptr) {
+		nextGen = func() {
 			t.Helper()
 
-			si.Mark(base, limit)
+			si.NextGen()
 		}
-		return find, mark
+		return
 	}
-	t.Run("Uninitialized", func(t *testing.T) {
-		find, _ := setup(t)
-		find(0, 0)
-	})
-	t.Run("OnePage", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx, 3), PageBase(BaseChunkIdx, 4))
-		find(BaseChunkIdx, 3)
-		find(0, 0)
-	})
-	t.Run("FirstPage", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx, 1))
-		find(BaseChunkIdx, 0)
-		find(0, 0)
-	})
-	t.Run("SeveralPages", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx, 9), PageBase(BaseChunkIdx, 14))
-		find(BaseChunkIdx, 13)
-		find(0, 0)
-	})
-	t.Run("WholeChunk", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0))
-		find(BaseChunkIdx, PallocChunkPages-1)
-		find(0, 0)
-	})
-	t.Run("LastPage", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx, PallocChunkPages-1), PageBase(BaseChunkIdx+1, 0))
-		find(BaseChunkIdx, PallocChunkPages-1)
-		find(0, 0)
-	})
-	t.Run("TwoChunks", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx, 128), PageBase(BaseChunkIdx+1, 128))
-		find(BaseChunkIdx+1, 127)
-		find(BaseChunkIdx, PallocChunkPages-1)
-		find(0, 0)
-	})
-	t.Run("TwoChunksOffset", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx+7, 128), PageBase(BaseChunkIdx+8, 129))
-		find(BaseChunkIdx+8, 128)
-		find(BaseChunkIdx+7, PallocChunkPages-1)
-		find(0, 0)
-	})
-	t.Run("SevenChunksOffset", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx+6, 11), PageBase(BaseChunkIdx+13, 15))
-		find(BaseChunkIdx+13, 14)
-		for i := BaseChunkIdx + 12; i >= BaseChunkIdx+6; i-- {
-			find(i, PallocChunkPages-1)
-		}
-		find(0, 0)
-	})
-	t.Run("ThirtyTwoChunks", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+32, 0))
-		for i := BaseChunkIdx + 31; i >= BaseChunkIdx; i-- {
-			find(i, PallocChunkPages-1)
-		}
-		find(0, 0)
-	})
-	t.Run("ThirtyTwoChunksOffset", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx+3, 0), PageBase(BaseChunkIdx+35, 0))
-		for i := BaseChunkIdx + 34; i >= BaseChunkIdx+3; i-- {
-			find(i, PallocChunkPages-1)
-		}
-		find(0, 0)
-	})
-	t.Run("Mark", func(t *testing.T) {
-		find, mark := setup(t)
+
+	// Each of these test cases calls mark and then find once.
+	type testCase struct {
+		name string
+		mark func(markFunc)
+		find func(findFunc)
+	}
+	for _, test := range []testCase{
+		{
+			name: "Uninitialized",
+			mark: func(_ markFunc) {},
+			find: func(_ findFunc) {},
+		},
+		{
+			name: "OnePage",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx, 3), PageBase(BaseChunkIdx, 4))
+			},
+			find: func(find findFunc) {
+				find(BaseChunkIdx, 3)
+			},
+		},
+		{
+			name: "FirstPage",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx, 1))
+			},
+			find: func(find findFunc) {
+				find(BaseChunkIdx, 0)
+			},
+		},
+		{
+			name: "SeveralPages",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx, 9), PageBase(BaseChunkIdx, 14))
+			},
+			find: func(find findFunc) {
+				find(BaseChunkIdx, 13)
+			},
+		},
+		{
+			name: "WholeChunk",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0))
+			},
+			find: func(find findFunc) {
+				find(BaseChunkIdx, PallocChunkPages-1)
+			},
+		},
+		{
+			name: "LastPage",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx, PallocChunkPages-1), PageBase(BaseChunkIdx+1, 0))
+			},
+			find: func(find findFunc) {
+				find(BaseChunkIdx, PallocChunkPages-1)
+			},
+		},
+		{
+			name: "TwoChunks",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx, 128), PageBase(BaseChunkIdx+1, 128))
+			},
+			find: func(find findFunc) {
+				find(BaseChunkIdx+1, 127)
+				find(BaseChunkIdx, PallocChunkPages-1)
+			},
+		},
+		{
+			name: "TwoChunksOffset",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx+7, 128), PageBase(BaseChunkIdx+8, 129))
+			},
+			find: func(find findFunc) {
+				find(BaseChunkIdx+8, 128)
+				find(BaseChunkIdx+7, PallocChunkPages-1)
+			},
+		},
+		{
+			name: "SevenChunksOffset",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx+6, 11), PageBase(BaseChunkIdx+13, 15))
+			},
+			find: func(find findFunc) {
+				find(BaseChunkIdx+13, 14)
+				for i := BaseChunkIdx + 12; i >= BaseChunkIdx+6; i-- {
+					find(i, PallocChunkPages-1)
+				}
+			},
+		},
+		{
+			name: "ThirtyTwoChunks",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+32, 0))
+			},
+			find: func(find findFunc) {
+				for i := BaseChunkIdx + 31; i >= BaseChunkIdx; i-- {
+					find(i, PallocChunkPages-1)
+				}
+			},
+		},
+		{
+			name: "ThirtyTwoChunksOffset",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx+3, 0), PageBase(BaseChunkIdx+35, 0))
+			},
+			find: func(find findFunc) {
+				for i := BaseChunkIdx + 34; i >= BaseChunkIdx+3; i-- {
+					find(i, PallocChunkPages-1)
+				}
+			},
+		},
+		{
+			name: "Mark",
+			mark: func(mark markFunc) {
+				for i := BaseChunkIdx; i < BaseChunkIdx+32; i++ {
+					mark(PageBase(i, 0), PageBase(i+1, 0))
+				}
+			},
+			find: func(find findFunc) {
+				for i := BaseChunkIdx + 31; i >= BaseChunkIdx; i-- {
+					find(i, PallocChunkPages-1)
+				}
+			},
+		},
+		{
+			name: "MarkIdempotentOneChunk",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0))
+				mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0))
+			},
+			find: func(find findFunc) {
+				find(BaseChunkIdx, PallocChunkPages-1)
+			},
+		},
+		{
+			name: "MarkIdempotentThirtyTwoChunks",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+32, 0))
+				mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+32, 0))
+			},
+			find: func(find findFunc) {
+				for i := BaseChunkIdx + 31; i >= BaseChunkIdx; i-- {
+					find(i, PallocChunkPages-1)
+				}
+			},
+		},
+		{
+			name: "MarkIdempotentThirtyTwoChunksOffset",
+			mark: func(mark markFunc) {
+				mark(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+31, 0))
+				mark(PageBase(BaseChunkIdx+5, 0), PageBase(BaseChunkIdx+36, 0))
+			},
+			find: func(find findFunc) {
+				for i := BaseChunkIdx + 35; i >= BaseChunkIdx+4; i-- {
+					find(i, PallocChunkPages-1)
+				}
+			},
+		},
+	} {
+		test := test
+		t.Run("Bg/"+test.name, func(t *testing.T) {
+			mark, find, nextGen := setup(t, false)
+			test.mark(mark)
+			find(0, 0)      // Make sure we find nothing at this point.
+			nextGen()       // Move to the next generation.
+			test.find(find) // Now we should be able to find things.
+			find(0, 0)      // The test should always fully exhaust the index.
+		})
+		t.Run("Force/"+test.name, func(t *testing.T) {
+			mark, find, _ := setup(t, true)
+			test.mark(mark)
+			test.find(find) // Finding should always work when forced.
+			find(0, 0)      // The test should always fully exhaust the index.
+		})
+	}
+	t.Run("Bg/MarkInterleaved", func(t *testing.T) {
+		mark, find, nextGen := setup(t, false)
 		for i := BaseChunkIdx; i < BaseChunkIdx+32; i++ {
 			mark(PageBase(i, 0), PageBase(i+1, 0))
-		}
-		for i := BaseChunkIdx + 31; i >= BaseChunkIdx; i-- {
+			nextGen()
 			find(i, PallocChunkPages-1)
 		}
 		find(0, 0)
 	})
-	t.Run("MarkInterleaved", func(t *testing.T) {
-		find, mark := setup(t)
+	t.Run("Force/MarkInterleaved", func(t *testing.T) {
+		mark, find, _ := setup(t, true)
 		for i := BaseChunkIdx; i < BaseChunkIdx+32; i++ {
 			mark(PageBase(i, 0), PageBase(i+1, 0))
 			find(i, PallocChunkPages-1)
 		}
 		find(0, 0)
 	})
-	t.Run("MarkIdempotentOneChunk", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0))
-		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+1, 0))
-		find(BaseChunkIdx, PallocChunkPages-1)
-		find(0, 0)
-	})
-	t.Run("MarkIdempotentThirtyTwoChunks", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+32, 0))
-		mark(PageBase(BaseChunkIdx, 0), PageBase(BaseChunkIdx+32, 0))
-		for i := BaseChunkIdx + 31; i >= BaseChunkIdx; i-- {
-			find(i, PallocChunkPages-1)
-		}
-		find(0, 0)
-	})
-	t.Run("MarkIdempotentThirtyTwoChunksOffset", func(t *testing.T) {
-		find, mark := setup(t)
-		mark(PageBase(BaseChunkIdx+4, 0), PageBase(BaseChunkIdx+31, 0))
-		mark(PageBase(BaseChunkIdx+5, 0), PageBase(BaseChunkIdx+36, 0))
-		for i := BaseChunkIdx + 35; i >= BaseChunkIdx+4; i-- {
-			find(i, PallocChunkPages-1)
-		}
-		find(0, 0)
-	})
+}
+
+func TestScavChunkDataPack(t *testing.T) {
+	if !CheckPackScavChunkData(1918237402, 512, 512, 0b11) {
+		t.Error("failed pack/unpack check for scavChunkData 1")
+	}
+	if !CheckPackScavChunkData(^uint32(0), 12, 0, 0b00) {
+		t.Error("failed pack/unpack check for scavChunkData 2")
+	}
 }
 
 func FuzzPIController(f *testing.F) {
diff --git a/src/runtime/mgcsweep.go b/src/runtime/mgcsweep.go
index 773e27e646..febe519750 100644
--- a/src/runtime/mgcsweep.go
+++ b/src/runtime/mgcsweep.go
@@ -260,9 +260,11 @@ func finishsweep_m() {
 		c.fullUnswept(sg).reset()
 	}
 
-	// Sweeping is done, so if the scavenger isn't already awake,
-	// wake it up. There's definitely work for it to do at this
-	// point.
+	// Sweeping is done, so there won't be any new memory to
+	// scavenge for a bit.
+	//
+	// If the scavenger isn't already awake, wake it up. There's
+	// definitely work for it to do at this point.
 	scavenger.wake()
 
 	nextMarkBitArenaEpoch()
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index 06592fe95b..ee005978fb 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -773,7 +773,7 @@ func (h *mheap) init() {
 		h.central[i].mcentral.init(spanClass(i))
 	}
 
-	h.pages.init(&h.lock, &memstats.gcMiscSys)
+	h.pages.init(&h.lock, &memstats.gcMiscSys, false)
 }
 
 // reclaim sweeps and reclaims at least npage pages into the heap.
@@ -1274,6 +1274,7 @@ HaveSpan:
 	// pages not to get touched until we return. Simultaneously, it's important
 	// to do this before calling sysUsed because that may commit address space.
 	bytesToScavenge := uintptr(0)
+	forceScavenge := false
 	if limit := gcController.memoryLimit.Load(); !gcCPULimiter.limiting() {
 		// Assist with scavenging to maintain the memory limit by the amount
 		// that we expect to page in.
@@ -1282,6 +1283,7 @@ HaveSpan:
 		// someone can set a really big memory limit that isn't maxInt64.
 		if uint64(scav)+inuse > uint64(limit) {
 			bytesToScavenge = uintptr(uint64(scav) + inuse - uint64(limit))
+			forceScavenge = true
 		}
 	}
 	if goal := scavenge.gcPercentGoal.Load(); goal != ^uint64(0) && growth > 0 {
@@ -1323,7 +1325,7 @@ HaveSpan:
 		// Scavenge, but back out if the limiter turns on.
 		h.pages.scavenge(bytesToScavenge, func() bool {
 			return gcCPULimiter.limiting()
-		})
+		}, forceScavenge)
 
 		// Finish up accounting.
 		now = nanotime()
@@ -1629,7 +1631,7 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 	memstats.heapStats.release()
 
 	// Mark the space as free.
-	h.pages.free(s.base(), s.npages, false)
+	h.pages.free(s.base(), s.npages)
 
 	// Free the span structure. We no longer have a use for it.
 	s.state.set(mSpanDead)
@@ -1639,6 +1641,10 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 // scavengeAll acquires the heap lock (blocking any additional
 // manipulation of the page allocator) and iterates over the whole
 // heap, scavenging every free page available.
+//
+// Must run on the system stack because it acquires the heap lock.
+//
+//go:systemstack
 func (h *mheap) scavengeAll() {
 	// Disallow malloc or panic while holding the heap lock. We do
 	// this here because this is a non-mallocgc entry-point to
@@ -1646,7 +1652,8 @@ func (h *mheap) scavengeAll() {
 	gp := getg()
 	gp.m.mallocing++
 
-	released := h.pages.scavenge(^uintptr(0), nil)
+	// Force scavenge everything.
+	released := h.pages.scavenge(^uintptr(0), nil, true)
 
 	gp.m.mallocing--
 
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index 4f35cafc24..da1b14e5a4 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -257,11 +257,9 @@ type pageAlloc struct {
 	// known by the page allocator to be currently in-use (passed
 	// to grow).
 	//
-	// This field is currently unused on 32-bit architectures but
-	// is harmless to track. We care much more about having a
-	// contiguous heap in these cases and take additional measures
-	// to ensure that, so in nearly all cases this should have just
-	// 1 element.
+	// We care much more about having a contiguous heap in these cases
+	// and take additional measures to ensure that, so in nearly all
+	// cases this should have just 1 element.
 	//
 	// All access is protected by the mheapLock.
 	inUse addrRanges
@@ -300,7 +298,7 @@ type pageAlloc struct {
 	test bool
 }
 
-func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat) {
+func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat, test bool) {
 	if levelLogPages[0] > logMaxPackedValue {
 		// We can't represent 1<<levelLogPages[0] pages, the maximum number
 		// of pages we need to represent at the root level, in a summary, which
@@ -315,13 +313,17 @@ func (p *pageAlloc) init(mheapLock *mutex, sysStat *sysMemStat) {
 	p.inUse.init(sysStat)
 
 	// System-dependent initialization.
-	p.sysInit()
+	p.sysInit(test)
 
 	// Start with the searchAddr in a state indicating there's no free memory.
 	p.searchAddr = maxSearchAddr()
 
 	// Set the mheapLock.
 	p.mheapLock = mheapLock
+
+	// Set if we're in a test.
+	p.test = test
+	p.scav.index.test = test
 }
 
 // tryChunkOf returns the bitmap data for the given chunk.
@@ -415,6 +417,11 @@ func (p *pageAlloc) grow(base, size uintptr) {
 	// we need to ensure this newly-free memory is visible in the
 	// summaries.
 	p.update(base, size/pageSize, true, false)
+
+	// Mark all new memory as huge page eligible.
+	if !p.test {
+		sysHugePage(unsafe.Pointer(base), size)
+	}
 }
 
 // enableChunkHugePages enables huge pages for the chunk bitmap mappings (disabled by default).
@@ -568,19 +575,23 @@ func (p *pageAlloc) allocRange(base, npages uintptr) uintptr {
 		chunk := p.chunkOf(sc)
 		scav += chunk.scavenged.popcntRange(si, ei+1-si)
 		chunk.allocRange(si, ei+1-si)
+		p.scav.index.alloc(sc, ei+1-si)
 	} else {
 		// The range crosses at least one chunk boundary.
 		chunk := p.chunkOf(sc)
 		scav += chunk.scavenged.popcntRange(si, pallocChunkPages-si)
 		chunk.allocRange(si, pallocChunkPages-si)
+		p.scav.index.alloc(sc, pallocChunkPages-si)
 		for c := sc + 1; c < ec; c++ {
 			chunk := p.chunkOf(c)
 			scav += chunk.scavenged.popcntRange(0, pallocChunkPages)
 			chunk.allocAll()
+			p.scav.index.alloc(c, pallocChunkPages)
 		}
 		chunk = p.chunkOf(ec)
 		scav += chunk.scavenged.popcntRange(0, ei+1)
 		chunk.allocRange(0, ei+1)
+		p.scav.index.alloc(ec, ei+1)
 	}
 	p.update(base, npages, true, true)
 	return uintptr(scav) * pageSize
@@ -914,7 +925,7 @@ Found:
 // Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (p *pageAlloc) free(base, npages uintptr, scavenged bool) {
+func (p *pageAlloc) free(base, npages uintptr) {
 	assertLockHeld(p.mheapLock)
 
 	// If we're freeing pages below the p.searchAddr, update searchAddr.
@@ -922,14 +933,13 @@ func (p *pageAlloc) free(base, npages uintptr, scavenged bool) {
 		p.searchAddr = b
 	}
 	limit := base + npages*pageSize - 1
-	if !scavenged {
-		p.scav.index.mark(base, limit+1)
-	}
 	if npages == 1 {
 		// Fast path: we're clearing a single bit, and we know exactly
 		// where it is, so mark it directly.
 		i := chunkIndex(base)
-		p.chunkOf(i).free1(chunkPageIndex(base))
+		pi := chunkPageIndex(base)
+		p.chunkOf(i).free1(pi)
+		p.scav.index.free(i, pi, 1)
 	} else {
 		// Slow path: we're clearing more bits so we may need to iterate.
 		sc, ec := chunkIndex(base), chunkIndex(limit)
@@ -938,13 +948,17 @@ func (p *pageAlloc) free(base, npages uintptr, scavenged bool) {
 		if sc == ec {
 			// The range doesn't cross any chunk boundaries.
 			p.chunkOf(sc).free(si, ei+1-si)
+			p.scav.index.free(sc, si, ei+1-si)
 		} else {
 			// The range crosses at least one chunk boundary.
 			p.chunkOf(sc).free(si, pallocChunkPages-si)
+			p.scav.index.free(sc, si, pallocChunkPages-si)
 			for c := sc + 1; c < ec; c++ {
 				p.chunkOf(c).freeAll()
+				p.scav.index.free(c, 0, pallocChunkPages)
 			}
 			p.chunkOf(ec).free(0, ei+1)
+			p.scav.index.free(ec, 0, ei+1)
 		}
 	}
 	p.update(base, npages, true, false)
diff --git a/src/runtime/mpagealloc_32bit.go b/src/runtime/mpagealloc_32bit.go
index 859c61d8a5..03990e47cf 100644
--- a/src/runtime/mpagealloc_32bit.go
+++ b/src/runtime/mpagealloc_32bit.go
@@ -12,7 +12,6 @@
 package runtime
 
 import (
-	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -58,10 +57,10 @@ var levelLogPages = [summaryLevels]uint{
 
 // scavengeIndexArray is the backing store for p.scav.index.chunks.
 // On 32-bit platforms, it's small enough to just be a global.
-var scavengeIndexArray [((1 << heapAddrBits) / pallocChunkBytes) / 8]atomic.Uint8
+var scavengeIndexArray [(1 << heapAddrBits) / pallocChunkBytes]atomicScavChunkData
 
 // See mpagealloc_64bit.go for details.
-func (p *pageAlloc) sysInit() {
+func (p *pageAlloc) sysInit(test bool) {
 	// Calculate how much memory all our entries will take up.
 	//
 	// This should be around 12 KiB or less.
@@ -95,8 +94,17 @@ func (p *pageAlloc) sysInit() {
 		reservation = add(reservation, uintptr(entries)*pallocSumBytes)
 	}
 
-	// Set up the scavenge index.
-	p.scav.index.chunks = scavengeIndexArray[:]
+	if test {
+		// Set up the scavenge index via sysAlloc so the test can free it later.
+		scavIndexSize := uintptr(len(scavengeIndexArray)) * unsafe.Sizeof(atomicScavChunkData{})
+		p.scav.index.chunks = ((*[(1 << heapAddrBits) / pallocChunkBytes]atomicScavChunkData)(sysAlloc(scavIndexSize, p.sysStat)))[:]
+		p.summaryMappedReady += scavIndexSize
+	} else {
+		// Set up the scavenge index.
+		p.scav.index.chunks = scavengeIndexArray[:]
+	}
+	p.scav.index.min.Store(1) // The 0th chunk is never going to be mapped for the heap.
+	p.scav.index.max.Store(uintptr(len(p.scav.index.chunks)))
 }
 
 // See mpagealloc_64bit.go for details.
diff --git a/src/runtime/mpagealloc_64bit.go b/src/runtime/mpagealloc_64bit.go
index 48859a7d01..a6f1954679 100644
--- a/src/runtime/mpagealloc_64bit.go
+++ b/src/runtime/mpagealloc_64bit.go
@@ -7,7 +7,6 @@
 package runtime
 
 import (
-	"runtime/internal/atomic"
 	"unsafe"
 )
 
@@ -69,7 +68,7 @@ var levelLogPages = [summaryLevels]uint{
 // sysInit performs architecture-dependent initialization of fields
 // in pageAlloc. pageAlloc should be uninitialized except for sysStat
 // if any runtime statistic should be updated.
-func (p *pageAlloc) sysInit() {
+func (p *pageAlloc) sysInit(test bool) {
 	// Reserve memory for each level. This will get mapped in
 	// as R/W by setArenas.
 	for l, shift := range levelShift {
@@ -88,10 +87,7 @@ func (p *pageAlloc) sysInit() {
 	}
 
 	// Set up the scavenge index.
-	nbytes := uintptr(1<<heapAddrBits) / pallocChunkBytes / 8
-	r := sysReserve(nil, nbytes)
-	sl := notInHeapSlice{(*notInHeap)(r), int(nbytes), int(nbytes)}
-	p.scav.index.chunks = *(*[]atomic.Uint8)(unsafe.Pointer(&sl))
+	p.scav.index.sysInit()
 }
 
 // sysGrow performs architecture-dependent operations on heap
@@ -168,8 +164,9 @@ func (p *pageAlloc) sysGrow(base, limit uintptr) {
 
 		// Prune need down to what needs to be newly mapped. Some parts of it may
 		// already be mapped by what inUse describes due to page alignment requirements
-		// for mapping. prune's invariants are guaranteed by the fact that this
-		// function will never be asked to remap the same memory twice.
+		// for mapping. Because this function will never be asked to remap the same
+		// memory twice, it should never be possible to prune in such a way that causes
+		// need to be split.
 		if inUseIndex > 0 {
 			need = need.subtract(addrRangeToSumAddrRange(l, p.inUse.ranges[inUseIndex-1]))
 		}
@@ -188,17 +185,18 @@ func (p *pageAlloc) sysGrow(base, limit uintptr) {
 	}
 
 	// Update the scavenge index.
-	p.summaryMappedReady += p.scav.index.grow(base, limit, p.sysStat)
+	p.summaryMappedReady += p.scav.index.sysGrow(base, limit, p.sysStat)
 }
 
-// grow increases the index's backing store in response to a heap growth.
+// sysGrow increases the index's backing store in response to a heap growth.
 //
 // Returns the amount of memory added to sysStat.
-func (s *scavengeIndex) grow(base, limit uintptr, sysStat *sysMemStat) uintptr {
+func (s *scavengeIndex) sysGrow(base, limit uintptr, sysStat *sysMemStat) uintptr {
 	if base%pallocChunkBytes != 0 || limit%pallocChunkBytes != 0 {
 		print("runtime: base = ", hex(base), ", limit = ", hex(limit), "\n")
 		throw("sysGrow bounds not aligned to pallocChunkBytes")
 	}
+	scSize := unsafe.Sizeof(atomicScavChunkData{})
 	// Map and commit the pieces of chunks that we need.
 	//
 	// We always map the full range of the minimum heap address to the
@@ -212,24 +210,24 @@ func (s *scavengeIndex) grow(base, limit uintptr, sysStat *sysMemStat) uintptr {
 	// index.
 	haveMin := s.min.Load()
 	haveMax := s.max.Load()
-	needMin := int32(alignDown(uintptr(chunkIndex(base)/8), physPageSize))
-	needMax := int32(alignUp(uintptr((chunkIndex(limit)+7)/8), physPageSize))
+	needMin := alignDown(uintptr(chunkIndex(base)), physPageSize/scSize)
+	needMax := alignUp(uintptr(chunkIndex(limit)), physPageSize/scSize)
 	// Extend the range down to what we have, if there's no overlap.
 	if needMax < haveMin {
 		needMax = haveMin
 	}
-	if needMin > haveMax {
+	if haveMax != 0 && needMin > haveMax {
 		needMin = haveMax
 	}
 	have := makeAddrRange(
 		// Avoid a panic from indexing one past the last element.
-		uintptr(unsafe.Pointer(&s.chunks[0]))+uintptr(haveMin),
-		uintptr(unsafe.Pointer(&s.chunks[0]))+uintptr(haveMax),
+		uintptr(unsafe.Pointer(&s.chunks[0]))+haveMin*scSize,
+		uintptr(unsafe.Pointer(&s.chunks[0]))+haveMax*scSize,
 	)
 	need := makeAddrRange(
 		// Avoid a panic from indexing one past the last element.
-		uintptr(unsafe.Pointer(&s.chunks[0]))+uintptr(needMin),
-		uintptr(unsafe.Pointer(&s.chunks[0]))+uintptr(needMax),
+		uintptr(unsafe.Pointer(&s.chunks[0]))+needMin*scSize,
+		uintptr(unsafe.Pointer(&s.chunks[0]))+needMax*scSize,
 	)
 	// Subtract any overlap from rounding. We can't re-map memory because
 	// it'll be zeroed.
@@ -247,11 +245,14 @@ func (s *scavengeIndex) grow(base, limit uintptr, sysStat *sysMemStat) uintptr {
 			s.max.Store(needMax)
 		}
 	}
-	// Update minHeapIdx. Note that even if there's no mapping work to do,
-	// we may still have a new, lower minimum heap address.
-	minHeapIdx := s.minHeapIdx.Load()
-	if baseIdx := int32(chunkIndex(base) / 8); minHeapIdx == 0 || baseIdx < minHeapIdx {
-		s.minHeapIdx.Store(baseIdx)
-	}
 	return need.size()
 }
+
+// sysInit initializes the scavengeIndex' chunks array.
+func (s *scavengeIndex) sysInit() {
+	n := uintptr(1<<heapAddrBits) / pallocChunkBytes
+	nbytes := n * unsafe.Sizeof(atomicScavChunkData{})
+	r := sysReserve(nil, nbytes)
+	sl := notInHeapSlice{(*notInHeap)(r), int(n), int(n)}
+	s.chunks = *(*[]atomicScavChunkData)(unsafe.Pointer(&sl))
+}
diff --git a/src/runtime/mpagecache.go b/src/runtime/mpagecache.go
index 5bc9c84408..245b0cbfef 100644
--- a/src/runtime/mpagecache.go
+++ b/src/runtime/mpagecache.go
@@ -89,11 +89,15 @@ func (c *pageCache) flush(p *pageAlloc) {
 	for i := uint(0); i < 64; i++ {
 		if c.cache&(1<<i) != 0 {
 			p.chunkOf(ci).free1(pi + i)
+
+			// Update density statistics.
+			p.scav.index.free(ci, pi+i, 1)
 		}
 		if c.scav&(1<<i) != 0 {
 			p.chunkOf(ci).scavenged.setRange(pi+i, 1)
 		}
 	}
+
 	// Since this is a lot like a free, we need to make sure
 	// we update the searchAddr just like free does.
 	if b := (offAddr{c.base}); b.lessThan(p.searchAddr) {
@@ -145,7 +149,7 @@ func (p *pageAlloc) allocToCache() pageCache {
 			p.searchAddr = maxSearchAddr()
 			return pageCache{}
 		}
-		ci := chunkIndex(addr)
+		ci = chunkIndex(addr)
 		chunk = p.chunkOf(ci)
 		c = pageCache{
 			base:  alignDown(addr, 64*pageSize),
@@ -163,6 +167,9 @@ func (p *pageAlloc) allocToCache() pageCache {
 	// Update as an allocation, but note that it's not contiguous.
 	p.update(c.base, pageCachePages, false, true)
 
+	// Update density statistics.
+	p.scav.index.alloc(ci, uint(sys.OnesCount64(c.cache)))
+
 	// Set the search address to the last page represented by the cache.
 	// Since all of the pages in this block are going to the cache, and we
 	// searched for the first free page, we can confidently start at the