From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 4 Oct 2021 20:36:49 +0000 (+0000)
Subject: runtime: don't hold the heap lock while scavenging
X-Git-Tag: go1.18beta1~507
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=4f543b59c5618abccf0e78a17a2aeb173c085a91;p=gostls13.git

runtime: don't hold the heap lock while scavenging

This change modifies the scavenger to no longer hold the heap lock while
actively scavenging pages. To achieve this, the change also:
* Reverses the locking behavior of the (*pageAlloc).scavenge API, to
  only acquire the heap lock when necessary.
* Introduces a new lock on the scavenger-related fields in a pageAlloc
  so that access to those fields doesn't require the heap lock. There
  are a few places in the scavenge path, notably reservation, that
  requires synchronization. The heap lock is far too heavy handed for
  this case.
* Changes the scavenger to marks pages that are actively being scavenged
  as allocated, and "frees" them back to the page allocator the usual
  way.
* Lifts the heap-growth scavenging code out of mheap.grow, where the
  heap lock is held, and into allocSpan, just after the lock is
  released. Releasing the lock during mheap.grow is not feasible if we
  want to ensure that allocation always makes progress (post-growth,
  another allocator could come in and take all that space, forcing the
  goroutine that just grew the heap to do so again).

This change means that the scavenger now must do more work for each
scavenge, but it is also now much more scalable. Although in theory it's
not great by always taking the locked paths in the page allocator, it
takes advantage of some properties of the allocator:
* Most of the time, the scavenger will be working with one page at a
  time. The page allocator's locked path is optimized for this case.
* On the allocation path, it doesn't need to do the find operation at
  all; it can go straight to setting bits for the range and updating the
  summary structure.

Change-Id: Ie941d5e7c05dcc96476795c63fef74bcafc2a0f1
Reviewed-on: https://go-review.googlesource.com/c/go/+/353974
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---

diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go
index 5149252c83..b2e64f14ad 100644
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -796,21 +796,17 @@ func (p *PageAlloc) Free(base, npages uintptr) {
 		// None of the tests need any higher-level locking, so we just
 		// take the lock internally.
 		lock(pp.mheapLock)
-		pp.free(base, npages)
+		pp.free(base, npages, true)
 		unlock(pp.mheapLock)
 	})
 }
 func (p *PageAlloc) Bounds() (ChunkIdx, ChunkIdx) {
 	return ChunkIdx((*pageAlloc)(p).start), ChunkIdx((*pageAlloc)(p).end)
 }
-func (p *PageAlloc) Scavenge(nbytes uintptr, mayUnlock bool) (r uintptr) {
+func (p *PageAlloc) Scavenge(nbytes uintptr) (r uintptr) {
 	pp := (*pageAlloc)(p)
 	systemstack(func() {
-		// None of the tests need any higher-level locking, so we just
-		// take the lock internally.
-		lock(pp.mheapLock)
-		r = pp.scavenge(nbytes, mayUnlock)
-		unlock(pp.mheapLock)
+		r = pp.scavenge(nbytes)
 	})
 	return
 }
diff --git a/src/runtime/lockrank.go b/src/runtime/lockrank.go
index 54b0f4ce9c..4a16bc0ddb 100644
--- a/src/runtime/lockrank.go
+++ b/src/runtime/lockrank.go
@@ -80,6 +80,7 @@ const (
 
 	// Memory-related leaf locks
 	lockRankGlobalAlloc
+	lockRankPageAllocScav
 
 	// Other leaf locks
 	lockRankGFree
@@ -157,7 +158,8 @@ var lockNames = []string{
 	lockRankMheap:        "mheap",
 	lockRankMheapSpecial: "mheapSpecial",
 
-	lockRankGlobalAlloc: "globalAlloc.mutex",
+	lockRankGlobalAlloc:   "globalAlloc.mutex",
+	lockRankPageAllocScav: "pageAlloc.scav.lock",
 
 	lockRankGFree:     "gFree",
 	lockRankHchanLeaf: "hchanLeaf",
@@ -223,16 +225,17 @@ var lockPartialOrder [][]lockRank = [][]lockRank{
 	lockRankRwmutexW: {},
 	lockRankRwmutexR: {lockRankSysmon, lockRankRwmutexW},
 
-	lockRankSpanSetSpine: {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-	lockRankGscan:        {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
-	lockRankStackpool:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
-	lockRankStackLarge:   {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
-	lockRankDefer:        {},
-	lockRankSudog:        {lockRankHchan, lockRankNotifyList},
-	lockRankWbufSpans:    {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
-	lockRankMheap:        {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans},
-	lockRankMheapSpecial: {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
-	lockRankGlobalAlloc:  {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
+	lockRankSpanSetSpine:  {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+	lockRankGscan:         {lockRankSysmon, lockRankScavenge, lockRankForcegc, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankSpanSetSpine},
+	lockRankStackpool:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankTrace, lockRankTraceStackTab, lockRankNetpollInit, lockRankRwmutexR, lockRankSpanSetSpine, lockRankGscan},
+	lockRankStackLarge:    {lockRankSysmon, lockRankAssistQueue, lockRankSched, lockRankItab, lockRankHchan, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan},
+	lockRankDefer:         {},
+	lockRankSudog:         {lockRankHchan, lockRankNotifyList},
+	lockRankWbufSpans:     {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankRoot, lockRankGscan, lockRankDefer, lockRankSudog},
+	lockRankMheap:         {lockRankSysmon, lockRankScavenge, lockRankSweepWaiters, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankFin, lockRankNotifyList, lockRankTraceStrings, lockRankMspanSpecial, lockRankProf, lockRankGcBitsArenas, lockRankRoot, lockRankSpanSetSpine, lockRankGscan, lockRankStackpool, lockRankStackLarge, lockRankDefer, lockRankSudog, lockRankWbufSpans},
+	lockRankMheapSpecial:  {lockRankSysmon, lockRankScavenge, lockRankAssistQueue, lockRankCpuprof, lockRankSweep, lockRankPollDesc, lockRankSched, lockRankAllg, lockRankAllp, lockRankTimers, lockRankItab, lockRankReflectOffs, lockRankHchan, lockRankTraceBuf, lockRankNotifyList, lockRankTraceStrings},
+	lockRankGlobalAlloc:   {lockRankProf, lockRankSpanSetSpine, lockRankMheap, lockRankMheapSpecial},
+	lockRankPageAllocScav: {lockRankMheap},
 
 	lockRankGFree:     {lockRankSched},
 	lockRankHchanLeaf: {lockRankGscan, lockRankHchanLeaf},
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 4edeb8739e..72ec81e5e3 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -289,30 +289,17 @@ func bgscavenge(c chan int) {
 
 	for {
 		released := uintptr(0)
-
-		// Time in scavenging critical section.
 		crit := float64(0)
 
-		// Run on the system stack since we grab the heap lock,
-		// and a stack growth with the heap lock means a deadlock.
-		systemstack(func() {
-			lock(&mheap_.lock)
-
-			// If background scavenging is disabled or if there's no work to do just park.
-			retained, goal := heapRetained(), atomic.Load64(&mheap_.scavengeGoal)
-			if retained <= goal {
-				unlock(&mheap_.lock)
-				return
-			}
-
+		// If background scavenging is disabled or if there's no work to do just park.
+		retained, goal := heapRetained(), atomic.Load64(&mheap_.scavengeGoal)
+		if retained > goal {
 			// Scavenge one page, and measure the amount of time spent scavenging.
 			start := nanotime()
-			released = mheap_.pages.scavenge(physPageSize, true)
-			mheap_.pages.scav.released += released
+			released = mheap_.pages.scavenge(physPageSize)
+			atomic.Xadduintptr(&mheap_.pages.scav.released, released)
 			crit = float64(nanotime() - start)
-
-			unlock(&mheap_.lock)
-		})
+		}
 
 		if released == 0 {
 			lock(&scavenge.lock)
@@ -395,16 +382,7 @@ func bgscavenge(c chan int) {
 // back to the top of the heap.
 //
 // Returns the amount of memory scavenged in bytes.
-//
-// p.mheapLock must be held, but may be temporarily released if
-// mayUnlock == true.
-//
-// Must run on the system stack because p.mheapLock must be held.
-//
-//go:systemstack
-func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
-	assertLockHeld(p.mheapLock)
-
+func (p *pageAlloc) scavenge(nbytes uintptr) uintptr {
 	var (
 		addrs addrRange
 		gen   uint32
@@ -416,9 +394,11 @@ func (p *pageAlloc) scavenge(nbytes uintptr, mayUnlock bool) uintptr {
 				break
 			}
 		}
-		r, a := p.scavengeOne(addrs, nbytes-released, mayUnlock)
-		released += r
-		addrs = a
+		systemstack(func() {
+			r, a := p.scavengeOne(addrs, nbytes-released)
+			released += r
+			addrs = a
+		})
 	}
 	// Only unreserve the space which hasn't been scavenged or searched
 	// to ensure we always make progress.
@@ -456,8 +436,9 @@ func printScavTrace(gen uint32, released uintptr, forced bool) {
 func (p *pageAlloc) scavengeStartGen() {
 	assertLockHeld(p.mheapLock)
 
+	lock(&p.scav.lock)
 	if debug.scavtrace > 0 {
-		printScavTrace(p.scav.gen, p.scav.released, false)
+		printScavTrace(p.scav.gen, atomic.Loaduintptr(&p.scav.released), false)
 	}
 	p.inUse.cloneInto(&p.scav.inUse)
 
@@ -487,9 +468,10 @@ func (p *pageAlloc) scavengeStartGen() {
 	// arena in size, so virtually every heap has the scavenger on.
 	p.scav.reservationBytes = alignUp(p.inUse.totalBytes, pallocChunkBytes) / scavengeReservationShards
 	p.scav.gen++
-	p.scav.released = 0
+	atomic.Storeuintptr(&p.scav.released, 0)
 	p.scav.freeHWM = minOffAddr
 	p.scav.scavLWM = maxOffAddr
+	unlock(&p.scav.lock)
 }
 
 // scavengeReserve reserves a contiguous range of the address space
@@ -498,14 +480,9 @@ func (p *pageAlloc) scavengeStartGen() {
 // first.
 //
 // Returns the reserved range and the scavenge generation number for it.
-//
-// p.mheapLock must be held.
-//
-// Must run on the system stack because p.mheapLock must be held.
-//
-//go:systemstack
 func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
-	assertLockHeld(p.mheapLock)
+	lock(&p.scav.lock)
+	gen := p.scav.gen
 
 	// Start by reserving the minimum.
 	r := p.scav.inUse.removeLast(p.scav.reservationBytes)
@@ -513,7 +490,8 @@ func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
 	// Return early if the size is zero; we don't want to use
 	// the bogus address below.
 	if r.size() == 0 {
-		return r, p.scav.gen
+		unlock(&p.scav.lock)
+		return r, gen
 	}
 
 	// The scavenger requires that base be aligned to a
@@ -524,28 +502,26 @@ func (p *pageAlloc) scavengeReserve() (addrRange, uint32) {
 
 	// Remove from inUse however much extra we just pulled out.
 	p.scav.inUse.removeGreaterEqual(newBase)
+	unlock(&p.scav.lock)
+
 	r.base = offAddr{newBase}
-	return r, p.scav.gen
+	return r, gen
 }
 
 // scavengeUnreserve returns an unscavenged portion of a range that was
 // previously reserved with scavengeReserve.
-//
-// p.mheapLock must be held.
-//
-// Must run on the system stack because p.mheapLock must be held.
-//
-//go:systemstack
 func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
-	assertLockHeld(p.mheapLock)
-
-	if r.size() == 0 || gen != p.scav.gen {
+	if r.size() == 0 {
 		return
 	}
 	if r.base.addr()%pallocChunkBytes != 0 {
 		throw("unreserving unaligned region")
 	}
-	p.scav.inUse.add(r)
+	lock(&p.scav.lock)
+	if gen == p.scav.gen {
+		p.scav.inUse.add(r)
+	}
+	unlock(&p.scav.lock)
 }
 
 // scavengeOne walks over address range work until it finds
@@ -559,15 +535,10 @@ func (p *pageAlloc) scavengeUnreserve(r addrRange, gen uint32) {
 //
 // work's base address must be aligned to pallocChunkBytes.
 //
-// p.mheapLock must be held, but may be temporarily released if
-// mayUnlock == true.
-//
-// Must run on the system stack because p.mheapLock must be held.
+// Must run on the systemstack because it acquires p.mheapLock.
 //
 //go:systemstack
-func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (uintptr, addrRange) {
-	assertLockHeld(p.mheapLock)
-
+func (p *pageAlloc) scavengeOne(work addrRange, max uintptr) (uintptr, addrRange) {
 	// Defensively check if we've received an empty address range.
 	// If so, just return.
 	if work.size() == 0 {
@@ -599,40 +570,12 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 		minPages = 1
 	}
 
-	// Helpers for locking and unlocking only if mayUnlock == true.
-	lockHeap := func() {
-		if mayUnlock {
-			lock(p.mheapLock)
-		}
-	}
-	unlockHeap := func() {
-		if mayUnlock {
-			unlock(p.mheapLock)
-		}
-	}
-
-	// Fast path: check the chunk containing the top-most address in work,
-	// starting at that address's page index in the chunk.
-	//
-	// Note that work.end() is exclusive, so get the chunk we care about
-	// by subtracting 1.
-	maxAddr := work.limit.addr() - 1
-	maxChunk := chunkIndex(maxAddr)
-	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
-		// We only bother looking for a candidate if there at least
-		// minPages free pages at all.
-		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
-
-		// If we found something, scavenge it and return!
-		if npages != 0 {
-			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
-
-			assertLockHeld(p.mheapLock) // Must be locked on return.
-			return uintptr(npages) * pageSize, work
-		}
+	// Fast path: check the chunk containing the top-most address in work.
+	if r, w := p.scavengeOneFast(work, minPages, maxPages); r != 0 {
+		return r, w
+	} else {
+		work = w
 	}
-	// Update the limit to reflect the fact that we checked maxChunk already.
-	work.limit = offAddr{chunkBase(maxChunk)}
 
 	// findCandidate finds the next scavenge candidate in work optimistically.
 	//
@@ -671,37 +614,61 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 	// looking for any free and unscavenged page. If we think we see something,
 	// lock and verify it!
 	for work.size() != 0 {
-		unlockHeap()
 
 		// Search for the candidate.
 		candidateChunkIdx, ok := findCandidate(work)
-
-		// Lock the heap. We need to do this now if we found a candidate or not.
-		// If we did, we'll verify it. If not, we need to lock before returning
-		// anyway.
-		lockHeap()
-
 		if !ok {
 			// We didn't find a candidate, so we're done.
 			work.limit = work.base
 			break
 		}
 
+		// Lock, so we can verify what we found.
+		lock(p.mheapLock)
+
 		// Find, verify, and scavenge if we can.
 		chunk := p.chunkOf(candidateChunkIdx)
 		base, npages := chunk.findScavengeCandidate(pallocChunkPages-1, minPages, maxPages)
 		if npages > 0 {
 			work.limit = offAddr{p.scavengeRangeLocked(candidateChunkIdx, base, npages)}
-
-			assertLockHeld(p.mheapLock) // Must be locked on return.
+			unlock(p.mheapLock)
 			return uintptr(npages) * pageSize, work
 		}
+		unlock(p.mheapLock)
 
 		// We were fooled, so let's continue from where we left off.
 		work.limit = offAddr{chunkBase(candidateChunkIdx)}
 	}
+	return 0, work
+}
 
-	assertLockHeld(p.mheapLock) // Must be locked on return.
+// scavengeOneFast is the fast path for scavengeOne, which just checks the top
+// chunk of work for some pages to scavenge.
+//
+// Must run on the system stack because it acquires the heap lock.
+//
+//go:systemstack
+func (p *pageAlloc) scavengeOneFast(work addrRange, minPages, maxPages uintptr) (uintptr, addrRange) {
+	maxAddr := work.limit.addr() - 1
+	maxChunk := chunkIndex(maxAddr)
+
+	lock(p.mheapLock)
+	if p.summary[len(p.summary)-1][maxChunk].max() >= uint(minPages) {
+		// We only bother looking for a candidate if there at least
+		// minPages free pages at all.
+		base, npages := p.chunkOf(maxChunk).findScavengeCandidate(chunkPageIndex(maxAddr), minPages, maxPages)
+
+		// If we found something, scavenge it and return!
+		if npages != 0 {
+			work.limit = offAddr{p.scavengeRangeLocked(maxChunk, base, npages)}
+			unlock(p.mheapLock)
+			return uintptr(npages) * pageSize, work
+		}
+	}
+	unlock(p.mheapLock)
+
+	// Update the limit to reflect the fact that we checked maxChunk already.
+	work.limit = offAddr{chunkBase(maxChunk)}
 	return 0, work
 }
 
@@ -712,38 +679,57 @@ func (p *pageAlloc) scavengeOne(work addrRange, max uintptr, mayUnlock bool) (ui
 //
 // Returns the base address of the scavenged region.
 //
-// p.mheapLock must be held.
+// p.mheapLock must be held. Unlocks p.mheapLock but reacquires
+// it before returning. Must be run on the systemstack as a result.
+//
+//go:systemstack
 func (p *pageAlloc) scavengeRangeLocked(ci chunkIdx, base, npages uint) uintptr {
 	assertLockHeld(p.mheapLock)
 
-	p.chunkOf(ci).scavenged.setRange(base, npages)
-
 	// Compute the full address for the start of the range.
 	addr := chunkBase(ci) + uintptr(base)*pageSize
 
+	// Mark the range we're about to scavenge as allocated, because
+	// we don't want any allocating goroutines to grab it while
+	// the scavenging is in progress.
+	if scav := p.allocRange(addr, uintptr(npages)); scav != 0 {
+		throw("double scavenge")
+	}
+
+	// With that done, it's safe to unlock.
+	unlock(p.mheapLock)
+
 	// Update the scavenge low watermark.
+	lock(&p.scav.lock)
 	if oAddr := (offAddr{addr}); oAddr.lessThan(p.scav.scavLWM) {
 		p.scav.scavLWM = oAddr
 	}
+	unlock(&p.scav.lock)
 
-	// Only perform the actual scavenging if we're not in a test.
-	// It's dangerous to do so otherwise.
-	if p.test {
-		return addr
-	}
-	sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
+	if !p.test {
+		// Only perform the actual scavenging if we're not in a test.
+		// It's dangerous to do so otherwise.
+		sysUnused(unsafe.Pointer(addr), uintptr(npages)*pageSize)
 
-	// Update global accounting only when not in test, otherwise
-	// the runtime's accounting will be wrong.
-	nbytes := int64(npages) * pageSize
-	atomic.Xadd64(&memstats.heap_released, nbytes)
+		// Update global accounting only when not in test, otherwise
+		// the runtime's accounting will be wrong.
+		nbytes := int64(npages) * pageSize
+		atomic.Xadd64(&memstats.heap_released, nbytes)
 
-	// Update consistent accounting too.
-	stats := memstats.heapStats.acquire()
-	atomic.Xaddint64(&stats.committed, -nbytes)
-	atomic.Xaddint64(&stats.released, nbytes)
-	memstats.heapStats.release()
+		// Update consistent accounting too.
+		stats := memstats.heapStats.acquire()
+		atomic.Xaddint64(&stats.committed, -nbytes)
+		atomic.Xaddint64(&stats.released, nbytes)
+		memstats.heapStats.release()
+	}
+
+	// Relock the heap, because now we need to make these pages
+	// available allocation. Free them back to the page allocator.
+	lock(p.mheapLock)
+	p.free(addr, uintptr(npages), true)
 
+	// Mark the range as scavenged.
+	p.chunkOf(ci).scavenged.setRange(base, npages)
 	return addr
 }
 
diff --git a/src/runtime/mgcscavenge_test.go b/src/runtime/mgcscavenge_test.go
index 3b12a2e1e6..b186cad2f4 100644
--- a/src/runtime/mgcscavenge_test.go
+++ b/src/runtime/mgcscavenge_test.go
@@ -430,12 +430,12 @@ func TestPageAllocScavenge(t *testing.T) {
 	}
 	for name, v := range tests {
 		v := v
-		runTest := func(t *testing.T, mayUnlock bool) {
+		t.Run(name, func(t *testing.T) {
 			b := NewPageAlloc(v.beforeAlloc, v.beforeScav)
 			defer FreePageAlloc(b)
 
 			for iter, h := range v.expect {
-				if got := b.Scavenge(h.request, mayUnlock); got != h.expect {
+				if got := b.Scavenge(h.request); got != h.expect {
 					t.Fatalf("bad scavenge #%d: want %d, got %d", iter+1, h.expect, got)
 				}
 			}
@@ -443,12 +443,6 @@ func TestPageAllocScavenge(t *testing.T) {
 			defer FreePageAlloc(want)
 
 			checkPageAlloc(t, want, b)
-		}
-		t.Run(name, func(t *testing.T) {
-			runTest(t, false)
-		})
-		t.Run(name+"MayUnlock", func(t *testing.T) {
-			runTest(t, true)
 		})
 	}
 }
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go
index f2f6e7f4cf..ecbd0a3a49 100644
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -80,7 +80,7 @@ type mheap struct {
 	// access (since that may free the backing store).
 	allspans []*mspan // all spans out there
 
-	_ uint32 // align uint64 fields on 32-bit for atomics
+	// _ uint32 // align uint64 fields on 32-bit for atomics
 
 	// Proportional sweep
 	//
@@ -1120,6 +1120,7 @@ func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass
 	// Function-global state.
 	gp := getg()
 	base, scav := uintptr(0), uintptr(0)
+	growth := uintptr(0)
 
 	// On some platforms we need to provide physical page aligned stack
 	// allocations. Where the page size is less than the physical page
@@ -1165,7 +1166,9 @@ func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass
 		// Try to acquire a base address.
 		base, scav = h.pages.alloc(npages)
 		if base == 0 {
-			if !h.grow(npages) {
+			var ok bool
+			growth, ok = h.grow(npages)
+			if !ok {
 				unlock(&h.lock)
 				return nil
 			}
@@ -1189,16 +1192,35 @@ func (h *mheap) allocSpan(npages uintptr, typ spanAllocType, spanclass spanClass
 		// Return memory around the aligned allocation.
 		spaceBefore := base - allocBase
 		if spaceBefore > 0 {
-			h.pages.free(allocBase, spaceBefore/pageSize)
+			h.pages.free(allocBase, spaceBefore/pageSize, false)
 		}
 		spaceAfter := (allocPages-npages)*pageSize - spaceBefore
 		if spaceAfter > 0 {
-			h.pages.free(base+npages*pageSize, spaceAfter/pageSize)
+			h.pages.free(base+npages*pageSize, spaceAfter/pageSize, false)
 		}
 	}
 
 	unlock(&h.lock)
 
+	if growth > 0 {
+		// We just caused a heap growth, so scavenge down what will soon be used.
+		// By scavenging inline we deal with the failure to allocate out of
+		// memory fragments by scavenging the memory fragments that are least
+		// likely to be re-used.
+		scavengeGoal := atomic.Load64(&h.scavengeGoal)
+		if retained := heapRetained(); retained+uint64(growth) > scavengeGoal {
+			// The scavenging algorithm requires the heap lock to be dropped so it
+			// can acquire it only sparingly. This is a potentially expensive operation
+			// so it frees up other goroutines to allocate in the meanwhile. In fact,
+			// they can make use of the growth we just created.
+			todo := growth
+			if overage := uintptr(retained + uint64(growth) - scavengeGoal); todo > overage {
+				todo = overage
+			}
+			h.pages.scavenge(todo)
+		}
+	}
+
 HaveSpan:
 	// At this point, both s != nil and base != 0, and the heap
 	// lock is no longer held. Initialize the span.
@@ -1311,10 +1333,10 @@ HaveSpan:
 }
 
 // Try to add at least npage pages of memory to the heap,
-// returning whether it worked.
+// returning how much the heap grew by and whether it worked.
 //
 // h.lock must be held.
-func (h *mheap) grow(npage uintptr) bool {
+func (h *mheap) grow(npage uintptr) (uintptr, bool) {
 	assertLockHeld(&h.lock)
 
 	// We must grow the heap in whole palloc chunks.
@@ -1336,7 +1358,7 @@ func (h *mheap) grow(npage uintptr) bool {
 		av, asize := h.sysAlloc(ask)
 		if av == nil {
 			print("runtime: out of memory: cannot allocate ", ask, "-byte block (", memstats.heap_sys, " in use)\n")
-			return false
+			return 0, false
 		}
 
 		if uintptr(av) == h.curArena.end {
@@ -1396,20 +1418,7 @@ func (h *mheap) grow(npage uintptr) bool {
 	// space ready for allocation.
 	h.pages.grow(v, nBase-v)
 	totalGrowth += nBase - v
-
-	// We just caused a heap growth, so scavenge down what will soon be used.
-	// By scavenging inline we deal with the failure to allocate out of
-	// memory fragments by scavenging the memory fragments that are least
-	// likely to be re-used.
-	scavengeGoal := atomic.Load64(&h.scavengeGoal)
-	if retained := heapRetained(); retained+uint64(totalGrowth) > scavengeGoal {
-		todo := totalGrowth
-		if overage := uintptr(retained + uint64(totalGrowth) - scavengeGoal); todo > overage {
-			todo = overage
-		}
-		h.pages.scavenge(todo, false)
-	}
-	return true
+	return totalGrowth, true
 }
 
 // Free the span back into the heap.
@@ -1499,7 +1508,7 @@ func (h *mheap) freeSpanLocked(s *mspan, typ spanAllocType) {
 	memstats.heapStats.release()
 
 	// Mark the space as free.
-	h.pages.free(s.base(), s.npages)
+	h.pages.free(s.base(), s.npages, false)
 
 	// Free the span structure. We no longer have a use for it.
 	s.state.set(mSpanDead)
@@ -1515,13 +1524,19 @@ func (h *mheap) scavengeAll() {
 	// the mheap API.
 	gp := getg()
 	gp.m.mallocing++
+
 	lock(&h.lock)
 	// Start a new scavenge generation so we have a chance to walk
 	// over the whole heap.
 	h.pages.scavengeStartGen()
-	released := h.pages.scavenge(^uintptr(0), false)
-	gen := h.pages.scav.gen
 	unlock(&h.lock)
+
+	released := h.pages.scavenge(^uintptr(0))
+
+	lock(&h.pages.scav.lock)
+	gen := h.pages.scav.gen
+	unlock(&h.pages.scav.lock)
+
 	gp.m.mallocing--
 
 	if debug.scavtrace > 0 {
diff --git a/src/runtime/mpagealloc.go b/src/runtime/mpagealloc.go
index 862882cd82..2725e3b7c7 100644
--- a/src/runtime/mpagealloc.go
+++ b/src/runtime/mpagealloc.go
@@ -226,6 +226,8 @@ type pageAlloc struct {
 	// are currently available. Otherwise one might iterate over unused
 	// ranges.
 	//
+	// Protected by mheapLock.
+	//
 	// TODO(mknyszek): Consider changing the definition of the bitmap
 	// such that 1 means free and 0 means in-use so that summaries and
 	// the bitmaps align better on zero-values.
@@ -261,29 +263,41 @@ type pageAlloc struct {
 	inUse addrRanges
 
 	// scav stores the scavenger state.
-	//
-	// All fields are protected by mheapLock.
 	scav struct {
+		lock mutex
+
 		// inUse is a slice of ranges of address space which have not
 		// yet been looked at by the scavenger.
+		//
+		// Protected by lock.
 		inUse addrRanges
 
 		// gen is the scavenge generation number.
+		//
+		// Protected by lock.
 		gen uint32
 
 		// reservationBytes is how large of a reservation should be made
 		// in bytes of address space for each scavenge iteration.
+		//
+		// Protected by lock.
 		reservationBytes uintptr
 
 		// released is the amount of memory released this generation.
+		//
+		// Updated atomically.
 		released uintptr
 
 		// scavLWM is the lowest (offset) address that the scavenger reached this
 		// scavenge generation.
+		//
+		// Protected by lock.
 		scavLWM offAddr
 
 		// freeHWM is the highest (offset) address of a page that was freed to
 		// the page allocator this scavenge generation.
+		//
+		// Protected by mheapLock.
 		freeHWM offAddr
 	}
 
@@ -864,17 +878,19 @@ Found:
 // Must run on the system stack because p.mheapLock must be held.
 //
 //go:systemstack
-func (p *pageAlloc) free(base, npages uintptr) {
+func (p *pageAlloc) free(base, npages uintptr, scavenged bool) {
 	assertLockHeld(p.mheapLock)
 
 	// If we're freeing pages below the p.searchAddr, update searchAddr.
 	if b := (offAddr{base}); b.lessThan(p.searchAddr) {
 		p.searchAddr = b
 	}
-	// Update the free high watermark for the scavenger.
 	limit := base + npages*pageSize - 1
-	if offLimit := (offAddr{limit}); p.scav.freeHWM.lessThan(offLimit) {
-		p.scav.freeHWM = offLimit
+	if !scavenged {
+		// Update the free high watermark for the scavenger.
+		if offLimit := (offAddr{limit}); p.scav.freeHWM.lessThan(offLimit) {
+			p.scav.freeHWM = offLimit
+		}
 	}
 	if npages == 1 {
 		// Fast path: we're clearing a single bit, and we know exactly