From e48e4b4cbbe270bc43e4209dce10c9225254aa64 Mon Sep 17 00:00:00 2001
From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Mon, 4 Oct 2021 21:44:06 +0000
Subject: [PATCH] runtime: use a controller to control the scavenge rate

Currently the scavenge rate is determined by a bunch of ad-hoc
mechanisms. Just use a controller instead, now that we have one.

To facilitate this, the scavenger now attempts to scavenge for at least
1 ms at a time, because any less and the timer system is too imprecise to
give useful feedback to the controller. Also increase the amount that we
scavenge at once, to try to reduce the overheads involved (at the
expense of a little bit of latency).

This change also modifies the controller to accept an update period,
because it's useful to allow that to be variable.

Change-Id: I8a15b2355d0a7c6cbac68c957082d5819618f7d7
Reviewed-on: https://go-review.googlesource.com/c/go/+/353975
Trust: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
---
 src/runtime/mgcpacer.go    |  15 ++--
 src/runtime/mgcscavenge.go | 159 +++++++++++++++++++++----------------
 2 files changed, 96 insertions(+), 78 deletions(-)

diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go
index 230e78b000..5b699cb298 100644
--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@@ -349,9 +349,6 @@ func (c *gcControllerState) init(gcPercent int32) {
 			kp: 0.9,
 			ti: 4.0,
 
-			// An update is done once per GC cycle.
-			period: 1,
-
 			// Set a high reset time in GC cycles.
 			// This is inversely proportional to the rate at which we
 			// accumulate error from clipping. By making this very high
@@ -677,8 +674,9 @@ func (c *gcControllerState) endCycle(now int64, procs int, userForced bool) floa
 			(float64(scanWork) * (1 - utilization))
 
 		// Update cons/mark controller.
+		// Period for this is 1 GC cycle.
 		oldConsMark := c.consMark
-		c.consMark = c.consMarkController.next(c.consMark, currentConsMark)
+		c.consMark = c.consMarkController.next(c.consMark, currentConsMark, 1.0)
 
 		if debug.gcpacertrace > 0 {
 			printlock()
@@ -1259,10 +1257,7 @@ func readGOGC() int32 {
 type piController struct {
 	kp float64 // Proportional constant.
 	ti float64 // Integral time constant.
-	tt float64 // Reset time in GC cyles.
-
-	// Period in GC cycles between updates.
-	period float64
+	tt float64 // Reset time.
 
 	min, max float64 // Output boundaries.
 
@@ -1271,7 +1266,7 @@ type piController struct {
 	errIntegral float64 // Integral of the error from t=0 to now.
 }
 
-func (c *piController) next(input, setpoint float64) float64 {
+func (c *piController) next(input, setpoint, period float64) float64 {
 	// Compute the raw output value.
 	prop := c.kp * (setpoint - input)
 	rawOutput := prop + c.errIntegral
@@ -1286,7 +1281,7 @@ func (c *piController) next(input, setpoint float64) float64 {
 
 	// Update the controller's state.
 	if c.ti != 0 && c.tt != 0 {
-		c.errIntegral += (c.kp*c.period/c.ti)*(setpoint-input) + (c.period/c.tt)*(output-rawOutput)
+		c.errIntegral += (c.kp*period/c.ti)*(setpoint-input) + (period/c.tt)*(output-rawOutput)
 	}
 	return output
 }
diff --git a/src/runtime/mgcscavenge.go b/src/runtime/mgcscavenge.go
index 72ec81e5e3..a2a88e94d2 100644
--- a/src/runtime/mgcscavenge.go
+++ b/src/runtime/mgcscavenge.go
@@ -270,35 +270,85 @@ func bgscavenge(c chan int) {
 	c <- 1
 	goparkunlock(&scavenge.lock, waitReasonGCScavengeWait, traceEvGoBlock, 1)
 
-	// Exponentially-weighted moving average of the fraction of time this
-	// goroutine spends scavenging (that is, percent of a single CPU).
-	// It represents a measure of scheduling overheads which might extend
-	// the sleep or the critical time beyond what's expected. Assume no
-	// overhead to begin with.
+	// idealFraction is the ideal % of overall application CPU time that we
+	// spend scavenging.
 	//
-	// TODO(mknyszek): Consider making this based on total CPU time of the
-	// application (i.e. scavengePercent * GOMAXPROCS). This isn't really
-	// feasible now because the scavenger acquires the heap lock over the
-	// scavenging operation, which means scavenging effectively blocks
-	// allocators and isn't scalable. However, given a scalable allocator,
-	// it makes sense to also make the scavenger scale with it; if you're
-	// allocating more frequently, then presumably you're also generating
-	// more work for the scavenger.
-	const idealFraction = scavengePercent / 100.0
-	scavengeEWMA := float64(idealFraction)
-
+	// TODO(mknyszek): Currently this is percent of one CPU (hence the division
+	// by gomaxprocs), but ideally this should be 1% of overall CPU time.
+	// Given a scalable memory allocator, it makes sense that the scavenger
+	// should scale with it; if you're allocating more frequently, then presumably
+	// you're also generating more work from the scavenger.
+	idealFraction := float64(scavengePercent) / 100.0 / float64(gomaxprocs)
+
+	// Input: fraction of CPU time used.
+	// Setpoint: idealFraction.
+	// Output: ratio of critical time to sleep time (determines sleep time).
+	//
+	// The output of this controller is somewhat indirect to what we actually
+	// want to achieve: how much time to sleep for. The reason for this definition
+	// is to ensure that the controller's outputs have a direct relationship with
+	// its inputs (as opposed to an inverse relationship), making it somewhat
+	// easier to reason about for tuning purposes.
+	critSleepController := piController{
+		// Tuned loosely via Ziegler-Nichols process.
+		kp: 0.3375,
+		ti: 3.2e6,
+		tt: 1e9, // 1 second reset time.
+
+		// These ranges seem wide, but we want to give the controller plenty of
+		// room to hunt for the optimal value.
+		min: 0.001,  // 1:1000
+		max: 1000.0, // 1000:1
+	}
+	// It doesn't really matter what value we start at, but we can't be zero, because
+	// that'll cause divide-by-zero issues.
+	critSleepRatio := 0.001
 	for {
 		released := uintptr(0)
 		crit := float64(0)
 
-		// If background scavenging is disabled or if there's no work to do just park.
-		retained, goal := heapRetained(), atomic.Load64(&mheap_.scavengeGoal)
-		if retained > goal {
-			// Scavenge one page, and measure the amount of time spent scavenging.
+		// Spend at least 1 ms scavenging, otherwise the corresponding
+		// sleep time to maintain our desired utilization is too low to
+		// be reliable.
+		const minCritTime = 1e6
+		for crit < minCritTime {
+			// If background scavenging is disabled or if there's no work to do just park.
+			retained, goal := heapRetained(), atomic.Load64(&mheap_.scavengeGoal)
+			if retained <= goal {
+				break
+			}
+
+			// scavengeQuantum is the amount of memory we try to scavenge
+			// in one go. A smaller value means the scavenger is more responsive
+			// to the scheduler in case of e.g. preemption. A larger value means
+			// that the overheads of scavenging are better amortized, so better
+			// scavenging throughput.
+			//
+			// The current value is chosen assuming a cost of ~10Âµs/physical page
+			// (this is somewhat pessimistic), which implies a worst-case latency of
+			// about 160Âµs for 4 KiB physical pages. The current value is biased
+			// toward latency over throughput.
+			const scavengeQuantum = 64 << 10
+
+			// Accumulate the amount of time spent scavenging.
 			start := nanotime()
-			released = mheap_.pages.scavenge(physPageSize)
+			released = mheap_.pages.scavenge(scavengeQuantum)
 			atomic.Xadduintptr(&mheap_.pages.scav.released, released)
-			crit = float64(nanotime() - start)
+			end := nanotime()
+
+			// On some platforms we may see end >= start if the time it takes to scavenge
+			// memory is less than the minimum granularity of its clock (e.g. Windows) or
+			// due to clock bugs.
+			//
+			// In this case, just assume scavenging takes 10 Âµs per regular physical page
+			// (determined empirically), and conservatively ignore the impact of huge pages
+			// on timing.
+			const approxCritNSPerPhysicalPage = 10e3
+			if end <= start {
+				crit += approxCritNSPerPhysicalPage * float64(released/physPageSize)
+			} else {
+				crit += float64(end - start)
+			}
 		}
 
 		if released == 0 {
@@ -316,18 +366,13 @@ func bgscavenge(c chan int) {
 			throw("released less than one physical page of memory")
 		}
 
-		// On some platforms we may see crit as zero if the time it takes to scavenge
-		// memory is less than the minimum granularity of its clock (e.g. Windows).
-		// In this case, just assume scavenging takes 10 Âµs per regular physical page
-		// (determined empirically), and conservatively ignore the impact of huge pages
-		// on timing.
-		//
-		// We shouldn't ever see a crit value less than zero unless there's a bug of
-		// some kind, either on our side or in the platform we're running on, but be
-		// defensive in that case as well.
-		const approxCritNSPerPhysicalPage = 10e3
-		if crit <= 0 {
-			crit = approxCritNSPerPhysicalPage * float64(released/physPageSize)
+		if crit < minCritTime {
+			// This means there wasn't enough work to actually fill up minCritTime.
+			// That's fine; we shouldn't try to do anything with this information
+			// because it's going result in a short enough sleep request that things
+			// will get messy. Just assume we did at least this much work.
+			// All this means is that we'll sleep longer than we otherwise would have.
+			crit = minCritTime
 		}
 
 		// Multiply the critical time by 1 + the ratio of the costs of using
@@ -338,41 +383,19 @@ func bgscavenge(c chan int) {
 		// because of the additional overheads of using scavenged memory.
 		crit *= 1 + scavengeCostRatio
 
-		// If we spent more than 10 ms (for example, if the OS scheduled us away, or someone
-		// put their machine to sleep) in the critical section, bound the time we use to
-		// calculate at 10 ms to avoid letting the sleep time get arbitrarily high.
-		const maxCrit = 10e6
-		if crit > maxCrit {
-			crit = maxCrit
-		}
+		// Go to sleep for our current sleepNS.
+		slept := scavengeSleep(int64(crit / critSleepRatio))
 
-		// Compute the amount of time to sleep, assuming we want to use at most
-		// scavengePercent of CPU time. Take into account scheduling overheads
-		// that may extend the length of our sleep by multiplying by how far
-		// off we are from the ideal ratio. For example, if we're sleeping too
-		// much, then scavengeEMWA < idealFraction, so we'll adjust the sleep time
-		// down.
-		adjust := scavengeEWMA / idealFraction
-		sleepTime := int64(adjust * crit / (scavengePercent / 100.0))
-
-		// Go to sleep.
-		slept := scavengeSleep(sleepTime)
-
-		// Compute the new ratio.
-		fraction := crit / (crit + float64(slept))
-
-		// Set a lower bound on the fraction.
-		// Due to OS-related anomalies we may "sleep" for an inordinate amount
-		// of time. Let's avoid letting the ratio get out of hand by bounding
-		// the sleep time we use in our EWMA.
-		const minFraction = 1.0 / 1000.0
-		if fraction < minFraction {
-			fraction = minFraction
-		}
-
-		// Update scavengeEWMA by merging in the new crit/slept ratio.
-		const alpha = 0.5
-		scavengeEWMA = alpha*fraction + (1-alpha)*scavengeEWMA
+		// Calculate the CPU time spent.
+		//
+		// This may be slightly inaccurate with respect to GOMAXPROCS, but we're
+		// recomputing this often enough relative to GOMAXPROCS changes in general
+		// (it only changes when the world is stopped, and not during a GC) that
+		// that small inaccuracy is in the noise.
+		cpuFraction := float64(crit) / ((float64(slept) + crit) * float64(gomaxprocs))
+
+		// Update the critSleepRatio, adjusting until we reach our ideal fraction.
+		critSleepRatio = critSleepController.next(cpuFraction, idealFraction, float64(slept)+crit)
 	}
 }
 
-- 
2.51.0