runtime: make CPU limiter assist time much less error-prone

author Michael Anthony Knyszek <mknyszek@google.com>

Fri, 6 May 2022 20:17:52 +0000 (20:17 +0000)

committer Michael Knyszek <mknyszek@google.com>

Fri, 13 May 2022 16:02:20 +0000 (16:02 +0000)
author Michael Anthony Knyszek <mknyszek@google.com>
Fri, 6 May 2022 20:17:52 +0000 (20:17 +0000)
committer Michael Knyszek <mknyszek@google.com>
Fri, 13 May 2022 16:02:20 +0000 (16:02 +0000)
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go

index 230ed76c814b9184facbb7c3aca32854fec94849..7196627f81cd5d6d01fe1989eab76e9015532b91 100644 (file)
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -1439,16 +1439,20 @@ func (l *GCCPULimiter) NeedUpdate(now int64) bool {
         return l.limiter.needUpdate(now)
  }
  
-func (l *GCCPULimiter) StartGCTransition(enableGC bool, totalAssistTime, now int64) {
-       l.limiter.startGCTransition(enableGC, totalAssistTime, now)
+func (l *GCCPULimiter) StartGCTransition(enableGC bool, now int64) {
+       l.limiter.startGCTransition(enableGC, now)
  }
  
  func (l *GCCPULimiter) FinishGCTransition(now int64) {
         l.limiter.finishGCTransition(now)
  }
  
-func (l *GCCPULimiter) Update(totalAssistTime int64, now int64) {
-       l.limiter.update(totalAssistTime, now)
+func (l *GCCPULimiter) Update(now int64) {
+       l.limiter.update(now)
+}
+
+func (l *GCCPULimiter) AddAssistTime(t int64) {
+       l.limiter.addAssistTime(t)
  }
  
  func (l *GCCPULimiter) ResetCapacity(now int64, nprocs int32) {
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go

index b0c6b1928e6dd2098cabf5e6c8ce8ba9ab74d44a..34043c64329eab0c25ce4ab7d39bd2a862219654 100644 (file)
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -677,7 +677,7 @@ func gcStart(trigger gcTrigger) {
         gcController.startCycle(now, int(gomaxprocs), trigger)
  
         // Notify the CPU limiter that assists may begin.
-       gcCPULimiter.startGCTransition(true, 0, now)
+       gcCPULimiter.startGCTransition(true, now)
  
         // In STW mode, disable scheduling of user Gs. This may also
         // disable scheduling of this goroutine, so it may block as
@@ -891,8 +891,8 @@ top:
         // this before waking blocked assists.
         atomic.Store(&gcBlackenEnabled, 0)
  
-       // Notify the CPU limiter that assists will now cease.
-       gcCPULimiter.startGCTransition(false, gcController.assistTime.Load(), now)
+       // Notify the CPU limiter that GC assists will now cease.
+       gcCPULimiter.startGCTransition(false, now)
  
         // Wake all blocked assists. These will run when we
         // start the world again.
@@ -1017,6 +1017,12 @@ func gcMarkTermination() {
         totalCpu := sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
         memstats.gc_cpu_fraction = float64(work.totaltime) / float64(totalCpu)
  
+       // Reset assist time stat.
+       //
+       // Do this now, instead of at the start of the next GC cycle, because
+       // these two may keep accumulating even if the GC is not active.
+       mheap_.pages.scav.assistTime.Store(0)
+
         // Reset sweep state.
         sweep.nbgsweep = 0
         sweep.npausesweep = 0
diff --git a/src/runtime/mgclimit.go b/src/runtime/mgclimit.go

index 1330ce63c08157504dfb5f1d2282e0c2787b40ad..b930af3340eb95438a339077da1104c84a70a6a1 100644 (file)
--- a/src/runtime/mgclimit.go
+++ b/src/runtime/mgclimit.go
@@ -55,11 +55,10 @@ type gcCPULimiterState struct {
         // the mark and sweep phases.
         transitioning bool
  
-       // lastTotalAssistTime is the last value of a monotonically increasing
-       // count of GC assist time, like gcController.assistTime.
-       lastTotalAssistTime int64
+       _ uint32 // Align assistTimePool and lastUpdate on 32-bit platforms.
  
-       _ uint32 // Align lastUpdate on 32-bit platforms.
+       // assistTimePool is the accumulated assist time since the last update.
+       assistTimePool atomic.Int64
  
         // lastUpdate is the nanotime timestamp of the last time update was called.
         //
@@ -81,15 +80,13 @@ func (l *gcCPULimiterState) limiting() bool {
         return l.enabled.Load()
  }
  
-// startGCTransition notifies the limiter of a GC transition. totalAssistTime
-// is the same as described for update. now must be the start of the STW pause
-// for the GC transition.
+// startGCTransition notifies the limiter of a GC transition.
  //
  // This call takes ownership of the limiter and disables all other means of
  // updating the limiter. Release ownership by calling finishGCTransition.
  //
  // It is safe to call concurrently with other operations.
-func (l *gcCPULimiterState) startGCTransition(enableGC bool, totalAssistTime, now int64) {
+func (l *gcCPULimiterState) startGCTransition(enableGC bool, now int64) {
         if !l.tryLock() {
                 // This must happen during a STW, so we can't fail to acquire the lock.
                 // If we did, something went wrong. Throw.
@@ -99,10 +96,7 @@ func (l *gcCPULimiterState) startGCTransition(enableGC bool, totalAssistTime, no
                 throw("transitioning GC to the same state as before?")
         }
         // Flush whatever was left between the last update and now.
-       l.updateLocked(totalAssistTime, now)
-       if enableGC && totalAssistTime != 0 {
-               throw("assist time must be zero on entry to a GC cycle")
-       }
+       l.updateLocked(now)
         l.gcEnabled = enableGC
         l.transitioning = true
         // N.B. finishGCTransition releases the lock.
@@ -127,8 +121,6 @@ func (l *gcCPULimiterState) finishGCTransition(now int64) {
         }
         l.lastUpdate.Store(now)
         l.transitioning = false
-       // Reset lastTotalAssistTime for the next GC cycle.
-       l.lastTotalAssistTime = 0
         l.unlock()
  }
  
@@ -142,12 +134,17 @@ func (l *gcCPULimiterState) needUpdate(now int64) bool {
         return now-l.lastUpdate.Load() > gcCPULimiterUpdatePeriod
  }
  
-// update updates the bucket given runtime-specific information. totalAssistTime must
-// be a value that increases monotonically throughout the GC cycle, and is reset
-// at the start of a new mark phase. now is the current monotonic time in nanoseconds.
+// addAssistTime notifies the limiter of additional assist time. It will be
+// included in the next update.
+func (l *gcCPULimiterState) addAssistTime(t int64) {
+       l.assistTimePool.Add(t)
+}
+
+// update updates the bucket given runtime-specific information. now is the
+// current monotonic time in nanoseconds.
  //
  // This is safe to call concurrently with other operations, except *GCTransition.
-func (l *gcCPULimiterState) update(totalAssistTime int64, now int64) {
+func (l *gcCPULimiterState) update(now int64) {
         if !l.tryLock() {
                 // We failed to acquire the lock, which means something else is currently
                 // updating. Just drop our update, the next one to update will include
@@ -157,31 +154,32 @@ func (l *gcCPULimiterState) update(totalAssistTime int64, now int64) {
         if l.transitioning {
                 throw("update during transition")
         }
-       l.updateLocked(totalAssistTime, now)
+       l.updateLocked(now)
         l.unlock()
  }
  
  // updatedLocked is the implementation of update. l.lock must be held.
-func (l *gcCPULimiterState) updateLocked(totalAssistTime int64, now int64) {
+func (l *gcCPULimiterState) updateLocked(now int64) {
         lastUpdate := l.lastUpdate.Load()
-       if now < lastUpdate || totalAssistTime < l.lastTotalAssistTime {
+       if now < lastUpdate {
                 // Defensively avoid overflow. This isn't even the latest update anyway.
-               // This might seem like a lot to back out on, but provided that both
-               // totalAssistTime and now are fresh, updaters must've been closely
-               // racing. It's close enough that it doesn't matter, and in the long
-               // term the result is the same.
                 return
         }
         windowTotalTime := (now - lastUpdate) * int64(l.nprocs)
         l.lastUpdate.Store(now)
-       if !l.gcEnabled {
-               l.accumulate(windowTotalTime, 0)
-               return
+
+       // Drain the pool of assist time.
+       assistTime := l.assistTimePool.Load()
+       if assistTime != 0 {
+               l.assistTimePool.Add(-assistTime)
+       }
+
+       // Accumulate.
+       windowGCTime := assistTime
+       if l.gcEnabled {
+               windowGCTime += int64(float64(windowTotalTime) * gcBackgroundUtilization)
         }
-       windowGCTime := totalAssistTime - l.lastTotalAssistTime
-       windowGCTime += int64(float64(windowTotalTime) * gcBackgroundUtilization)
         l.accumulate(windowTotalTime-windowGCTime, windowGCTime)
-       l.lastTotalAssistTime = totalAssistTime
  }
  
  // accumulate adds time to the bucket and signals whether the limiter is enabled.
@@ -249,7 +247,7 @@ func (l *gcCPULimiterState) resetCapacity(now int64, nprocs int32) {
                 throw("failed to acquire lock to reset capacity")
         }
         // Flush the rest of the time for this period.
-       l.updateLocked(0, now)
+       l.updateLocked(now)
         l.nprocs = nprocs
  
         l.bucket.capacity = uint64(nprocs) * capacityPerProc
diff --git a/src/runtime/mgclimit_test.go b/src/runtime/mgclimit_test.go

index b5e2190470cc02305c3bbc6b5010cb496339c854..124da03ef1d9f34f7099a00172f6093704ee364e 100644 (file)
--- a/src/runtime/mgclimit_test.go
+++ b/src/runtime/mgclimit_test.go
@@ -21,12 +21,11 @@ func TestGCCPULimiter(t *testing.T) {
                 return ticks
         }
  
-       // Create mock assist time.
-       assistCPUTime := int64(0)
-       doAssist := func(d time.Duration, frac float64) int64 {
+       // assistTime computes the CPU time for assists using frac of GOMAXPROCS
+       // over the wall-clock duration d.
+       assistTime := func(d time.Duration, frac float64) int64 {
                 t.Helper()
-               assistCPUTime += int64(frac * float64(d) * procs)
-               return assistCPUTime
+               return int64(frac * float64(d) * procs)
         }
  
         l := NewGCCPULimiter(ticks, procs)
@@ -45,9 +44,9 @@ func TestGCCPULimiter(t *testing.T) {
  
                 // Test filling the bucket with just mutator time.
  
-               l.Update(0, advance(10*time.Millisecond))
-               l.Update(0, advance(1*time.Second))
-               l.Update(0, advance(1*time.Hour))
+               l.Update(advance(10 * time.Millisecond))
+               l.Update(advance(1 * time.Second))
+               l.Update(advance(1 * time.Hour))
                 if l.Fill() != 0 {
                         t.Fatalf("expected empty bucket from only accumulating mutator time, got fill of %d cpu-ns", l.Fill())
                 }
@@ -60,14 +59,14 @@ func TestGCCPULimiter(t *testing.T) {
                 if !l.NeedUpdate(advance(GCCPULimiterUpdatePeriod)) {
                         t.Fatal("doesn't need update even though updated 1.5 periods ago")
                 }
-               l.Update(0, advance(0))
+               l.Update(advance(0))
                 if l.NeedUpdate(advance(0)) {
                         t.Fatal("need update even though just updated")
                 }
  
                 // Test transitioning the bucket to enable the GC.
  
-               l.StartGCTransition(true, 0, advance(109*time.Millisecond))
+               l.StartGCTransition(true, advance(109*time.Millisecond))
                 l.FinishGCTransition(advance(2*time.Millisecond + 1*time.Microsecond))
  
                 if expect := uint64((2*time.Millisecond + 1*time.Microsecond) * procs); l.Fill() != expect {
@@ -88,25 +87,27 @@ func TestGCCPULimiter(t *testing.T) {
                 // And here we want n=procs:
                 factor := (1 / (1 - 2*GCBackgroundUtilization))
                 fill := (2*time.Millisecond + 1*time.Microsecond) * procs
-               l.Update(0, advance(time.Duration(factor*float64(fill-procs)/procs)))
+               l.Update(advance(time.Duration(factor * float64(fill-procs) / procs)))
                 if l.Fill() != procs {
                         t.Fatalf("expected fill %d cpu-ns from draining after a GC started, got fill of %d cpu-ns", procs, l.Fill())
                 }
  
                 // Drain to zero for the rest of the test.
-               l.Update(0, advance(2*procs*CapacityPerProc))
+               l.Update(advance(2 * procs * CapacityPerProc))
                 if l.Fill() != 0 {
                         t.Fatalf("expected empty bucket from draining, got fill of %d cpu-ns", l.Fill())
                 }
  
                 // Test filling up the bucket with 50% total GC work (so, not moving the bucket at all).
-               l.Update(doAssist(10*time.Millisecond, 0.5-GCBackgroundUtilization), advance(10*time.Millisecond))
+               l.AddAssistTime(assistTime(10*time.Millisecond, 0.5-GCBackgroundUtilization))
+               l.Update(advance(10 * time.Millisecond))
                 if l.Fill() != 0 {
                         t.Fatalf("expected empty bucket from 50%% GC work, got fill of %d cpu-ns", l.Fill())
                 }
  
                 // Test adding to the bucket overall with 100% GC work.
-               l.Update(doAssist(time.Millisecond, 1.0-GCBackgroundUtilization), advance(time.Millisecond))
+               l.AddAssistTime(assistTime(time.Millisecond, 1.0-GCBackgroundUtilization))
+               l.Update(advance(time.Millisecond))
                 if expect := uint64(procs * time.Millisecond); l.Fill() != expect {
                         t.Errorf("expected %d fill from 100%% GC CPU, got fill of %d cpu-ns", expect, l.Fill())
                 }
@@ -118,7 +119,8 @@ func TestGCCPULimiter(t *testing.T) {
                 }
  
                 // Test filling the bucket exactly full.
-               l.Update(doAssist(CapacityPerProc-time.Millisecond, 1.0-GCBackgroundUtilization), advance(CapacityPerProc-time.Millisecond))
+               l.AddAssistTime(assistTime(CapacityPerProc-time.Millisecond, 1.0-GCBackgroundUtilization))
+               l.Update(advance(CapacityPerProc - time.Millisecond))
                 if l.Fill() != l.Capacity() {
                         t.Errorf("expected bucket filled to capacity %d, got %d", l.Capacity(), l.Fill())
                 }
@@ -134,7 +136,8 @@ func TestGCCPULimiter(t *testing.T) {
  
                 // Test adding with a delta of exactly zero. That is, GC work is exactly 50% of all resources.
                 // Specifically, the limiter should still be on, and no overflow should accumulate.
-               l.Update(doAssist(1*time.Second, 0.5-GCBackgroundUtilization), advance(1*time.Second))
+               l.AddAssistTime(assistTime(1*time.Second, 0.5-GCBackgroundUtilization))
+               l.Update(advance(1 * time.Second))
                 if l.Fill() != l.Capacity() {
                         t.Errorf("expected bucket filled to capacity %d, got %d", l.Capacity(), l.Fill())
                 }
@@ -149,7 +152,8 @@ func TestGCCPULimiter(t *testing.T) {
                 }
  
                 // Drain the bucket by half.
-               l.Update(doAssist(CapacityPerProc, 0), advance(CapacityPerProc))
+               l.AddAssistTime(assistTime(CapacityPerProc, 0))
+               l.Update(advance(CapacityPerProc))
                 if expect := l.Capacity() / 2; l.Fill() != expect {
                         t.Errorf("failed to drain to %d, got fill %d", expect, l.Fill())
                 }
@@ -161,7 +165,8 @@ func TestGCCPULimiter(t *testing.T) {
                 }
  
                 // Test overfilling the bucket.
-               l.Update(doAssist(CapacityPerProc, 1.0-GCBackgroundUtilization), advance(CapacityPerProc))
+               l.AddAssistTime(assistTime(CapacityPerProc, 1.0-GCBackgroundUtilization))
+               l.Update(advance(CapacityPerProc))
                 if l.Fill() != l.Capacity() {
                         t.Errorf("failed to fill to capacity %d, got fill %d", l.Capacity(), l.Fill())
                 }
@@ -176,8 +181,8 @@ func TestGCCPULimiter(t *testing.T) {
                 }
  
                 // Test ending the cycle with some assists left over.
-
-               l.StartGCTransition(false, doAssist(1*time.Millisecond, 1.0-GCBackgroundUtilization), advance(1*time.Millisecond))
+               l.AddAssistTime(assistTime(1*time.Millisecond, 1.0-GCBackgroundUtilization))
+               l.StartGCTransition(false, advance(1*time.Millisecond))
                 if l.Fill() != l.Capacity() {
                         t.Errorf("failed to maintain fill to capacity %d, got fill %d", l.Capacity(), l.Fill())
                 }
@@ -206,9 +211,6 @@ func TestGCCPULimiter(t *testing.T) {
                         t.FailNow()
                 }
  
-               // Reset the mock total assist CPU time, since we just ended the cycle.
-               assistCPUTime = 0
-
                 // Resize procs up and make sure limiting stops.
                 expectFill := l.Capacity()
                 l.ResetCapacity(advance(0), procs+10)
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go

index a6dc43d8d3c224f329f03b92c1ba1b337f0e3cfb..45d779054c7db19c59a0e20b6c7427708db7c625 100644 (file)
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -594,9 +594,10 @@ func gcAssistAlloc1(gp *g, scanWork int64) {
         _p_ := gp.m.p.ptr()
         _p_.gcAssistTime += duration
         if _p_.gcAssistTime > gcAssistTimeSlack {
-               assistTime := gcController.assistTime.Add(_p_.gcAssistTime)
+               gcController.assistTime.Add(_p_.gcAssistTime)
+               gcCPULimiter.addAssistTime(_p_.gcAssistTime)
+               gcCPULimiter.update(now)
                 _p_.gcAssistTime = 0
-               gcCPULimiter.update(assistTime+mheap_.pages.scav.assistTime.Load(), now)
         }
  }
  
diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go

index 9e7e9b12aa96568d922dbdb1b28add3eb58a5837..9fbbe83c6b0d23b944f3f414f4966c5d3153a771 100644 (file)
--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@@ -800,7 +800,7 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p, now int64) *g {
         // case the limiter is on but hasn't been checked in a while and
         // so may have left sufficient headroom to turn off again.
         if gcCPULimiter.needUpdate(now) {
-               gcCPULimiter.update(gcController.assistTime.Load(), now)
+               gcCPULimiter.update(now)
         }
  
         if !gcMarkWorkAvailable(_p_) {
diff --git a/src/runtime/mheap.go b/src/runtime/mheap.go

index ff681a19cd2b2f0db01faff92dc8f0d0972759fb..2d4d7e3e973b93623d42b19a024e1a2836fec762 100644 (file)
--- a/src/runtime/mheap.go
+++ b/src/runtime/mheap.go
@@ -1301,8 +1301,9 @@ HaveSpan:
                 start := nanotime()
                 h.pages.scavenge(bytesToScavenge)
                 now := nanotime()
-               assistTime := h.pages.scav.assistTime.Add(now - start)
-               gcCPULimiter.update(gcController.assistTime.Load()+assistTime, now)
+               h.pages.scav.assistTime.Add(now - start)
+               gcCPULimiter.addAssistTime(now - start)
+               gcCPULimiter.update(now)
         }
  
         // Commit and account for any scavenged memory that the span now owns.
author	Michael Anthony Knyszek <mknyszek@google.com>
	Fri, 6 May 2022 20:17:52 +0000 (20:17 +0000)
committer	Michael Knyszek <mknyszek@google.com>
	Fri, 13 May 2022 16:02:20 +0000 (16:02 +0000)
src/runtime/export_test.go		patch \| blob \| history
src/runtime/mgc.go		patch \| blob \| history
src/runtime/mgclimit.go		patch \| blob \| history
src/runtime/mgclimit_test.go		patch \| blob \| history
src/runtime/mgcmark.go		patch \| blob \| history
src/runtime/mgcpacer.go		patch \| blob \| history
src/runtime/mheap.go		patch \| blob \| history