]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: move GC pause time CPU metrics update into the STW
authorMichael Anthony Knyszek <mknyszek@google.com>
Fri, 8 Mar 2024 23:06:41 +0000 (23:06 +0000)
committerGopher Robot <gobot@golang.org>
Mon, 8 Apr 2024 21:03:13 +0000 (21:03 +0000)
This change fixes a possible race with updating metrics and reading
them. The update is intended to be protected by the world being stopped,
but here, it clearly isn't.

Fixing this lets us lower the thresholds in the metrics tests by an
order of magnitude, because the only thing we have to worry about now is
floating point error (the tests were previously written assuming the
floating point error was much higher than it actually was; that turns
out not to be the case, and this bug was the problem instead). However,
this still isn't that tight of a bound; we still want to catch any and
all problems of exactness. For this purpose, this CL adds a test to
check the source-of-truth (in uint64 nanoseconds) that ensures the
totals exactly match.

This means we unfortunately have to take another time measurement, but
for now let's prioritize correctness. A few additional nanoseconds of
STW time won't be terribly noticable.

Fixes #66212.

Change-Id: Id02c66e8a43c13b1f70e9b268b8a84cc72293bfd
Reviewed-on: https://go-review.googlesource.com/c/go/+/570257
Auto-Submit: Michael Knyszek <mknyszek@google.com>
Reviewed-by: Nicolas Hillegeer <aktau@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>

src/runtime/export_test.go
src/runtime/metrics.go
src/runtime/metrics_test.go
src/runtime/mgc.go
src/runtime/mstats.go

index 4eb187c007cd3da400b61d0d1973351d25cc4b63..1ec45b8cc08a93033649cd46da13b45df014f691 100644 (file)
@@ -429,6 +429,12 @@ func (p *ProfBuf) Close() {
        (*profBuf)(p).close()
 }
 
+type CPUStats = cpuStats
+
+func ReadCPUStats() CPUStats {
+       return work.cpuStats
+}
+
 func ReadMetricsSlow(memStats *MemStats, samplesp unsafe.Pointer, len, cap int) {
        stw := stopTheWorld(stwForTestReadMetricsSlow)
 
index f97a3804ab32201c895ea5d45a180067b6a7b305..417f1071bb7007fa36d1d910b68f03781527c181 100644 (file)
@@ -95,77 +95,77 @@ func initMetrics() {
                        deps: makeStatDepSet(cpuStatsDep),
                        compute: func(in *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(in.cpuStats.gcAssistTime))
+                               out.scalar = float64bits(nsToSec(in.cpuStats.GCAssistTime))
                        },
                },
                "/cpu/classes/gc/mark/dedicated:cpu-seconds": {
                        deps: makeStatDepSet(cpuStatsDep),
                        compute: func(in *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(in.cpuStats.gcDedicatedTime))
+                               out.scalar = float64bits(nsToSec(in.cpuStats.GCDedicatedTime))
                        },
                },
                "/cpu/classes/gc/mark/idle:cpu-seconds": {
                        deps: makeStatDepSet(cpuStatsDep),
                        compute: func(in *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(in.cpuStats.gcIdleTime))
+                               out.scalar = float64bits(nsToSec(in.cpuStats.GCIdleTime))
                        },
                },
                "/cpu/classes/gc/pause:cpu-seconds": {
                        deps: makeStatDepSet(cpuStatsDep),
                        compute: func(in *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(in.cpuStats.gcPauseTime))
+                               out.scalar = float64bits(nsToSec(in.cpuStats.GCPauseTime))
                        },
                },
                "/cpu/classes/gc/total:cpu-seconds": {
                        deps: makeStatDepSet(cpuStatsDep),
                        compute: func(in *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(in.cpuStats.gcTotalTime))
+                               out.scalar = float64bits(nsToSec(in.cpuStats.GCTotalTime))
                        },
                },
                "/cpu/classes/idle:cpu-seconds": {
                        deps: makeStatDepSet(cpuStatsDep),
                        compute: func(in *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(in.cpuStats.idleTime))
+                               out.scalar = float64bits(nsToSec(in.cpuStats.IdleTime))
                        },
                },
                "/cpu/classes/scavenge/assist:cpu-seconds": {
                        deps: makeStatDepSet(cpuStatsDep),
                        compute: func(in *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(in.cpuStats.scavengeAssistTime))
+                               out.scalar = float64bits(nsToSec(in.cpuStats.ScavengeAssistTime))
                        },
                },
                "/cpu/classes/scavenge/background:cpu-seconds": {
                        deps: makeStatDepSet(cpuStatsDep),
                        compute: func(in *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(in.cpuStats.scavengeBgTime))
+                               out.scalar = float64bits(nsToSec(in.cpuStats.ScavengeBgTime))
                        },
                },
                "/cpu/classes/scavenge/total:cpu-seconds": {
                        deps: makeStatDepSet(cpuStatsDep),
                        compute: func(in *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(in.cpuStats.scavengeTotalTime))
+                               out.scalar = float64bits(nsToSec(in.cpuStats.ScavengeTotalTime))
                        },
                },
                "/cpu/classes/total:cpu-seconds": {
                        deps: makeStatDepSet(cpuStatsDep),
                        compute: func(in *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(in.cpuStats.totalTime))
+                               out.scalar = float64bits(nsToSec(in.cpuStats.TotalTime))
                        },
                },
                "/cpu/classes/user:cpu-seconds": {
                        deps: makeStatDepSet(cpuStatsDep),
                        compute: func(in *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(in.cpuStats.userTime))
+                               out.scalar = float64bits(nsToSec(in.cpuStats.UserTime))
                        },
                },
                "/gc/cycles/automatic:gc-cycles": {
index d7f41334cd6b625b0ac6a0b48bc15235bd4381c7..1d99b86fe6b7c06ff20acd7b5f140242b193e42e 100644 (file)
@@ -357,11 +357,11 @@ func TestReadMetricsConsistency(t *testing.T) {
                if cpu.idle <= 0 {
                        t.Errorf("found no idle time: %f", cpu.idle)
                }
-               if total := cpu.gcDedicated + cpu.gcAssist + cpu.gcIdle + cpu.gcPause; !withinEpsilon(cpu.gcTotal, total, 0.01) {
-                       t.Errorf("calculated total GC CPU not within 1%% of sampled total: %f vs. %f", total, cpu.gcTotal)
+               if total := cpu.gcDedicated + cpu.gcAssist + cpu.gcIdle + cpu.gcPause; !withinEpsilon(cpu.gcTotal, total, 0.001) {
+                       t.Errorf("calculated total GC CPU time not within %%0.1 of total: %f vs. %f", total, cpu.gcTotal)
                }
-               if total := cpu.scavengeAssist + cpu.scavengeBg; !withinEpsilon(cpu.scavengeTotal, total, 0.01) {
-                       t.Errorf("calculated total scavenge CPU not within 1%% of sampled total: %f vs. %f", total, cpu.scavengeTotal)
+               if total := cpu.scavengeAssist + cpu.scavengeBg; !withinEpsilon(cpu.scavengeTotal, total, 0.001) {
+                       t.Errorf("calculated total scavenge CPU not within %%0.1 of total: %f vs. %f", total, cpu.scavengeTotal)
                }
                if cpu.total <= 0 {
                        t.Errorf("found no total CPU time passed")
@@ -369,8 +369,8 @@ func TestReadMetricsConsistency(t *testing.T) {
                if cpu.user <= 0 {
                        t.Errorf("found no user time passed")
                }
-               if total := cpu.gcTotal + cpu.scavengeTotal + cpu.user + cpu.idle; !withinEpsilon(cpu.total, total, 0.02) {
-                       t.Errorf("calculated total CPU not within 2%% of sampled total: %f vs. %f", total, cpu.total)
+               if total := cpu.gcTotal + cpu.scavengeTotal + cpu.user + cpu.idle; !withinEpsilon(cpu.total, total, 0.001) {
+                       t.Errorf("calculated total CPU not within %%0.1 of total: %f vs. %f", total, cpu.total)
                }
        }
        if totalVirtual.got != totalVirtual.want {
@@ -1290,3 +1290,39 @@ func (w *contentionWorker) run() {
        for w.fn() {
        }
 }
+
+func TestCPUStats(t *testing.T) {
+       // Run a few GC cycles to get some of the stats to be non-zero.
+       runtime.GC()
+       runtime.GC()
+       runtime.GC()
+
+       // Set GOMAXPROCS high then sleep briefly to ensure we generate
+       // some idle time.
+       oldmaxprocs := runtime.GOMAXPROCS(10)
+       time.Sleep(time.Millisecond)
+       runtime.GOMAXPROCS(oldmaxprocs)
+
+       stats := runtime.ReadCPUStats()
+       gcTotal := stats.GCAssistTime + stats.GCDedicatedTime + stats.GCIdleTime + stats.GCPauseTime
+       if gcTotal != stats.GCTotalTime {
+               t.Errorf("manually computed total does not match GCTotalTime: %d cpu-ns vs. %d cpu-ns", gcTotal, stats.GCTotalTime)
+       }
+       scavTotal := stats.ScavengeAssistTime + stats.ScavengeBgTime
+       if scavTotal != stats.ScavengeTotalTime {
+               t.Errorf("manually computed total does not match ScavengeTotalTime: %d cpu-ns vs. %d cpu-ns", scavTotal, stats.ScavengeTotalTime)
+       }
+       total := gcTotal + scavTotal + stats.IdleTime + stats.UserTime
+       if total != stats.TotalTime {
+               t.Errorf("manually computed overall total does not match TotalTime: %d cpu-ns vs. %d cpu-ns", total, stats.TotalTime)
+       }
+       if total == 0 {
+               t.Error("total time is zero")
+       }
+       if gcTotal == 0 {
+               t.Error("GC total time is zero")
+       }
+       if stats.IdleTime == 0 {
+               t.Error("idle time is zero")
+       }
+}
index 1da6114ab89a8af02a1d52d1465b171b779760df..4ea55eceb6f5ada9b7230a4a07ebdab93f4d03b0 100644 (file)
@@ -417,7 +417,10 @@ type workType struct {
        stwprocs, maxprocs                 int32
        tSweepTerm, tMark, tMarkTerm, tEnd int64 // nanotime() of phase start
 
-       pauseNS int64 // total STW time this cycle
+       // pauseNS is the total STW time this cycle, measured as the time between
+       // when stopping began (just before trying to stop Ps) and just after the
+       // world started again.
+       pauseNS int64
 
        // debug.gctrace heap sizes for this cycle.
        heap0, heap1, heap2 uint64
@@ -741,19 +744,19 @@ func gcStart(trigger gcTrigger) {
        // returns, so make sure we're not preemptible.
        mp = acquirem()
 
+       // Update the CPU stats pause time.
+       //
+       // Use maxprocs instead of stwprocs here because the total time
+       // computed in the CPU stats is based on maxprocs, and we want them
+       // to be comparable.
+       work.cpuStats.accumulateGCPauseTime(nanotime()-work.tSweepTerm, work.maxprocs)
+
        // Concurrent mark.
        systemstack(func() {
                now = startTheWorldWithSema(0, stw)
                work.pauseNS += now - stw.start
                work.tMark = now
 
-               // Update the CPU stats pause time.
-               //
-               // Use maxprocs instead of stwprocs here because the total time
-               // computed in the CPU stats is based on maxprocs, and we want them
-               // to be comparable.
-               work.cpuStats.accumulateGCPauseTime(now-work.tSweepTerm, work.maxprocs)
-
                // Release the CPU limiter.
                gcCPULimiter.finishGCTransition(now)
        })
@@ -1033,7 +1036,7 @@ func gcMarkTermination(stw worldStop) {
 
        // Compute overall GC CPU utilization.
        // Omit idle marking time from the overall utilization here since it's "free".
-       memstats.gc_cpu_fraction = float64(work.cpuStats.gcTotalTime-work.cpuStats.gcIdleTime) / float64(work.cpuStats.totalTime)
+       memstats.gc_cpu_fraction = float64(work.cpuStats.GCTotalTime-work.cpuStats.GCIdleTime) / float64(work.cpuStats.TotalTime)
 
        // Reset assist time and background time stats.
        //
index 3d1471c576ff6bc3b645d102afa7524ea98673e2..c10ca402217cfb301b644f1a17b3684bc4a169af 100644 (file)
@@ -905,20 +905,20 @@ type cpuStats struct {
        // they don't accurately compute on-CPU time (so some of the time
        // could be spent scheduled away by the OS).
 
-       gcAssistTime    int64 // GC assists
-       gcDedicatedTime int64 // GC dedicated mark workers + pauses
-       gcIdleTime      int64 // GC idle mark workers
-       gcPauseTime     int64 // GC pauses (all GOMAXPROCS, even if just 1 is running)
-       gcTotalTime     int64
+       GCAssistTime    int64 // GC assists
+       GCDedicatedTime int64 // GC dedicated mark workers + pauses
+       GCIdleTime      int64 // GC idle mark workers
+       GCPauseTime     int64 // GC pauses (all GOMAXPROCS, even if just 1 is running)
+       GCTotalTime     int64
 
-       scavengeAssistTime int64 // background scavenger
-       scavengeBgTime     int64 // scavenge assists
-       scavengeTotalTime  int64
+       ScavengeAssistTime int64 // background scavenger
+       ScavengeBgTime     int64 // scavenge assists
+       ScavengeTotalTime  int64
 
-       idleTime int64 // Time Ps spent in _Pidle.
-       userTime int64 // Time Ps spent in _Prunning or _Psyscall that's not any of the above.
+       IdleTime int64 // Time Ps spent in _Pidle.
+       UserTime int64 // Time Ps spent in _Prunning or _Psyscall that's not any of the above.
 
-       totalTime int64 // GOMAXPROCS * (monotonic wall clock time elapsed)
+       TotalTime int64 // GOMAXPROCS * (monotonic wall clock time elapsed)
 }
 
 // accumulateGCPauseTime add dt*stwProcs to the GC CPU pause time stats. dt should be
@@ -927,8 +927,8 @@ type cpuStats struct {
 // from GOMAXPROCS.
 func (s *cpuStats) accumulateGCPauseTime(dt int64, maxProcs int32) {
        cpu := dt * int64(maxProcs)
-       s.gcPauseTime += cpu
-       s.gcTotalTime += cpu
+       s.GCPauseTime += cpu
+       s.GCTotalTime += cpu
 }
 
 // accumulate takes a cpuStats and adds in the current state of all GC CPU
@@ -961,19 +961,19 @@ func (s *cpuStats) accumulate(now int64, gcMarkPhase bool) {
        scavBgCpu := scavenge.backgroundTime.Load()
 
        // Update cumulative GC CPU stats.
-       s.gcAssistTime += markAssistCpu
-       s.gcDedicatedTime += markDedicatedCpu + markFractionalCpu
-       s.gcIdleTime += markIdleCpu
-       s.gcTotalTime += markAssistCpu + markDedicatedCpu + markFractionalCpu + markIdleCpu
+       s.GCAssistTime += markAssistCpu
+       s.GCDedicatedTime += markDedicatedCpu + markFractionalCpu
+       s.GCIdleTime += markIdleCpu
+       s.GCTotalTime += markAssistCpu + markDedicatedCpu + markFractionalCpu + markIdleCpu
 
        // Update cumulative scavenge CPU stats.
-       s.scavengeAssistTime += scavAssistCpu
-       s.scavengeBgTime += scavBgCpu
-       s.scavengeTotalTime += scavAssistCpu + scavBgCpu
+       s.ScavengeAssistTime += scavAssistCpu
+       s.ScavengeBgTime += scavBgCpu
+       s.ScavengeTotalTime += scavAssistCpu + scavBgCpu
 
        // Update total CPU.
-       s.totalTime = sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
-       s.idleTime += sched.idleTime.Load()
+       s.TotalTime = sched.totaltime + (now-sched.procresizetime)*int64(gomaxprocs)
+       s.IdleTime += sched.idleTime.Load()
 
        // Compute userTime. We compute this indirectly as everything that's not the above.
        //
@@ -983,5 +983,5 @@ func (s *cpuStats) accumulate(now int64, gcMarkPhase bool) {
        // else via sysmon. Meanwhile if we subtract GC time from whatever's left, we get non-GC
        // _Prunning time. Note that this still leaves time spent in sweeping and in the scheduler,
        // but that's fine. The overwhelming majority of this time will be actual user time.
-       s.userTime = s.totalTime - (s.gcTotalTime + s.scavengeTotalTime + s.idleTime)
+       s.UserTime = s.TotalTime - (s.GCTotalTime + s.ScavengeTotalTime + s.IdleTime)
 }