]> Cypherpunks repositories - gostls13.git/commitdiff
runtime,runtime/metrics: add metric to track scheduling latencies
authorMichael Pratt <mpratt@google.com>
Fri, 7 Aug 2020 16:28:35 +0000 (16:28 +0000)
committerMichael Pratt <mpratt@google.com>
Fri, 23 Apr 2021 13:48:10 +0000 (13:48 +0000)
This change adds a metric to track scheduling latencies, defined as the
cumulative amount of time a goroutine spends being runnable before
running again. The metric is an approximations and samples instead of
trying to record every goroutine scheduling latency.

This change was primarily authored by mknyszek@google.com.

Change-Id: Ie0be7e6e7be421572eb2317d3dd8dd6f3d6aa152
Reviewed-on: https://go-review.googlesource.com/c/go/+/308933
Trust: Michael Pratt <mpratt@google.com>
Run-TryBot: Michael Pratt <mpratt@google.com>
TryBot-Result: Go Bot <gobot@golang.org>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
src/runtime/histogram.go
src/runtime/metrics.go
src/runtime/metrics/description.go
src/runtime/metrics/doc.go
src/runtime/proc.go
src/runtime/runtime2.go
src/runtime/sizeof_test.go

index da4910d341cbcf861059f292eb4576a8e762e75a..0cccbcca16252babeb09438e4041cb30fd8b8d8f 100644 (file)
@@ -81,6 +81,10 @@ type timeHistogram struct {
 }
 
 // record adds the given duration to the distribution.
+//
+// Disallow preemptions and stack growths because this function
+// may run in sensitive locations.
+//go:nosplit
 func (h *timeHistogram) record(duration int64) {
        if duration < 0 {
                atomic.Xadd64(&h.underflow, 1)
index e4343f9148384e9afc3f2d4a3366452deffe4644..fe82688aac8a13c5a91a0327baa01ceeb40f2f6e 100644 (file)
@@ -245,6 +245,15 @@ func initMetrics() {
                                out.scalar = uint64(gcount())
                        },
                },
+               "/sched/latencies:seconds": {
+                       compute: func(_ *statAggregate, out *metricValue) {
+                               hist := out.float64HistOrInit(timeHistBuckets)
+                               hist.counts[0] = atomic.Load64(&sched.timeToRun.underflow)
+                               for i := range sched.timeToRun.counts {
+                                       hist.counts[i+1] = atomic.Load64(&sched.timeToRun.counts[i])
+                               }
+                       },
+               },
        }
        metricsInit = true
 }
index 117515610436f2a27145a79f589400d8076e14b3..9aaf45713538f5c447a68df418532a9d6219c9b5 100644 (file)
@@ -176,6 +176,11 @@ var allDesc = []Description{
                Description: "Count of live goroutines.",
                Kind:        KindUint64,
        },
+       {
+               Name:        "/sched/latencies:seconds",
+               Description: "Distribution of the time goroutines have spent in the scheduler in a runnable state before actually running.",
+               Kind:        KindFloat64Histogram,
+       },
 }
 
 // All returns a slice of containing metric descriptions for all supported metrics.
index 7f790afc12eeabcdda78f0d01a0d83003155d18a..7cbc0415dc6e85f505bef09ad5bbc48db5746dfa 100644 (file)
@@ -139,5 +139,9 @@ Below is the full list of supported metrics, ordered lexicographically.
 
        /sched/goroutines:goroutines
                Count of live goroutines.
+
+       /sched/latencies:seconds
+               Distribution of the time goroutines have spent in the scheduler
+               in a runnable state before actually running.
 */
 package metrics
index 37c051634ce733daef31882ec0e512c19059ded2..2f9818d9f1c789221264595bbedee76ad40e9ee5 100644 (file)
@@ -697,6 +697,11 @@ func schedinit() {
        sigsave(&_g_.m.sigmask)
        initSigmask = _g_.m.sigmask
 
+       if offset := unsafe.Offsetof(sched.timeToRun); offset%8 != 0 {
+               println(offset)
+               throw("sched.timeToRun not aligned to 8 bytes")
+       }
+
        goargs()
        goenvs()
        parsedebugvars()
@@ -973,6 +978,37 @@ func casgstatus(gp *g, oldval, newval uint32) {
                        nextYield = nanotime() + yieldDelay/2
                }
        }
+
+       // Handle tracking for scheduling latencies.
+       if oldval == _Grunning {
+               // Track every 8th time a goroutine transitions out of running.
+               if gp.trackingSeq%gTrackingPeriod == 0 {
+                       gp.tracking = true
+               }
+               gp.trackingSeq++
+       }
+       if gp.tracking {
+               now := nanotime()
+               if oldval == _Grunnable {
+                       // We transitioned out of runnable, so measure how much
+                       // time we spent in this state and add it to
+                       // runnableTime.
+                       gp.runnableTime += now - gp.runnableStamp
+                       gp.runnableStamp = 0
+               }
+               if newval == _Grunnable {
+                       // We just transitioned into runnable, so record what
+                       // time that happened.
+                       gp.runnableStamp = now
+               } else if newval == _Grunning {
+                       // We're transitioning into running, so turn off
+                       // tracking and record how much time we spent in
+                       // runnable.
+                       gp.tracking = false
+                       sched.timeToRun.record(gp.runnableTime)
+                       gp.runnableTime = 0
+               }
+       }
 }
 
 // casgstatus(gp, oldstatus, Gcopystack), assuming oldstatus is Gwaiting or Grunnable.
@@ -4286,6 +4322,11 @@ func newproc1(fn *funcval, argp unsafe.Pointer, narg int32, callergp *g, callerp
        if isSystemGoroutine(newg, false) {
                atomic.Xadd(&sched.ngsys, +1)
        }
+       // Track initial transition?
+       newg.trackingSeq = uint8(fastrand())
+       if newg.trackingSeq%gTrackingPeriod == 0 {
+               newg.tracking = true
+       }
        casgstatus(newg, _Gdead, _Grunnable)
 
        if _p_.goidcache == _p_.goidcacheend {
index f84cb51badd8052d8145497eb49ce0c5c14ba9de..7fc71743343746667c2f318cb88753aec010eb98 100644 (file)
@@ -462,6 +462,10 @@ type g struct {
 
        raceignore     int8     // ignore race detection events
        sysblocktraced bool     // StartTrace has emitted EvGoInSyscall about this goroutine
+       tracking       bool     // whether we're tracking this G for sched latency statistics
+       trackingSeq    uint8    // used to decide whether to track this G
+       runnableStamp  int64    // timestamp of when the G last became runnable, only used when tracking
+       runnableTime   int64    // the amount of time spent runnable, cleared when running, only used when tracking
        sysexitticks   int64    // cputicks when syscall has returned (for tracing)
        traceseq       uint64   // trace event sequencer
        tracelastp     puintptr // last P emitted an event for this goroutine
@@ -493,6 +497,10 @@ type g struct {
        gcAssistBytes int64
 }
 
+// gTrackingPeriod is the number of transitions out of _Grunning between
+// latency tracking runs.
+const gTrackingPeriod = 8
+
 const (
        // tlsSlots is the number of pointer-sized slots reserved for TLS on some platforms,
        // like Windows.
@@ -824,6 +832,15 @@ type schedt struct {
        // Acquire and hold this mutex to block sysmon from interacting
        // with the rest of the runtime.
        sysmonlock mutex
+
+       _ uint32 // ensure timeToRun has 8-byte alignment
+
+       // timeToRun is a distribution of scheduling latencies, defined
+       // as the sum of time a G spends in the _Grunnable state before
+       // it transitions to _Grunning.
+       //
+       // timeToRun is protected by sched.lock.
+       timeToRun timeHistogram
 }
 
 // Values for the flags field of a sigTabT.
index 736e848f8cbc52e9a50a919bb3468975d2abe648..bbbd1becf7a070ecb197193921b1081cc7b3259d 100644 (file)
@@ -21,7 +21,7 @@ func TestSizeof(t *testing.T) {
                _32bit uintptr     // size on 32bit platforms
                _64bit uintptr     // size on 64bit platforms
        }{
-               {runtime.G{}, 216, 376},   // g, but exported for testing
+               {runtime.G{}, 236, 392},   // g, but exported for testing
                {runtime.Sudog{}, 56, 88}, // sudog, but exported for testing
        }