runtime: make TestCPUMetricsSleep even more lenient

author Michael Anthony Knyszek <mknyszek@google.com>

Fri, 26 May 2023 17:56:39 +0000 (17:56 +0000)

committer Michael Knyszek <mknyszek@google.com>

Wed, 31 May 2023 15:17:38 +0000 (15:17 +0000)
author Michael Anthony Knyszek <mknyszek@google.com>
Fri, 26 May 2023 17:56:39 +0000 (17:56 +0000)
committer Michael Knyszek <mknyszek@google.com>
Wed, 31 May 2023 15:17:38 +0000 (15:17 +0000)
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go

index 83f6ecddf3638b2004cdf4745cf698a9128335f0..c138a2a1ba0d448b6316377fe76b5b27dccb475c 100644 (file)
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@@ -5,7 +5,6 @@
  package runtime_test
  
  import (
-       "internal/testenv"
         "reflect"
         "runtime"
         "runtime/debug"
@@ -661,17 +660,6 @@ func TestCPUMetricsSleep(t *testing.T) {
                 t.Skip("wasip1 currently busy-waits in idle time; test not applicable")
         }
  
-       // Unconditionally skip this test as flaky.
-       //
-       // There's a fundamental issue with this test, which is that there's no
-       // guarantee the application will go idle; background goroutines and
-       // time spent in the scheduler going to sleep can always erode idle time
-       // sufficiently such that minimum idle time (or maximum user time) stays
-       // within some threshold.
-       //
-       // Leave this as skipped while we figure out a better way to check this.
-       testenv.SkipFlaky(t, 60376)
-
         names := []string{
                 "/cpu/classes/idle:cpu-seconds",
  
@@ -695,33 +683,70 @@ func TestCPUMetricsSleep(t *testing.T) {
         }
         m1, m2 := prep(), prep()
  
-       // Read 1.
-       runtime.GC() // Update /cpu/classes metrics.
-       metrics.Read(m1)
+       const (
+               // Expected time spent idle.
+               dur = 100 * time.Millisecond
  
-       // Sleep.
-       const dur = 100 * time.Millisecond
-       time.Sleep(dur)
+               // maxFailures is the number of consecutive failures requires to cause the test to fail.
+               maxFailures = 10
+       )
  
-       // Read 2.
-       runtime.GC() // Update /cpu/classes metrics.
-       metrics.Read(m2)
+       failureIdleTimes := make([]float64, 0, maxFailures)
  
         // If the bug we expect is happening, then the Sleep CPU time will be accounted for
-       // as user time rather than idle time.
+       // as user time rather than idle time. In an ideal world we'd expect the whole application
+       // to go instantly idle the moment this goroutine goes to sleep, and stay asleep for that
+       // duration. However, the Go runtime can easily eat into idle time while this goroutine is
+       // blocked in a sleep. For example, slow platforms might spend more time expected in the
+       // scheduler. Another example is that a Go runtime background goroutine could run while
+       // everything else is idle. Lastly, if a running goroutine is descheduled by the OS, enough
+       // time may pass such that the goroutine is ready to wake, even though the runtime couldn't
+       // observe itself as idle with nanotime.
         //
-       // TODO(mknyszek): This number here is wrong. Background goroutines and just slow
-       // platforms spending a non-trivial amount of time in the scheduler doing things
-       // could easily erode idle time beyond this minimum.
-       minIdleCPUSeconds := dur.Seconds() * float64(runtime.GOMAXPROCS(-1))
+       // To deal with all this, we give a half-proc's worth of leniency.
+       //
+       // We also retry multiple times to deal with the fact that the OS might deschedule us before
+       // we yield and go idle. That has a rare enough chance that retries should resolve it.
+       // If the issue we expect is happening, it should be persistent.
+       minIdleCPUSeconds := dur.Seconds() * (float64(runtime.GOMAXPROCS(-1)) - 0.5)
  
-       if dt := m2[0].Value.Float64() - m1[0].Value.Float64(); dt < minIdleCPUSeconds {
-               for i := range names {
-                       if m1[i].Value.Kind() == metrics.KindBad {
-                               continue
-                       }
-                       t.Logf("%s %0.3f\n", names[i], m2[i].Value.Float64()-m1[i].Value.Float64())
+       // Let's make sure there's no background scavenge work to do.
+       //
+       // The runtime.GC calls below ensure the background sweeper
+       // will not run during the idle period.
+       debug.FreeOSMemory()
+
+       for retries := 0; retries < maxFailures; retries++ {
+               // Read 1.
+               runtime.GC() // Update /cpu/classes metrics.
+               metrics.Read(m1)
+
+               // Sleep.
+               time.Sleep(dur)
+
+               // Read 2.
+               runtime.GC() // Update /cpu/classes metrics.
+               metrics.Read(m2)
+
+               dt := m2[0].Value.Float64() - m1[0].Value.Float64()
+               if dt >= minIdleCPUSeconds {
+                       // All is well. Test passed.
+                       return
+               }
+               failureIdleTimes = append(failureIdleTimes, dt)
+               // Try again.
+       }
+
+       // We couldn't observe the expected idle time even once.
+       for i, dt := range failureIdleTimes {
+               t.Logf("try %2d: idle time = %.5fs\n", i+1, dt)
+       }
+       t.Logf("try %d breakdown:\n", len(failureIdleTimes))
+       for i := range names {
+               if m1[i].Value.Kind() == metrics.KindBad {
+                       continue
                 }
-               t.Errorf(`time.Sleep did not contribute enough to "idle" class: %.5fs < %.5fs`, dt, minIdleCPUSeconds)
+               t.Logf("\t%s %0.3f\n", names[i], m2[i].Value.Float64()-m1[i].Value.Float64())
         }
+       t.Errorf(`time.Sleep did not contribute enough to "idle" class: minimum idle time = %.5fs`, minIdleCPUSeconds)
  }
author	Michael Anthony Knyszek <mknyszek@google.com>
	Fri, 26 May 2023 17:56:39 +0000 (17:56 +0000)
committer	Michael Knyszek <mknyszek@google.com>
	Wed, 31 May 2023 15:17:38 +0000 (15:17 +0000)