runtime: only sleep before stealing work from a running P

author Jamie Liu <jamieliu@google.com>

Wed, 15 Nov 2017 20:47:22 +0000 (12:47 -0800)

committer Austin Clements <austin@google.com>

Tue, 21 Nov 2017 19:31:06 +0000 (19:31 +0000)
author Jamie Liu <jamieliu@google.com>
Wed, 15 Nov 2017 20:47:22 +0000 (12:47 -0800)
committer Austin Clements <austin@google.com>
Tue, 21 Nov 2017 19:31:06 +0000 (19:31 +0000)
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index 02c092711c9e5577a8683107f6fcc7f6c70f5329..2120d647457f23b28e18afdb65b4a8b2fb17a2db 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4773,22 +4773,25 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
                         if stealRunNextG {
                                 // Try to steal from _p_.runnext.
                                 if next := _p_.runnext; next != 0 {
-                                       // Sleep to ensure that _p_ isn't about to run the g we
-                                       // are about to steal.
-                                       // The important use case here is when the g running on _p_
-                                       // ready()s another g and then almost immediately blocks.
-                                       // Instead of stealing runnext in this window, back off
-                                       // to give _p_ a chance to schedule runnext. This will avoid
-                                       // thrashing gs between different Ps.
-                                       // A sync chan send/recv takes ~50ns as of time of writing,
-                                       // so 3us gives ~50x overshoot.
-                                       if GOOS != "windows" {
-                                               usleep(3)
-                                       } else {
-                                               // On windows system timer granularity is 1-15ms,
-                                               // which is way too much for this optimization.
-                                               // So just yield.
-                                               osyield()
+                                       if _p_.status == _Prunning {
+                                               // Sleep to ensure that _p_ isn't about to run the g
+                                               // we are about to steal.
+                                               // The important use case here is when the g running
+                                               // on _p_ ready()s another g and then almost
+                                               // immediately blocks. Instead of stealing runnext
+                                               // in this window, back off to give _p_ a chance to
+                                               // schedule runnext. This will avoid thrashing gs
+                                               // between different Ps.
+                                               // A sync chan send/recv takes ~50ns as of time of
+                                               // writing, so 3us gives ~50x overshoot.
+                                               if GOOS != "windows" {
+                                                       usleep(3)
+                                               } else {
+                                                       // On windows system timer granularity is
+                                                       // 1-15ms, which is way too much for this
+                                                       // optimization. So just yield.
+                                                       osyield()
+                                               }
                                         }
                                         if !_p_.runnext.cas(next, 0) {
                                                 continue
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go

index c6ecc2a4722f9bdc9545c12a097646a8ef007866..a0112f2fac3c0540e9ce44ccd05739d367fac94c 100644 (file)
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -655,6 +655,114 @@ func BenchmarkClosureCall(b *testing.B) {
         _ = sum
  }
  
+func benchmarkWakeupParallel(b *testing.B, spin func(time.Duration)) {
+       if runtime.GOMAXPROCS(0) == 1 {
+               b.Skip("skipping: GOMAXPROCS=1")
+       }
+
+       wakeDelay := 5 * time.Microsecond
+       for _, delay := range []time.Duration{
+               0,
+               1 * time.Microsecond,
+               2 * time.Microsecond,
+               5 * time.Microsecond,
+               10 * time.Microsecond,
+               20 * time.Microsecond,
+               50 * time.Microsecond,
+               100 * time.Microsecond,
+       } {
+               b.Run(delay.String(), func(b *testing.B) {
+                       if b.N == 0 {
+                               return
+                       }
+                       // Start two goroutines, which alternate between being
+                       // sender and receiver in the following protocol:
+                       //
+                       // - The receiver spins for `delay` and then does a
+                       // blocking receive on a channel.
+                       //
+                       // - The sender spins for `delay+wakeDelay` and then
+                       // sends to the same channel. (The addition of
+                       // `wakeDelay` improves the probability that the
+                       // receiver will be blocking when the send occurs when
+                       // the goroutines execute in parallel.)
+                       //
+                       // In each iteration of the benchmark, each goroutine
+                       // acts once as sender and once as receiver, so each
+                       // goroutine spins for delay twice.
+                       //
+                       // BenchmarkWakeupParallel is used to estimate how
+                       // efficiently the scheduler parallelizes goroutines in
+                       // the presence of blocking:
+                       //
+                       // - If both goroutines are executed on the same core,
+                       // an increase in delay by N will increase the time per
+                       // iteration by 4*N, because all 4 delays are
+                       // serialized.
+                       //
+                       // - Otherwise, an increase in delay by N will increase
+                       // the time per iteration by 2*N, and the time per
+                       // iteration is 2 * (runtime overhead + chan
+                       // send/receive pair + delay + wakeDelay). This allows
+                       // the runtime overhead, including the time it takes
+                       // for the unblocked goroutine to be scheduled, to be
+                       // estimated.
+                       ping, pong := make(chan struct{}), make(chan struct{})
+                       start := make(chan struct{})
+                       done := make(chan struct{})
+                       go func() {
+                               <-start
+                               for i := 0; i < b.N; i++ {
+                                       // sender
+                                       spin(delay + wakeDelay)
+                                       ping <- struct{}{}
+                                       // receiver
+                                       spin(delay)
+                                       <-pong
+                               }
+                               done <- struct{}{}
+                       }()
+                       go func() {
+                               for i := 0; i < b.N; i++ {
+                                       // receiver
+                                       spin(delay)
+                                       <-ping
+                                       // sender
+                                       spin(delay + wakeDelay)
+                                       pong <- struct{}{}
+                               }
+                               done <- struct{}{}
+                       }()
+                       b.ResetTimer()
+                       start <- struct{}{}
+                       <-done
+                       <-done
+               })
+       }
+}
+
+func BenchmarkWakeupParallelSpinning(b *testing.B) {
+       benchmarkWakeupParallel(b, func(d time.Duration) {
+               end := time.Now().Add(d)
+               for time.Now().Before(end) {
+                       // do nothing
+               }
+       })
+}
+
+func BenchmarkWakeupParallelSyscall(b *testing.B) {
+       benchmarkWakeupParallel(b, func(d time.Duration) {
+               // Invoke a blocking syscall directly; calling time.Sleep()
+               // would deschedule the goroutine instead.
+               ts := syscall.NsecToTimespec(d.Nanoseconds())
+               for {
+                       if err := syscall.Nanosleep(&ts, &ts); err != syscall.EINTR {
+                               return
+                       }
+               }
+       })
+}
+
  type Matrix [][]float64
  
  func BenchmarkMatmult(b *testing.B) {
author	Jamie Liu <jamieliu@google.com>
	Wed, 15 Nov 2017 20:47:22 +0000 (12:47 -0800)
committer	Austin Clements <austin@google.com>
	Tue, 21 Nov 2017 19:31:06 +0000 (19:31 +0000)
src/runtime/proc.go		patch \| blob \| history
src/runtime/proc_test.go		patch \| blob \| history