]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: only sleep before stealing work from a running P
authorJamie Liu <jamieliu@google.com>
Wed, 15 Nov 2017 20:47:22 +0000 (12:47 -0800)
committerAustin Clements <austin@google.com>
Tue, 21 Nov 2017 19:31:06 +0000 (19:31 +0000)
The sleep in question does not make sense if the stolen-from P cannot
run the stolen G. The usleep(3) has been observed delaying execution of
woken G's by ~60us; skipping it reduces the wakeup-to-execution latency
to ~7us in these cases, improving CPU utilization.

Benchmarks added by this change:

name                             old time/op  new time/op  delta
WakeupParallelSpinning/0s-12     14.4µs ± 1%  14.3µs ± 1%     ~     (p=0.227 n=19+20)
WakeupParallelSpinning/1µs-12    18.3µs ± 0%  18.3µs ± 1%     ~     (p=0.950 n=20+19)
WakeupParallelSpinning/2µs-12    22.3µs ± 1%  22.3µs ± 1%     ~     (p=0.670 n=20+18)
WakeupParallelSpinning/5µs-12    31.7µs ± 0%  31.7µs ± 0%     ~     (p=0.460 n=20+17)
WakeupParallelSpinning/10µs-12   51.8µs ± 0%  51.8µs ± 0%     ~     (p=0.883 n=20+20)
WakeupParallelSpinning/20µs-12   91.9µs ± 0%  91.9µs ± 0%     ~     (p=0.245 n=20+20)
WakeupParallelSpinning/50µs-12    214µs ± 0%   214µs ± 0%     ~     (p=0.509 n=19+20)
WakeupParallelSpinning/100µs-12   335µs ± 0%   335µs ± 0%   -0.05%  (p=0.006 n=17+15)
WakeupParallelSyscall/0s-12       228µs ± 2%   129µs ± 1%  -43.32%  (p=0.000 n=20+19)
WakeupParallelSyscall/1µs-12      232µs ± 1%   131µs ± 1%  -43.60%  (p=0.000 n=19+20)
WakeupParallelSyscall/2µs-12      236µs ± 1%   133µs ± 1%  -43.44%  (p=0.000 n=18+19)
WakeupParallelSyscall/5µs-12      248µs ± 2%   139µs ± 1%  -43.68%  (p=0.000 n=18+19)
WakeupParallelSyscall/10µs-12     263µs ± 3%   150µs ± 2%  -42.97%  (p=0.000 n=18+20)
WakeupParallelSyscall/20µs-12     281µs ± 2%   170µs ± 1%  -39.43%  (p=0.000 n=19+19)
WakeupParallelSyscall/50µs-12     345µs ± 4%   246µs ± 7%  -28.85%  (p=0.000 n=20+20)
WakeupParallelSyscall/100µs-12    460µs ± 5%   350µs ± 4%  -23.85%  (p=0.000 n=20+20)

Benchmarks associated with the change that originally added this sleep
(see https://golang.org/s/go15gomaxprocs):

name        old time/op  new time/op  delta
Chain       19.4µs ± 2%  19.3µs ± 1%    ~     (p=0.101 n=19+20)
ChainBuf    19.5µs ± 2%  19.4µs ± 2%    ~     (p=0.840 n=19+19)
Chain-2     19.9µs ± 1%  19.9µs ± 2%    ~     (p=0.734 n=19+19)
ChainBuf-2  20.0µs ± 2%  20.0µs ± 2%    ~     (p=0.175 n=19+17)
Chain-4     20.3µs ± 1%  20.1µs ± 1%  -0.62%  (p=0.010 n=19+18)
ChainBuf-4  20.3µs ± 1%  20.2µs ± 1%  -0.52%  (p=0.023 n=19+19)
Powser       2.09s ± 1%   2.10s ± 3%    ~     (p=0.908 n=19+19)
Powser-2     2.21s ± 1%   2.20s ± 1%  -0.35%  (p=0.010 n=19+18)
Powser-4     2.31s ± 2%   2.31s ± 2%    ~     (p=0.578 n=18+19)
Sieve        13.6s ± 1%   13.6s ± 1%    ~     (p=0.909 n=17+18)
Sieve-2      8.02s ±52%   7.28s ±15%    ~     (p=0.336 n=20+16)
Sieve-4      4.00s ±35%   3.98s ±26%    ~     (p=0.654 n=20+18)

Change-Id: I58edd8ce01075859d871e2348fc0833e9c01f70f
Reviewed-on: https://go-review.googlesource.com/78538
Reviewed-by: Austin Clements <austin@google.com>
src/runtime/proc.go
src/runtime/proc_test.go

index 02c092711c9e5577a8683107f6fcc7f6c70f5329..2120d647457f23b28e18afdb65b4a8b2fb17a2db 100644 (file)
@@ -4773,22 +4773,25 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
                        if stealRunNextG {
                                // Try to steal from _p_.runnext.
                                if next := _p_.runnext; next != 0 {
-                                       // Sleep to ensure that _p_ isn't about to run the g we
-                                       // are about to steal.
-                                       // The important use case here is when the g running on _p_
-                                       // ready()s another g and then almost immediately blocks.
-                                       // Instead of stealing runnext in this window, back off
-                                       // to give _p_ a chance to schedule runnext. This will avoid
-                                       // thrashing gs between different Ps.
-                                       // A sync chan send/recv takes ~50ns as of time of writing,
-                                       // so 3us gives ~50x overshoot.
-                                       if GOOS != "windows" {
-                                               usleep(3)
-                                       } else {
-                                               // On windows system timer granularity is 1-15ms,
-                                               // which is way too much for this optimization.
-                                               // So just yield.
-                                               osyield()
+                                       if _p_.status == _Prunning {
+                                               // Sleep to ensure that _p_ isn't about to run the g
+                                               // we are about to steal.
+                                               // The important use case here is when the g running
+                                               // on _p_ ready()s another g and then almost
+                                               // immediately blocks. Instead of stealing runnext
+                                               // in this window, back off to give _p_ a chance to
+                                               // schedule runnext. This will avoid thrashing gs
+                                               // between different Ps.
+                                               // A sync chan send/recv takes ~50ns as of time of
+                                               // writing, so 3us gives ~50x overshoot.
+                                               if GOOS != "windows" {
+                                                       usleep(3)
+                                               } else {
+                                                       // On windows system timer granularity is
+                                                       // 1-15ms, which is way too much for this
+                                                       // optimization. So just yield.
+                                                       osyield()
+                                               }
                                        }
                                        if !_p_.runnext.cas(next, 0) {
                                                continue
index c6ecc2a4722f9bdc9545c12a097646a8ef007866..a0112f2fac3c0540e9ce44ccd05739d367fac94c 100644 (file)
@@ -655,6 +655,114 @@ func BenchmarkClosureCall(b *testing.B) {
        _ = sum
 }
 
+func benchmarkWakeupParallel(b *testing.B, spin func(time.Duration)) {
+       if runtime.GOMAXPROCS(0) == 1 {
+               b.Skip("skipping: GOMAXPROCS=1")
+       }
+
+       wakeDelay := 5 * time.Microsecond
+       for _, delay := range []time.Duration{
+               0,
+               1 * time.Microsecond,
+               2 * time.Microsecond,
+               5 * time.Microsecond,
+               10 * time.Microsecond,
+               20 * time.Microsecond,
+               50 * time.Microsecond,
+               100 * time.Microsecond,
+       } {
+               b.Run(delay.String(), func(b *testing.B) {
+                       if b.N == 0 {
+                               return
+                       }
+                       // Start two goroutines, which alternate between being
+                       // sender and receiver in the following protocol:
+                       //
+                       // - The receiver spins for `delay` and then does a
+                       // blocking receive on a channel.
+                       //
+                       // - The sender spins for `delay+wakeDelay` and then
+                       // sends to the same channel. (The addition of
+                       // `wakeDelay` improves the probability that the
+                       // receiver will be blocking when the send occurs when
+                       // the goroutines execute in parallel.)
+                       //
+                       // In each iteration of the benchmark, each goroutine
+                       // acts once as sender and once as receiver, so each
+                       // goroutine spins for delay twice.
+                       //
+                       // BenchmarkWakeupParallel is used to estimate how
+                       // efficiently the scheduler parallelizes goroutines in
+                       // the presence of blocking:
+                       //
+                       // - If both goroutines are executed on the same core,
+                       // an increase in delay by N will increase the time per
+                       // iteration by 4*N, because all 4 delays are
+                       // serialized.
+                       //
+                       // - Otherwise, an increase in delay by N will increase
+                       // the time per iteration by 2*N, and the time per
+                       // iteration is 2 * (runtime overhead + chan
+                       // send/receive pair + delay + wakeDelay). This allows
+                       // the runtime overhead, including the time it takes
+                       // for the unblocked goroutine to be scheduled, to be
+                       // estimated.
+                       ping, pong := make(chan struct{}), make(chan struct{})
+                       start := make(chan struct{})
+                       done := make(chan struct{})
+                       go func() {
+                               <-start
+                               for i := 0; i < b.N; i++ {
+                                       // sender
+                                       spin(delay + wakeDelay)
+                                       ping <- struct{}{}
+                                       // receiver
+                                       spin(delay)
+                                       <-pong
+                               }
+                               done <- struct{}{}
+                       }()
+                       go func() {
+                               for i := 0; i < b.N; i++ {
+                                       // receiver
+                                       spin(delay)
+                                       <-ping
+                                       // sender
+                                       spin(delay + wakeDelay)
+                                       pong <- struct{}{}
+                               }
+                               done <- struct{}{}
+                       }()
+                       b.ResetTimer()
+                       start <- struct{}{}
+                       <-done
+                       <-done
+               })
+       }
+}
+
+func BenchmarkWakeupParallelSpinning(b *testing.B) {
+       benchmarkWakeupParallel(b, func(d time.Duration) {
+               end := time.Now().Add(d)
+               for time.Now().Before(end) {
+                       // do nothing
+               }
+       })
+}
+
+func BenchmarkWakeupParallelSyscall(b *testing.B) {
+       benchmarkWakeupParallel(b, func(d time.Duration) {
+               // Invoke a blocking syscall directly; calling time.Sleep()
+               // would deschedule the goroutine instead.
+               ts := syscall.NsecToTimespec(d.Nanoseconds())
+               for {
+                       if err := syscall.Nanosleep(&ts, &ts); err != syscall.EINTR {
+                               return
+                       }
+               }
+       })
+}
+
 type Matrix [][]float64
 
 func BenchmarkMatmult(b *testing.B) {