From: Jamie Liu <jamieliu@google.com>
Date: Wed, 15 Nov 2017 20:47:22 +0000 (-0800)
Subject: runtime: only sleep before stealing work from a running P
X-Git-Tag: go1.10beta1~182
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=868c8b374d766c46ebf09f056ff7eff6e2186c75;p=gostls13.git

runtime: only sleep before stealing work from a running P

The sleep in question does not make sense if the stolen-from P cannot
run the stolen G. The usleep(3) has been observed delaying execution of
woken G's by ~60us; skipping it reduces the wakeup-to-execution latency
to ~7us in these cases, improving CPU utilization.

Benchmarks added by this change:

name                             old time/op  new time/op  delta
WakeupParallelSpinning/0s-12     14.4µs ± 1%  14.3µs ± 1%     ~     (p=0.227 n=19+20)
WakeupParallelSpinning/1µs-12    18.3µs ± 0%  18.3µs ± 1%     ~     (p=0.950 n=20+19)
WakeupParallelSpinning/2µs-12    22.3µs ± 1%  22.3µs ± 1%     ~     (p=0.670 n=20+18)
WakeupParallelSpinning/5µs-12    31.7µs ± 0%  31.7µs ± 0%     ~     (p=0.460 n=20+17)
WakeupParallelSpinning/10µs-12   51.8µs ± 0%  51.8µs ± 0%     ~     (p=0.883 n=20+20)
WakeupParallelSpinning/20µs-12   91.9µs ± 0%  91.9µs ± 0%     ~     (p=0.245 n=20+20)
WakeupParallelSpinning/50µs-12    214µs ± 0%   214µs ± 0%     ~     (p=0.509 n=19+20)
WakeupParallelSpinning/100µs-12   335µs ± 0%   335µs ± 0%   -0.05%  (p=0.006 n=17+15)
WakeupParallelSyscall/0s-12       228µs ± 2%   129µs ± 1%  -43.32%  (p=0.000 n=20+19)
WakeupParallelSyscall/1µs-12      232µs ± 1%   131µs ± 1%  -43.60%  (p=0.000 n=19+20)
WakeupParallelSyscall/2µs-12      236µs ± 1%   133µs ± 1%  -43.44%  (p=0.000 n=18+19)
WakeupParallelSyscall/5µs-12      248µs ± 2%   139µs ± 1%  -43.68%  (p=0.000 n=18+19)
WakeupParallelSyscall/10µs-12     263µs ± 3%   150µs ± 2%  -42.97%  (p=0.000 n=18+20)
WakeupParallelSyscall/20µs-12     281µs ± 2%   170µs ± 1%  -39.43%  (p=0.000 n=19+19)
WakeupParallelSyscall/50µs-12     345µs ± 4%   246µs ± 7%  -28.85%  (p=0.000 n=20+20)
WakeupParallelSyscall/100µs-12    460µs ± 5%   350µs ± 4%  -23.85%  (p=0.000 n=20+20)

Benchmarks associated with the change that originally added this sleep
(see https://golang.org/s/go15gomaxprocs):

name        old time/op  new time/op  delta
Chain       19.4µs ± 2%  19.3µs ± 1%    ~     (p=0.101 n=19+20)
ChainBuf    19.5µs ± 2%  19.4µs ± 2%    ~     (p=0.840 n=19+19)
Chain-2     19.9µs ± 1%  19.9µs ± 2%    ~     (p=0.734 n=19+19)
ChainBuf-2  20.0µs ± 2%  20.0µs ± 2%    ~     (p=0.175 n=19+17)
Chain-4     20.3µs ± 1%  20.1µs ± 1%  -0.62%  (p=0.010 n=19+18)
ChainBuf-4  20.3µs ± 1%  20.2µs ± 1%  -0.52%  (p=0.023 n=19+19)
Powser       2.09s ± 1%   2.10s ± 3%    ~     (p=0.908 n=19+19)
Powser-2     2.21s ± 1%   2.20s ± 1%  -0.35%  (p=0.010 n=19+18)
Powser-4     2.31s ± 2%   2.31s ± 2%    ~     (p=0.578 n=18+19)
Sieve        13.6s ± 1%   13.6s ± 1%    ~     (p=0.909 n=17+18)
Sieve-2      8.02s ±52%   7.28s ±15%    ~     (p=0.336 n=20+16)
Sieve-4      4.00s ±35%   3.98s ±26%    ~     (p=0.654 n=20+18)

Change-Id: I58edd8ce01075859d871e2348fc0833e9c01f70f
Reviewed-on: https://go-review.googlesource.com/78538
Reviewed-by: Austin Clements <austin@google.com>
---

diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 02c092711c..2120d64745 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4773,22 +4773,25 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
 			if stealRunNextG {
 				// Try to steal from _p_.runnext.
 				if next := _p_.runnext; next != 0 {
-					// Sleep to ensure that _p_ isn't about to run the g we
-					// are about to steal.
-					// The important use case here is when the g running on _p_
-					// ready()s another g and then almost immediately blocks.
-					// Instead of stealing runnext in this window, back off
-					// to give _p_ a chance to schedule runnext. This will avoid
-					// thrashing gs between different Ps.
-					// A sync chan send/recv takes ~50ns as of time of writing,
-					// so 3us gives ~50x overshoot.
-					if GOOS != "windows" {
-						usleep(3)
-					} else {
-						// On windows system timer granularity is 1-15ms,
-						// which is way too much for this optimization.
-						// So just yield.
-						osyield()
+					if _p_.status == _Prunning {
+						// Sleep to ensure that _p_ isn't about to run the g
+						// we are about to steal.
+						// The important use case here is when the g running
+						// on _p_ ready()s another g and then almost
+						// immediately blocks. Instead of stealing runnext
+						// in this window, back off to give _p_ a chance to
+						// schedule runnext. This will avoid thrashing gs
+						// between different Ps.
+						// A sync chan send/recv takes ~50ns as of time of
+						// writing, so 3us gives ~50x overshoot.
+						if GOOS != "windows" {
+							usleep(3)
+						} else {
+							// On windows system timer granularity is
+							// 1-15ms, which is way too much for this
+							// optimization. So just yield.
+							osyield()
+						}
 					}
 					if !_p_.runnext.cas(next, 0) {
 						continue
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go
index c6ecc2a472..a0112f2fac 100644
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -655,6 +655,114 @@ func BenchmarkClosureCall(b *testing.B) {
 	_ = sum
 }
 
+func benchmarkWakeupParallel(b *testing.B, spin func(time.Duration)) {
+	if runtime.GOMAXPROCS(0) == 1 {
+		b.Skip("skipping: GOMAXPROCS=1")
+	}
+
+	wakeDelay := 5 * time.Microsecond
+	for _, delay := range []time.Duration{
+		0,
+		1 * time.Microsecond,
+		2 * time.Microsecond,
+		5 * time.Microsecond,
+		10 * time.Microsecond,
+		20 * time.Microsecond,
+		50 * time.Microsecond,
+		100 * time.Microsecond,
+	} {
+		b.Run(delay.String(), func(b *testing.B) {
+			if b.N == 0 {
+				return
+			}
+			// Start two goroutines, which alternate between being
+			// sender and receiver in the following protocol:
+			//
+			// - The receiver spins for `delay` and then does a
+			// blocking receive on a channel.
+			//
+			// - The sender spins for `delay+wakeDelay` and then
+			// sends to the same channel. (The addition of
+			// `wakeDelay` improves the probability that the
+			// receiver will be blocking when the send occurs when
+			// the goroutines execute in parallel.)
+			//
+			// In each iteration of the benchmark, each goroutine
+			// acts once as sender and once as receiver, so each
+			// goroutine spins for delay twice.
+			//
+			// BenchmarkWakeupParallel is used to estimate how
+			// efficiently the scheduler parallelizes goroutines in
+			// the presence of blocking:
+			//
+			// - If both goroutines are executed on the same core,
+			// an increase in delay by N will increase the time per
+			// iteration by 4*N, because all 4 delays are
+			// serialized.
+			//
+			// - Otherwise, an increase in delay by N will increase
+			// the time per iteration by 2*N, and the time per
+			// iteration is 2 * (runtime overhead + chan
+			// send/receive pair + delay + wakeDelay). This allows
+			// the runtime overhead, including the time it takes
+			// for the unblocked goroutine to be scheduled, to be
+			// estimated.
+			ping, pong := make(chan struct{}), make(chan struct{})
+			start := make(chan struct{})
+			done := make(chan struct{})
+			go func() {
+				<-start
+				for i := 0; i < b.N; i++ {
+					// sender
+					spin(delay + wakeDelay)
+					ping <- struct{}{}
+					// receiver
+					spin(delay)
+					<-pong
+				}
+				done <- struct{}{}
+			}()
+			go func() {
+				for i := 0; i < b.N; i++ {
+					// receiver
+					spin(delay)
+					<-ping
+					// sender
+					spin(delay + wakeDelay)
+					pong <- struct{}{}
+				}
+				done <- struct{}{}
+			}()
+			b.ResetTimer()
+			start <- struct{}{}
+			<-done
+			<-done
+		})
+	}
+}
+
+func BenchmarkWakeupParallelSpinning(b *testing.B) {
+	benchmarkWakeupParallel(b, func(d time.Duration) {
+		end := time.Now().Add(d)
+		for time.Now().Before(end) {
+			// do nothing
+		}
+	})
+}
+
+func BenchmarkWakeupParallelSyscall(b *testing.B) {
+	benchmarkWakeupParallel(b, func(d time.Duration) {
+		// Invoke a blocking syscall directly; calling time.Sleep()
+		// would deschedule the goroutine instead.
+		ts := syscall.NsecToTimespec(d.Nanoseconds())
+		for {
+			if err := syscall.Nanosleep(&ts, &ts); err != syscall.EINTR {
+				return
+			}
+		}
+	})
+}
+
 type Matrix [][]float64
 
 func BenchmarkMatmult(b *testing.B) {