Usleep(100) in runqgrab negatively affects latency and throughput
of parallel application. We are sleeping instead of doing useful work.
This is effect is particularly visible on windows where minimal
sleep duration is 1-15ms.
Reduce sleep from 100us to 3us and use osyield on windows.
Sync chan send/recv takes ~50ns, so 3us gives us ~50x overshoot.
benchmark old ns/op new ns/op delta
BenchmarkChanSync-12 216 217 +0.46%
BenchmarkChanSyncWork-12 27213 25816 -5.13%
CPU consumption goes up from 106% to 108% in the first case,
and from 107% to 125% in the second case.
Test case from #14790 on windows:
BenchmarkDefaultResolution-8
4583372 29720 -99.35%
Benchmark1ms-8 992056 30701 -96.91%
99-th latency percentile for HTTP request serving is improved by up to 15%
(see http://golang.org/cl/20835 for details).
The following benchmarks are from the change that originally added this sleep
(see https://golang.org/s/go15gomaxprocs):
name old time/op new time/op delta
Chain 22.6µs ± 2% 22.7µs ± 6% ~ (p=0.905 n=9+10)
ChainBuf 22.4µs ± 3% 22.5µs ± 4% ~ (p=0.780 n=9+10)
Chain-2 23.5µs ± 4% 24.9µs ± 1% +5.66% (p=0.000 n=10+9)
ChainBuf-2 23.7µs ± 1% 24.4µs ± 1% +3.31% (p=0.000 n=9+10)
Chain-4 24.2µs ± 2% 25.1µs ± 3% +3.70% (p=0.000 n=9+10)
ChainBuf-4 24.4µs ± 5% 25.0µs ± 2% +2.37% (p=0.023 n=10+10)
Powser 2.37s ± 1% 2.37s ± 1% ~ (p=0.423 n=8+9)
Powser-2 2.48s ± 2% 2.57s ± 2% +3.74% (p=0.000 n=10+9)
Powser-4 2.66s ± 1% 2.75s ± 1% +3.40% (p=0.000 n=10+10)
Sieve 13.3s ± 2% 13.3s ± 2% ~ (p=1.000 n=10+9)
Sieve-2 7.00s ± 2% 7.44s ±16% ~ (p=0.408 n=8+10)
Sieve-4 4.13s ±21% 3.85s ±22% ~ (p=0.113 n=9+9)
Fixes #14790
Change-Id: Ie7c6a1c4f9c8eb2f5d65ab127a3845386d6f8b5d
Reviewed-on: https://go-review.googlesource.com/20835
Reviewed-by: Austin Clements <austin@google.com>
})
}
-func BenchmarkChanSync(b *testing.B) {
+func benchmarkChanSync(b *testing.B, work int) {
const CallsPerSched = 1000
procs := 2
N := int32(b.N / CallsPerSched / procs * procs)
for g := 0; g < CallsPerSched; g++ {
if i%2 == 0 {
<-myc
+ localWork(work)
myc <- 0
+ localWork(work)
} else {
myc <- 0
+ localWork(work)
<-myc
+ localWork(work)
}
}
}
}
}
+func BenchmarkChanSync(b *testing.B) {
+ benchmarkChanSync(b, 0)
+}
+
+func BenchmarkChanSyncWork(b *testing.B) {
+ benchmarkChanSync(b, 1000)
+}
+
func benchmarkChanProdCons(b *testing.B, chanSize, localWork int) {
const CallsPerSched = 1000
procs := runtime.GOMAXPROCS(-1)
}
wg.Wait()
}
+
+var (
+ alwaysFalse = false
+ workSink = 0
+)
+
+func localWork(w int) {
+ foo := 0
+ for i := 0; i < w; i++ {
+ foo /= (foo + 1)
+ }
+ if alwaysFalse {
+ workSink += foo
+ }
+}
// Instead of stealing runnext in this window, back off
// to give _p_ a chance to schedule runnext. This will avoid
// thrashing gs between different Ps.
- usleep(100)
+ // A sync chan send/recv takes ~50ns as of time of writing,
+ // so 3us gives ~50x overshoot.
+ if GOOS != "windows" {
+ usleep(3)
+ } else {
+ // On windows system timer granularity is 1-15ms,
+ // which is way too much for this optimization.
+ // So just yield.
+ osyield()
+ }
if !_p_.runnext.cas(next, 0) {
continue
}