]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: sleep less when we can do work
authorDmitry Vyukov <dvyukov@google.com>
Fri, 18 Mar 2016 10:00:03 +0000 (11:00 +0100)
committerDmitry Vyukov <dvyukov@google.com>
Tue, 5 Apr 2016 15:32:06 +0000 (15:32 +0000)
Usleep(100) in runqgrab negatively affects latency and throughput
of parallel application. We are sleeping instead of doing useful work.
This is effect is particularly visible on windows where minimal
sleep duration is 1-15ms.

Reduce sleep from 100us to 3us and use osyield on windows.
Sync chan send/recv takes ~50ns, so 3us gives us ~50x overshoot.

benchmark                    old ns/op     new ns/op     delta
BenchmarkChanSync-12         216           217           +0.46%
BenchmarkChanSyncWork-12     27213         25816         -5.13%

CPU consumption goes up from 106% to 108% in the first case,
and from 107% to 125% in the second case.

Test case from #14790 on windows:

BenchmarkDefaultResolution-8  4583372   29720    -99.35%
Benchmark1ms-8                992056    30701    -96.91%

99-th latency percentile for HTTP request serving is improved by up to 15%
(see http://golang.org/cl/20835 for details).

The following benchmarks are from the change that originally added this sleep
(see https://golang.org/s/go15gomaxprocs):

name        old time/op  new time/op  delta
Chain       22.6µs ± 2%  22.7µs ± 6%    ~      (p=0.905 n=9+10)
ChainBuf    22.4µs ± 3%  22.5µs ± 4%    ~      (p=0.780 n=9+10)
Chain-2     23.5µs ± 4%  24.9µs ± 1%  +5.66%   (p=0.000 n=10+9)
ChainBuf-2  23.7µs ± 1%  24.4µs ± 1%  +3.31%   (p=0.000 n=9+10)
Chain-4     24.2µs ± 2%  25.1µs ± 3%  +3.70%   (p=0.000 n=9+10)
ChainBuf-4  24.4µs ± 5%  25.0µs ± 2%  +2.37%  (p=0.023 n=10+10)
Powser       2.37s ± 1%   2.37s ± 1%    ~       (p=0.423 n=8+9)
Powser-2     2.48s ± 2%   2.57s ± 2%  +3.74%   (p=0.000 n=10+9)
Powser-4     2.66s ± 1%   2.75s ± 1%  +3.40%  (p=0.000 n=10+10)
Sieve        13.3s ± 2%   13.3s ± 2%    ~      (p=1.000 n=10+9)
Sieve-2      7.00s ± 2%   7.44s ±16%    ~      (p=0.408 n=8+10)
Sieve-4      4.13s ±21%   3.85s ±22%    ~       (p=0.113 n=9+9)

Fixes #14790

Change-Id: Ie7c6a1c4f9c8eb2f5d65ab127a3845386d6f8b5d
Reviewed-on: https://go-review.googlesource.com/20835
Reviewed-by: Austin Clements <austin@google.com>
src/runtime/chan_test.go
src/runtime/proc.go

index 911821bea58d0a6af99f86bfaaf5d0fcab4e3095..219f2b449bf03fa7bd69db94f64c63f428c32726 100644 (file)
@@ -777,7 +777,7 @@ func BenchmarkChanContended(b *testing.B) {
        })
 }
 
-func BenchmarkChanSync(b *testing.B) {
+func benchmarkChanSync(b *testing.B, work int) {
        const CallsPerSched = 1000
        procs := 2
        N := int32(b.N / CallsPerSched / procs * procs)
@@ -793,10 +793,14 @@ func BenchmarkChanSync(b *testing.B) {
                                for g := 0; g < CallsPerSched; g++ {
                                        if i%2 == 0 {
                                                <-myc
+                                               localWork(work)
                                                myc <- 0
+                                               localWork(work)
                                        } else {
                                                myc <- 0
+                                               localWork(work)
                                                <-myc
+                                               localWork(work)
                                        }
                                }
                        }
@@ -808,6 +812,14 @@ func BenchmarkChanSync(b *testing.B) {
        }
 }
 
+func BenchmarkChanSync(b *testing.B) {
+       benchmarkChanSync(b, 0)
+}
+
+func BenchmarkChanSyncWork(b *testing.B) {
+       benchmarkChanSync(b, 1000)
+}
+
 func benchmarkChanProdCons(b *testing.B, chanSize, localWork int) {
        const CallsPerSched = 1000
        procs := runtime.GOMAXPROCS(-1)
@@ -981,3 +993,18 @@ func BenchmarkChanPopular(b *testing.B) {
        }
        wg.Wait()
 }
+
+var (
+       alwaysFalse = false
+       workSink    = 0
+)
+
+func localWork(w int) {
+       foo := 0
+       for i := 0; i < w; i++ {
+               foo /= (foo + 1)
+       }
+       if alwaysFalse {
+               workSink += foo
+       }
+}
index 1e1987ff9e01c397b4c99b477c1ed4fa0b2b15d7..2cc29df43466109dc2ce34a18afafb9c4229ae5a 100644 (file)
@@ -3999,7 +3999,16 @@ func runqgrab(_p_ *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool
                                        // Instead of stealing runnext in this window, back off
                                        // to give _p_ a chance to schedule runnext. This will avoid
                                        // thrashing gs between different Ps.
-                                       usleep(100)
+                                       // A sync chan send/recv takes ~50ns as of time of writing,
+                                       // so 3us gives ~50x overshoot.
+                                       if GOOS != "windows" {
+                                               usleep(3)
+                                       } else {
+                                               // On windows system timer granularity is 1-15ms,
+                                               // which is way too much for this optimization.
+                                               // So just yield.
+                                               osyield()
+                                       }
                                        if !_p_.runnext.cas(next, 0) {
                                                continue
                                        }