From c4fe503119ee517e202e81913f08b2e9940e1a72 Mon Sep 17 00:00:00 2001 From: Rick Hudson Date: Thu, 7 May 2015 17:19:30 -0400 Subject: [PATCH] runtime: reduce thrashing of gs between ps MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit One important use case is a pipeline computation that pass values from one Goroutine to the next and then exits or is placed in a wait state. If GOMAXPROCS > 1 a Goroutine running on P1 will enable another Goroutine and then immediately make P1 available to execute it. We need to prevent other Ps from stealing the G that P1 is about to execute. Otherwise the Gs can thrash between Ps causing unneeded synchronization and slowing down throughput. Fix this by changing the stealing logic so that when a P attempts to steal the only G on some other P's run queue, it will pause momentarily to allow the victim P to schedule the G. As part of optimizing stealing we also use a per P victim queue move stolen gs. This eliminates the zeroing of a stack local victim queue which turned out to be expensive. This CL is a necessary but not sufficient prerequisite to changing the default value of GOMAXPROCS to something > 1 which is another CL/discussion. For highly serialized programs, such as GoroutineRing below this can make a large difference. For larger and more parallel programs such as the x/benchmarks there is no noticeable detriment. ~/work/code/src/rsc.io/benchstat/benchstat old.txt new.txt name old mean new mean delta GoroutineRing 30.2µs × (0.98,1.01) 30.1µs × (0.97,1.04) ~ (p=0.941) GoroutineRing-2 113µs × (0.91,1.07) 30µs × (0.98,1.03) -73.17% (p=0.004) GoroutineRing-4 144µs × (0.98,1.02) 32µs × (0.98,1.01) -77.69% (p=0.000) GoroutineRingBuf 32.7µs × (0.97,1.03) 32.5µs × (0.97,1.02) ~ (p=0.795) GoroutineRingBuf-2 120µs × (0.92,1.08) 33µs × (1.00,1.00) -72.48% (p=0.004) GoroutineRingBuf-4 138µs × (0.92,1.06) 33µs × (1.00,1.00) -76.21% (p=0.003) The bench benchmarks show little impact. old new garbage 7032879 7011696 httpold 25509 25301 splayold 1022073 1019499 jsonold 28230624 28081433 Change-Id: I228c48fed8d85c9bbef16a7edc53ab7898506f50 Reviewed-on: https://go-review.googlesource.com/9872 Reviewed-by: Austin Clements --- src/runtime/proc1.go | 44 +++++++++++++++++++++++++---------------- src/runtime/runtime2.go | 7 ++++--- 2 files changed, 31 insertions(+), 20 deletions(-) diff --git a/src/runtime/proc1.go b/src/runtime/proc1.go index 2fe1551952..8aeacee747 100644 --- a/src/runtime/proc1.go +++ b/src/runtime/proc1.go @@ -1420,7 +1420,7 @@ top: xadd(&sched.nmspinning, 1) } // random steal from other P's - for i := 0; i < int(2*gomaxprocs); i++ { + for i := 0; i < int(4*gomaxprocs); i++ { if sched.gcwaiting != 0 { goto top } @@ -1429,15 +1429,17 @@ top: if _p_ == _g_.m.p.ptr() { gp, _ = runqget(_p_) } else { - gp = runqsteal(_g_.m.p.ptr(), _p_) + stealRunNextG := i > 2*int(gomaxprocs) // first look for ready queues with more than 1 g + gp = runqsteal(_g_.m.p.ptr(), _p_, stealRunNextG) } if gp != nil { return gp, false } } + stop: - // We have nothing to do. If we're in the GC mark phaseand can + // We have nothing to do. If we're in the GC mark phase and can // safely scan and blacken objects, run idle-time marking // rather than give up the P. if _p_ := _g_.m.p.ptr(); gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != nil { @@ -3461,20 +3463,30 @@ func runqget(_p_ *p) (gp *g, inheritTime bool) { // Grabs a batch of goroutines from local runnable queue. // batch array must be of size len(p->runq)/2. Returns number of grabbed goroutines. // Can be executed by any P. -func runqgrab(_p_ *p, batch []*g) uint32 { +func runqgrab(_p_ *p, batch []*g, stealRunNextG bool) uint32 { for { h := atomicload(&_p_.runqhead) // load-acquire, synchronize with other consumers t := atomicload(&_p_.runqtail) // load-acquire, synchronize with the producer n := t - h n = n - n/2 if n == 0 { - // Try to steal from _p_.runnext. - if next := _p_.runnext; next != 0 { - if !_p_.runnext.cas(next, 0) { - continue + if stealRunNextG { + // Try to steal from _p_.runnext. + if next := _p_.runnext; next != 0 { + // Sleep to ensure that _p_ isn't about to run the g we + // are about to steal. + // The important use case here is when the g running on _p_ + // ready()s another g and then almost immediately blocks. + // Instead of stealing runnext in this window, back off + // to give _p_ a chance to schedule runnext. This will avoid + // thrashing gs between different Ps. + usleep(100) + if !_p_.runnext.cas(next, 0) { + continue + } + batch[0] = next.ptr() + return 1 } - batch[0] = next.ptr() - return 1 } return 0 } @@ -3493,15 +3505,13 @@ func runqgrab(_p_ *p, batch []*g) uint32 { // Steal half of elements from local runnable queue of p2 // and put onto local runnable queue of p. // Returns one of the stolen elements (or nil if failed). -func runqsteal(_p_, p2 *p) *g { - var batch [len(_p_.runq) / 2]*g - - n := runqgrab(p2, batch[:]) +func runqsteal(_p_, p2 *p, stealRunNextG bool) *g { + n := runqgrab(p2, _p_.runqvictims[:], stealRunNextG) if n == 0 { return nil } n-- - gp := batch[n] + gp := _p_.runqvictims[n] if n == 0 { return gp } @@ -3511,7 +3521,7 @@ func runqsteal(_p_, p2 *p) *g { throw("runqsteal: runq overflow") } for i := uint32(0); i < n; i++ { - _p_.runq[(t+i)%uint32(len(_p_.runq))] = batch[i] + _p_.runq[(t+i)%uint32(len(_p_.runq))] = _p_.runqvictims[i] } atomicstore(&_p_.runqtail, t+n) // store-release, makes the item available for consumption return gp @@ -3548,7 +3558,7 @@ func testSchedLocalQueueSteal() { gs[j].sig = 0 runqput(p1, &gs[j], false) } - gp := runqsteal(p2, p1) + gp := runqsteal(p2, p1, true) s := 0 if gp != nil { s++ diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index 8dfece5845..ae93bb8dcb 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -353,9 +353,10 @@ type p struct { goidcacheend uint64 // Queue of runnable goroutines. Accessed without lock. - runqhead uint32 - runqtail uint32 - runq [256]*g + runqhead uint32 + runqtail uint32 + runq [256]*g + runqvictims [128]*g // Used to stage victims from another p's runq // runnext, if non-nil, is a runnable G that was ready'd by // the current G and should be run next instead of what's in // runq if there's time remaining in the running G's time -- 2.48.1