runtime: improve randomized stealing logic

author Dmitry Vyukov <dvyukov@google.com>

Fri, 18 Mar 2016 11:52:52 +0000 (12:52 +0100)

committer Dmitry Vyukov <dvyukov@google.com>

Fri, 25 Mar 2016 11:00:48 +0000 (11:00 +0000)
author Dmitry Vyukov <dvyukov@google.com>
Fri, 18 Mar 2016 11:52:52 +0000 (12:52 +0100)
committer Dmitry Vyukov <dvyukov@google.com>
Fri, 25 Mar 2016 11:00:48 +0000 (11:00 +0000)
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index d3867977845ff0604f5f0a0c5396ce82991de2bc..c30ce7a5a34ea1a6c9c95ed383075e44abfdade0 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1787,11 +1787,12 @@ func findrunnable() (gp *g, inheritTime bool) {
         // an M.
  
  top:
+       _p_ := _g_.m.p.ptr()
         if sched.gcwaiting != 0 {
                 gcstopm()
                 goto top
         }
-       if _g_.m.p.ptr().runSafePointFn != 0 {
+       if _p_.runSafePointFn != 0 {
                 runSafePointFn()
         }
         if fingwait && fingwake {
@@ -1801,14 +1802,14 @@ top:
         }
  
         // local runq
-       if gp, inheritTime := runqget(_g_.m.p.ptr()); gp != nil {
+       if gp, inheritTime := runqget(_p_); gp != nil {
                 return gp, inheritTime
         }
  
         // global runq
         if sched.runqsize != 0 {
                 lock(&sched.lock)
-               gp := globrunqget(_g_.m.p.ptr(), 0)
+               gp := globrunqget(_p_, 0)
                 unlock(&sched.lock)
                 if gp != nil {
                         return gp, false
@@ -1833,31 +1834,33 @@ top:
                 }
         }
  
+       // Steal work from other P's.
+       procs := uint32(gomaxprocs)
+       if atomic.Load(&sched.npidle) == procs-1 {
+               // Either GOMAXPROCS=1 or everybody, except for us, is idle already.
+               // New work can appear from returning syscall/cgocall, network or timers.
+               // Neither of that submits to local run queues, so no point in stealing.
+               goto stop
+       }
         // If number of spinning M's >= number of busy P's, block.
         // This is necessary to prevent excessive CPU consumption
         // when GOMAXPROCS>>1 but the program parallelism is low.
-       if !_g_.m.spinning && 2*atomic.Load(&sched.nmspinning) >= uint32(gomaxprocs)-atomic.Load(&sched.npidle) { // TODO: fast atomic
+       if !_g_.m.spinning && 2*atomic.Load(&sched.nmspinning) >= procs-atomic.Load(&sched.npidle) { // TODO: fast atomic
                 goto stop
         }
         if !_g_.m.spinning {
                 _g_.m.spinning = true
                 atomic.Xadd(&sched.nmspinning, 1)
         }
-       // random steal from other P's
-       for i := 0; i < int(4*gomaxprocs); i++ {
-               if sched.gcwaiting != 0 {
-                       goto top
-               }
-               _p_ := allp[fastrand1()%uint32(gomaxprocs)]
-               var gp *g
-               if _p_ == _g_.m.p.ptr() {
-                       gp, _ = runqget(_p_)
-               } else {
-                       stealRunNextG := i > 2*int(gomaxprocs) // first look for ready queues with more than 1 g
-                       gp = runqsteal(_g_.m.p.ptr(), _p_, stealRunNextG)
-               }
-               if gp != nil {
-                       return gp, false
+       for i := 0; i < 4; i++ {
+               for enum := stealOrder.start(fastrand1()); !enum.done(); enum.next() {
+                       if sched.gcwaiting != 0 {
+                               goto top
+                       }
+                       stealRunNextG := i > 2 // first look for ready queues with more than 1 g
+                       if gp := runqsteal(_p_, allp[enum.position()], stealRunNextG); gp != nil {
+                               return gp, false
+                       }
                 }
         }
  
@@ -1866,7 +1869,7 @@ stop:
         // We have nothing to do. If we're in the GC mark phase, can
         // safely scan and blacken objects, and have work to do, run
         // idle-time marking rather than give up the P.
-       if _p_ := _g_.m.p.ptr(); gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != 0 && gcMarkWorkAvailable(_p_) {
+       if gcBlackenEnabled != 0 && _p_.gcBgMarkWorker != 0 && gcMarkWorkAvailable(_p_) {
                 _p_.gcMarkWorkerMode = gcMarkWorkerIdleMode
                 gp := _p_.gcBgMarkWorker.ptr()
                 casgstatus(gp, _Gwaiting, _Grunnable)
@@ -1878,16 +1881,18 @@ stop:
  
         // return P and block
         lock(&sched.lock)
-       if sched.gcwaiting != 0 || _g_.m.p.ptr().runSafePointFn != 0 {
+       if sched.gcwaiting != 0 || _p_.runSafePointFn != 0 {
                 unlock(&sched.lock)
                 goto top
         }
         if sched.runqsize != 0 {
-               gp := globrunqget(_g_.m.p.ptr(), 0)
+               gp := globrunqget(_p_, 0)
                 unlock(&sched.lock)
                 return gp, false
         }
-       _p_ := releasep()
+       if releasep() != _p_ {
+               throw("findrunnable: wrong p")
+       }
         pidleput(_p_)
         unlock(&sched.lock)
  
@@ -3265,6 +3270,7 @@ func procresize(nprocs int32) *p {
                         runnablePs = p
                 }
         }
+       stealOrder.reset(uint32(nprocs))
         var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
         atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
         return runnablePs
@@ -4121,3 +4127,59 @@ func sync_runtime_canSpin(i int) bool {
  func sync_runtime_doSpin() {
         procyield(active_spin_cnt)
  }
+
+var stealOrder randomOrder
+
+// randomOrder/randomEnum are helper types for randomized work stealing.
+// They allow to enumerate all Ps in different pseudo-random orders without repetitions.
+// The algorithm is based on the fact that if we have X such that X and GOMAXPROCS
+// are coprime, then a sequences of (i + X) % GOMAXPROCS gives the required enumeration.
+type randomOrder struct {
+       count    uint32
+       coprimes []uint32
+}
+
+type randomEnum struct {
+       i     uint32
+       count uint32
+       pos   uint32
+       inc   uint32
+}
+
+func (ord *randomOrder) reset(count uint32) {
+       ord.count = count
+       ord.coprimes = ord.coprimes[:0]
+       for i := uint32(1); i <= count; i++ {
+               if gcd(i, count) == 1 {
+                       ord.coprimes = append(ord.coprimes, i)
+               }
+       }
+}
+
+func (ord *randomOrder) start(i uint32) randomEnum {
+       return randomEnum{
+               count: ord.count,
+               pos:   i % ord.count,
+               inc:   ord.coprimes[i%uint32(len(ord.coprimes))],
+       }
+}
+
+func (enum *randomEnum) done() bool {
+       return enum.i == enum.count
+}
+
+func (enum *randomEnum) next() {
+       enum.i++
+       enum.pos = (enum.pos + enum.inc) % enum.count
+}
+
+func (enum *randomEnum) position() uint32 {
+       return enum.pos
+}
+
+func gcd(a, b uint32) uint32 {
+       for b != 0 {
+               a, b = b, a%b
+       }
+       return a
+}
diff --git a/src/runtime/proc_runtime_test.go b/src/runtime/proc_runtime_test.go

new file mode 100644 (file)

index 0000000..a7bde2c
--- /dev/null
+++ b/src/runtime/proc_runtime_test.go
@@ -0,0 +1,33 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Proc unit tests. In runtime package so can use runtime guts.
+
+package runtime
+
+func RunStealOrderTest() {
+       var ord randomOrder
+       for procs := 1; procs <= 64; procs++ {
+               ord.reset(uint32(procs))
+               if procs >= 3 && len(ord.coprimes) < 2 {
+                       panic("too few coprimes")
+               }
+               for co := 0; co < len(ord.coprimes); co++ {
+                       enum := ord.start(uint32(co))
+                       checked := make([]bool, procs)
+                       for p := 0; p < procs; p++ {
+                               x := enum.position()
+                               if checked[x] {
+                                       println("procs:", procs, "inc:", enum.inc)
+                                       panic("duplicate during enumeration")
+                               }
+                               checked[x] = true
+                               enum.next()
+                       }
+                       if !enum.done() {
+                               panic("not done")
+                       }
+               }
+       }
+}
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go

index fd12945be0c35edc5eec1c1d5a0fc36d80a91afe..b1d7f75870e602856b1d4ab48f3acfea97081d2e 100644 (file)
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -689,3 +689,7 @@ func matmult(done chan<- struct{}, A, B, C Matrix, i0, i1, j0, j1, k0, k1, thres
                 done <- struct{}{}
         }
  }
+
+func TestStealOrder(t *testing.T) {
+       runtime.RunStealOrderTest()
+}
author	Dmitry Vyukov <dvyukov@google.com>
	Fri, 18 Mar 2016 11:52:52 +0000 (12:52 +0100)
committer	Dmitry Vyukov <dvyukov@google.com>
	Fri, 25 Mar 2016 11:00:48 +0000 (11:00 +0000)
src/runtime/proc.go		patch \| blob \| history
src/runtime/proc_runtime_test.go	[new file with mode: 0644]	patch \| blob
src/runtime/proc_test.go		patch \| blob \| history