From fc116b69e2004c159d0f2563c6e91ac75a79f872 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Mon, 5 Oct 2020 18:12:35 -0400 Subject: [PATCH] runtime: try to elide timer stealing if P has no timers MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Following golang.org/cl/259578, findrunnable still must touch every other P in checkTimers in order to look for timers to steal. This scales poorly with GOMAXPROCS and potentially performs poorly by pulling remote Ps into cache. Add timerpMask, a bitmask that tracks whether each P may have any timers on its timer heap. Ideally we would update this field on any timer add / remove to always keep it up to date. Unfortunately, updating a shared global structure is antithetical to sharding timers by P, and doing so approximately doubles the cost of addtimer / deltimer in microbenchmarks. Instead we only (potentially) clear the mask when the P goes idle. This covers the best case of avoiding looking at a P _at all_ when it is idle and has no timers. See the comment on updateTimerPMask for more details on the trade-off. Future CLs may be able to expand cases we can avoid looking at the timers. Note that the addition of idlepMask to p.init is a no-op. The zero value of the mask is the correct init value so it is not necessary, but it is included for clarity. Benchmark results from WakeupParallel/syscall/pair/race/1ms (see golang.org/cl/228577). Note that these are on top of golang.org/cl/259578: name old msec new msec delta Perf-task-clock-8 244 ± 4% 246 ± 4% ~ (p=0.841 n=5+5) Perf-task-clock-16 247 ±11% 252 ± 4% ~ (p=1.000 n=5+5) Perf-task-clock-32 270 ± 1% 268 ± 2% ~ (p=0.548 n=5+5) Perf-task-clock-64 302 ± 3% 296 ± 1% ~ (p=0.222 n=5+5) Perf-task-clock-128 358 ± 3% 352 ± 2% ~ (p=0.310 n=5+5) Perf-task-clock-256 483 ± 3% 458 ± 1% -5.16% (p=0.008 n=5+5) Perf-task-clock-512 663 ± 1% 612 ± 4% -7.61% (p=0.008 n=5+5) Perf-task-clock-1024 1.06k ± 1% 0.95k ± 2% -10.24% (p=0.008 n=5+5) Updates #28808 Updates #18237 Change-Id: I4239cd89f21ad16dfbbef58d81981da48acd0605 Reviewed-on: https://go-review.googlesource.com/c/go/+/264477 Run-TryBot: Michael Pratt TryBot-Result: Go Bot Reviewed-by: Michael Knyszek Trust: Michael Pratt --- src/runtime/proc.go | 87 +++++++++++++++++++++++++++++++---------- src/runtime/runtime2.go | 17 ++++++-- 2 files changed, 81 insertions(+), 23 deletions(-) diff --git a/src/runtime/proc.go b/src/runtime/proc.go index b335e1184d..64c891d007 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -2512,9 +2512,9 @@ top: // the timers for each P more than once with the same value of now // is probably a waste of time. // - // TODO(prattmic): Maintain a global look-aside similar to idlepMask - // to avoid looking at p2 if it can't possibly have timers. - if stealTimersOrRunNextG { + // timerpMask tells us whether the P may have timers at all. If it + // can't, no need to check at all. + if stealTimersOrRunNextG && timerpMask.read(enum.position()) { tnow, w, ran := checkTimers(p2, now) now = tnow if w != 0 && (pollUntil == 0 || w < pollUntil) { @@ -4502,6 +4502,13 @@ func (pp *p) init(id int32) { } } lockInit(&pp.timersLock, lockRankTimers) + + // This P may get timers when it starts running. Set the mask here + // since the P may not go through pidleget (notably P 0 on startup). + timerpMask.set(id) + // Similarly, we may not go through pidleget before this P starts + // running if it is P 0 on startup. + idlepMask.clear(id) } // destroy releases all of the resources associated with pp and @@ -4647,11 +4654,16 @@ func procresize(nprocs int32) *p { if maskWords <= int32(cap(idlepMask)) { idlepMask = idlepMask[:maskWords] + timerpMask = timerpMask[:maskWords] } else { nidlepMask := make([]uint32, maskWords) // No need to copy beyond len, old Ps are irrelevant. copy(nidlepMask, idlepMask) idlepMask = nidlepMask + + ntimerpMask := make([]uint32, maskWords) + copy(ntimerpMask, timerpMask) + timerpMask = ntimerpMask } unlock(&allpLock) } @@ -4712,6 +4724,7 @@ func procresize(nprocs int32) *p { lock(&allpLock) allp = allp[:nprocs] idlepMask = idlepMask[:maskWords] + timerpMask = timerpMask[:maskWords] unlock(&allpLock) } @@ -5408,39 +5421,70 @@ func globrunqget(_p_ *p, max int32) *g { return gp } -// pIdleMask is a bitmap of of Ps in the _Pidle list, one bit per P. -type pIdleMask []uint32 +// pMask is an atomic bitstring with one bit per P. +type pMask []uint32 -// read returns true if P id is in the _Pidle list, and thus cannot have work. -func (p pIdleMask) read(id uint32) bool { +// read returns true if P id's bit is set. +func (p pMask) read(id uint32) bool { word := id / 32 mask := uint32(1) << (id % 32) return (atomic.Load(&p[word]) & mask) != 0 } -// set sets P id as idle in mask. -// -// Must be called only for a P owned by the caller. In order to maintain -// consistency, a P going idle must the idle mask simultaneously with updates -// to the idle P list under the sched.lock, otherwise a racing pidleget may -// clear the mask before pidleput sets the mask, corrupting the bitmap. -// -// N.B., procresize takes ownership of all Ps in stopTheWorldWithSema. -func (p pIdleMask) set(id int32) { +// set sets P id's bit. +func (p pMask) set(id int32) { word := id / 32 mask := uint32(1) << (id % 32) atomic.Or(&p[word], mask) } -// clear sets P id as non-idle in mask. -// -// See comment on set. -func (p pIdleMask) clear(id int32) { +// clear clears P id's bit. +func (p pMask) clear(id int32) { word := id / 32 mask := uint32(1) << (id % 32) atomic.And(&p[word], ^mask) } +// updateTimerPMask clears pp's timer mask if it has no timers on its heap. +// +// Ideally, the timer mask would be kept immediately consistent on any timer +// operations. Unfortunately, updating a shared global data structure in the +// timer hot path adds too much overhead in applications frequently switching +// between no timers and some timers. +// +// As a compromise, the timer mask is updated only on pidleget / pidleput. A +// running P (returned by pidleget) may add a timer at any time, so its mask +// must be set. An idle P (passed to pidleput) cannot add new timers while +// idle, so if it has no timers at that time, its mask may be cleared. +// +// Thus, we get the following effects on timer-stealing in findrunnable: +// +// * Idle Ps with no timers when they go idle are never checked in findrunnable +// (for work- or timer-stealing; this is the ideal case). +// * Running Ps must always be checked. +// * Idle Ps whose timers are stolen must continue to be checked until they run +// again, even after timer expiration. +// +// When the P starts running again, the mask should be set, as a timer may be +// added at any time. +// +// TODO(prattmic): Additional targeted updates may improve the above cases. +// e.g., updating the mask when stealing a timer. +func updateTimerPMask(pp *p) { + if atomic.Load(&pp.numTimers) > 0 { + return + } + + // Looks like there are no timers, however another P may transiently + // decrement numTimers when handling a timerModified timer in + // checkTimers. We must take timersLock to serialize with these changes. + lock(&pp.timersLock) + if atomic.Load(&pp.numTimers) == 0 { + timerpMask.clear(pp.id) + } + unlock(&pp.timersLock) +} + // pidleput puts p to on the _Pidle list. // // This releases ownership of p. Once sched.lock is released it is no longer @@ -5456,6 +5500,7 @@ func pidleput(_p_ *p) { if !runqempty(_p_) { throw("pidleput: P has non-empty run queue") } + updateTimerPMask(_p_) // clear if there are no timers. idlepMask.set(_p_.id) _p_.link = sched.pidle sched.pidle.set(_p_) @@ -5473,6 +5518,8 @@ func pidleget() *p { _p_ := sched.pidle.ptr() if _p_ != nil { + // Timer may get added at any time now. + timerpMask.set(_p_.id) idlepMask.clear(_p_.id) sched.pidle = _p_.link atomic.Xadd(&sched.npidle, -1) // TODO: fast atomic diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go index a2e4411c7d..2dbc0efca3 100644 --- a/src/runtime/runtime2.go +++ b/src/runtime/runtime2.go @@ -1052,15 +1052,26 @@ var ( sched schedt newprocs int32 - // allpLock protects P-less reads and size changes of allp and - // idlepMask, and all writes to allp. + // allpLock protects P-less reads and size changes of allp, idlepMask, + // and timerpMask, and all writes to allp. allpLock mutex // len(allp) == gomaxprocs; may change at safe points, otherwise // immutable. allp []*p // Bitmask of Ps in _Pidle list, one bit per P. Reads and writes must // be atomic. Length may change at safe points. - idlepMask pIdleMask + // + // Each P must update only its own bit. In order to maintain + // consistency, a P going idle must the idle mask simultaneously with + // updates to the idle P list under the sched.lock, otherwise a racing + // pidleget may clear the mask before pidleput sets the mask, + // corrupting the bitmap. + // + // N.B., procresize takes ownership of all Ps in stopTheWorldWithSema. + idlepMask pMask + // Bitmask of Ps that may have a timer, one bit per P. Reads and writes + // must be atomic. Length may change at safe points. + timerpMask pMask // Information about what cpu features are available. // Packages outside the runtime should not use these -- 2.48.1