From: Rhys Hiltner Date: Wed, 29 May 2024 16:36:36 +0000 (+0000) Subject: Revert "runtime: split mutex profile clocks" X-Git-Tag: go1.23rc1~112 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=1be701a344d1f1819dc08d78259684de1da6f923;p=gostls13.git Revert "runtime: split mutex profile clocks" This reverts commit 8ab131fb1256a4a795c610e145c022e22e2d1567 (CL 586796) Reason for revert: This is part of a patch series that changed the handling of contended lock2/unlock2 calls, reducing the maximum throughput of contended runtime.mutex values, and causing a performance regression on applications where that is (or became) the bottleneck. Updates #66999 Updates #67585 Change-Id: I54711691e86e072081482102019d168292b5150a Reviewed-on: https://go-review.googlesource.com/c/go/+/589095 Reviewed-by: Michael Pratt Auto-Submit: Rhys Hiltner LUCI-TryBot-Result: Go LUCI Reviewed-by: Than McIntosh --- diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go index ef9a800b56..9be231f2ea 100644 --- a/src/runtime/lock_futex.go +++ b/src/runtime/lock_futex.go @@ -80,14 +80,7 @@ func lock2(l *mutex) { gp.stackguard0, gp.throwsplit = stackPreempt, true } - var startNanos int64 - const sampleRate = gTrackingPeriod - sample := cheaprandn(sampleRate) == 0 - if sample { - startNanos = nanotime() - } - gp.m.mWaitList.acquireTicks = cputicks() - + gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()} // On uniprocessors, no point spinning. // On multiprocessors, spin for ACTIVE_SPIN attempts. spin := 0 @@ -119,7 +112,7 @@ Loop: if v == old || atomic.Casuintptr(&l.key, old, v) { gp.m.mWaitList.clearLinks() - gp.m.mWaitList.acquireTicks = 0 + gp.m.mWaitList.acquireTimes = timePair{} break } v = atomic.Loaduintptr(&l.key) @@ -127,11 +120,6 @@ Loop: if gp == gp.m.curg { gp.stackguard0, gp.throwsplit = stackguard0, throwsplit } - - if sample { - endNanos := nanotime() - gp.m.mLockProfile.waitTime.Add((endNanos - startNanos) * sampleRate) - } return } i = 0 @@ -173,8 +161,7 @@ func unlock(l *mutex) { } func unlock2(l *mutex) { - var claimed bool - var cycles int64 + now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{} for { v := atomic.Loaduintptr(&l.key) if v == mutex_locked { @@ -184,11 +171,10 @@ func unlock2(l *mutex) { } else if v&mutex_locked == 0 { throw("unlock of unlocked lock") } else { - if !claimed { - claimed = true - nowTicks := cputicks() + if now != (timePair{}) { head := muintptr(v &^ (mutex_sleeping | mutex_locked)) - cycles = claimMutexWaitTime(nowTicks, head) + dt = claimMutexWaitTime(now, head) + now = timePair{} } // Other M's are waiting for the lock. @@ -200,7 +186,7 @@ func unlock2(l *mutex) { } gp := getg() - gp.m.mLockProfile.recordUnlock(cycles) + gp.m.mLockProfile.recordUnlock(dt) gp.m.locks-- if gp.m.locks < 0 { throw("runtime·unlock: lock count") diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go index 6c941c6514..0d7bd5b9c9 100644 --- a/src/runtime/lock_sema.go +++ b/src/runtime/lock_sema.go @@ -60,14 +60,7 @@ func lock2(l *mutex) { gp.stackguard0, gp.throwsplit = stackPreempt, true } - var startNanos int64 - const sampleRate = gTrackingPeriod - sample := cheaprandn(sampleRate) == 0 - if sample { - startNanos = nanotime() - } - gp.m.mWaitList.acquireTicks = cputicks() - + gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()} // On uniprocessor's, no point spinning. // On multiprocessors, spin for ACTIVE_SPIN attempts. spin := 0 @@ -95,7 +88,7 @@ Loop: if v == old || atomic.Casuintptr(&l.key, old, v) { gp.m.mWaitList.clearLinks() - gp.m.mWaitList.acquireTicks = 0 + gp.m.mWaitList.acquireTimes = timePair{} break } v = atomic.Loaduintptr(&l.key) @@ -103,11 +96,6 @@ Loop: if gp == gp.m.curg { gp.stackguard0, gp.throwsplit = stackguard0, throwsplit } - - if sample { - endNanos := nanotime() - gp.m.mLockProfile.waitTime.Add((endNanos - startNanos) * sampleRate) - } return } i = 0 @@ -157,8 +145,7 @@ func unlock(l *mutex) { // //go:nowritebarrier func unlock2(l *mutex) { - var claimed bool - var cycles int64 + now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{} gp := getg() var mp *m for { @@ -168,11 +155,9 @@ func unlock2(l *mutex) { break } } else { - if !claimed { - claimed = true - nowTicks := cputicks() - head := muintptr(v &^ locked) - cycles = claimMutexWaitTime(nowTicks, head) + if now != (timePair{}) { + dt = claimMutexWaitTime(now, muintptr(v&^locked)) + now = timePair{} } // Other M's are waiting for the lock. @@ -186,7 +171,7 @@ func unlock2(l *mutex) { } } - gp.m.mLockProfile.recordUnlock(cycles) + gp.m.mLockProfile.recordUnlock(dt) gp.m.locks-- if gp.m.locks < 0 { throw("runtime·unlock: lock count") diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go index 1d44164b84..fd0a018724 100644 --- a/src/runtime/mprof.go +++ b/src/runtime/mprof.go @@ -590,7 +590,7 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) { // mutex. // // Having found the head and tail nodes and a correct waiters count, the -// unlocking M can read and update those two nodes' acquireTicks field and thus +// unlocking M can read and update those two nodes' acquireTimes fields and thus // take responsibility for (an estimate of) the entire list's delay since the // last unlock call. // @@ -603,16 +603,21 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) { // runtime controls the order of thread wakeups (it's a LIFO stack), but with // lock_futex.go the OS can wake an arbitrary thread. type mWaitList struct { - acquireTicks int64 // start of current wait (set by us, updated by others during unlock) + acquireTimes timePair // start of current wait (set by us, updated by others during unlock) next muintptr // next m waiting for lock (set by us, cleared by another during unlock) prev muintptr // previous m waiting for lock (an amortized hint, set by another during unlock) tail muintptr // final m waiting for lock (an amortized hint, set by others during unlock) waiters int32 // length of waiting m list (an amortized hint, set by another during unlock) } +type timePair struct { + nanotime int64 + cputicks int64 +} + // clearLinks resets the fields related to the M's position in the list of Ms -// waiting for a mutex. It leaves acquireTicks intact, since this M may still be -// waiting and may have had its acquireTicks updated by an unlock2 call. +// waiting for a mutex. It leaves acquireTimes intact, since this M may still be +// waiting and may have had its acquireTimes updated by an unlock2 call. // // In lock_sema.go, the previous owner of the mutex dequeues an M and then wakes // it; with semaphore-based sleep, it's important that each M receives only one @@ -731,8 +736,8 @@ func removeMutexWaitList(head muintptr, mp *m) muintptr { hp := head.ptr() tail := hp.mWaitList.tail waiters := hp.mWaitList.waiters - headTicks := hp.mWaitList.acquireTicks - tailTicks := hp.mWaitList.tail.ptr().mWaitList.acquireTicks + headTimes := hp.mWaitList.acquireTimes + tailTimes := hp.mWaitList.tail.ptr().mWaitList.acquireTimes mp.mWaitList.tail = 0 @@ -768,40 +773,42 @@ func removeMutexWaitList(head muintptr, mp *m) muintptr { hp.mWaitList.prev = 0 hp.mWaitList.tail = tail hp.mWaitList.waiters = waiters - 1 - hp.mWaitList.acquireTicks = headTicks + hp.mWaitList.acquireTimes = headTimes } if tp := tail.ptr(); tp != nil { - tp.mWaitList.acquireTicks = tailTicks + tp.mWaitList.acquireTimes = tailTimes } return head } -// claimMutexWaitTime advances the acquireTicks of the list of waiting Ms at +// claimMutexWaitTime advances the acquireTimes of the list of waiting Ms at // head to now, returning an estimate of the total wait time claimed by that // action. -func claimMutexWaitTime(nowTicks int64, head muintptr) int64 { +func claimMutexWaitTime(now timePair, head muintptr) timePair { fixMutexWaitList(head) hp := head.ptr() if hp == nil { - return 0 + return timePair{} } tp := hp.mWaitList.tail.ptr() waiters := hp.mWaitList.waiters - headTicks := hp.mWaitList.acquireTicks - tailTicks := tp.mWaitList.acquireTicks + headTimes := hp.mWaitList.acquireTimes + tailTimes := tp.mWaitList.acquireTimes - var cycles int64 - cycles = nowTicks - headTicks + var dt timePair + dt.nanotime = now.nanotime - headTimes.nanotime + dt.cputicks = now.cputicks - headTimes.cputicks if waiters > 1 { - cycles = int64(waiters) * (cycles + nowTicks - tailTicks) / 2 + dt.nanotime = int64(waiters) * (dt.nanotime + now.nanotime - tailTimes.nanotime) / 2 + dt.cputicks = int64(waiters) * (dt.cputicks + now.cputicks - tailTimes.cputicks) / 2 } // When removeMutexWaitList removes a head or tail node, it's responsible // for applying these changes to the new head or tail. - hp.mWaitList.acquireTicks = nowTicks - tp.mWaitList.acquireTicks = nowTicks + hp.mWaitList.acquireTimes = now + tp.mWaitList.acquireTimes = now - return cycles + return dt } // mLockProfile is part of the M struct to hold information relating to mutex @@ -832,21 +839,26 @@ type mLockProfile struct { // From unlock2, we might not be holding a p in this code. // //go:nowritebarrierrec -func (prof *mLockProfile) recordUnlock(cycles int64) { - if cycles != 0 { +func (prof *mLockProfile) recordUnlock(dt timePair) { + if dt != (timePair{}) { // We could make a point of clearing out the local storage right before // this, to have a slightly better chance of being able to see the call // stack if the program has several (nested) contended locks. If apps // are seeing a lot of _LostContendedRuntimeLock samples, maybe that'll // be a worthwhile change. - prof.proposeUnlock(cycles) + prof.proposeUnlock(dt) } if getg().m.locks == 1 && prof.cycles != 0 { prof.store() } } -func (prof *mLockProfile) proposeUnlock(cycles int64) { +func (prof *mLockProfile) proposeUnlock(dt timePair) { + if nanos := dt.nanotime; nanos > 0 { + prof.waitTime.Add(nanos) + } + + cycles := dt.cputicks if cycles <= 0 { return }