From: Rhys Hiltner Date: Tue, 14 May 2024 19:32:14 +0000 (-0700) Subject: runtime: profile mutex contention during unlock X-Git-Tag: go1.23rc1~249 X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=ba1c5b2c4573e10f3b5f0e0f25a27f17fba67eb0;p=gostls13.git runtime: profile mutex contention during unlock When an M's use of a lock causes delays in other Ms, capture the stack of the unlock call that caused the delay. This makes the meaning of the mutex profile for runtime-internal mutexes match the behavior for sync.Mutex: the profile points to the end of the critical section that is responsible for delaying other work. Fixes #66999 Change-Id: I4abc4a1df00a48765d29c07776481a1cbd539ff8 Reviewed-on: https://go-review.googlesource.com/c/go/+/585638 Auto-Submit: Rhys Hiltner LUCI-TryBot-Result: Go LUCI Reviewed-by: Carlos Amedee Reviewed-by: Michael Pratt --- diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go index 5c7c3a85fb..9be231f2ea 100644 --- a/src/runtime/lock_futex.go +++ b/src/runtime/lock_futex.go @@ -72,9 +72,6 @@ func lock2(l *mutex) { return } - timer := &lockTimer{lock: l} - timer.begin() - // If a goroutine's stack needed to grow during a lock2 call, the M could // end up with two active lock2 calls (one each on curg and g0). If both are // contended, the call on g0 will corrupt mWaitList. Disable stack growth. @@ -83,6 +80,7 @@ func lock2(l *mutex) { gp.stackguard0, gp.throwsplit = stackPreempt, true } + gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()} // On uniprocessors, no point spinning. // On multiprocessors, spin for ACTIVE_SPIN attempts. spin := 0 @@ -114,6 +112,7 @@ Loop: if v == old || atomic.Casuintptr(&l.key, old, v) { gp.m.mWaitList.clearLinks() + gp.m.mWaitList.acquireTimes = timePair{} break } v = atomic.Loaduintptr(&l.key) @@ -121,7 +120,6 @@ Loop: if gp == gp.m.curg { gp.stackguard0, gp.throwsplit = stackguard0, throwsplit } - timer.end() return } i = 0 @@ -163,6 +161,7 @@ func unlock(l *mutex) { } func unlock2(l *mutex) { + now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{} for { v := atomic.Loaduintptr(&l.key) if v == mutex_locked { @@ -172,6 +171,12 @@ func unlock2(l *mutex) { } else if v&mutex_locked == 0 { throw("unlock of unlocked lock") } else { + if now != (timePair{}) { + head := muintptr(v &^ (mutex_sleeping | mutex_locked)) + dt = claimMutexWaitTime(now, head) + now = timePair{} + } + // Other M's are waiting for the lock. if atomic.Casuintptr(&l.key, v, v&^mutex_locked) { futexwakeup(key32(&l.key), 1) @@ -181,7 +186,7 @@ func unlock2(l *mutex) { } gp := getg() - gp.m.mLockProfile.recordUnlock(l) + gp.m.mLockProfile.recordUnlock(dt) gp.m.locks-- if gp.m.locks < 0 { throw("runtime·unlock: lock count") diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go index 907f1c2a0d..0d7bd5b9c9 100644 --- a/src/runtime/lock_sema.go +++ b/src/runtime/lock_sema.go @@ -52,9 +52,6 @@ func lock2(l *mutex) { } semacreate(gp.m) - timer := &lockTimer{lock: l} - timer.begin() - // If a goroutine's stack needed to grow during a lock2 call, the M could // end up with two active lock2 calls (one each on curg and g0). If both are // contended, the call on g0 will corrupt mWaitList. Disable stack growth. @@ -63,6 +60,7 @@ func lock2(l *mutex) { gp.stackguard0, gp.throwsplit = stackPreempt, true } + gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()} // On uniprocessor's, no point spinning. // On multiprocessors, spin for ACTIVE_SPIN attempts. spin := 0 @@ -90,6 +88,7 @@ Loop: if v == old || atomic.Casuintptr(&l.key, old, v) { gp.m.mWaitList.clearLinks() + gp.m.mWaitList.acquireTimes = timePair{} break } v = atomic.Loaduintptr(&l.key) @@ -97,7 +96,6 @@ Loop: if gp == gp.m.curg { gp.stackguard0, gp.throwsplit = stackguard0, throwsplit } - timer.end() return } i = 0 @@ -147,6 +145,7 @@ func unlock(l *mutex) { // //go:nowritebarrier func unlock2(l *mutex) { + now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{} gp := getg() var mp *m for { @@ -156,6 +155,11 @@ func unlock2(l *mutex) { break } } else { + if now != (timePair{}) { + dt = claimMutexWaitTime(now, muintptr(v&^locked)) + now = timePair{} + } + // Other M's are waiting for the lock. // Dequeue an M. mp = muintptr(v &^ locked).ptr() @@ -166,7 +170,8 @@ func unlock2(l *mutex) { } } } - gp.m.mLockProfile.recordUnlock(l) + + gp.m.mLockProfile.recordUnlock(dt) gp.m.locks-- if gp.m.locks < 0 { throw("runtime·unlock: lock count") diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go index 6260780850..93d49275c9 100644 --- a/src/runtime/mprof.go +++ b/src/runtime/mprof.go @@ -781,99 +781,34 @@ func removeMutexWaitList(head muintptr, mp *m) muintptr { return head } -// lockTimer assists with profiling contention on runtime-internal locks. -// -// There are several steps between the time that an M experiences contention and -// when that contention may be added to the profile. This comes from our -// constraints: We need to keep the critical section of each lock small, -// especially when those locks are contended. The reporting code cannot acquire -// new locks until the M has released all other locks, which means no memory -// allocations and encourages use of (temporary) M-local storage. -// -// The M will have space for storing one call stack that caused contention, and -// for the magnitude of that contention. It will also have space to store the -// magnitude of additional contention the M caused, since it only has space to -// remember one call stack and might encounter several contention events before -// it releases all of its locks and is thus able to transfer the local buffer -// into the profile. -// -// The M will collect the call stack when it unlocks the contended lock. That -// minimizes the impact on the critical section of the contended lock, and -// matches the mutex profile's behavior for contention in sync.Mutex: measured -// at the Unlock method. -// -// The profile for contention on sync.Mutex blames the caller of Unlock for the -// amount of contention experienced by the callers of Lock which had to wait. -// When there are several critical sections, this allows identifying which of -// them is responsible. -// -// Matching that behavior for runtime-internal locks will require identifying -// which Ms are blocked on the mutex. The semaphore-based implementation is -// ready to allow that, but the futex-based implementation will require a bit -// more work. Until then, we report contention on runtime-internal locks with a -// call stack taken from the unlock call (like the rest of the user-space -// "mutex" profile), but assign it a duration value based on how long the -// previous lock call took (like the user-space "block" profile). -// -// Thus, reporting the call stacks of runtime-internal lock contention is -// guarded by GODEBUG for now. Set GODEBUG=runtimecontentionstacks=1 to enable. -// -// TODO(rhysh): plumb through the delay duration, remove GODEBUG, update comment -// -// The M will track this by storing a pointer to the lock; lock/unlock pairs for -// runtime-internal locks are always on the same M. -// -// Together, that demands several steps for recording contention. First, when -// finally acquiring a contended lock, the M decides whether it should plan to -// profile that event by storing a pointer to the lock in its "to be profiled -// upon unlock" field. If that field is already set, it uses the relative -// magnitudes to weight a random choice between itself and the other lock, with -// the loser's time being added to the "additional contention" field. Otherwise -// if the M's call stack buffer is occupied, it does the comparison against that -// sample's magnitude. -// -// Second, having unlocked a mutex the M checks to see if it should capture the -// call stack into its local buffer. Finally, when the M unlocks its last mutex, -// it transfers the local buffer into the profile. As part of that step, it also -// transfers any "additional contention" time to the profile. Any lock -// contention that it experiences while adding samples to the profile will be -// recorded later as "additional contention" and not include a call stack, to -// avoid an echo. -type lockTimer struct { - lock *mutex - timeRate int64 - timeStart int64 - tickStart int64 -} - -func (lt *lockTimer) begin() { - rate := int64(atomic.Load64(&mutexprofilerate)) - - lt.timeRate = gTrackingPeriod - if rate != 0 && rate < lt.timeRate { - lt.timeRate = rate - } - if int64(cheaprand())%lt.timeRate == 0 { - lt.timeStart = nanotime() +// claimMutexWaitTime advances the acquireTimes of the list of waiting Ms at +// head to now, returning an estimate of the total wait time claimed by that +// action. +func claimMutexWaitTime(now timePair, head muintptr) timePair { + fixMutexWaitList(head) + hp := head.ptr() + if hp == nil { + return timePair{} } + tp := hp.mWaitList.tail.ptr() + waiters := hp.mWaitList.waiters + headTimes := hp.mWaitList.acquireTimes + tailTimes := tp.mWaitList.acquireTimes - if rate > 0 && int64(cheaprand())%rate == 0 { - lt.tickStart = cputicks() + var dt timePair + dt.nanotime = now.nanotime - headTimes.nanotime + dt.cputicks = now.cputicks - headTimes.cputicks + if waiters > 1 { + dt.nanotime = int64(waiters) * (dt.nanotime + now.nanotime - tailTimes.nanotime) / 2 + dt.cputicks = int64(waiters) * (dt.cputicks + now.cputicks - tailTimes.cputicks) / 2 } -} -func (lt *lockTimer) end() { - gp := getg() + // When removeMutexWaitList removes a head or tail node, it's responsible + // for applying these changes to the new head or tail. + hp.mWaitList.acquireTimes = now + tp.mWaitList.acquireTimes = now - if lt.timeStart != 0 { - nowTime := nanotime() - gp.m.mLockProfile.waitTime.Add((nowTime - lt.timeStart) * lt.timeRate) - } - - if lt.tickStart != 0 { - nowTick := cputicks() - gp.m.mLockProfile.recordLock(nowTick-lt.tickStart, lt.lock) - } + return dt } // mLockProfile is part of the M struct to hold information relating to mutex @@ -890,18 +825,49 @@ func (lt *lockTimer) end() { // not include a call stack, to avoid an echo. type mLockProfile struct { waitTime atomic.Int64 // total nanoseconds spent waiting in runtime.lockWithRank - stack []uintptr // stack that experienced contention in runtime.lockWithRank - pending uintptr // *mutex that experienced contention (to be traceback-ed) - cycles int64 // cycles attributable to "pending" (if set), otherwise to "stack" + stack []uintptr // unlock stack that caused delay in other Ms' runtime.lockWithRank + cycles int64 // cycles attributable to "stack" cyclesLost int64 // contention for which we weren't able to record a call stack disabled bool // attribute all time to "lost" } -func (prof *mLockProfile) recordLock(cycles int64, l *mutex) { +// recordUnlock considers the current unlock call (which caused a total of dt +// delay in other Ms) for later inclusion in the mutex contention profile. If +// this M holds no other locks, it transfers the buffered contention record to +// the mutex contention profile. +// +// From unlock2, we might not be holding a p in this code. +// +//go:nowritebarrierrec +func (prof *mLockProfile) recordUnlock(dt timePair) { + if dt != (timePair{}) { + // We could make a point of clearing out the local storage right before + // this, to have a slightly better chance of being able to see the call + // stack if the program has several (nested) contended locks. If apps + // are seeing a lot of _LostContendedRuntimeLock samples, maybe that'll + // be a worthwhile change. + prof.proposeUnlock(dt) + } + if getg().m.locks == 1 && prof.cycles != 0 { + prof.store() + } +} + +func (prof *mLockProfile) proposeUnlock(dt timePair) { + if nanos := dt.nanotime; nanos > 0 { + prof.waitTime.Add(nanos) + } + + cycles := dt.cputicks if cycles <= 0 { return } + rate := int64(atomic.Load64(&mutexprofilerate)) + if rate <= 0 || int64(cheaprand())%rate != 0 { + return + } + if prof.disabled { // We're experiencing contention while attempting to report contention. // Make a note of its magnitude, but don't allow it to be the sole cause @@ -910,13 +876,6 @@ func (prof *mLockProfile) recordLock(cycles int64, l *mutex) { return } - if uintptr(unsafe.Pointer(l)) == prof.pending { - // Optimization: we'd already planned to profile this same lock (though - // possibly from a different unlock site). - prof.cycles += cycles - return - } - if prev := prof.cycles; prev > 0 { // We can only store one call stack for runtime-internal lock contention // on this M, and we've already got one. Decide which should stay, and @@ -930,24 +889,8 @@ func (prof *mLockProfile) recordLock(cycles int64, l *mutex) { prof.cyclesLost += prev } } - // Saving the *mutex as a uintptr is safe because: - // - lockrank_on.go does this too, which gives it regular exercise - // - the lock would only move if it's stack allocated, which means it - // cannot experience multi-M contention - prof.pending = uintptr(unsafe.Pointer(l)) prof.cycles = cycles -} - -// From unlock2, we might not be holding a p in this code. -// -//go:nowritebarrierrec -func (prof *mLockProfile) recordUnlock(l *mutex) { - if uintptr(unsafe.Pointer(l)) == prof.pending { - prof.captureStack() - } - if gp := getg(); gp.m.locks == 1 && gp.m.mLockProfile.cycles != 0 { - prof.store() - } + prof.captureStack() } func (prof *mLockProfile) captureStack() { @@ -957,7 +900,7 @@ func (prof *mLockProfile) captureStack() { return } - skip := 3 // runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank + skip := 4 // runtime.(*mLockProfile).proposeUnlock runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank if staticLockRanking { // When static lock ranking is enabled, we'll always be on the system // stack at this point. There will be a runtime.unlockWithRank.func1 @@ -970,7 +913,6 @@ func (prof *mLockProfile) captureStack() { // "runtime.unlock". skip += 1 // runtime.unlockWithRank.func1 } - prof.pending = 0 prof.stack[0] = logicalStackSentinel if debug.runtimeContentionStacks.Load() == 0 {