From 5dead59add1f11e98de155860aa86175e893ac01 Mon Sep 17 00:00:00 2001
From: Rhys Hiltner <rhys.hiltner@gmail.com>
Date: Wed, 29 May 2024 16:41:51 +0000
Subject: [PATCH] Revert "runtime: profile mutex contention during unlock"

This reverts commit ba1c5b2c4573e10f3b5f0e0f25a27f17fba67eb0 (CL 585638).

Reason for revert: This is part of a patch series that changed the
handling of contended lock2/unlock2 calls, reducing the maximum
throughput of contended runtime.mutex values, and causing a performance
regression on applications where that is (or became) the bottleneck.

Updates #66999
Updates #67585

Change-Id: Ibeec5d8deb17e87966cf352fefc7efe2267839d6
Reviewed-on: https://go-review.googlesource.com/c/go/+/589115
Auto-Submit: Rhys Hiltner <rhys.hiltner@gmail.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Than McIntosh <thanm@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---
 src/runtime/lock_futex.go |  15 ++--
 src/runtime/lock_sema.go  |  15 ++--
 src/runtime/mprof.go      | 178 +++++++++++++++++++++++++-------------
 3 files changed, 128 insertions(+), 80 deletions(-)

diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go
index 9be231f2ea..5c7c3a85fb 100644
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go
@@ -72,6 +72,9 @@ func lock2(l *mutex) {
 		return
 	}
 
+	timer := &lockTimer{lock: l}
+	timer.begin()
+
 	// If a goroutine's stack needed to grow during a lock2 call, the M could
 	// end up with two active lock2 calls (one each on curg and g0). If both are
 	// contended, the call on g0 will corrupt mWaitList. Disable stack growth.
@@ -80,7 +83,6 @@ func lock2(l *mutex) {
 		gp.stackguard0, gp.throwsplit = stackPreempt, true
 	}
 
-	gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()}
 	// On uniprocessors, no point spinning.
 	// On multiprocessors, spin for ACTIVE_SPIN attempts.
 	spin := 0
@@ -112,7 +114,6 @@ Loop:
 
 					if v == old || atomic.Casuintptr(&l.key, old, v) {
 						gp.m.mWaitList.clearLinks()
-						gp.m.mWaitList.acquireTimes = timePair{}
 						break
 					}
 					v = atomic.Loaduintptr(&l.key)
@@ -120,6 +121,7 @@ Loop:
 				if gp == gp.m.curg {
 					gp.stackguard0, gp.throwsplit = stackguard0, throwsplit
 				}
+				timer.end()
 				return
 			}
 			i = 0
@@ -161,7 +163,6 @@ func unlock(l *mutex) {
 }
 
 func unlock2(l *mutex) {
-	now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{}
 	for {
 		v := atomic.Loaduintptr(&l.key)
 		if v == mutex_locked {
@@ -171,12 +172,6 @@ func unlock2(l *mutex) {
 		} else if v&mutex_locked == 0 {
 			throw("unlock of unlocked lock")
 		} else {
-			if now != (timePair{}) {
-				head := muintptr(v &^ (mutex_sleeping | mutex_locked))
-				dt = claimMutexWaitTime(now, head)
-				now = timePair{}
-			}
-
 			// Other M's are waiting for the lock.
 			if atomic.Casuintptr(&l.key, v, v&^mutex_locked) {
 				futexwakeup(key32(&l.key), 1)
@@ -186,7 +181,7 @@ func unlock2(l *mutex) {
 	}
 
 	gp := getg()
-	gp.m.mLockProfile.recordUnlock(dt)
+	gp.m.mLockProfile.recordUnlock(l)
 	gp.m.locks--
 	if gp.m.locks < 0 {
 		throw("runtimeÂ·unlock: lock count")
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go
index 0d7bd5b9c9..907f1c2a0d 100644
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@@ -52,6 +52,9 @@ func lock2(l *mutex) {
 	}
 	semacreate(gp.m)
 
+	timer := &lockTimer{lock: l}
+	timer.begin()
+
 	// If a goroutine's stack needed to grow during a lock2 call, the M could
 	// end up with two active lock2 calls (one each on curg and g0). If both are
 	// contended, the call on g0 will corrupt mWaitList. Disable stack growth.
@@ -60,7 +63,6 @@ func lock2(l *mutex) {
 		gp.stackguard0, gp.throwsplit = stackPreempt, true
 	}
 
-	gp.m.mWaitList.acquireTimes = timePair{nanotime: nanotime(), cputicks: cputicks()}
 	// On uniprocessor's, no point spinning.
 	// On multiprocessors, spin for ACTIVE_SPIN attempts.
 	spin := 0
@@ -88,7 +90,6 @@ Loop:
 
 					if v == old || atomic.Casuintptr(&l.key, old, v) {
 						gp.m.mWaitList.clearLinks()
-						gp.m.mWaitList.acquireTimes = timePair{}
 						break
 					}
 					v = atomic.Loaduintptr(&l.key)
@@ -96,6 +97,7 @@ Loop:
 				if gp == gp.m.curg {
 					gp.stackguard0, gp.throwsplit = stackguard0, throwsplit
 				}
+				timer.end()
 				return
 			}
 			i = 0
@@ -145,7 +147,6 @@ func unlock(l *mutex) {
 //
 //go:nowritebarrier
 func unlock2(l *mutex) {
-	now, dt := timePair{nanotime: nanotime(), cputicks: cputicks()}, timePair{}
 	gp := getg()
 	var mp *m
 	for {
@@ -155,11 +156,6 @@ func unlock2(l *mutex) {
 				break
 			}
 		} else {
-			if now != (timePair{}) {
-				dt = claimMutexWaitTime(now, muintptr(v&^locked))
-				now = timePair{}
-			}
-
 			// Other M's are waiting for the lock.
 			// Dequeue an M.
 			mp = muintptr(v &^ locked).ptr()
@@ -170,8 +166,7 @@ func unlock2(l *mutex) {
 			}
 		}
 	}
-
-	gp.m.mLockProfile.recordUnlock(dt)
+	gp.m.mLockProfile.recordUnlock(l)
 	gp.m.locks--
 	if gp.m.locks < 0 {
 		throw("runtimeÂ·unlock: lock count")
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go
index 93d49275c9..6260780850 100644
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -781,34 +781,99 @@ func removeMutexWaitList(head muintptr, mp *m) muintptr {
 	return head
 }
 
-// claimMutexWaitTime advances the acquireTimes of the list of waiting Ms at
-// head to now, returning an estimate of the total wait time claimed by that
-// action.
-func claimMutexWaitTime(now timePair, head muintptr) timePair {
-	fixMutexWaitList(head)
-	hp := head.ptr()
-	if hp == nil {
-		return timePair{}
+// lockTimer assists with profiling contention on runtime-internal locks.
+//
+// There are several steps between the time that an M experiences contention and
+// when that contention may be added to the profile. This comes from our
+// constraints: We need to keep the critical section of each lock small,
+// especially when those locks are contended. The reporting code cannot acquire
+// new locks until the M has released all other locks, which means no memory
+// allocations and encourages use of (temporary) M-local storage.
+//
+// The M will have space for storing one call stack that caused contention, and
+// for the magnitude of that contention. It will also have space to store the
+// magnitude of additional contention the M caused, since it only has space to
+// remember one call stack and might encounter several contention events before
+// it releases all of its locks and is thus able to transfer the local buffer
+// into the profile.
+//
+// The M will collect the call stack when it unlocks the contended lock. That
+// minimizes the impact on the critical section of the contended lock, and
+// matches the mutex profile's behavior for contention in sync.Mutex: measured
+// at the Unlock method.
+//
+// The profile for contention on sync.Mutex blames the caller of Unlock for the
+// amount of contention experienced by the callers of Lock which had to wait.
+// When there are several critical sections, this allows identifying which of
+// them is responsible.
+//
+// Matching that behavior for runtime-internal locks will require identifying
+// which Ms are blocked on the mutex. The semaphore-based implementation is
+// ready to allow that, but the futex-based implementation will require a bit
+// more work. Until then, we report contention on runtime-internal locks with a
+// call stack taken from the unlock call (like the rest of the user-space
+// "mutex" profile), but assign it a duration value based on how long the
+// previous lock call took (like the user-space "block" profile).
+//
+// Thus, reporting the call stacks of runtime-internal lock contention is
+// guarded by GODEBUG for now. Set GODEBUG=runtimecontentionstacks=1 to enable.
+//
+// TODO(rhysh): plumb through the delay duration, remove GODEBUG, update comment
+//
+// The M will track this by storing a pointer to the lock; lock/unlock pairs for
+// runtime-internal locks are always on the same M.
+//
+// Together, that demands several steps for recording contention. First, when
+// finally acquiring a contended lock, the M decides whether it should plan to
+// profile that event by storing a pointer to the lock in its "to be profiled
+// upon unlock" field. If that field is already set, it uses the relative
+// magnitudes to weight a random choice between itself and the other lock, with
+// the loser's time being added to the "additional contention" field. Otherwise
+// if the M's call stack buffer is occupied, it does the comparison against that
+// sample's magnitude.
+//
+// Second, having unlocked a mutex the M checks to see if it should capture the
+// call stack into its local buffer. Finally, when the M unlocks its last mutex,
+// it transfers the local buffer into the profile. As part of that step, it also
+// transfers any "additional contention" time to the profile. Any lock
+// contention that it experiences while adding samples to the profile will be
+// recorded later as "additional contention" and not include a call stack, to
+// avoid an echo.
+type lockTimer struct {
+	lock      *mutex
+	timeRate  int64
+	timeStart int64
+	tickStart int64
+}
+
+func (lt *lockTimer) begin() {
+	rate := int64(atomic.Load64(&mutexprofilerate))
+
+	lt.timeRate = gTrackingPeriod
+	if rate != 0 && rate < lt.timeRate {
+		lt.timeRate = rate
+	}
+	if int64(cheaprand())%lt.timeRate == 0 {
+		lt.timeStart = nanotime()
 	}
-	tp := hp.mWaitList.tail.ptr()
-	waiters := hp.mWaitList.waiters
-	headTimes := hp.mWaitList.acquireTimes
-	tailTimes := tp.mWaitList.acquireTimes
 
-	var dt timePair
-	dt.nanotime = now.nanotime - headTimes.nanotime
-	dt.cputicks = now.cputicks - headTimes.cputicks
-	if waiters > 1 {
-		dt.nanotime = int64(waiters) * (dt.nanotime + now.nanotime - tailTimes.nanotime) / 2
-		dt.cputicks = int64(waiters) * (dt.cputicks + now.cputicks - tailTimes.cputicks) / 2
+	if rate > 0 && int64(cheaprand())%rate == 0 {
+		lt.tickStart = cputicks()
 	}
+}
 
-	// When removeMutexWaitList removes a head or tail node, it's responsible
-	// for applying these changes to the new head or tail.
-	hp.mWaitList.acquireTimes = now
-	tp.mWaitList.acquireTimes = now
+func (lt *lockTimer) end() {
+	gp := getg()
 
-	return dt
+	if lt.timeStart != 0 {
+		nowTime := nanotime()
+		gp.m.mLockProfile.waitTime.Add((nowTime - lt.timeStart) * lt.timeRate)
+	}
+
+	if lt.tickStart != 0 {
+		nowTick := cputicks()
+		gp.m.mLockProfile.recordLock(nowTick-lt.tickStart, lt.lock)
+	}
 }
 
 // mLockProfile is part of the M struct to hold information relating to mutex
@@ -825,49 +890,18 @@ func claimMutexWaitTime(now timePair, head muintptr) timePair {
 // not include a call stack, to avoid an echo.
 type mLockProfile struct {
 	waitTime   atomic.Int64 // total nanoseconds spent waiting in runtime.lockWithRank
-	stack      []uintptr    // unlock stack that caused delay in other Ms' runtime.lockWithRank
-	cycles     int64        // cycles attributable to "stack"
+	stack      []uintptr    // stack that experienced contention in runtime.lockWithRank
+	pending    uintptr      // *mutex that experienced contention (to be traceback-ed)
+	cycles     int64        // cycles attributable to "pending" (if set), otherwise to "stack"
 	cyclesLost int64        // contention for which we weren't able to record a call stack
 	disabled   bool         // attribute all time to "lost"
 }
 
-// recordUnlock considers the current unlock call (which caused a total of dt
-// delay in other Ms) for later inclusion in the mutex contention profile. If
-// this M holds no other locks, it transfers the buffered contention record to
-// the mutex contention profile.
-//
-// From unlock2, we might not be holding a p in this code.
-//
-//go:nowritebarrierrec
-func (prof *mLockProfile) recordUnlock(dt timePair) {
-	if dt != (timePair{}) {
-		// We could make a point of clearing out the local storage right before
-		// this, to have a slightly better chance of being able to see the call
-		// stack if the program has several (nested) contended locks. If apps
-		// are seeing a lot of _LostContendedRuntimeLock samples, maybe that'll
-		// be a worthwhile change.
-		prof.proposeUnlock(dt)
-	}
-	if getg().m.locks == 1 && prof.cycles != 0 {
-		prof.store()
-	}
-}
-
-func (prof *mLockProfile) proposeUnlock(dt timePair) {
-	if nanos := dt.nanotime; nanos > 0 {
-		prof.waitTime.Add(nanos)
-	}
-
-	cycles := dt.cputicks
+func (prof *mLockProfile) recordLock(cycles int64, l *mutex) {
 	if cycles <= 0 {
 		return
 	}
 
-	rate := int64(atomic.Load64(&mutexprofilerate))
-	if rate <= 0 || int64(cheaprand())%rate != 0 {
-		return
-	}
-
 	if prof.disabled {
 		// We're experiencing contention while attempting to report contention.
 		// Make a note of its magnitude, but don't allow it to be the sole cause
@@ -876,6 +910,13 @@ func (prof *mLockProfile) proposeUnlock(dt timePair) {
 		return
 	}
 
+	if uintptr(unsafe.Pointer(l)) == prof.pending {
+		// Optimization: we'd already planned to profile this same lock (though
+		// possibly from a different unlock site).
+		prof.cycles += cycles
+		return
+	}
+
 	if prev := prof.cycles; prev > 0 {
 		// We can only store one call stack for runtime-internal lock contention
 		// on this M, and we've already got one. Decide which should stay, and
@@ -889,8 +930,24 @@ func (prof *mLockProfile) proposeUnlock(dt timePair) {
 			prof.cyclesLost += prev
 		}
 	}
+	// Saving the *mutex as a uintptr is safe because:
+	//  - lockrank_on.go does this too, which gives it regular exercise
+	//  - the lock would only move if it's stack allocated, which means it
+	//      cannot experience multi-M contention
+	prof.pending = uintptr(unsafe.Pointer(l))
 	prof.cycles = cycles
-	prof.captureStack()
+}
+
+// From unlock2, we might not be holding a p in this code.
+//
+//go:nowritebarrierrec
+func (prof *mLockProfile) recordUnlock(l *mutex) {
+	if uintptr(unsafe.Pointer(l)) == prof.pending {
+		prof.captureStack()
+	}
+	if gp := getg(); gp.m.locks == 1 && gp.m.mLockProfile.cycles != 0 {
+		prof.store()
+	}
 }
 
 func (prof *mLockProfile) captureStack() {
@@ -900,7 +957,7 @@ func (prof *mLockProfile) captureStack() {
 		return
 	}
 
-	skip := 4 // runtime.(*mLockProfile).proposeUnlock runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
+	skip := 3 // runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
 	if staticLockRanking {
 		// When static lock ranking is enabled, we'll always be on the system
 		// stack at this point. There will be a runtime.unlockWithRank.func1
@@ -913,6 +970,7 @@ func (prof *mLockProfile) captureStack() {
 		// "runtime.unlock".
 		skip += 1 // runtime.unlockWithRank.func1
 	}
+	prof.pending = 0
 
 	prof.stack[0] = logicalStackSentinel
 	if debug.runtimeContentionStacks.Load() == 0 {
-- 
2.52.0