]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: profile contended lock calls
authorRhys Hiltner <rhys@justin.tv>
Tue, 12 Sep 2023 22:44:48 +0000 (15:44 -0700)
committerGopher Robot <gobot@golang.org>
Fri, 17 Nov 2023 23:04:58 +0000 (23:04 +0000)
Add runtime-internal locks to the mutex contention profile.

Store up to one call stack responsible for lock contention on the M,
until it's safe to contribute its value to the mprof table. Try to use
that limited local storage space for a relatively large source of
contention, and attribute any contention in stacks we're not able to
store to a sentinel _LostContendedLock function.

Avoid ballooning lock contention while manipulating the mprof table by
attributing to that sentinel function any lock contention experienced
while reporting lock contention.

Guard collecting real call stacks with GODEBUG=profileruntimelocks=1,
since the available data has mixed semantics; we can easily capture an
M's own wait time, but we'd prefer for the profile entry of each
critical section to describe how long it made the other Ms wait. It's
too late in the Go 1.22 cycle to make the required changes to
futex-based locks. When not enabled, attribute the time to the sentinel
function instead.

Fixes #57071

Change-Id: I3eee0ccbfc20f333b56f20d8725dfd7f3a526b41
Reviewed-on: https://go-review.googlesource.com/c/go/+/528657
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Michael Pratt <mpratt@google.com>
Auto-Submit: Rhys Hiltner <rhys@justin.tv>
Reviewed-by: Than McIntosh <thanm@google.com>
src/runtime/debug.go
src/runtime/export_test.go
src/runtime/extern.go
src/runtime/lock_futex.go
src/runtime/lock_sema.go
src/runtime/metrics.go
src/runtime/metrics_test.go
src/runtime/mprof.go
src/runtime/proc.go
src/runtime/runtime1.go
src/runtime/runtime2.go

index 0e61692f3d1003163c30f7c7578c88817deabd19..3233ce8ee73718c1123594b28dfc71f255a9ee0b 100644 (file)
@@ -52,6 +52,17 @@ func NumCgoCall() int64 {
        return n
 }
 
+func totalMutexWaitTimeNanos() int64 {
+       total := sched.totalMutexWaitTime.Load()
+
+       total += sched.totalRuntimeLockWaitTime.Load()
+       for mp := (*m)(atomic.Loadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
+               total += mp.mLockProfile.waitTime.Load()
+       }
+
+       return total
+}
+
 // NumGoroutine returns the number of goroutines that currently exist.
 func NumGoroutine() int {
        return int(gcount())
index d2f35639568c89a4553e9957988238e107020473..5f0e24d4f13e237986942e22af6fb2135e3c558d 100644 (file)
@@ -1340,6 +1340,27 @@ func PageCachePagesLeaked() (leaked uintptr) {
        return
 }
 
+type Mutex = mutex
+
+var Lock = lock
+var Unlock = unlock
+
+func MutexContended(l *mutex) bool {
+       switch atomic.Loaduintptr(&l.key) {
+       case 0: // unlocked
+               return false
+       case 1: // locked
+               return false
+       default: // an M is sleeping
+               return true
+       }
+}
+
+func SemRootLock(addr *uint32) *mutex {
+       root := semtable.rootFor(addr)
+       return &root.lock
+}
+
 var Semacquire = semacquire
 var Semrelease1 = semrelease1
 
index f8db296e6ba67f73290b0cd69be39f68247f4d86..d199720b9b05cf5c7305524308b2e82b919e862e 100644 (file)
@@ -145,6 +145,14 @@ It is a comma-separated list of name=val pairs setting these named variables:
        risk in that scenario. Currently not supported on Windows, plan9 or js/wasm. Setting this
        option for some applications can produce large traces, so use with care.
 
+       profileruntimelocks: setting profileruntimelocks=1 includes call stacks related to
+       contention on runtime-internal locks in the "mutex" profile, subject to the
+       MutexProfileFraction setting. The call stacks will correspond to the unlock call that
+       released the lock. But instead of the value corresponding to the amount of contention that
+       call stack caused, it corresponds to the amount of time the caller of unlock had to wait
+       in its original call to lock. A future release is expected to align those and remove this
+       setting.
+
        invalidptr: invalidptr=1 (the default) causes the garbage collector and stack
        copier to crash the program if an invalid pointer value (for example, 1)
        is found in a pointer-typed location. Setting invalidptr=0 disables this check.
index cc7d465ef1fc18c44ea56f40f7585bf39b673a2f..b4f57d5259d83fa6cfd13320ea034bca44d7fc20 100644 (file)
@@ -71,6 +71,8 @@ func lock2(l *mutex) {
        // its wakeup call.
        wait := v
 
+       timer := &lockTimer{lock: l}
+       timer.begin()
        // On uniprocessors, no point spinning.
        // On multiprocessors, spin for ACTIVE_SPIN attempts.
        spin := 0
@@ -82,6 +84,7 @@ func lock2(l *mutex) {
                for i := 0; i < spin; i++ {
                        for l.key == mutex_unlocked {
                                if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
+                                       timer.end()
                                        return
                                }
                        }
@@ -92,6 +95,7 @@ func lock2(l *mutex) {
                for i := 0; i < passive_spin; i++ {
                        for l.key == mutex_unlocked {
                                if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
+                                       timer.end()
                                        return
                                }
                        }
@@ -101,6 +105,7 @@ func lock2(l *mutex) {
                // Sleep.
                v = atomic.Xchg(key32(&l.key), mutex_sleeping)
                if v == mutex_unlocked {
+                       timer.end()
                        return
                }
                wait = mutex_sleeping
@@ -122,6 +127,7 @@ func unlock2(l *mutex) {
        }
 
        gp := getg()
+       gp.m.mLockProfile.recordUnlock(l)
        gp.m.locks--
        if gp.m.locks < 0 {
                throw("runtimeĀ·unlock: lock count")
index 9afba08b0b243bcd597a8d215824b89054f949fb..84cd344db843f43c96f0f866ee3084d3c70038ab 100644 (file)
@@ -48,6 +48,8 @@ func lock2(l *mutex) {
        }
        semacreate(gp.m)
 
+       timer := &lockTimer{lock: l}
+       timer.begin()
        // On uniprocessor's, no point spinning.
        // On multiprocessors, spin for ACTIVE_SPIN attempts.
        spin := 0
@@ -60,6 +62,7 @@ Loop:
                if v&locked == 0 {
                        // Unlocked. Try to lock.
                        if atomic.Casuintptr(&l.key, v, v|locked) {
+                               timer.end()
                                return
                        }
                        i = 0
@@ -119,6 +122,7 @@ func unlock2(l *mutex) {
                        }
                }
        }
+       gp.m.mLockProfile.recordUnlock(l)
        gp.m.locks--
        if gp.m.locks < 0 {
                throw("runtimeĀ·unlock: lock count")
index 4bd167135db8f87c77cb8c758d737ca426b92bc8..f97a3804ab32201c895ea5d45a180067b6a7b305 100644 (file)
@@ -470,7 +470,7 @@ func initMetrics() {
                "/sync/mutex/wait/total:seconds": {
                        compute: func(_ *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(sched.totalMutexWaitTime.Load()))
+                               out.scalar = float64bits(nsToSec(totalMutexWaitTimeNanos()))
                        },
                },
        }
index 1e82897381c43c86f72675ccdaa1682ae495dee9..56f3340b1686ec5a9ce6c57eaa5012846dd69ddd 100644 (file)
@@ -6,15 +6,21 @@ package runtime_test
 
 import (
        "bytes"
+       "fmt"
+       "internal/goexperiment"
+       "internal/profile"
        "os"
        "reflect"
        "runtime"
        "runtime/debug"
        "runtime/metrics"
+       "runtime/pprof"
        "runtime/trace"
+       "slices"
        "sort"
        "strings"
        "sync"
+       "sync/atomic"
        "testing"
        "time"
        "unsafe"
@@ -939,3 +945,305 @@ func TestSchedPauseMetrics(t *testing.T) {
                })
        }
 }
+
+func TestRuntimeLockMetricsAndProfile(t *testing.T) {
+       old := runtime.SetMutexProfileFraction(0) // enabled during sub-tests
+       defer runtime.SetMutexProfileFraction(old)
+       if old != 0 {
+               t.Fatalf("need MutexProfileRate 0, got %d", old)
+       }
+
+       {
+               before := os.Getenv("GODEBUG")
+               for _, s := range strings.Split(before, ",") {
+                       if strings.HasPrefix(s, "profileruntimelocks=") {
+                               t.Logf("GODEBUG includes explicit setting %q", s)
+                       }
+               }
+               defer func() { os.Setenv("GODEBUG", before) }()
+               os.Setenv("GODEBUG", fmt.Sprintf("%s,profileruntimelocks=1", before))
+       }
+
+       loadProfile := func(t *testing.T) *profile.Profile {
+               var w bytes.Buffer
+               pprof.Lookup("mutex").WriteTo(&w, 0)
+               p, err := profile.Parse(&w)
+               if err != nil {
+                       t.Fatalf("failed to parse profile: %v", err)
+               }
+               if err := p.CheckValid(); err != nil {
+                       t.Fatalf("invalid profile: %v", err)
+               }
+               return p
+       }
+
+       measureDelta := func(t *testing.T, fn func()) (metricGrowth, profileGrowth float64, p *profile.Profile) {
+               beforeProfile := loadProfile(t)
+               beforeMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
+               metrics.Read(beforeMetrics)
+
+               fn()
+
+               afterProfile := loadProfile(t)
+               afterMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
+               metrics.Read(afterMetrics)
+
+               sumSamples := func(p *profile.Profile, i int) int64 {
+                       var sum int64
+                       for _, s := range p.Sample {
+                               sum += s.Value[i]
+                       }
+                       return sum
+               }
+
+               metricGrowth = afterMetrics[0].Value.Float64() - beforeMetrics[0].Value.Float64()
+               profileGrowth = float64(sumSamples(afterProfile, 1)-sumSamples(beforeProfile, 1)) * time.Nanosecond.Seconds()
+
+               // The internal/profile package does not support compaction; this delta
+               // profile will include separate positive and negative entries.
+               p = afterProfile.Copy()
+               if len(beforeProfile.Sample) > 0 {
+                       err := p.Merge(beforeProfile, -1)
+                       if err != nil {
+                               t.Fatalf("Merge profiles: %v", err)
+                       }
+               }
+
+               return metricGrowth, profileGrowth, p
+       }
+
+       testcase := func(stk []string, workers int, fn func() bool) func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
+               return func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
+                       metricGrowth, profileGrowth, p := measureDelta(t, func() {
+                               var started, stopped sync.WaitGroup
+                               started.Add(workers)
+                               stopped.Add(workers)
+                               for i := 0; i < workers; i++ {
+                                       w := &contentionWorker{
+                                               before: func() {
+                                                       started.Done()
+                                                       started.Wait()
+                                               },
+                                               after: func() {
+                                                       stopped.Done()
+                                               },
+                                               fn: fn,
+                                       }
+                                       go w.run()
+                               }
+                               stopped.Wait()
+                       })
+
+                       if profileGrowth == 0 {
+                               t.Errorf("no increase in mutex profile")
+                       }
+                       if metricGrowth == 0 {
+                               t.Errorf("no increase in /sync/mutex/wait/total:seconds metric")
+                       }
+                       // This comparison is possible because the time measurements in support of
+                       // runtime/pprof and runtime/metrics for runtime-internal locks are so close
+                       // together. It doesn't work as well for user-space contention, where the
+                       // involved goroutines are not _Grunnable the whole time and so need to pass
+                       // through the scheduler.
+                       t.Logf("lock contention growth in runtime/pprof's view  (%fs)", profileGrowth)
+                       t.Logf("lock contention growth in runtime/metrics' view (%fs)", metricGrowth)
+
+                       if goexperiment.StaticLockRanking {
+                               if !slices.ContainsFunc(stk, func(s string) bool {
+                                       return s == "runtime.systemstack" || s == "runtime.mcall" || s == "runtime.mstart"
+                               }) {
+                                       // stk is a call stack that is still on the user stack when
+                                       // it calls runtime.unlock. Add the extra function that
+                                       // we'll see, when the static lock ranking implementation of
+                                       // runtime.unlockWithRank switches to the system stack.
+                                       stk = append([]string{"runtime.unlockWithRank"}, stk...)
+                               }
+                       }
+
+                       var stks [][]string
+                       for _, s := range p.Sample {
+                               var have []string
+                               for _, loc := range s.Location {
+                                       for _, line := range loc.Line {
+                                               have = append(have, line.Function.Name)
+                                       }
+                               }
+                               stks = append(stks, have)
+                               if slices.Equal(have, stk) {
+                                       n += s.Value[0]
+                                       value += s.Value[1]
+                               }
+                       }
+                       t.Logf("stack %v has samples totaling n=%d value=%d", stk, n, value)
+                       if n == 0 && value == 0 {
+                               t.Logf("profile:\n%s", p)
+                               for _, have := range stks {
+                                       t.Logf("have stack %v", have)
+                               }
+                               t.Errorf("want stack %v", stk)
+                       }
+
+                       return metricGrowth, profileGrowth, n, value
+               }
+       }
+
+       name := t.Name()
+
+       t.Run("runtime.lock", func(t *testing.T) {
+               mus := make([]runtime.Mutex, 100)
+               var needContention atomic.Int64
+               delay := 10 * time.Microsecond
+               delayMicros := delay.Microseconds()
+
+               // The goroutine that acquires the lock will only proceed when it
+               // detects that its partner is contended for the lock. That will lead to
+               // live-lock if anything (such as a STW) prevents the partner goroutine
+               // from running. Allowing the contention workers to pause and restart
+               // (to allow a STW to proceed) makes it harder to confirm that we're
+               // counting the correct number of contention events, since some locks
+               // will end up contended twice. Instead, disable the GC.
+               defer debug.SetGCPercent(debug.SetGCPercent(-1))
+
+               const workers = 2
+               if runtime.GOMAXPROCS(0) < workers {
+                       t.Skipf("contention on runtime-internal locks requires GOMAXPROCS >= %d", workers)
+               }
+
+               fn := func() bool {
+                       n := int(needContention.Load())
+                       if n < 0 {
+                               return false
+                       }
+                       mu := &mus[n]
+
+                       runtime.Lock(mu)
+                       for int(needContention.Load()) == n {
+                               if runtime.MutexContended(mu) {
+                                       // make them wait a little while
+                                       for start := runtime.Nanotime(); (runtime.Nanotime()-start)/1000 < delayMicros; {
+                                               runtime.Usleep(uint32(delayMicros))
+                                       }
+                                       break
+                               }
+                       }
+                       runtime.Unlock(mu)
+                       needContention.Store(int64(n - 1))
+
+                       return true
+               }
+
+               stk := []string{
+                       "runtime.unlock",
+                       "runtime_test." + name + ".func5.1",
+                       "runtime_test.(*contentionWorker).run",
+               }
+
+               t.Run("sample-1", func(t *testing.T) {
+                       old := runtime.SetMutexProfileFraction(1)
+                       defer runtime.SetMutexProfileFraction(old)
+
+                       needContention.Store(int64(len(mus) - 1))
+                       metricGrowth, profileGrowth, n, _ := testcase(stk, workers, fn)(t)
+
+                       if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); have < want {
+                               // The test imposes a delay with usleep, verified with calls to
+                               // nanotime. Compare against the runtime/metrics package's view
+                               // (based on nanotime) rather than runtime/pprof's view (based
+                               // on cputicks).
+                               t.Errorf("runtime/metrics reported less than the known minimum contention duration (%fs < %fs)", have, want)
+                       }
+                       if have, want := n, int64(len(mus)); have != want {
+                               t.Errorf("mutex profile reported contention count different from the known true count (%d != %d)", have, want)
+                       }
+
+                       const slop = 1.5 // account for nanotime vs cputicks
+                       if profileGrowth > slop*metricGrowth || metricGrowth > slop*profileGrowth {
+                               t.Errorf("views differ by more than %fx", slop)
+                       }
+               })
+
+               t.Run("sample-2", func(t *testing.T) {
+                       old := runtime.SetMutexProfileFraction(2)
+                       defer runtime.SetMutexProfileFraction(old)
+
+                       needContention.Store(int64(len(mus) - 1))
+                       metricGrowth, profileGrowth, n, _ := testcase(stk, workers, fn)(t)
+
+                       // With 100 trials and profile fraction of 2, we expect to capture
+                       // 50 samples. Allow the test to pass if we get at least 20 samples;
+                       // the CDF of the binomial distribution says there's less than a
+                       // 1e-9 chance of that, which is an acceptably low flakiness rate.
+                       const samplingSlop = 2.5
+
+                       if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); samplingSlop*have < want {
+                               // The test imposes a delay with usleep, verified with calls to
+                               // nanotime. Compare against the runtime/metrics package's view
+                               // (based on nanotime) rather than runtime/pprof's view (based
+                               // on cputicks).
+                               t.Errorf("runtime/metrics reported less than the known minimum contention duration (%f * %fs < %fs)", samplingSlop, have, want)
+                       }
+                       if have, want := n, int64(len(mus)); float64(have) > float64(want)*samplingSlop || float64(want) > float64(have)*samplingSlop {
+                               t.Errorf("mutex profile reported contention count too different from the expected count (%d far from %d)", have, want)
+                       }
+
+                       const timerSlop = 1.5 * samplingSlop // account for nanotime vs cputicks, plus the two views' independent sampling
+                       if profileGrowth > timerSlop*metricGrowth || metricGrowth > timerSlop*profileGrowth {
+                               t.Errorf("views differ by more than %fx", timerSlop)
+                       }
+               })
+       })
+
+       t.Run("runtime.semrelease", func(t *testing.T) {
+               old := runtime.SetMutexProfileFraction(1)
+               defer runtime.SetMutexProfileFraction(old)
+
+               const workers = 3
+               if runtime.GOMAXPROCS(0) < workers {
+                       t.Skipf("creating and observing contention on runtime-internal semaphores requires GOMAXPROCS >= %d", workers)
+               }
+
+               var sem uint32 = 1
+               var sawContention atomic.Int32
+               var need int32 = 1000 // counteract low timer resolution by requiring more samples
+               fn := func() bool {
+                       if sawContention.Load() >= need {
+                               return false
+                       }
+                       runtime.Semacquire(&sem)
+                       runtime.Semrelease1(&sem, false, 0)
+                       if runtime.MutexContended(runtime.SemRootLock(&sem)) {
+                               sawContention.Add(1)
+                       }
+                       return true
+               }
+
+               stk := []string{
+                       "runtime.unlock",
+                       "runtime.semrelease1",
+                       "runtime_test.TestRuntimeLockMetricsAndProfile.func6.1",
+                       "runtime_test.(*contentionWorker).run",
+               }
+
+               // Verify that we get call stack we expect, with anything more than zero
+               // nanoseconds / zero samples. The duration of each contention event is
+               // too small relative to the expected overhead for us to verify its
+               // value more directly. Leave that to the explicit lock/unlock test.
+
+               testcase(stk, workers, fn)(t)
+       })
+}
+
+// contentionWorker provides cleaner call stacks for lock contention profile tests
+type contentionWorker struct {
+       before func()
+       fn     func() bool
+       after  func()
+}
+
+func (w *contentionWorker) run() {
+       defer w.after()
+       w.before()
+
+       for w.fn() {
+       }
+}
index af461eef1a3e86e324ac3226cbb8405ef1142dc2..b1930b30202c007a25cf0e2b71f1d3be80f15efd 100644 (file)
@@ -234,6 +234,10 @@ func newBucket(typ bucketType, nstk int) *bucket {
 // stk returns the slice in b holding the stack.
 func (b *bucket) stk() []uintptr {
        stk := (*[maxStack]uintptr)(add(unsafe.Pointer(b), unsafe.Sizeof(*b)))
+       if b.nstk > maxStack {
+               // prove that slicing works; otherwise a failure requires a P
+               throw("bad profile stack count")
+       }
        return stk[:b.nstk:b.nstk]
 }
 
@@ -509,7 +513,237 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) {
        } else {
                nstk = gcallers(gp.m.curg, skip, stk[:])
        }
-       b := stkbucket(which, 0, stk[:nstk], true)
+
+       saveBlockEventStack(cycles, rate, stk[:nstk], which)
+}
+
+// lockTimer assists with profiling contention on runtime-internal locks.
+//
+// There are several steps between the time that an M experiences contention and
+// when that contention may be added to the profile. This comes from our
+// constraints: We need to keep the critical section of each lock small,
+// especially when those locks are contended. The reporting code cannot acquire
+// new locks until the M has released all other locks, which means no memory
+// allocations and encourages use of (temporary) M-local storage.
+//
+// The M will have space for storing one call stack that caused contention, and
+// for the magnitude of that contention. It will also have space to store the
+// magnitude of additional contention the M caused, since it only has space to
+// remember one call stack and might encounter several contention events before
+// it releases all of its locks and is thus able to transfer the local buffer
+// into the profile.
+//
+// The M will collect the call stack when it unlocks the contended lock. That
+// minimizes the impact on the critical section of the contended lock, and
+// matches the mutex profile's behavior for contention in sync.Mutex: measured
+// at the Unlock method.
+//
+// The profile for contention on sync.Mutex blames the caller of Unlock for the
+// amount of contention experienced by the callers of Lock which had to wait.
+// When there are several critical sections, this allows identifying which of
+// them is responsible.
+//
+// Matching that behavior for runtime-internal locks will require identifying
+// which Ms are blocked on the mutex. The semaphore-based implementation is
+// ready to allow that, but the futex-based implementation will require a bit
+// more work. Until then, we report contention on runtime-internal locks with a
+// call stack taken from the unlock call (like the rest of the user-space
+// "mutex" profile), but assign it a duration value based on how long the
+// previous lock call took (like the user-space "block" profile).
+//
+// Thus, reporting the call stacks of runtime-internal lock contention is
+// guarded by GODEBUG for now. Set GODEBUG=profileruntimelocks=1 to enable.
+//
+// TODO(rhysh): plumb through the delay duration, remove GODEBUG, update comment
+//
+// The M will track this by storing a pointer to the lock; lock/unlock pairs for
+// runtime-internal locks are always on the same M.
+//
+// Together, that demands several steps for recording contention. First, when
+// finally acquiring a contended lock, the M decides whether it should plan to
+// profile that event by storing a pointer to the lock in its "to be profiled
+// upon unlock" field. If that field is already set, it uses the relative
+// magnitudes to weight a random choice between itself and the other lock, with
+// the loser's time being added to the "additional contention" field. Otherwise
+// if the M's call stack buffer is occupied, it does the comparison against that
+// sample's magnitude.
+//
+// Second, having unlocked a mutex the M checks to see if it should capture the
+// call stack into its local buffer. Finally, when the M unlocks its last mutex,
+// it transfers the local buffer into the profile. As part of that step, it also
+// transfers any "additional contention" time to the profile. Any lock
+// contention that it experiences while adding samples to the profile will be
+// recorded later as "additional contention" and not include a call stack, to
+// avoid an echo.
+type lockTimer struct {
+       lock      *mutex
+       timeRate  int64
+       timeStart int64
+       tickStart int64
+}
+
+func (lt *lockTimer) begin() {
+       rate := int64(atomic.Load64(&mutexprofilerate))
+
+       lt.timeRate = gTrackingPeriod
+       if rate != 0 && rate < lt.timeRate {
+               lt.timeRate = rate
+       }
+       if int64(fastrand())%lt.timeRate == 0 {
+               lt.timeStart = nanotime()
+       }
+
+       if rate > 0 && int64(fastrand())%rate == 0 {
+               lt.tickStart = cputicks()
+       }
+}
+
+func (lt *lockTimer) end() {
+       gp := getg()
+
+       if lt.timeStart != 0 {
+               nowTime := nanotime()
+               gp.m.mLockProfile.waitTime.Add((nowTime - lt.timeStart) * lt.timeRate)
+       }
+
+       if lt.tickStart != 0 {
+               nowTick := cputicks()
+               gp.m.mLockProfile.recordLock(nowTick-lt.tickStart, lt.lock)
+       }
+}
+
+type mLockProfile struct {
+       waitTime   atomic.Int64      // total nanoseconds spent waiting in runtime.lockWithRank
+       stack      [maxStack]uintptr // stack that experienced contention in runtime.lockWithRank
+       pending    uintptr           // *mutex that experienced contention (to be traceback-ed)
+       cycles     int64             // cycles attributable to "pending" (if set), otherwise to "stack"
+       cyclesLost int64             // contention for which we weren't able to record a call stack
+       disabled   bool              // attribute all time to "lost"
+}
+
+func (prof *mLockProfile) recordLock(cycles int64, l *mutex) {
+       if cycles <= 0 {
+               return
+       }
+
+       if prof.disabled {
+               // We're experiencing contention while attempting to report contention.
+               // Make a note of its magnitude, but don't allow it to be the sole cause
+               // of another contention report.
+               prof.cyclesLost += cycles
+               return
+       }
+
+       if uintptr(unsafe.Pointer(l)) == prof.pending {
+               // Optimization: we'd already planned to profile this same lock (though
+               // possibly from a different unlock site).
+               prof.cycles += cycles
+               return
+       }
+
+       if prev := prof.cycles; prev > 0 {
+               // We can only store one call stack for runtime-internal lock contention
+               // on this M, and we've already got one. Decide which should stay, and
+               // add the other to the report for runtime._LostContendedLock.
+               prevScore := fastrand64() % uint64(prev)
+               thisScore := fastrand64() % uint64(cycles)
+               if prevScore > thisScore {
+                       prof.cyclesLost += cycles
+                       return
+               } else {
+                       prof.cyclesLost += prev
+               }
+       }
+       // Saving the *mutex as a uintptr is safe because:
+       //  - lockrank_on.go does this too, which gives it regular exercise
+       //  - the lock would only move if it's stack allocated, which means it
+       //      cannot experience multi-M contention
+       prof.pending = uintptr(unsafe.Pointer(l))
+       prof.cycles = cycles
+}
+
+// From unlock2, we might not be holding a p in this code.
+//
+//go:nowritebarrierrec
+func (prof *mLockProfile) recordUnlock(l *mutex) {
+       if uintptr(unsafe.Pointer(l)) == prof.pending {
+               prof.captureStack()
+       }
+       if gp := getg(); gp.m.locks == 1 && gp.m.mLockProfile.cycles != 0 {
+               prof.store()
+       }
+}
+
+func (prof *mLockProfile) captureStack() {
+       skip := 3 // runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
+       if staticLockRanking {
+               // When static lock ranking is enabled, we'll always be on the system
+               // stack at this point. There will be a runtime.unlockWithRank.func1
+               // frame, and if the call to runtime.unlock took place on a user stack
+               // then there'll also be a runtime.systemstack frame. To keep stack
+               // traces somewhat consistent whether or not static lock ranking is
+               // enabled, we'd like to skip those. But it's hard to tell how long
+               // we've been on the system stack so accept an extra frame in that case,
+               // with a leaf of "runtime.unlockWithRank runtime.unlock" instead of
+               // "runtime.unlock".
+               skip += 1 // runtime.unlockWithRank.func1
+       }
+       prof.pending = 0
+
+       if debug.profileruntimelocks.Load() == 0 {
+               prof.stack[0] = abi.FuncPCABIInternal(_LostContendedLock) + sys.PCQuantum
+               prof.stack[1] = 0
+               return
+       }
+
+       var nstk int
+       gp := getg()
+       sp := getcallersp()
+       pc := getcallerpc()
+       systemstack(func() {
+               var u unwinder
+               u.initAt(pc, sp, 0, gp, unwindSilentErrors|unwindJumpStack)
+               nstk = tracebackPCs(&u, skip, prof.stack[:])
+       })
+       if nstk < len(prof.stack) {
+               prof.stack[nstk] = 0
+       }
+}
+
+func (prof *mLockProfile) store() {
+       // Report any contention we experience within this function as "lost"; it's
+       // important that the act of reporting a contention event not lead to a
+       // reportable contention event. This also means we can use prof.stack
+       // without copying, since it won't change during this function.
+       mp := acquirem()
+       prof.disabled = true
+
+       nstk := maxStack
+       for i := 0; i < nstk; i++ {
+               if pc := prof.stack[i]; pc == 0 {
+                       nstk = i
+                       break
+               }
+       }
+
+       cycles, lost := prof.cycles, prof.cyclesLost
+       prof.cycles, prof.cyclesLost = 0, 0
+
+       rate := int64(atomic.Load64(&mutexprofilerate))
+       saveBlockEventStack(cycles, rate, prof.stack[:nstk], mutexProfile)
+       if lost > 0 {
+               lostStk := [...]uintptr{
+                       abi.FuncPCABIInternal(_LostContendedLock) + sys.PCQuantum,
+               }
+               saveBlockEventStack(lost, rate, lostStk[:], mutexProfile)
+       }
+
+       prof.disabled = false
+       releasem(mp)
+}
+
+func saveBlockEventStack(cycles, rate int64, stk []uintptr, which bucketType) {
+       b := stkbucket(which, 0, stk, true)
        bp := b.bp()
 
        lock(&profBlockLock)
index 824b0fa0091c58b7faaedc81590bd3519f4205e6..6b13d3b2f6e16ff77855625374e06c16aa3cd27f 100644 (file)
@@ -919,7 +919,7 @@ func mcommoninit(mp *m, id int64) {
        // when it is just in a register or thread-local storage.
        mp.alllink = allm
 
-       // NumCgoCall() iterates over allm w/o schedlock,
+       // NumCgoCall() and others iterate over allm w/o schedlock,
        // so we need to publish it safely.
        atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
        unlock(&sched.lock)
@@ -1852,6 +1852,7 @@ found:
        unlock(&sched.lock)
 
        atomic.Xadd64(&ncgocall, int64(mp.ncgocall))
+       sched.totalRuntimeLockWaitTime.Add(mp.mLockProfile.waitTime.Load())
 
        // Release the P.
        handoffp(releasep())
@@ -5270,6 +5271,7 @@ func _ExternalCode()              { _ExternalCode() }
 func _LostExternalCode()          { _LostExternalCode() }
 func _GC()                        { _GC() }
 func _LostSIGPROFDuringAtomic64() { _LostSIGPROFDuringAtomic64() }
+func _LostContendedLock()         { _LostContendedLock() }
 func _VDSO()                      { _VDSO() }
 
 // Called if we receive a SIGPROF signal.
index 489dcdd79ca4bf7eb8636296645b5eecb5e9e5bc..877d94eef28d85a76f72bf1fb5813799a1669b24 100644 (file)
@@ -307,26 +307,27 @@ type dbgVar struct {
 // existing int var for that value, which may
 // already have an initial value.
 var debug struct {
-       cgocheck           int32
-       clobberfree        int32
-       dontfreezetheworld int32
-       efence             int32
-       gccheckmark        int32
-       gcpacertrace       int32
-       gcshrinkstackoff   int32
-       gcstoptheworld     int32
-       gctrace            int32
-       invalidptr         int32
-       madvdontneed       int32 // for Linux; issue 28466
-       scavtrace          int32
-       scheddetail        int32
-       schedtrace         int32
-       tracebackancestors int32
-       asyncpreemptoff    int32
-       harddecommit       int32
-       adaptivestackstart int32
-       tracefpunwindoff   int32
-       traceadvanceperiod int32
+       cgocheck            int32
+       clobberfree         int32
+       dontfreezetheworld  int32
+       efence              int32
+       gccheckmark         int32
+       gcpacertrace        int32
+       gcshrinkstackoff    int32
+       gcstoptheworld      int32
+       gctrace             int32
+       invalidptr          int32
+       madvdontneed        int32 // for Linux; issue 28466
+       profileruntimelocks atomic.Int32
+       scavtrace           int32
+       scheddetail         int32
+       schedtrace          int32
+       tracebackancestors  int32
+       asyncpreemptoff     int32
+       harddecommit        int32
+       adaptivestackstart  int32
+       tracefpunwindoff    int32
+       traceadvanceperiod  int32
 
        // debug.malloc is used as a combined debug check
        // in the malloc function and should be set
@@ -352,6 +353,7 @@ var dbgvars = []*dbgVar{
        {name: "gctrace", value: &debug.gctrace},
        {name: "invalidptr", value: &debug.invalidptr},
        {name: "madvdontneed", value: &debug.madvdontneed},
+       {name: "profileruntimelocks", atomic: &debug.profileruntimelocks},
        {name: "sbrk", value: &debug.sbrk},
        {name: "scavtrace", value: &debug.scavtrace},
        {name: "scheddetail", value: &debug.scheddetail},
index e64c3c569544ca10e66483cbf620ead443b002d9..6bdd66766db761c342269d062b5d847e5de32914 100644 (file)
@@ -593,6 +593,8 @@ type m struct {
        lockedInt     uint32      // tracking for internal lockOSThread
        nextwaitm     muintptr    // next m waiting for lock
 
+       mLockProfile mLockProfile // fields relating to runtime.lock contention
+
        // wait* are used to carry arguments from gopark into park_m, because
        // there's no stack to put them on. That is their sole purpose.
        waitunlockf          func(*g, unsafe.Pointer) bool
@@ -900,6 +902,12 @@ type schedt struct {
        // stwTotalTimeOther covers the others.
        stwTotalTimeGC    timeHistogram
        stwTotalTimeOther timeHistogram
+
+       // totalRuntimeLockWaitTime (plus the value of lockWaitTime on each M in
+       // allm) is the sum of time goroutines have spent in _Grunnable and with an
+       // M, but waiting for locks within the runtime. This field stores the value
+       // for Ms that have exited.
+       totalRuntimeLockWaitTime atomic.Int64
 }
 
 // Values for the flags field of a sigTabT.