]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: profile contended lock calls
authorRhys Hiltner <rhys@justin.tv>
Tue, 21 Nov 2023 16:03:54 +0000 (16:03 +0000)
committerGopher Robot <gobot@golang.org>
Tue, 21 Nov 2023 21:02:20 +0000 (21:02 +0000)
Add runtime-internal locks to the mutex contention profile.

Store up to one call stack responsible for lock contention on the M,
until it's safe to contribute its value to the mprof table. Try to use
that limited local storage space for a relatively large source of
contention, and attribute any contention in stacks we're not able to
store to a sentinel _LostContendedLock function.

Avoid ballooning lock contention while manipulating the mprof table by
attributing to that sentinel function any lock contention experienced
while reporting lock contention.

Guard collecting real call stacks with GODEBUG=profileruntimelocks=1,
since the available data has mixed semantics; we can easily capture an
M's own wait time, but we'd prefer for the profile entry of each
critical section to describe how long it made the other Ms wait. It's
too late in the Go 1.22 cycle to make the required changes to
futex-based locks. When not enabled, attribute the time to the sentinel
function instead.

Fixes #57071

This is a roll-forward of https://go.dev/cl/528657, which was reverted
in https://go.dev/cl/543660

Reason for revert: de-flakes tests (reduces dependence on fine-grained
timers, correctly identifies contention on big-endian futex locks,
attempts to measure contention in the semaphore implementation but only
uses that secondary measurement to finish the test early, skips tests on
single-processor systems)

Change-Id: I31389f24283d85e46ad9ba8d4f514cb9add8dfb0
Reviewed-on: https://go-review.googlesource.com/c/go/+/544195
Reviewed-by: Michael Pratt <mpratt@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Reviewed-by: Than McIntosh <thanm@google.com>
Auto-Submit: Rhys Hiltner <rhys@justin.tv>
Run-TryBot: Rhys Hiltner <rhys@justin.tv>

13 files changed:
src/runtime/debug.go
src/runtime/export_test.go
src/runtime/extern.go
src/runtime/lock_futex.go
src/runtime/lock_js.go
src/runtime/lock_sema.go
src/runtime/lock_wasip1.go
src/runtime/metrics.go
src/runtime/metrics_test.go
src/runtime/mprof.go
src/runtime/proc.go
src/runtime/runtime1.go
src/runtime/runtime2.go

index 0e61692f3d1003163c30f7c7578c88817deabd19..3233ce8ee73718c1123594b28dfc71f255a9ee0b 100644 (file)
@@ -52,6 +52,17 @@ func NumCgoCall() int64 {
        return n
 }
 
+func totalMutexWaitTimeNanos() int64 {
+       total := sched.totalMutexWaitTime.Load()
+
+       total += sched.totalRuntimeLockWaitTime.Load()
+       for mp := (*m)(atomic.Loadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
+               total += mp.mLockProfile.waitTime.Load()
+       }
+
+       return total
+}
+
 // NumGoroutine returns the number of goroutines that currently exist.
 func NumGoroutine() int {
        return int(gcount())
index 2b34929ba051c53c40550dc573cbcb63dffaed15..d071035e6225ccf718571a6d3da7c5f2926341f7 100644 (file)
@@ -1340,6 +1340,18 @@ func PageCachePagesLeaked() (leaked uintptr) {
        return
 }
 
+type Mutex = mutex
+
+var Lock = lock
+var Unlock = unlock
+
+var MutexContended = mutexContended
+
+func SemRootLock(addr *uint32) *mutex {
+       root := semtable.rootFor(addr)
+       return &root.lock
+}
+
 var Semacquire = semacquire
 var Semrelease1 = semrelease1
 
index f8db296e6ba67f73290b0cd69be39f68247f4d86..d199720b9b05cf5c7305524308b2e82b919e862e 100644 (file)
@@ -145,6 +145,14 @@ It is a comma-separated list of name=val pairs setting these named variables:
        risk in that scenario. Currently not supported on Windows, plan9 or js/wasm. Setting this
        option for some applications can produce large traces, so use with care.
 
+       profileruntimelocks: setting profileruntimelocks=1 includes call stacks related to
+       contention on runtime-internal locks in the "mutex" profile, subject to the
+       MutexProfileFraction setting. The call stacks will correspond to the unlock call that
+       released the lock. But instead of the value corresponding to the amount of contention that
+       call stack caused, it corresponds to the amount of time the caller of unlock had to wait
+       in its original call to lock. A future release is expected to align those and remove this
+       setting.
+
        invalidptr: invalidptr=1 (the default) causes the garbage collector and stack
        copier to crash the program if an invalid pointer value (for example, 1)
        is found in a pointer-typed location. Setting invalidptr=0 disables this check.
index cc7d465ef1fc18c44ea56f40f7585bf39b673a2f..867e2b34d0662279311ac1407f27bfda3b6a7b37 100644 (file)
@@ -44,6 +44,10 @@ func key32(p *uintptr) *uint32 {
        return (*uint32)(unsafe.Pointer(p))
 }
 
+func mutexContended(l *mutex) bool {
+       return atomic.Load(key32(&l.key)) > mutex_locked
+}
+
 func lock(l *mutex) {
        lockWithRank(l, getLockRank(l))
 }
@@ -71,6 +75,8 @@ func lock2(l *mutex) {
        // its wakeup call.
        wait := v
 
+       timer := &lockTimer{lock: l}
+       timer.begin()
        // On uniprocessors, no point spinning.
        // On multiprocessors, spin for ACTIVE_SPIN attempts.
        spin := 0
@@ -82,6 +88,7 @@ func lock2(l *mutex) {
                for i := 0; i < spin; i++ {
                        for l.key == mutex_unlocked {
                                if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
+                                       timer.end()
                                        return
                                }
                        }
@@ -92,6 +99,7 @@ func lock2(l *mutex) {
                for i := 0; i < passive_spin; i++ {
                        for l.key == mutex_unlocked {
                                if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
+                                       timer.end()
                                        return
                                }
                        }
@@ -101,6 +109,7 @@ func lock2(l *mutex) {
                // Sleep.
                v = atomic.Xchg(key32(&l.key), mutex_sleeping)
                if v == mutex_unlocked {
+                       timer.end()
                        return
                }
                wait = mutex_sleeping
@@ -122,6 +131,7 @@ func unlock2(l *mutex) {
        }
 
        gp := getg()
+       gp.m.mLockProfile.recordUnlock(l)
        gp.m.locks--
        if gp.m.locks < 0 {
                throw("runtimeĀ·unlock: lock count")
index 91ad7be317398f3bb26a7e3e26513d4df5799caa..b6ee5ec7afe2697b75c8dae63b3035bb4b4c867a 100644 (file)
@@ -23,6 +23,10 @@ const (
        passive_spin    = 1
 )
 
+func mutexContended(l *mutex) bool {
+       return false
+}
+
 func lock(l *mutex) {
        lockWithRank(l, getLockRank(l))
 }
index 9afba08b0b243bcd597a8d215824b89054f949fb..073e7d410e4c74bceb6a0d1ca1e4cc4078df127e 100644 (file)
@@ -31,6 +31,10 @@ const (
        passive_spin    = 1
 )
 
+func mutexContended(l *mutex) bool {
+       return atomic.Loaduintptr(&l.key) > locked
+}
+
 func lock(l *mutex) {
        lockWithRank(l, getLockRank(l))
 }
@@ -48,6 +52,8 @@ func lock2(l *mutex) {
        }
        semacreate(gp.m)
 
+       timer := &lockTimer{lock: l}
+       timer.begin()
        // On uniprocessor's, no point spinning.
        // On multiprocessors, spin for ACTIVE_SPIN attempts.
        spin := 0
@@ -60,6 +66,7 @@ Loop:
                if v&locked == 0 {
                        // Unlocked. Try to lock.
                        if atomic.Casuintptr(&l.key, v, v|locked) {
+                               timer.end()
                                return
                        }
                        i = 0
@@ -119,6 +126,7 @@ func unlock2(l *mutex) {
                        }
                }
        }
+       gp.m.mLockProfile.recordUnlock(l)
        gp.m.locks--
        if gp.m.locks < 0 {
                throw("runtimeĀ·unlock: lock count")
index c4fc59f6cc222ca54a8ee970d250d716dfdfe02d..acfc62acb48e90d58ec7accbeea0cd41b184bfa1 100644 (file)
@@ -19,6 +19,10 @@ const (
        active_spin_cnt = 30
 )
 
+func mutexContended(l *mutex) bool {
+       return false
+}
+
 func lock(l *mutex) {
        lockWithRank(l, getLockRank(l))
 }
index 4bd167135db8f87c77cb8c758d737ca426b92bc8..f97a3804ab32201c895ea5d45a180067b6a7b305 100644 (file)
@@ -470,7 +470,7 @@ func initMetrics() {
                "/sync/mutex/wait/total:seconds": {
                        compute: func(_ *statAggregate, out *metricValue) {
                                out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(sched.totalMutexWaitTime.Load()))
+                               out.scalar = float64bits(nsToSec(totalMutexWaitTimeNanos()))
                        },
                },
        }
index acaa5ca375f114f4f8eed4be5c0f3d2aea62299a..d28a589adc2e1ce627b3fd2ad0598a25c93e55e6 100644 (file)
@@ -6,15 +6,21 @@ package runtime_test
 
 import (
        "bytes"
+       "fmt"
+       "internal/goexperiment"
+       "internal/profile"
        "os"
        "reflect"
        "runtime"
        "runtime/debug"
        "runtime/metrics"
+       "runtime/pprof"
        "runtime/trace"
+       "slices"
        "sort"
        "strings"
        "sync"
+       "sync/atomic"
        "testing"
        "time"
        "unsafe"
@@ -939,3 +945,319 @@ func TestSchedPauseMetrics(t *testing.T) {
                })
        }
 }
+
+func TestRuntimeLockMetricsAndProfile(t *testing.T) {
+       old := runtime.SetMutexProfileFraction(0) // enabled during sub-tests
+       defer runtime.SetMutexProfileFraction(old)
+       if old != 0 {
+               t.Fatalf("need MutexProfileRate 0, got %d", old)
+       }
+
+       {
+               before := os.Getenv("GODEBUG")
+               for _, s := range strings.Split(before, ",") {
+                       if strings.HasPrefix(s, "profileruntimelocks=") {
+                               t.Logf("GODEBUG includes explicit setting %q", s)
+                       }
+               }
+               defer func() { os.Setenv("GODEBUG", before) }()
+               os.Setenv("GODEBUG", fmt.Sprintf("%s,profileruntimelocks=1", before))
+       }
+
+       t.Logf("NumCPU %d", runtime.NumCPU())
+       t.Logf("GOMAXPROCS %d", runtime.GOMAXPROCS(0))
+       if minCPU := 2; runtime.NumCPU() < minCPU {
+               t.Skipf("creating and observing contention on runtime-internal locks requires NumCPU >= %d", minCPU)
+       }
+
+       loadProfile := func(t *testing.T) *profile.Profile {
+               var w bytes.Buffer
+               pprof.Lookup("mutex").WriteTo(&w, 0)
+               p, err := profile.Parse(&w)
+               if err != nil {
+                       t.Fatalf("failed to parse profile: %v", err)
+               }
+               if err := p.CheckValid(); err != nil {
+                       t.Fatalf("invalid profile: %v", err)
+               }
+               return p
+       }
+
+       measureDelta := func(t *testing.T, fn func()) (metricGrowth, profileGrowth float64, p *profile.Profile) {
+               beforeProfile := loadProfile(t)
+               beforeMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
+               metrics.Read(beforeMetrics)
+
+               fn()
+
+               afterProfile := loadProfile(t)
+               afterMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
+               metrics.Read(afterMetrics)
+
+               sumSamples := func(p *profile.Profile, i int) int64 {
+                       var sum int64
+                       for _, s := range p.Sample {
+                               sum += s.Value[i]
+                       }
+                       return sum
+               }
+
+               metricGrowth = afterMetrics[0].Value.Float64() - beforeMetrics[0].Value.Float64()
+               profileGrowth = float64(sumSamples(afterProfile, 1)-sumSamples(beforeProfile, 1)) * time.Nanosecond.Seconds()
+
+               // The internal/profile package does not support compaction; this delta
+               // profile will include separate positive and negative entries.
+               p = afterProfile.Copy()
+               if len(beforeProfile.Sample) > 0 {
+                       err := p.Merge(beforeProfile, -1)
+                       if err != nil {
+                               t.Fatalf("Merge profiles: %v", err)
+                       }
+               }
+
+               return metricGrowth, profileGrowth, p
+       }
+
+       testcase := func(strictTiming bool, stk []string, workers int, fn func() bool) func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
+               return func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
+                       metricGrowth, profileGrowth, p := measureDelta(t, func() {
+                               var started, stopped sync.WaitGroup
+                               started.Add(workers)
+                               stopped.Add(workers)
+                               for i := 0; i < workers; i++ {
+                                       w := &contentionWorker{
+                                               before: func() {
+                                                       started.Done()
+                                                       started.Wait()
+                                               },
+                                               after: func() {
+                                                       stopped.Done()
+                                               },
+                                               fn: fn,
+                                       }
+                                       go w.run()
+                               }
+                               stopped.Wait()
+                       })
+
+                       if profileGrowth == 0 {
+                               t.Errorf("no increase in mutex profile")
+                       }
+                       if metricGrowth == 0 && strictTiming {
+                               // If the critical section is very short, systems with low timer
+                               // resolution may be unable to measure it via nanotime.
+                               t.Errorf("no increase in /sync/mutex/wait/total:seconds metric")
+                       }
+                       // This comparison is possible because the time measurements in support of
+                       // runtime/pprof and runtime/metrics for runtime-internal locks are so close
+                       // together. It doesn't work as well for user-space contention, where the
+                       // involved goroutines are not _Grunnable the whole time and so need to pass
+                       // through the scheduler.
+                       t.Logf("lock contention growth in runtime/pprof's view  (%fs)", profileGrowth)
+                       t.Logf("lock contention growth in runtime/metrics' view (%fs)", metricGrowth)
+
+                       if goexperiment.StaticLockRanking {
+                               if !slices.ContainsFunc(stk, func(s string) bool {
+                                       return s == "runtime.systemstack" || s == "runtime.mcall" || s == "runtime.mstart"
+                               }) {
+                                       // stk is a call stack that is still on the user stack when
+                                       // it calls runtime.unlock. Add the extra function that
+                                       // we'll see, when the static lock ranking implementation of
+                                       // runtime.unlockWithRank switches to the system stack.
+                                       stk = append([]string{"runtime.unlockWithRank"}, stk...)
+                               }
+                       }
+
+                       var stks [][]string
+                       for _, s := range p.Sample {
+                               var have []string
+                               for _, loc := range s.Location {
+                                       for _, line := range loc.Line {
+                                               have = append(have, line.Function.Name)
+                                       }
+                               }
+                               stks = append(stks, have)
+                               if slices.Equal(have, stk) {
+                                       n += s.Value[0]
+                                       value += s.Value[1]
+                               }
+                       }
+                       t.Logf("stack %v has samples totaling n=%d value=%d", stk, n, value)
+                       if n == 0 && value == 0 {
+                               t.Logf("profile:\n%s", p)
+                               for _, have := range stks {
+                                       t.Logf("have stack %v", have)
+                               }
+                               t.Errorf("want stack %v", stk)
+                       }
+
+                       return metricGrowth, profileGrowth, n, value
+               }
+       }
+
+       name := t.Name()
+
+       t.Run("runtime.lock", func(t *testing.T) {
+               mus := make([]runtime.Mutex, 100)
+               var needContention atomic.Int64
+               delay := 100 * time.Microsecond // large relative to system noise, for comparison between clocks
+               delayMicros := delay.Microseconds()
+
+               // The goroutine that acquires the lock will only proceed when it
+               // detects that its partner is contended for the lock. That will lead to
+               // live-lock if anything (such as a STW) prevents the partner goroutine
+               // from running. Allowing the contention workers to pause and restart
+               // (to allow a STW to proceed) makes it harder to confirm that we're
+               // counting the correct number of contention events, since some locks
+               // will end up contended twice. Instead, disable the GC.
+               defer debug.SetGCPercent(debug.SetGCPercent(-1))
+
+               const workers = 2
+               if runtime.GOMAXPROCS(0) < workers {
+                       t.Skipf("contention on runtime-internal locks requires GOMAXPROCS >= %d", workers)
+               }
+
+               fn := func() bool {
+                       n := int(needContention.Load())
+                       if n < 0 {
+                               return false
+                       }
+                       mu := &mus[n]
+
+                       runtime.Lock(mu)
+                       for int(needContention.Load()) == n {
+                               if runtime.MutexContended(mu) {
+                                       // make them wait a little while
+                                       for start := runtime.Nanotime(); (runtime.Nanotime()-start)/1000 < delayMicros; {
+                                               runtime.Usleep(uint32(delayMicros))
+                                       }
+                                       break
+                               }
+                       }
+                       runtime.Unlock(mu)
+                       needContention.Store(int64(n - 1))
+
+                       return true
+               }
+
+               stk := []string{
+                       "runtime.unlock",
+                       "runtime_test." + name + ".func5.1",
+                       "runtime_test.(*contentionWorker).run",
+               }
+
+               t.Run("sample-1", func(t *testing.T) {
+                       old := runtime.SetMutexProfileFraction(1)
+                       defer runtime.SetMutexProfileFraction(old)
+
+                       needContention.Store(int64(len(mus) - 1))
+                       metricGrowth, profileGrowth, n, _ := testcase(true, stk, workers, fn)(t)
+
+                       if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); have < want {
+                               // The test imposes a delay with usleep, verified with calls to
+                               // nanotime. Compare against the runtime/metrics package's view
+                               // (based on nanotime) rather than runtime/pprof's view (based
+                               // on cputicks).
+                               t.Errorf("runtime/metrics reported less than the known minimum contention duration (%fs < %fs)", have, want)
+                       }
+                       if have, want := n, int64(len(mus)); have != want {
+                               t.Errorf("mutex profile reported contention count different from the known true count (%d != %d)", have, want)
+                       }
+
+                       const slop = 1.5 // account for nanotime vs cputicks
+                       if profileGrowth > slop*metricGrowth || metricGrowth > slop*profileGrowth {
+                               t.Errorf("views differ by more than %fx", slop)
+                       }
+               })
+
+               t.Run("sample-2", func(t *testing.T) {
+                       old := runtime.SetMutexProfileFraction(2)
+                       defer runtime.SetMutexProfileFraction(old)
+
+                       needContention.Store(int64(len(mus) - 1))
+                       metricGrowth, profileGrowth, n, _ := testcase(true, stk, workers, fn)(t)
+
+                       // With 100 trials and profile fraction of 2, we expect to capture
+                       // 50 samples. Allow the test to pass if we get at least 20 samples;
+                       // the CDF of the binomial distribution says there's less than a
+                       // 1e-9 chance of that, which is an acceptably low flakiness rate.
+                       const samplingSlop = 2.5
+
+                       if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); samplingSlop*have < want {
+                               // The test imposes a delay with usleep, verified with calls to
+                               // nanotime. Compare against the runtime/metrics package's view
+                               // (based on nanotime) rather than runtime/pprof's view (based
+                               // on cputicks).
+                               t.Errorf("runtime/metrics reported less than the known minimum contention duration (%f * %fs < %fs)", samplingSlop, have, want)
+                       }
+                       if have, want := n, int64(len(mus)); float64(have) > float64(want)*samplingSlop || float64(want) > float64(have)*samplingSlop {
+                               t.Errorf("mutex profile reported contention count too different from the expected count (%d far from %d)", have, want)
+                       }
+
+                       const timerSlop = 1.5 * samplingSlop // account for nanotime vs cputicks, plus the two views' independent sampling
+                       if profileGrowth > timerSlop*metricGrowth || metricGrowth > timerSlop*profileGrowth {
+                               t.Errorf("views differ by more than %fx", timerSlop)
+                       }
+               })
+       })
+
+       t.Run("runtime.semrelease", func(t *testing.T) {
+               old := runtime.SetMutexProfileFraction(1)
+               defer runtime.SetMutexProfileFraction(old)
+
+               const workers = 3
+               if runtime.GOMAXPROCS(0) < workers {
+                       t.Skipf("creating and observing contention on runtime-internal semaphores requires GOMAXPROCS >= %d", workers)
+               }
+
+               var sem uint32 = 1
+               var tries atomic.Int32
+               tries.Store(10_000_000) // prefer controlled failure to timeout
+               var sawContention atomic.Int32
+               var need int32 = 1
+               fn := func() bool {
+                       if sawContention.Load() >= need {
+                               return false
+                       }
+                       if tries.Add(-1) < 0 {
+                               return false
+                       }
+
+                       runtime.Semacquire(&sem)
+                       runtime.Semrelease1(&sem, false, 0)
+                       if runtime.MutexContended(runtime.SemRootLock(&sem)) {
+                               sawContention.Add(1)
+                       }
+                       return true
+               }
+
+               stk := []string{
+                       "runtime.unlock",
+                       "runtime.semrelease1",
+                       "runtime_test.TestRuntimeLockMetricsAndProfile.func6.1",
+                       "runtime_test.(*contentionWorker).run",
+               }
+
+               // Verify that we get call stack we expect, with anything more than zero
+               // cycles / zero samples. The duration of each contention event is too
+               // small relative to the expected overhead for us to verify its value
+               // more directly. Leave that to the explicit lock/unlock test.
+
+               testcase(false, stk, workers, fn)(t)
+       })
+}
+
+// contentionWorker provides cleaner call stacks for lock contention profile tests
+type contentionWorker struct {
+       before func()
+       fn     func() bool
+       after  func()
+}
+
+func (w *contentionWorker) run() {
+       defer w.after()
+       w.before()
+
+       for w.fn() {
+       }
+}
index af461eef1a3e86e324ac3226cbb8405ef1142dc2..b1930b30202c007a25cf0e2b71f1d3be80f15efd 100644 (file)
@@ -234,6 +234,10 @@ func newBucket(typ bucketType, nstk int) *bucket {
 // stk returns the slice in b holding the stack.
 func (b *bucket) stk() []uintptr {
        stk := (*[maxStack]uintptr)(add(unsafe.Pointer(b), unsafe.Sizeof(*b)))
+       if b.nstk > maxStack {
+               // prove that slicing works; otherwise a failure requires a P
+               throw("bad profile stack count")
+       }
        return stk[:b.nstk:b.nstk]
 }
 
@@ -509,7 +513,237 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) {
        } else {
                nstk = gcallers(gp.m.curg, skip, stk[:])
        }
-       b := stkbucket(which, 0, stk[:nstk], true)
+
+       saveBlockEventStack(cycles, rate, stk[:nstk], which)
+}
+
+// lockTimer assists with profiling contention on runtime-internal locks.
+//
+// There are several steps between the time that an M experiences contention and
+// when that contention may be added to the profile. This comes from our
+// constraints: We need to keep the critical section of each lock small,
+// especially when those locks are contended. The reporting code cannot acquire
+// new locks until the M has released all other locks, which means no memory
+// allocations and encourages use of (temporary) M-local storage.
+//
+// The M will have space for storing one call stack that caused contention, and
+// for the magnitude of that contention. It will also have space to store the
+// magnitude of additional contention the M caused, since it only has space to
+// remember one call stack and might encounter several contention events before
+// it releases all of its locks and is thus able to transfer the local buffer
+// into the profile.
+//
+// The M will collect the call stack when it unlocks the contended lock. That
+// minimizes the impact on the critical section of the contended lock, and
+// matches the mutex profile's behavior for contention in sync.Mutex: measured
+// at the Unlock method.
+//
+// The profile for contention on sync.Mutex blames the caller of Unlock for the
+// amount of contention experienced by the callers of Lock which had to wait.
+// When there are several critical sections, this allows identifying which of
+// them is responsible.
+//
+// Matching that behavior for runtime-internal locks will require identifying
+// which Ms are blocked on the mutex. The semaphore-based implementation is
+// ready to allow that, but the futex-based implementation will require a bit
+// more work. Until then, we report contention on runtime-internal locks with a
+// call stack taken from the unlock call (like the rest of the user-space
+// "mutex" profile), but assign it a duration value based on how long the
+// previous lock call took (like the user-space "block" profile).
+//
+// Thus, reporting the call stacks of runtime-internal lock contention is
+// guarded by GODEBUG for now. Set GODEBUG=profileruntimelocks=1 to enable.
+//
+// TODO(rhysh): plumb through the delay duration, remove GODEBUG, update comment
+//
+// The M will track this by storing a pointer to the lock; lock/unlock pairs for
+// runtime-internal locks are always on the same M.
+//
+// Together, that demands several steps for recording contention. First, when
+// finally acquiring a contended lock, the M decides whether it should plan to
+// profile that event by storing a pointer to the lock in its "to be profiled
+// upon unlock" field. If that field is already set, it uses the relative
+// magnitudes to weight a random choice between itself and the other lock, with
+// the loser's time being added to the "additional contention" field. Otherwise
+// if the M's call stack buffer is occupied, it does the comparison against that
+// sample's magnitude.
+//
+// Second, having unlocked a mutex the M checks to see if it should capture the
+// call stack into its local buffer. Finally, when the M unlocks its last mutex,
+// it transfers the local buffer into the profile. As part of that step, it also
+// transfers any "additional contention" time to the profile. Any lock
+// contention that it experiences while adding samples to the profile will be
+// recorded later as "additional contention" and not include a call stack, to
+// avoid an echo.
+type lockTimer struct {
+       lock      *mutex
+       timeRate  int64
+       timeStart int64
+       tickStart int64
+}
+
+func (lt *lockTimer) begin() {
+       rate := int64(atomic.Load64(&mutexprofilerate))
+
+       lt.timeRate = gTrackingPeriod
+       if rate != 0 && rate < lt.timeRate {
+               lt.timeRate = rate
+       }
+       if int64(fastrand())%lt.timeRate == 0 {
+               lt.timeStart = nanotime()
+       }
+
+       if rate > 0 && int64(fastrand())%rate == 0 {
+               lt.tickStart = cputicks()
+       }
+}
+
+func (lt *lockTimer) end() {
+       gp := getg()
+
+       if lt.timeStart != 0 {
+               nowTime := nanotime()
+               gp.m.mLockProfile.waitTime.Add((nowTime - lt.timeStart) * lt.timeRate)
+       }
+
+       if lt.tickStart != 0 {
+               nowTick := cputicks()
+               gp.m.mLockProfile.recordLock(nowTick-lt.tickStart, lt.lock)
+       }
+}
+
+type mLockProfile struct {
+       waitTime   atomic.Int64      // total nanoseconds spent waiting in runtime.lockWithRank
+       stack      [maxStack]uintptr // stack that experienced contention in runtime.lockWithRank
+       pending    uintptr           // *mutex that experienced contention (to be traceback-ed)
+       cycles     int64             // cycles attributable to "pending" (if set), otherwise to "stack"
+       cyclesLost int64             // contention for which we weren't able to record a call stack
+       disabled   bool              // attribute all time to "lost"
+}
+
+func (prof *mLockProfile) recordLock(cycles int64, l *mutex) {
+       if cycles <= 0 {
+               return
+       }
+
+       if prof.disabled {
+               // We're experiencing contention while attempting to report contention.
+               // Make a note of its magnitude, but don't allow it to be the sole cause
+               // of another contention report.
+               prof.cyclesLost += cycles
+               return
+       }
+
+       if uintptr(unsafe.Pointer(l)) == prof.pending {
+               // Optimization: we'd already planned to profile this same lock (though
+               // possibly from a different unlock site).
+               prof.cycles += cycles
+               return
+       }
+
+       if prev := prof.cycles; prev > 0 {
+               // We can only store one call stack for runtime-internal lock contention
+               // on this M, and we've already got one. Decide which should stay, and
+               // add the other to the report for runtime._LostContendedLock.
+               prevScore := fastrand64() % uint64(prev)
+               thisScore := fastrand64() % uint64(cycles)
+               if prevScore > thisScore {
+                       prof.cyclesLost += cycles
+                       return
+               } else {
+                       prof.cyclesLost += prev
+               }
+       }
+       // Saving the *mutex as a uintptr is safe because:
+       //  - lockrank_on.go does this too, which gives it regular exercise
+       //  - the lock would only move if it's stack allocated, which means it
+       //      cannot experience multi-M contention
+       prof.pending = uintptr(unsafe.Pointer(l))
+       prof.cycles = cycles
+}
+
+// From unlock2, we might not be holding a p in this code.
+//
+//go:nowritebarrierrec
+func (prof *mLockProfile) recordUnlock(l *mutex) {
+       if uintptr(unsafe.Pointer(l)) == prof.pending {
+               prof.captureStack()
+       }
+       if gp := getg(); gp.m.locks == 1 && gp.m.mLockProfile.cycles != 0 {
+               prof.store()
+       }
+}
+
+func (prof *mLockProfile) captureStack() {
+       skip := 3 // runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
+       if staticLockRanking {
+               // When static lock ranking is enabled, we'll always be on the system
+               // stack at this point. There will be a runtime.unlockWithRank.func1
+               // frame, and if the call to runtime.unlock took place on a user stack
+               // then there'll also be a runtime.systemstack frame. To keep stack
+               // traces somewhat consistent whether or not static lock ranking is
+               // enabled, we'd like to skip those. But it's hard to tell how long
+               // we've been on the system stack so accept an extra frame in that case,
+               // with a leaf of "runtime.unlockWithRank runtime.unlock" instead of
+               // "runtime.unlock".
+               skip += 1 // runtime.unlockWithRank.func1
+       }
+       prof.pending = 0
+
+       if debug.profileruntimelocks.Load() == 0 {
+               prof.stack[0] = abi.FuncPCABIInternal(_LostContendedLock) + sys.PCQuantum
+               prof.stack[1] = 0
+               return
+       }
+
+       var nstk int
+       gp := getg()
+       sp := getcallersp()
+       pc := getcallerpc()
+       systemstack(func() {
+               var u unwinder
+               u.initAt(pc, sp, 0, gp, unwindSilentErrors|unwindJumpStack)
+               nstk = tracebackPCs(&u, skip, prof.stack[:])
+       })
+       if nstk < len(prof.stack) {
+               prof.stack[nstk] = 0
+       }
+}
+
+func (prof *mLockProfile) store() {
+       // Report any contention we experience within this function as "lost"; it's
+       // important that the act of reporting a contention event not lead to a
+       // reportable contention event. This also means we can use prof.stack
+       // without copying, since it won't change during this function.
+       mp := acquirem()
+       prof.disabled = true
+
+       nstk := maxStack
+       for i := 0; i < nstk; i++ {
+               if pc := prof.stack[i]; pc == 0 {
+                       nstk = i
+                       break
+               }
+       }
+
+       cycles, lost := prof.cycles, prof.cyclesLost
+       prof.cycles, prof.cyclesLost = 0, 0
+
+       rate := int64(atomic.Load64(&mutexprofilerate))
+       saveBlockEventStack(cycles, rate, prof.stack[:nstk], mutexProfile)
+       if lost > 0 {
+               lostStk := [...]uintptr{
+                       abi.FuncPCABIInternal(_LostContendedLock) + sys.PCQuantum,
+               }
+               saveBlockEventStack(lost, rate, lostStk[:], mutexProfile)
+       }
+
+       prof.disabled = false
+       releasem(mp)
+}
+
+func saveBlockEventStack(cycles, rate int64, stk []uintptr, which bucketType) {
+       b := stkbucket(which, 0, stk, true)
        bp := b.bp()
 
        lock(&profBlockLock)
index edf692fd1a13dc934a6da251fb6b10e45fb0cfee..fd05687301e7309c9b3f8eb968850e64ac9a0a9b 100644 (file)
@@ -922,7 +922,7 @@ func mcommoninit(mp *m, id int64) {
        // when it is just in a register or thread-local storage.
        mp.alllink = allm
 
-       // NumCgoCall() iterates over allm w/o schedlock,
+       // NumCgoCall() and others iterate over allm w/o schedlock,
        // so we need to publish it safely.
        atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
        unlock(&sched.lock)
@@ -1855,6 +1855,7 @@ found:
        unlock(&sched.lock)
 
        atomic.Xadd64(&ncgocall, int64(mp.ncgocall))
+       sched.totalRuntimeLockWaitTime.Add(mp.mLockProfile.waitTime.Load())
 
        // Release the P.
        handoffp(releasep())
@@ -5273,6 +5274,7 @@ func _ExternalCode()              { _ExternalCode() }
 func _LostExternalCode()          { _LostExternalCode() }
 func _GC()                        { _GC() }
 func _LostSIGPROFDuringAtomic64() { _LostSIGPROFDuringAtomic64() }
+func _LostContendedLock()         { _LostContendedLock() }
 func _VDSO()                      { _VDSO() }
 
 // Called if we receive a SIGPROF signal.
index 489dcdd79ca4bf7eb8636296645b5eecb5e9e5bc..877d94eef28d85a76f72bf1fb5813799a1669b24 100644 (file)
@@ -307,26 +307,27 @@ type dbgVar struct {
 // existing int var for that value, which may
 // already have an initial value.
 var debug struct {
-       cgocheck           int32
-       clobberfree        int32
-       dontfreezetheworld int32
-       efence             int32
-       gccheckmark        int32
-       gcpacertrace       int32
-       gcshrinkstackoff   int32
-       gcstoptheworld     int32
-       gctrace            int32
-       invalidptr         int32
-       madvdontneed       int32 // for Linux; issue 28466
-       scavtrace          int32
-       scheddetail        int32
-       schedtrace         int32
-       tracebackancestors int32
-       asyncpreemptoff    int32
-       harddecommit       int32
-       adaptivestackstart int32
-       tracefpunwindoff   int32
-       traceadvanceperiod int32
+       cgocheck            int32
+       clobberfree         int32
+       dontfreezetheworld  int32
+       efence              int32
+       gccheckmark         int32
+       gcpacertrace        int32
+       gcshrinkstackoff    int32
+       gcstoptheworld      int32
+       gctrace             int32
+       invalidptr          int32
+       madvdontneed        int32 // for Linux; issue 28466
+       profileruntimelocks atomic.Int32
+       scavtrace           int32
+       scheddetail         int32
+       schedtrace          int32
+       tracebackancestors  int32
+       asyncpreemptoff     int32
+       harddecommit        int32
+       adaptivestackstart  int32
+       tracefpunwindoff    int32
+       traceadvanceperiod  int32
 
        // debug.malloc is used as a combined debug check
        // in the malloc function and should be set
@@ -352,6 +353,7 @@ var dbgvars = []*dbgVar{
        {name: "gctrace", value: &debug.gctrace},
        {name: "invalidptr", value: &debug.invalidptr},
        {name: "madvdontneed", value: &debug.madvdontneed},
+       {name: "profileruntimelocks", atomic: &debug.profileruntimelocks},
        {name: "sbrk", value: &debug.sbrk},
        {name: "scavtrace", value: &debug.scavtrace},
        {name: "scheddetail", value: &debug.scheddetail},
index fb283a3ab427b9faf80f6407982824c5d065b710..6d40c7c4666d52bc145f9cf3c27f132cf94a6087 100644 (file)
@@ -593,6 +593,8 @@ type m struct {
        lockedInt     uint32      // tracking for internal lockOSThread
        nextwaitm     muintptr    // next m waiting for lock
 
+       mLockProfile mLockProfile // fields relating to runtime.lock contention
+
        // wait* are used to carry arguments from gopark into park_m, because
        // there's no stack to put them on. That is their sole purpose.
        waitunlockf          func(*g, unsafe.Pointer) bool
@@ -900,6 +902,12 @@ type schedt struct {
        // stwTotalTimeOther covers the others.
        stwTotalTimeGC    timeHistogram
        stwTotalTimeOther timeHistogram
+
+       // totalRuntimeLockWaitTime (plus the value of lockWaitTime on each M in
+       // allm) is the sum of time goroutines have spent in _Grunnable and with an
+       // M, but waiting for locks within the runtime. This field stores the value
+       // for Ms that have exited.
+       totalRuntimeLockWaitTime atomic.Int64
 }
 
 // Values for the flags field of a sigTabT.