runtime: profile contended lock calls

author Rhys Hiltner <rhys@justin.tv>

Tue, 21 Nov 2023 16:03:54 +0000 (16:03 +0000)

committer Gopher Robot <gobot@golang.org>

Tue, 21 Nov 2023 21:02:20 +0000 (21:02 +0000)
author Rhys Hiltner <rhys@justin.tv>
Tue, 21 Nov 2023 16:03:54 +0000 (16:03 +0000)
committer Gopher Robot <gobot@golang.org>
Tue, 21 Nov 2023 21:02:20 +0000 (21:02 +0000)
diff --git a/src/runtime/debug.go b/src/runtime/debug.go

index 0e61692f3d1003163c30f7c7578c88817deabd19..3233ce8ee73718c1123594b28dfc71f255a9ee0b 100644 (file)
--- a/src/runtime/debug.go
+++ b/src/runtime/debug.go
@@ -52,6 +52,17 @@ func NumCgoCall() int64 {
         return n
  }
  
+func totalMutexWaitTimeNanos() int64 {
+       total := sched.totalMutexWaitTime.Load()
+
+       total += sched.totalRuntimeLockWaitTime.Load()
+       for mp := (*m)(atomic.Loadp(unsafe.Pointer(&allm))); mp != nil; mp = mp.alllink {
+               total += mp.mLockProfile.waitTime.Load()
+       }
+
+       return total
+}
+
  // NumGoroutine returns the number of goroutines that currently exist.
  func NumGoroutine() int {
         return int(gcount())
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go

index 2b34929ba051c53c40550dc573cbcb63dffaed15..d071035e6225ccf718571a6d3da7c5f2926341f7 100644 (file)
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -1340,6 +1340,18 @@ func PageCachePagesLeaked() (leaked uintptr) {
         return
  }
  
+type Mutex = mutex
+
+var Lock = lock
+var Unlock = unlock
+
+var MutexContended = mutexContended
+
+func SemRootLock(addr *uint32) *mutex {
+       root := semtable.rootFor(addr)
+       return &root.lock
+}
+
  var Semacquire = semacquire
  var Semrelease1 = semrelease1
  
diff --git a/src/runtime/extern.go b/src/runtime/extern.go

index f8db296e6ba67f73290b0cd69be39f68247f4d86..d199720b9b05cf5c7305524308b2e82b919e862e 100644 (file)
--- a/src/runtime/extern.go
+++ b/src/runtime/extern.go
@@ -145,6 +145,14 @@ It is a comma-separated list of name=val pairs setting these named variables:
         risk in that scenario. Currently not supported on Windows, plan9 or js/wasm. Setting this
         option for some applications can produce large traces, so use with care.
  
+       profileruntimelocks: setting profileruntimelocks=1 includes call stacks related to
+       contention on runtime-internal locks in the "mutex" profile, subject to the
+       MutexProfileFraction setting. The call stacks will correspond to the unlock call that
+       released the lock. But instead of the value corresponding to the amount of contention that
+       call stack caused, it corresponds to the amount of time the caller of unlock had to wait
+       in its original call to lock. A future release is expected to align those and remove this
+       setting.
+
         invalidptr: invalidptr=1 (the default) causes the garbage collector and stack
         copier to crash the program if an invalid pointer value (for example, 1)
         is found in a pointer-typed location. Setting invalidptr=0 disables this check.
diff --git a/src/runtime/lock_futex.go b/src/runtime/lock_futex.go

index cc7d465ef1fc18c44ea56f40f7585bf39b673a2f..867e2b34d0662279311ac1407f27bfda3b6a7b37 100644 (file)
--- a/src/runtime/lock_futex.go
+++ b/src/runtime/lock_futex.go
@@ -44,6 +44,10 @@ func key32(p *uintptr) *uint32 {
         return (*uint32)(unsafe.Pointer(p))
  }
  
+func mutexContended(l *mutex) bool {
+       return atomic.Load(key32(&l.key)) > mutex_locked
+}
+
  func lock(l *mutex) {
         lockWithRank(l, getLockRank(l))
  }
@@ -71,6 +75,8 @@ func lock2(l *mutex) {
         // its wakeup call.
         wait := v
  
+       timer := &lockTimer{lock: l}
+       timer.begin()
         // On uniprocessors, no point spinning.
         // On multiprocessors, spin for ACTIVE_SPIN attempts.
         spin := 0
@@ -82,6 +88,7 @@ func lock2(l *mutex) {
                 for i := 0; i < spin; i++ {
                         for l.key == mutex_unlocked {
                                 if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
+                                       timer.end()
                                         return
                                 }
                         }
@@ -92,6 +99,7 @@ func lock2(l *mutex) {
                 for i := 0; i < passive_spin; i++ {
                         for l.key == mutex_unlocked {
                                 if atomic.Cas(key32(&l.key), mutex_unlocked, wait) {
+                                       timer.end()
                                         return
                                 }
                         }
@@ -101,6 +109,7 @@ func lock2(l *mutex) {
                 // Sleep.
                 v = atomic.Xchg(key32(&l.key), mutex_sleeping)
                 if v == mutex_unlocked {
+                       timer.end()
                         return
                 }
                 wait = mutex_sleeping
@@ -122,6 +131,7 @@ func unlock2(l *mutex) {
         }
  
         gp := getg()
+       gp.m.mLockProfile.recordUnlock(l)
         gp.m.locks--
         if gp.m.locks < 0 {
                 throw("runtime·unlock: lock count")
diff --git a/src/runtime/lock_js.go b/src/runtime/lock_js.go

index 91ad7be317398f3bb26a7e3e26513d4df5799caa..b6ee5ec7afe2697b75c8dae63b3035bb4b4c867a 100644 (file)
--- a/src/runtime/lock_js.go
+++ b/src/runtime/lock_js.go
@@ -23,6 +23,10 @@ const (
         passive_spin    = 1
  )
  
+func mutexContended(l *mutex) bool {
+       return false
+}
+
  func lock(l *mutex) {
         lockWithRank(l, getLockRank(l))
  }
diff --git a/src/runtime/lock_sema.go b/src/runtime/lock_sema.go

index 9afba08b0b243bcd597a8d215824b89054f949fb..073e7d410e4c74bceb6a0d1ca1e4cc4078df127e 100644 (file)
--- a/src/runtime/lock_sema.go
+++ b/src/runtime/lock_sema.go
@@ -31,6 +31,10 @@ const (
         passive_spin    = 1
  )
  
+func mutexContended(l *mutex) bool {
+       return atomic.Loaduintptr(&l.key) > locked
+}
+
  func lock(l *mutex) {
         lockWithRank(l, getLockRank(l))
  }
@@ -48,6 +52,8 @@ func lock2(l *mutex) {
         }
         semacreate(gp.m)
  
+       timer := &lockTimer{lock: l}
+       timer.begin()
         // On uniprocessor's, no point spinning.
         // On multiprocessors, spin for ACTIVE_SPIN attempts.
         spin := 0
@@ -60,6 +66,7 @@ Loop:
                 if v&locked == 0 {
                         // Unlocked. Try to lock.
                         if atomic.Casuintptr(&l.key, v, v|locked) {
+                               timer.end()
                                 return
                         }
                         i = 0
@@ -119,6 +126,7 @@ func unlock2(l *mutex) {
                         }
                 }
         }
+       gp.m.mLockProfile.recordUnlock(l)
         gp.m.locks--
         if gp.m.locks < 0 {
                 throw("runtime·unlock: lock count")
diff --git a/src/runtime/lock_wasip1.go b/src/runtime/lock_wasip1.go

index c4fc59f6cc222ca54a8ee970d250d716dfdfe02d..acfc62acb48e90d58ec7accbeea0cd41b184bfa1 100644 (file)
--- a/src/runtime/lock_wasip1.go
+++ b/src/runtime/lock_wasip1.go
@@ -19,6 +19,10 @@ const (
         active_spin_cnt = 30
  )
  
+func mutexContended(l *mutex) bool {
+       return false
+}
+
  func lock(l *mutex) {
         lockWithRank(l, getLockRank(l))
  }
diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go

index 4bd167135db8f87c77cb8c758d737ca426b92bc8..f97a3804ab32201c895ea5d45a180067b6a7b305 100644 (file)
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -470,7 +470,7 @@ func initMetrics() {
                 "/sync/mutex/wait/total:seconds": {
                         compute: func(_ *statAggregate, out *metricValue) {
                                 out.kind = metricKindFloat64
-                               out.scalar = float64bits(nsToSec(sched.totalMutexWaitTime.Load()))
+                               out.scalar = float64bits(nsToSec(totalMutexWaitTimeNanos()))
                         },
                 },
         }
diff --git a/src/runtime/metrics_test.go b/src/runtime/metrics_test.go

index acaa5ca375f114f4f8eed4be5c0f3d2aea62299a..d28a589adc2e1ce627b3fd2ad0598a25c93e55e6 100644 (file)
--- a/src/runtime/metrics_test.go
+++ b/src/runtime/metrics_test.go
@@ -6,15 +6,21 @@ package runtime_test
  
  import (
         "bytes"
+       "fmt"
+       "internal/goexperiment"
+       "internal/profile"
         "os"
         "reflect"
         "runtime"
         "runtime/debug"
         "runtime/metrics"
+       "runtime/pprof"
         "runtime/trace"
+       "slices"
         "sort"
         "strings"
         "sync"
+       "sync/atomic"
         "testing"
         "time"
         "unsafe"
@@ -939,3 +945,319 @@ func TestSchedPauseMetrics(t *testing.T) {
                 })
         }
  }
+
+func TestRuntimeLockMetricsAndProfile(t *testing.T) {
+       old := runtime.SetMutexProfileFraction(0) // enabled during sub-tests
+       defer runtime.SetMutexProfileFraction(old)
+       if old != 0 {
+               t.Fatalf("need MutexProfileRate 0, got %d", old)
+       }
+
+       {
+               before := os.Getenv("GODEBUG")
+               for _, s := range strings.Split(before, ",") {
+                       if strings.HasPrefix(s, "profileruntimelocks=") {
+                               t.Logf("GODEBUG includes explicit setting %q", s)
+                       }
+               }
+               defer func() { os.Setenv("GODEBUG", before) }()
+               os.Setenv("GODEBUG", fmt.Sprintf("%s,profileruntimelocks=1", before))
+       }
+
+       t.Logf("NumCPU %d", runtime.NumCPU())
+       t.Logf("GOMAXPROCS %d", runtime.GOMAXPROCS(0))
+       if minCPU := 2; runtime.NumCPU() < minCPU {
+               t.Skipf("creating and observing contention on runtime-internal locks requires NumCPU >= %d", minCPU)
+       }
+
+       loadProfile := func(t *testing.T) *profile.Profile {
+               var w bytes.Buffer
+               pprof.Lookup("mutex").WriteTo(&w, 0)
+               p, err := profile.Parse(&w)
+               if err != nil {
+                       t.Fatalf("failed to parse profile: %v", err)
+               }
+               if err := p.CheckValid(); err != nil {
+                       t.Fatalf("invalid profile: %v", err)
+               }
+               return p
+       }
+
+       measureDelta := func(t *testing.T, fn func()) (metricGrowth, profileGrowth float64, p *profile.Profile) {
+               beforeProfile := loadProfile(t)
+               beforeMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
+               metrics.Read(beforeMetrics)
+
+               fn()
+
+               afterProfile := loadProfile(t)
+               afterMetrics := []metrics.Sample{{Name: "/sync/mutex/wait/total:seconds"}}
+               metrics.Read(afterMetrics)
+
+               sumSamples := func(p *profile.Profile, i int) int64 {
+                       var sum int64
+                       for _, s := range p.Sample {
+                               sum += s.Value[i]
+                       }
+                       return sum
+               }
+
+               metricGrowth = afterMetrics[0].Value.Float64() - beforeMetrics[0].Value.Float64()
+               profileGrowth = float64(sumSamples(afterProfile, 1)-sumSamples(beforeProfile, 1)) * time.Nanosecond.Seconds()
+
+               // The internal/profile package does not support compaction; this delta
+               // profile will include separate positive and negative entries.
+               p = afterProfile.Copy()
+               if len(beforeProfile.Sample) > 0 {
+                       err := p.Merge(beforeProfile, -1)
+                       if err != nil {
+                               t.Fatalf("Merge profiles: %v", err)
+                       }
+               }
+
+               return metricGrowth, profileGrowth, p
+       }
+
+       testcase := func(strictTiming bool, stk []string, workers int, fn func() bool) func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
+               return func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64) {
+                       metricGrowth, profileGrowth, p := measureDelta(t, func() {
+                               var started, stopped sync.WaitGroup
+                               started.Add(workers)
+                               stopped.Add(workers)
+                               for i := 0; i < workers; i++ {
+                                       w := &contentionWorker{
+                                               before: func() {
+                                                       started.Done()
+                                                       started.Wait()
+                                               },
+                                               after: func() {
+                                                       stopped.Done()
+                                               },
+                                               fn: fn,
+                                       }
+                                       go w.run()
+                               }
+                               stopped.Wait()
+                       })
+
+                       if profileGrowth == 0 {
+                               t.Errorf("no increase in mutex profile")
+                       }
+                       if metricGrowth == 0 && strictTiming {
+                               // If the critical section is very short, systems with low timer
+                               // resolution may be unable to measure it via nanotime.
+                               t.Errorf("no increase in /sync/mutex/wait/total:seconds metric")
+                       }
+                       // This comparison is possible because the time measurements in support of
+                       // runtime/pprof and runtime/metrics for runtime-internal locks are so close
+                       // together. It doesn't work as well for user-space contention, where the
+                       // involved goroutines are not _Grunnable the whole time and so need to pass
+                       // through the scheduler.
+                       t.Logf("lock contention growth in runtime/pprof's view  (%fs)", profileGrowth)
+                       t.Logf("lock contention growth in runtime/metrics' view (%fs)", metricGrowth)
+
+                       if goexperiment.StaticLockRanking {
+                               if !slices.ContainsFunc(stk, func(s string) bool {
+                                       return s == "runtime.systemstack" || s == "runtime.mcall" || s == "runtime.mstart"
+                               }) {
+                                       // stk is a call stack that is still on the user stack when
+                                       // it calls runtime.unlock. Add the extra function that
+                                       // we'll see, when the static lock ranking implementation of
+                                       // runtime.unlockWithRank switches to the system stack.
+                                       stk = append([]string{"runtime.unlockWithRank"}, stk...)
+                               }
+                       }
+
+                       var stks [][]string
+                       for _, s := range p.Sample {
+                               var have []string
+                               for _, loc := range s.Location {
+                                       for _, line := range loc.Line {
+                                               have = append(have, line.Function.Name)
+                                       }
+                               }
+                               stks = append(stks, have)
+                               if slices.Equal(have, stk) {
+                                       n += s.Value[0]
+                                       value += s.Value[1]
+                               }
+                       }
+                       t.Logf("stack %v has samples totaling n=%d value=%d", stk, n, value)
+                       if n == 0 && value == 0 {
+                               t.Logf("profile:\n%s", p)
+                               for _, have := range stks {
+                                       t.Logf("have stack %v", have)
+                               }
+                               t.Errorf("want stack %v", stk)
+                       }
+
+                       return metricGrowth, profileGrowth, n, value
+               }
+       }
+
+       name := t.Name()
+
+       t.Run("runtime.lock", func(t *testing.T) {
+               mus := make([]runtime.Mutex, 100)
+               var needContention atomic.Int64
+               delay := 100 * time.Microsecond // large relative to system noise, for comparison between clocks
+               delayMicros := delay.Microseconds()
+
+               // The goroutine that acquires the lock will only proceed when it
+               // detects that its partner is contended for the lock. That will lead to
+               // live-lock if anything (such as a STW) prevents the partner goroutine
+               // from running. Allowing the contention workers to pause and restart
+               // (to allow a STW to proceed) makes it harder to confirm that we're
+               // counting the correct number of contention events, since some locks
+               // will end up contended twice. Instead, disable the GC.
+               defer debug.SetGCPercent(debug.SetGCPercent(-1))
+
+               const workers = 2
+               if runtime.GOMAXPROCS(0) < workers {
+                       t.Skipf("contention on runtime-internal locks requires GOMAXPROCS >= %d", workers)
+               }
+
+               fn := func() bool {
+                       n := int(needContention.Load())
+                       if n < 0 {
+                               return false
+                       }
+                       mu := &mus[n]
+
+                       runtime.Lock(mu)
+                       for int(needContention.Load()) == n {
+                               if runtime.MutexContended(mu) {
+                                       // make them wait a little while
+                                       for start := runtime.Nanotime(); (runtime.Nanotime()-start)/1000 < delayMicros; {
+                                               runtime.Usleep(uint32(delayMicros))
+                                       }
+                                       break
+                               }
+                       }
+                       runtime.Unlock(mu)
+                       needContention.Store(int64(n - 1))
+
+                       return true
+               }
+
+               stk := []string{
+                       "runtime.unlock",
+                       "runtime_test." + name + ".func5.1",
+                       "runtime_test.(*contentionWorker).run",
+               }
+
+               t.Run("sample-1", func(t *testing.T) {
+                       old := runtime.SetMutexProfileFraction(1)
+                       defer runtime.SetMutexProfileFraction(old)
+
+                       needContention.Store(int64(len(mus) - 1))
+                       metricGrowth, profileGrowth, n, _ := testcase(true, stk, workers, fn)(t)
+
+                       if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); have < want {
+                               // The test imposes a delay with usleep, verified with calls to
+                               // nanotime. Compare against the runtime/metrics package's view
+                               // (based on nanotime) rather than runtime/pprof's view (based
+                               // on cputicks).
+                               t.Errorf("runtime/metrics reported less than the known minimum contention duration (%fs < %fs)", have, want)
+                       }
+                       if have, want := n, int64(len(mus)); have != want {
+                               t.Errorf("mutex profile reported contention count different from the known true count (%d != %d)", have, want)
+                       }
+
+                       const slop = 1.5 // account for nanotime vs cputicks
+                       if profileGrowth > slop*metricGrowth || metricGrowth > slop*profileGrowth {
+                               t.Errorf("views differ by more than %fx", slop)
+                       }
+               })
+
+               t.Run("sample-2", func(t *testing.T) {
+                       old := runtime.SetMutexProfileFraction(2)
+                       defer runtime.SetMutexProfileFraction(old)
+
+                       needContention.Store(int64(len(mus) - 1))
+                       metricGrowth, profileGrowth, n, _ := testcase(true, stk, workers, fn)(t)
+
+                       // With 100 trials and profile fraction of 2, we expect to capture
+                       // 50 samples. Allow the test to pass if we get at least 20 samples;
+                       // the CDF of the binomial distribution says there's less than a
+                       // 1e-9 chance of that, which is an acceptably low flakiness rate.
+                       const samplingSlop = 2.5
+
+                       if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); samplingSlop*have < want {
+                               // The test imposes a delay with usleep, verified with calls to
+                               // nanotime. Compare against the runtime/metrics package's view
+                               // (based on nanotime) rather than runtime/pprof's view (based
+                               // on cputicks).
+                               t.Errorf("runtime/metrics reported less than the known minimum contention duration (%f * %fs < %fs)", samplingSlop, have, want)
+                       }
+                       if have, want := n, int64(len(mus)); float64(have) > float64(want)*samplingSlop || float64(want) > float64(have)*samplingSlop {
+                               t.Errorf("mutex profile reported contention count too different from the expected count (%d far from %d)", have, want)
+                       }
+
+                       const timerSlop = 1.5 * samplingSlop // account for nanotime vs cputicks, plus the two views' independent sampling
+                       if profileGrowth > timerSlop*metricGrowth || metricGrowth > timerSlop*profileGrowth {
+                               t.Errorf("views differ by more than %fx", timerSlop)
+                       }
+               })
+       })
+
+       t.Run("runtime.semrelease", func(t *testing.T) {
+               old := runtime.SetMutexProfileFraction(1)
+               defer runtime.SetMutexProfileFraction(old)
+
+               const workers = 3
+               if runtime.GOMAXPROCS(0) < workers {
+                       t.Skipf("creating and observing contention on runtime-internal semaphores requires GOMAXPROCS >= %d", workers)
+               }
+
+               var sem uint32 = 1
+               var tries atomic.Int32
+               tries.Store(10_000_000) // prefer controlled failure to timeout
+               var sawContention atomic.Int32
+               var need int32 = 1
+               fn := func() bool {
+                       if sawContention.Load() >= need {
+                               return false
+                       }
+                       if tries.Add(-1) < 0 {
+                               return false
+                       }
+
+                       runtime.Semacquire(&sem)
+                       runtime.Semrelease1(&sem, false, 0)
+                       if runtime.MutexContended(runtime.SemRootLock(&sem)) {
+                               sawContention.Add(1)
+                       }
+                       return true
+               }
+
+               stk := []string{
+                       "runtime.unlock",
+                       "runtime.semrelease1",
+                       "runtime_test.TestRuntimeLockMetricsAndProfile.func6.1",
+                       "runtime_test.(*contentionWorker).run",
+               }
+
+               // Verify that we get call stack we expect, with anything more than zero
+               // cycles / zero samples. The duration of each contention event is too
+               // small relative to the expected overhead for us to verify its value
+               // more directly. Leave that to the explicit lock/unlock test.
+
+               testcase(false, stk, workers, fn)(t)
+       })
+}
+
+// contentionWorker provides cleaner call stacks for lock contention profile tests
+type contentionWorker struct {
+       before func()
+       fn     func() bool
+       after  func()
+}
+
+func (w *contentionWorker) run() {
+       defer w.after()
+       w.before()
+
+       for w.fn() {
+       }
+}
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go

index af461eef1a3e86e324ac3226cbb8405ef1142dc2..b1930b30202c007a25cf0e2b71f1d3be80f15efd 100644 (file)
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -234,6 +234,10 @@ func newBucket(typ bucketType, nstk int) *bucket {
  // stk returns the slice in b holding the stack.
  func (b *bucket) stk() []uintptr {
         stk := (*[maxStack]uintptr)(add(unsafe.Pointer(b), unsafe.Sizeof(*b)))
+       if b.nstk > maxStack {
+               // prove that slicing works; otherwise a failure requires a P
+               throw("bad profile stack count")
+       }
         return stk[:b.nstk:b.nstk]
  }
  
@@ -509,7 +513,237 @@ func saveblockevent(cycles, rate int64, skip int, which bucketType) {
         } else {
                 nstk = gcallers(gp.m.curg, skip, stk[:])
         }
-       b := stkbucket(which, 0, stk[:nstk], true)
+
+       saveBlockEventStack(cycles, rate, stk[:nstk], which)
+}
+
+// lockTimer assists with profiling contention on runtime-internal locks.
+//
+// There are several steps between the time that an M experiences contention and
+// when that contention may be added to the profile. This comes from our
+// constraints: We need to keep the critical section of each lock small,
+// especially when those locks are contended. The reporting code cannot acquire
+// new locks until the M has released all other locks, which means no memory
+// allocations and encourages use of (temporary) M-local storage.
+//
+// The M will have space for storing one call stack that caused contention, and
+// for the magnitude of that contention. It will also have space to store the
+// magnitude of additional contention the M caused, since it only has space to
+// remember one call stack and might encounter several contention events before
+// it releases all of its locks and is thus able to transfer the local buffer
+// into the profile.
+//
+// The M will collect the call stack when it unlocks the contended lock. That
+// minimizes the impact on the critical section of the contended lock, and
+// matches the mutex profile's behavior for contention in sync.Mutex: measured
+// at the Unlock method.
+//
+// The profile for contention on sync.Mutex blames the caller of Unlock for the
+// amount of contention experienced by the callers of Lock which had to wait.
+// When there are several critical sections, this allows identifying which of
+// them is responsible.
+//
+// Matching that behavior for runtime-internal locks will require identifying
+// which Ms are blocked on the mutex. The semaphore-based implementation is
+// ready to allow that, but the futex-based implementation will require a bit
+// more work. Until then, we report contention on runtime-internal locks with a
+// call stack taken from the unlock call (like the rest of the user-space
+// "mutex" profile), but assign it a duration value based on how long the
+// previous lock call took (like the user-space "block" profile).
+//
+// Thus, reporting the call stacks of runtime-internal lock contention is
+// guarded by GODEBUG for now. Set GODEBUG=profileruntimelocks=1 to enable.
+//
+// TODO(rhysh): plumb through the delay duration, remove GODEBUG, update comment
+//
+// The M will track this by storing a pointer to the lock; lock/unlock pairs for
+// runtime-internal locks are always on the same M.
+//
+// Together, that demands several steps for recording contention. First, when
+// finally acquiring a contended lock, the M decides whether it should plan to
+// profile that event by storing a pointer to the lock in its "to be profiled
+// upon unlock" field. If that field is already set, it uses the relative
+// magnitudes to weight a random choice between itself and the other lock, with
+// the loser's time being added to the "additional contention" field. Otherwise
+// if the M's call stack buffer is occupied, it does the comparison against that
+// sample's magnitude.
+//
+// Second, having unlocked a mutex the M checks to see if it should capture the
+// call stack into its local buffer. Finally, when the M unlocks its last mutex,
+// it transfers the local buffer into the profile. As part of that step, it also
+// transfers any "additional contention" time to the profile. Any lock
+// contention that it experiences while adding samples to the profile will be
+// recorded later as "additional contention" and not include a call stack, to
+// avoid an echo.
+type lockTimer struct {
+       lock      *mutex
+       timeRate  int64
+       timeStart int64
+       tickStart int64
+}
+
+func (lt *lockTimer) begin() {
+       rate := int64(atomic.Load64(&mutexprofilerate))
+
+       lt.timeRate = gTrackingPeriod
+       if rate != 0 && rate < lt.timeRate {
+               lt.timeRate = rate
+       }
+       if int64(fastrand())%lt.timeRate == 0 {
+               lt.timeStart = nanotime()
+       }
+
+       if rate > 0 && int64(fastrand())%rate == 0 {
+               lt.tickStart = cputicks()
+       }
+}
+
+func (lt *lockTimer) end() {
+       gp := getg()
+
+       if lt.timeStart != 0 {
+               nowTime := nanotime()
+               gp.m.mLockProfile.waitTime.Add((nowTime - lt.timeStart) * lt.timeRate)
+       }
+
+       if lt.tickStart != 0 {
+               nowTick := cputicks()
+               gp.m.mLockProfile.recordLock(nowTick-lt.tickStart, lt.lock)
+       }
+}
+
+type mLockProfile struct {
+       waitTime   atomic.Int64      // total nanoseconds spent waiting in runtime.lockWithRank
+       stack      [maxStack]uintptr // stack that experienced contention in runtime.lockWithRank
+       pending    uintptr           // *mutex that experienced contention (to be traceback-ed)
+       cycles     int64             // cycles attributable to "pending" (if set), otherwise to "stack"
+       cyclesLost int64             // contention for which we weren't able to record a call stack
+       disabled   bool              // attribute all time to "lost"
+}
+
+func (prof *mLockProfile) recordLock(cycles int64, l *mutex) {
+       if cycles <= 0 {
+               return
+       }
+
+       if prof.disabled {
+               // We're experiencing contention while attempting to report contention.
+               // Make a note of its magnitude, but don't allow it to be the sole cause
+               // of another contention report.
+               prof.cyclesLost += cycles
+               return
+       }
+
+       if uintptr(unsafe.Pointer(l)) == prof.pending {
+               // Optimization: we'd already planned to profile this same lock (though
+               // possibly from a different unlock site).
+               prof.cycles += cycles
+               return
+       }
+
+       if prev := prof.cycles; prev > 0 {
+               // We can only store one call stack for runtime-internal lock contention
+               // on this M, and we've already got one. Decide which should stay, and
+               // add the other to the report for runtime._LostContendedLock.
+               prevScore := fastrand64() % uint64(prev)
+               thisScore := fastrand64() % uint64(cycles)
+               if prevScore > thisScore {
+                       prof.cyclesLost += cycles
+                       return
+               } else {
+                       prof.cyclesLost += prev
+               }
+       }
+       // Saving the *mutex as a uintptr is safe because:
+       //  - lockrank_on.go does this too, which gives it regular exercise
+       //  - the lock would only move if it's stack allocated, which means it
+       //      cannot experience multi-M contention
+       prof.pending = uintptr(unsafe.Pointer(l))
+       prof.cycles = cycles
+}
+
+// From unlock2, we might not be holding a p in this code.
+//
+//go:nowritebarrierrec
+func (prof *mLockProfile) recordUnlock(l *mutex) {
+       if uintptr(unsafe.Pointer(l)) == prof.pending {
+               prof.captureStack()
+       }
+       if gp := getg(); gp.m.locks == 1 && gp.m.mLockProfile.cycles != 0 {
+               prof.store()
+       }
+}
+
+func (prof *mLockProfile) captureStack() {
+       skip := 3 // runtime.(*mLockProfile).recordUnlock runtime.unlock2 runtime.unlockWithRank
+       if staticLockRanking {
+               // When static lock ranking is enabled, we'll always be on the system
+               // stack at this point. There will be a runtime.unlockWithRank.func1
+               // frame, and if the call to runtime.unlock took place on a user stack
+               // then there'll also be a runtime.systemstack frame. To keep stack
+               // traces somewhat consistent whether or not static lock ranking is
+               // enabled, we'd like to skip those. But it's hard to tell how long
+               // we've been on the system stack so accept an extra frame in that case,
+               // with a leaf of "runtime.unlockWithRank runtime.unlock" instead of
+               // "runtime.unlock".
+               skip += 1 // runtime.unlockWithRank.func1
+       }
+       prof.pending = 0
+
+       if debug.profileruntimelocks.Load() == 0 {
+               prof.stack[0] = abi.FuncPCABIInternal(_LostContendedLock) + sys.PCQuantum
+               prof.stack[1] = 0
+               return
+       }
+
+       var nstk int
+       gp := getg()
+       sp := getcallersp()
+       pc := getcallerpc()
+       systemstack(func() {
+               var u unwinder
+               u.initAt(pc, sp, 0, gp, unwindSilentErrors|unwindJumpStack)
+               nstk = tracebackPCs(&u, skip, prof.stack[:])
+       })
+       if nstk < len(prof.stack) {
+               prof.stack[nstk] = 0
+       }
+}
+
+func (prof *mLockProfile) store() {
+       // Report any contention we experience within this function as "lost"; it's
+       // important that the act of reporting a contention event not lead to a
+       // reportable contention event. This also means we can use prof.stack
+       // without copying, since it won't change during this function.
+       mp := acquirem()
+       prof.disabled = true
+
+       nstk := maxStack
+       for i := 0; i < nstk; i++ {
+               if pc := prof.stack[i]; pc == 0 {
+                       nstk = i
+                       break
+               }
+       }
+
+       cycles, lost := prof.cycles, prof.cyclesLost
+       prof.cycles, prof.cyclesLost = 0, 0
+
+       rate := int64(atomic.Load64(&mutexprofilerate))
+       saveBlockEventStack(cycles, rate, prof.stack[:nstk], mutexProfile)
+       if lost > 0 {
+               lostStk := [...]uintptr{
+                       abi.FuncPCABIInternal(_LostContendedLock) + sys.PCQuantum,
+               }
+               saveBlockEventStack(lost, rate, lostStk[:], mutexProfile)
+       }
+
+       prof.disabled = false
+       releasem(mp)
+}
+
+func saveBlockEventStack(cycles, rate int64, stk []uintptr, which bucketType) {
+       b := stkbucket(which, 0, stk, true)
         bp := b.bp()
  
         lock(&profBlockLock)
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index edf692fd1a13dc934a6da251fb6b10e45fb0cfee..fd05687301e7309c9b3f8eb968850e64ac9a0a9b 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -922,7 +922,7 @@ func mcommoninit(mp *m, id int64) {
         // when it is just in a register or thread-local storage.
         mp.alllink = allm
  
-       // NumCgoCall() iterates over allm w/o schedlock,
+       // NumCgoCall() and others iterate over allm w/o schedlock,
         // so we need to publish it safely.
         atomicstorep(unsafe.Pointer(&allm), unsafe.Pointer(mp))
         unlock(&sched.lock)
@@ -1855,6 +1855,7 @@ found:
         unlock(&sched.lock)
  
         atomic.Xadd64(&ncgocall, int64(mp.ncgocall))
+       sched.totalRuntimeLockWaitTime.Add(mp.mLockProfile.waitTime.Load())
  
         // Release the P.
         handoffp(releasep())
@@ -5273,6 +5274,7 @@ func _ExternalCode()              { _ExternalCode() }
  func _LostExternalCode()          { _LostExternalCode() }
  func _GC()                        { _GC() }
  func _LostSIGPROFDuringAtomic64() { _LostSIGPROFDuringAtomic64() }
+func _LostContendedLock()         { _LostContendedLock() }
  func _VDSO()                      { _VDSO() }
  
  // Called if we receive a SIGPROF signal.
diff --git a/src/runtime/runtime1.go b/src/runtime/runtime1.go

index 489dcdd79ca4bf7eb8636296645b5eecb5e9e5bc..877d94eef28d85a76f72bf1fb5813799a1669b24 100644 (file)
--- a/src/runtime/runtime1.go
+++ b/src/runtime/runtime1.go
@@ -307,26 +307,27 @@ type dbgVar struct {
  // existing int var for that value, which may
  // already have an initial value.
  var debug struct {
-       cgocheck           int32
-       clobberfree        int32
-       dontfreezetheworld int32
-       efence             int32
-       gccheckmark        int32
-       gcpacertrace       int32
-       gcshrinkstackoff   int32
-       gcstoptheworld     int32
-       gctrace            int32
-       invalidptr         int32
-       madvdontneed       int32 // for Linux; issue 28466
-       scavtrace          int32
-       scheddetail        int32
-       schedtrace         int32
-       tracebackancestors int32
-       asyncpreemptoff    int32
-       harddecommit       int32
-       adaptivestackstart int32
-       tracefpunwindoff   int32
-       traceadvanceperiod int32
+       cgocheck            int32
+       clobberfree         int32
+       dontfreezetheworld  int32
+       efence              int32
+       gccheckmark         int32
+       gcpacertrace        int32
+       gcshrinkstackoff    int32
+       gcstoptheworld      int32
+       gctrace             int32
+       invalidptr          int32
+       madvdontneed        int32 // for Linux; issue 28466
+       profileruntimelocks atomic.Int32
+       scavtrace           int32
+       scheddetail         int32
+       schedtrace          int32
+       tracebackancestors  int32
+       asyncpreemptoff     int32
+       harddecommit        int32
+       adaptivestackstart  int32
+       tracefpunwindoff    int32
+       traceadvanceperiod  int32
  
         // debug.malloc is used as a combined debug check
         // in the malloc function and should be set
@@ -352,6 +353,7 @@ var dbgvars = []*dbgVar{
         {name: "gctrace", value: &debug.gctrace},
         {name: "invalidptr", value: &debug.invalidptr},
         {name: "madvdontneed", value: &debug.madvdontneed},
+       {name: "profileruntimelocks", atomic: &debug.profileruntimelocks},
         {name: "sbrk", value: &debug.sbrk},
         {name: "scavtrace", value: &debug.scavtrace},
         {name: "scheddetail", value: &debug.scheddetail},
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go

index fb283a3ab427b9faf80f6407982824c5d065b710..6d40c7c4666d52bc145f9cf3c27f132cf94a6087 100644 (file)
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -593,6 +593,8 @@ type m struct {
         lockedInt     uint32      // tracking for internal lockOSThread
         nextwaitm     muintptr    // next m waiting for lock
  
+       mLockProfile mLockProfile // fields relating to runtime.lock contention
+
         // wait* are used to carry arguments from gopark into park_m, because
         // there's no stack to put them on. That is their sole purpose.
         waitunlockf          func(*g, unsafe.Pointer) bool
@@ -900,6 +902,12 @@ type schedt struct {
         // stwTotalTimeOther covers the others.
         stwTotalTimeGC    timeHistogram
         stwTotalTimeOther timeHistogram
+
+       // totalRuntimeLockWaitTime (plus the value of lockWaitTime on each M in
+       // allm) is the sum of time goroutines have spent in _Grunnable and with an
+       // M, but waiting for locks within the runtime. This field stores the value
+       // for Ms that have exited.
+       totalRuntimeLockWaitTime atomic.Int64
  }
  
  // Values for the flags field of a sigTabT.
author	Rhys Hiltner <rhys@justin.tv>
	Tue, 21 Nov 2023 16:03:54 +0000 (16:03 +0000)
committer	Gopher Robot <gobot@golang.org>
	Tue, 21 Nov 2023 21:02:20 +0000 (21:02 +0000)
src/runtime/debug.go		patch \| blob \| history
src/runtime/export_test.go		patch \| blob \| history
src/runtime/extern.go		patch \| blob \| history
src/runtime/lock_futex.go		patch \| blob \| history
src/runtime/lock_js.go		patch \| blob \| history
src/runtime/lock_sema.go		patch \| blob \| history
src/runtime/lock_wasip1.go		patch \| blob \| history
src/runtime/metrics.go		patch \| blob \| history
src/runtime/metrics_test.go		patch \| blob \| history
src/runtime/mprof.go		patch \| blob \| history
src/runtime/proc.go		patch \| blob \| history
src/runtime/runtime1.go		patch \| blob \| history
src/runtime/runtime2.go		patch \| blob \| history