return metricGrowth, profileGrowth, p
}
- testcase := func(strictTiming bool, acceptStacks [][]string, workers int, fn func() bool) func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64, explain func()) {
- return func(t *testing.T) (metricGrowth, profileGrowth float64, n, value int64, explain func()) {
- metricGrowth, profileGrowth, p := measureDelta(t, func() {
+ testcase := func(strictTiming bool, acceptStacks [][]string, workers int, fn func() bool) func(t *testing.T) (metricGrowth float64, profileGrowth []int64, n, value int64, explain func()) {
+ return func(t *testing.T) (metricGrowth float64, profileGrowth []int64, n, value int64, explain func()) {
+ metricGrowth, totalProfileGrowth, p := measureDelta(t, func() {
var started, stopped sync.WaitGroup
started.Add(workers)
stopped.Add(workers)
stopped.Wait()
})
- if profileGrowth == 0 {
+ if totalProfileGrowth == 0 {
t.Errorf("no increase in mutex profile")
}
if metricGrowth == 0 && strictTiming {
// together. It doesn't work as well for user-space contention, where the
// involved goroutines are not _Grunnable the whole time and so need to pass
// through the scheduler.
- t.Logf("lock contention growth in runtime/pprof's view (%fs)", profileGrowth)
+ t.Logf("lock contention growth in runtime/pprof's view (%fs)", totalProfileGrowth)
t.Logf("lock contention growth in runtime/metrics' view (%fs)", metricGrowth)
acceptStacks = append([][]string(nil), acceptStacks...)
}
var stks [][]string
- values := make([][2]int64, len(acceptStacks))
+ values := make([][2]int64, len(acceptStacks)+1)
for _, s := range p.Sample {
var have []string
for _, loc := range s.Location {
}
}
stks = append(stks, have)
+ found := false
for i, stk := range acceptStacks {
if slices.Equal(have, stk) {
values[i][0] += s.Value[0]
values[i][1] += s.Value[1]
+ found = true
+ break
}
}
+ if !found {
+ values[len(values)-1][0] += s.Value[0]
+ values[len(values)-1][1] += s.Value[1]
+ }
}
+ profileGrowth = make([]int64, len(acceptStacks)+1)
+ profileGrowth[len(profileGrowth)-1] = values[len(values)-1][1]
for i, stk := range acceptStacks {
n += values[i][0]
value += values[i][1]
+ profileGrowth[i] = values[i][1]
t.Logf("stack %v has samples totaling n=%d value=%d", stk, values[i][0], values[i][1])
}
if n == 0 && value == 0 {
name := t.Name()
t.Run("runtime.lock", func(t *testing.T) {
- mus := make([]runtime.Mutex, 200)
- var needContention atomic.Int64
- delay := 100 * time.Microsecond // large relative to system noise, for comparison between clocks
- delayMicros := delay.Microseconds()
-
// The goroutine that acquires the lock will only proceed when it
// detects that its partner is contended for the lock. That will lead to
// live-lock if anything (such as a STW) prevents the partner goroutine
// will end up contended twice. Instead, disable the GC.
defer debug.SetGCPercent(debug.SetGCPercent(-1))
- const workers = 2
- if runtime.GOMAXPROCS(0) < workers {
- t.Skipf("contention on runtime-internal locks requires GOMAXPROCS >= %d", workers)
+ mus := make([]runtime.Mutex, 200)
+ var needContention atomic.Int64
+
+ baseDelay := 100 * time.Microsecond // large relative to system noise, for comparison between clocks
+ fastDelayMicros := baseDelay.Microseconds()
+ slowDelayMicros := baseDelay.Microseconds() * 4
+
+ const (
+ fastRole = iota
+ slowRole
+ workerCount
+ )
+ if runtime.GOMAXPROCS(0) < workerCount {
+ t.Skipf("contention on runtime-internal locks requires GOMAXPROCS >= %d", workerCount)
}
+ minTicks := make([][]int64, workerCount) // lower bound, known-contended time, measured by cputicks
+ maxTicks := make([][]int64, workerCount) // upper bound, total lock() duration, measured by cputicks
+ for i := range minTicks {
+ minTicks[i] = make([]int64, len(mus))
+ maxTicks[i] = make([]int64, len(mus))
+ }
+ var id atomic.Int32
fn := func() bool {
n := int(needContention.Load())
if n < 0 {
}
mu := &mus[n]
- runtime.Lock(mu)
- for int(needContention.Load()) == n {
- if runtime.MutexContended(mu) {
- // make them wait a little while
- for start := runtime.Nanotime(); (runtime.Nanotime()-start)/1000 < delayMicros; {
- runtime.Usleep(uint32(delayMicros))
+ // Each worker has a role: to have a fast or slow critical section.
+ // Rotate the role assignments as we step through the mutex slice so
+ // we don't end up with one M always claiming the same kind of work.
+ id := int(id.Add(1))
+ role := (id + n) % workerCount
+
+ marker, delayMicros := fastMarkerFrame, fastDelayMicros
+ if role == slowRole {
+ marker, delayMicros = slowMarkerFrame, slowDelayMicros
+ }
+
+ // Each lock is used by two different critical sections, one fast
+ // and one slow, identified in profiles by their different "marker"
+ // functions. We expect the profile to blame each for the amount of
+ // delay it inflicts on other users of the lock. We run one worker
+ // of each kind, so any contention in one would be due to the other.
+ //
+ // We measure how long our runtime.lock call takes, which sets an
+ // upper bound on how much blame to expect for the other worker type
+ // in the profile. And if we acquire the lock first, we wait for the
+ // other worker to announce its contention. We measure the
+ // known-contended time, to use as a lower bound on how much blame
+ // we expect of ourselves in the profile. Then we stall for a little
+ // while (different amounts for "fast" versus "slow") before
+ // unlocking the mutex.
+
+ marker(func() {
+ t0 := runtime.Cputicks()
+ runtime.Lock(mu)
+ maxTicks[role][n] = runtime.Cputicks() - t0
+ minTicks[role][n] = 0
+ for int(needContention.Load()) == n {
+ if runtime.MutexContended(mu) {
+ t1 := runtime.Cputicks()
+ // make them wait a little while
+ for start := runtime.Nanotime(); (runtime.Nanotime()-start)/1000 < delayMicros; {
+ runtime.Usleep(uint32(1 + delayMicros/8))
+ }
+ minTicks[role][n] = runtime.Cputicks() - t1
+ break
}
- break
}
- }
- runtime.Unlock(mu)
- needContention.Store(int64(n - 1))
+ runtime.Unlock(mu)
+ needContention.Store(int64(n - 1))
+ })
return true
}
- stks := [][]string{{
- "runtime.unlock",
- "runtime_test." + name + ".func5.1",
- "runtime_test.(*contentionWorker).run",
- }}
+ stks := make([][]string, 2)
+ for i := range stks {
+ marker := "runtime_test.fastMarkerFrame"
+ if i == slowRole {
+ marker = "runtime_test.slowMarkerFrame"
+ }
+
+ stks[i] = []string{
+ "runtime.unlock",
+ "runtime_test." + name + ".func5.1.1",
+ marker,
+ "runtime_test." + name + ".func5.1",
+ "runtime_test.(*contentionWorker).run",
+ }
+ }
t.Run("sample-1", func(t *testing.T) {
old := runtime.SetMutexProfileFraction(1)
defer runtime.SetMutexProfileFraction(old)
needContention.Store(int64(len(mus) - 1))
- metricGrowth, profileGrowth, n, _, explain := testcase(true, stks, workers, fn)(t)
+ metricGrowth, profileGrowth, n, _, explain := testcase(true, stks, workerCount, fn)(t)
defer func() {
if t.Failed() {
explain()
// gTrackingPeriod, so we don't have a hard lower bound here.
testenv.SkipFlaky(t, 64253)
- if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); have < want {
+ if have, want := metricGrowth, baseDelay.Seconds()*float64(len(mus)); have < want {
// The test imposes a delay with usleep, verified with calls to
// nanotime. Compare against the runtime/metrics package's view
// (based on nanotime) rather than runtime/pprof's view (based
t.Errorf("mutex profile reported contention count different from the known true count (%d != %d)", have, want)
}
+ var slowMinTicks, fastMinTicks int64
+ for role, ticks := range minTicks {
+ for _, delta := range ticks {
+ if role == slowRole {
+ slowMinTicks += delta
+ } else {
+ fastMinTicks += delta
+ }
+ }
+ }
+ var slowMaxTicks, fastMaxTicks int64
+ for role, ticks := range maxTicks {
+ for _, delta := range ticks {
+ if role == slowRole {
+ slowMaxTicks += delta
+ } else {
+ fastMaxTicks += delta
+ }
+ }
+ }
+
+ cpuGHz := float64(runtime.CyclesPerSecond()) / 1e9
+ for _, set := range []struct {
+ name string
+ profTime int64
+ minTime int64
+ maxTime int64
+ }{
+ {
+ name: "slow",
+ profTime: profileGrowth[slowRole],
+ minTime: int64(float64(slowMinTicks) / cpuGHz),
+ maxTime: int64(float64(fastMaxTicks) / cpuGHz),
+ },
+ {
+ name: "fast",
+ profTime: profileGrowth[fastRole],
+ minTime: int64(float64(fastMinTicks) / cpuGHz),
+ maxTime: int64(float64(slowMaxTicks) / cpuGHz),
+ },
+ } {
+ t.Logf("profile's view of delays due to %q critical section: %dns", set.name, set.profTime)
+ t.Logf("test's view of known-contended time within %q critical section: %dns", set.name, set.minTime)
+ t.Logf("test's view of lock duration before critical sections other than %q: %dns", set.name, set.maxTime)
+
+ if set.profTime < set.minTime {
+ t.Errorf("profile undercounted %q critical section", set.name)
+ }
+ if set.profTime > set.maxTime {
+ t.Errorf("profile overcounted %q critical section", set.name)
+ }
+ }
+
+ var totalProfileGrowth float64
+ for _, growth := range profileGrowth {
+ totalProfileGrowth += float64(growth) * time.Nanosecond.Seconds()
+ }
+
const slop = 1.5 // account for nanotime vs cputicks
t.Run("compare timers", func(t *testing.T) {
testenv.SkipFlaky(t, 64253)
- if profileGrowth > slop*metricGrowth || metricGrowth > slop*profileGrowth {
+ if totalProfileGrowth > slop*metricGrowth || metricGrowth > slop*totalProfileGrowth {
t.Errorf("views differ by more than %fx", slop)
}
})
defer runtime.SetMutexProfileFraction(old)
needContention.Store(int64(len(mus) - 1))
- metricGrowth, profileGrowth, n, _, explain := testcase(true, stks, workers, fn)(t)
+ metricGrowth, profileGrowth, n, _, explain := testcase(true, stks, workerCount, fn)(t)
defer func() {
if t.Failed() {
explain()
// 1e-9 chance of that, which is an acceptably low flakiness rate.
const samplingSlop = 2.5
- if have, want := metricGrowth, delay.Seconds()*float64(len(mus)); samplingSlop*have < want {
+ if have, want := metricGrowth, baseDelay.Seconds()*float64(len(mus)); samplingSlop*have < want {
// The test imposes a delay with usleep, verified with calls to
// nanotime. Compare against the runtime/metrics package's view
// (based on nanotime) rather than runtime/pprof's view (based
t.Errorf("mutex profile reported contention count too different from the expected count (%d far from %d)", have, want)
}
+ var totalProfileGrowth float64
+ for _, growth := range profileGrowth {
+ totalProfileGrowth += float64(growth) * time.Nanosecond.Seconds()
+ }
+
const timerSlop = 1.5 * samplingSlop // account for nanotime vs cputicks, plus the two views' independent sampling
- if profileGrowth > timerSlop*metricGrowth || metricGrowth > timerSlop*profileGrowth {
+ if totalProfileGrowth > timerSlop*metricGrowth || metricGrowth > timerSlop*totalProfileGrowth {
t.Errorf("views differ by more than %fx", timerSlop)
}
})
})
}
+func slowMarkerFrame(fn func()) { fn() }
+func fastMarkerFrame(fn func()) { fn() }
+
// contentionWorker provides cleaner call stacks for lock contention profile tests
type contentionWorker struct {
before func()