gcCPULimiter.update(now)
}
- ok, now := c.assignWaitingGCWorker(pp, now)
- if !ok {
- return nil, now
+ // If a worker wasn't already assigned by procresize, assign one now.
+ if pp.nextGCMarkWorker == nil {
+ ok, now := c.assignWaitingGCWorker(pp, now)
+ if !ok {
+ return nil, now
+ }
}
node := pp.nextGCMarkWorker
gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available
+ // May be on a new P.
+ pp = mp.p.ptr()
+
// findRunnable may have collected an allp snapshot. The snapshot is
// only required within findRunnable. Clear it to all GC to collect the
// slice.
mp.clearAllpSnapshot()
+ // If the P was assigned a next GC mark worker but findRunnable
+ // selected anything else, release the worker so another P may run it.
+ //
+ // N.B. If this occurs because a higher-priority goroutine was selected
+ // (trace reader), then tryWakeP is set, which will wake another P to
+ // run the worker. If this occurs because the GC is no longer active,
+ // there is no need to wakep.
+ gcController.releaseNextGCMarkWorker(pp)
+
if debug.dontfreezetheworld > 0 && freezing.Load() {
// See comment in freezetheworld. We don't want to perturb
// scheduler state, so we didn't gcstopm in findRunnable, but
unlock(&allpLock)
}
+ // Assign Ms to Ps with runnable goroutines.
var runnablePs *p
var runnablePsNeedM *p
+ var idlePs *p
for i := nprocs - 1; i >= 0; i-- {
pp := allp[i]
if gp.m.p.ptr() == pp {
}
pp.status = _Pidle
if runqempty(pp) {
- pidleput(pp, now)
+ pp.link.set(idlePs)
+ idlePs = pp
continue
}
pp.link.set(runnablePs)
runnablePs = pp
}
+ // Assign Ms to remaining runnable Ps without usable oldm. See comment
+ // above.
for runnablePsNeedM != nil {
pp := runnablePsNeedM
runnablePsNeedM = pp.link.ptr()
runnablePs = pp
}
+ // Now that we've assigned Ms to Ps with runnable goroutines, assign GC
+ // mark workers to remaining idle Ps, if needed.
+ //
+ // By assigning GC workers to Ps here, we slightly speed up starting
+ // the world, as we will start enough Ps to run all of the user
+ // goroutines and GC mark workers all at once, rather than using a
+ // sequence of wakep calls as each P's findRunnable realizes it needs
+ // to run a mark worker instead of a user goroutine.
+ //
+ // By assigning GC workers to Ps only _after_ previously-running Ps are
+ // assigned Ms, we ensure that goroutines previously running on a P
+ // continue to run on the same P, with GC mark workers preferring
+ // previously-idle Ps. This helps prevent goroutines from shuffling
+ // around too much across STW.
+ //
+ // N.B., if there aren't enough Ps left in idlePs for all of the GC
+ // mark workers, then findRunnable will still choose to run mark
+ // workers on Ps assigned above.
+ //
+ // N.B., we do this during any STW in the mark phase, not just the
+ // sweep termination STW that starts the mark phase. gcBgMarkWorker
+ // always preempts by removing itself from the P, so even unrelated
+ // STWs during the mark require that Ps reselect mark workers upon
+ // restart.
+ if gcBlackenEnabled != 0 {
+ for idlePs != nil {
+ pp := idlePs
+
+ ok, _ := gcController.assignWaitingGCWorker(pp, now)
+ if !ok {
+ // No more mark workers needed.
+ break
+ }
+
+ // Got a worker, P is now runnable.
+ //
+ // mget may return nil if there aren't enough Ms, in
+ // which case startTheWorldWithSema will start one.
+ //
+ // N.B. findRunnableGCWorker will make the worker G
+ // itself runnable.
+ idlePs = pp.link.ptr()
+ mp := mget()
+ pp.m.set(mp)
+ pp.link.set(runnablePs)
+ runnablePs = pp
+ }
+ }
+
+ // Finally, any remaining Ps are truly idle.
+ for idlePs != nil {
+ pp := idlePs
+ idlePs = pp.link.ptr()
+ pidleput(pp, now)
+ }
+
stealOrder.reset(uint32(nprocs))
var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n")
throw("releasep: invalid p state")
}
+
+ // P must clear if nextGCMarkWorker if it stops.
+ gcController.releaseNextGCMarkWorker(pp)
+
gp.m.p = 0
pp.m = 0
pp.status = _Pidle
var errors int
for i := range runs {
- err := runTestTracesSTW(t, i)
+ err := runTestTracesSTW(t, i, "TraceSTW", "stop-the-world (read mem stats)")
if err != nil {
t.Logf("Run %d failed: %v", i, err)
errors++
}
}
-func runTestTracesSTW(t *testing.T, run int) (err error) {
+// TestTraceGCSTW verifies that goroutines continue running on the same M and P
+// after a GC STW.
+func TestTraceGCSTW(t *testing.T) {
+ // Very similar to TestTraceSTW, but using a STW that starts the GC.
+ // When the GC starts, the background GC mark workers start running,
+ // which provide an additional source of disturbance to the scheduler.
+ //
+ // procresize assigns GC workers to previously-idle Ps to avoid
+ // changing what the previously-running Ps are doing.
+
+ if testing.Short() {
+ t.Skip("skipping in -short mode")
+ }
+
+ if runtime.NumCPU() < 8 {
+ t.Skip("This test sets GOMAXPROCS=8 and wants to avoid thread descheduling as much as possible. Skip on machines with less than 8 CPUs")
+ }
+
+ const runs = 50
+
+ var errors int
+ for i := range runs {
+ err := runTestTracesSTW(t, i, "TraceGCSTW", "stop-the-world (GC sweep termination)")
+ if err != nil {
+ t.Logf("Run %d failed: %v", i, err)
+ errors++
+ }
+ }
+
+ pct := float64(errors)/float64(runs)
+ t.Logf("Errors: %d/%d = %f%%", errors, runs, 100*pct)
+ if pct > 0.25 {
+ t.Errorf("Error rate too high")
+ }
+}
+
+func runTestTracesSTW(t *testing.T, run int, name, stwType string) (err error) {
t.Logf("Run %d", run)
// By default, TSAN sleeps for 1s at exit to allow background
// much, since we are running 50 iterations, so disable the sleep.
//
// Outside of race mode, GORACE does nothing.
- buf := []byte(runTestProg(t, "testprog", "TraceSTW", "GORACE=atexit_sleep_ms=0"))
+ buf := []byte(runTestProg(t, "testprog", name, "GORACE=atexit_sleep_ms=0"))
// We locally "fail" the run (return an error) if the trace exhibits
// unwanted scheduling. i.e., the target goroutines did not remain on
// occur, such as a trace parse error.
defer func() {
if err != nil || t.Failed() {
- testtrace.Dump(t, fmt.Sprintf("TestTraceSTW-run%d", run), []byte(buf), false)
+ testtrace.Dump(t, fmt.Sprintf("Test%s-run%d", name, run), []byte(buf), false)
}
}()
break findEnd
case trace.EventRangeBegin:
r := ev.Range()
- if r.Name == "stop-the-world (read mem stats)" {
+ if r.Name == stwType {
// Note when we see the STW begin. This is not
// load bearing; it's purpose is simply to fail
- // the test if we manage to remove the STW from
- // ReadMemStat, so we remember to change this
- // test to add some new source of STW.
+ // the test if we accidentally remove the STW.
stwSeen = true
}
}
import (
"context"
"log"
+ "math/rand/v2"
"os"
"runtime"
"runtime/debug"
+ "runtime/metrics"
"runtime/trace"
"sync/atomic"
)
func init() {
register("TraceSTW", TraceSTW)
+ register("TraceGCSTW", TraceGCSTW)
}
// The parent writes to ping and waits for the children to write back
// https://go.dev/issue/65694). Alternatively, we could just ignore the
// trace if the GC runs.
runtime.GOMAXPROCS(4)
- debug.SetGCPercent(0)
+ debug.SetGCPercent(-1)
if err := trace.Start(os.Stdout); err != nil {
log.Fatalf("failed to start tracing: %v", err)
stop.Store(true)
}
+// Variant of TraceSTW for GC STWs. We want the GC mark workers to start on
+// previously-idle Ps, rather than bumping the current P.
+func TraceGCSTW() {
+ ctx := context.Background()
+
+ // The idea here is to have 2 target goroutines that are constantly
+ // running. When the world restarts after STW, we expect these
+ // goroutines to continue execution on the same M and P.
+ //
+ // Set GOMAXPROCS=8 to make room for the 2 target goroutines, 1 parent,
+ // 2 dedicated workers, and a bit of slack.
+ //
+ // Disable the GC initially so we can be sure it only triggers once we
+ // are ready.
+ runtime.GOMAXPROCS(8)
+ debug.SetGCPercent(-1)
+
+ if err := trace.Start(os.Stdout); err != nil {
+ log.Fatalf("failed to start tracing: %v", err)
+ }
+ defer trace.Stop()
+
+ for i := range 2 {
+ go traceSTWTarget(i)
+ }
+
+ // Wait for children to start running.
+ ping.Store(1)
+ for pong[0].Load() != 1 {}
+ for pong[1].Load() != 1 {}
+
+ trace.Log(ctx, "TraceSTW", "start")
+
+ // STW
+ triggerGC()
+
+ // Make sure to run long enough for the children to schedule again
+ // after STW. This is included for good measure, but the goroutines
+ // really ought to have already scheduled since the entire GC
+ // completed.
+ ping.Store(2)
+ for pong[0].Load() != 2 {}
+ for pong[1].Load() != 2 {}
+
+ trace.Log(ctx, "TraceSTW", "end")
+
+ stop.Store(true)
+}
+
+func triggerGC() {
+ // Allocate a bunch to trigger the GC rather than using runtime.GC. The
+ // latter blocks until the GC is complete, which is convenient, but
+ // messes with scheduling as it gives this P a chance to steal the
+ // other goroutines before their Ps get up and running again.
+
+ // Bring heap size up prior to enabling the GC to ensure that there is
+ // a decent amount of work in case the GC triggers immediately upon
+ // re-enabling.
+ for range 1000 {
+ alloc()
+ }
+
+ sample := make([]metrics.Sample, 1)
+ sample[0].Name = "/gc/cycles/total:gc-cycles"
+ metrics.Read(sample)
+
+ start := sample[0].Value.Uint64()
+
+ debug.SetGCPercent(100)
+
+ // Keep allocating until the GC is complete. We really only need to
+ // continue until the mark workers are scheduled, but there isn't a
+ // good way to measure that.
+ for {
+ metrics.Read(sample)
+ if sample[0].Value.Uint64() != start {
+ return
+ }
+
+ alloc()
+ }
+}
+
+// Allocate a tree data structure to generate plenty of scan work for the GC.
+
+type node struct {
+ children []*node
+}
+
+var gcSink node
+
+func alloc() {
+ // 10% chance of adding a node a each layer.
+
+ curr := &gcSink
+ for {
+ if len(curr.children) == 0 || rand.Float32() < 0.1 {
+ curr.children = append(curr.children, new(node))
+ return
+ }
+
+ i := rand.IntN(len(curr.children))
+ curr = curr.children[i]
+ }
+}
+
// Manually insert a morestack call. Leaf functions can omit morestack, but
// non-leaf functions should include them.