runtime: select GC mark workers during start-the-world

author Michael Pratt <mpratt@google.com>

Mon, 17 Nov 2025 21:08:21 +0000 (16:08 -0500)

committer Gopher Robot <gobot@golang.org>

Thu, 20 Nov 2025 16:10:14 +0000 (08:10 -0800)
author Michael Pratt <mpratt@google.com>
Mon, 17 Nov 2025 21:08:21 +0000 (16:08 -0500)
committer Gopher Robot <gobot@golang.org>
Thu, 20 Nov 2025 16:10:14 +0000 (08:10 -0800)
diff --git a/src/runtime/mgcpacer.go b/src/runtime/mgcpacer.go

index f78fb6f636707bebbbe3743e43c8ff172162bd24..388cce83cd2388a7f2a54e515218e275cf635578 100644 (file)
--- a/src/runtime/mgcpacer.go
+++ b/src/runtime/mgcpacer.go
@@ -870,9 +870,12 @@ func (c *gcControllerState) findRunnableGCWorker(pp *p, now int64) (*g, int64) {
                 gcCPULimiter.update(now)
         }
  
-       ok, now := c.assignWaitingGCWorker(pp, now)
-       if !ok {
-               return nil, now
+       // If a worker wasn't already assigned by procresize, assign one now.
+       if pp.nextGCMarkWorker == nil {
+               ok, now := c.assignWaitingGCWorker(pp, now)
+               if !ok {
+                       return nil, now
+               }
         }
  
         node := pp.nextGCMarkWorker
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index a378b8c39d376b6689dc1d7057207db4466c1be2..62e79e74e28b0ddd8fe2c1dd67cf8fd89112d67e 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4135,11 +4135,23 @@ top:
  
         gp, inheritTime, tryWakeP := findRunnable() // blocks until work is available
  
+       // May be on a new P.
+       pp = mp.p.ptr()
+
         // findRunnable may have collected an allp snapshot. The snapshot is
         // only required within findRunnable. Clear it to all GC to collect the
         // slice.
         mp.clearAllpSnapshot()
  
+       // If the P was assigned a next GC mark worker but findRunnable
+       // selected anything else, release the worker so another P may run it.
+       //
+       // N.B. If this occurs because a higher-priority goroutine was selected
+       // (trace reader), then tryWakeP is set, which will wake another P to
+       // run the worker. If this occurs because the GC is no longer active,
+       // there is no need to wakep.
+       gcController.releaseNextGCMarkWorker(pp)
+
         if debug.dontfreezetheworld > 0 && freezing.Load() {
                 // See comment in freezetheworld. We don't want to perturb
                 // scheduler state, so we didn't gcstopm in findRunnable, but
@@ -6036,8 +6048,10 @@ func procresize(nprocs int32) *p {
                 unlock(&allpLock)
         }
  
+       // Assign Ms to Ps with runnable goroutines.
         var runnablePs *p
         var runnablePsNeedM *p
+       var idlePs *p
         for i := nprocs - 1; i >= 0; i-- {
                 pp := allp[i]
                 if gp.m.p.ptr() == pp {
@@ -6045,7 +6059,8 @@ func procresize(nprocs int32) *p {
                 }
                 pp.status = _Pidle
                 if runqempty(pp) {
-                       pidleput(pp, now)
+                       pp.link.set(idlePs)
+                       idlePs = pp
                         continue
                 }
  
@@ -6071,6 +6086,8 @@ func procresize(nprocs int32) *p {
                 pp.link.set(runnablePs)
                 runnablePs = pp
         }
+       // Assign Ms to remaining runnable Ps without usable oldm. See comment
+       // above.
         for runnablePsNeedM != nil {
                 pp := runnablePsNeedM
                 runnablePsNeedM = pp.link.ptr()
@@ -6081,6 +6098,62 @@ func procresize(nprocs int32) *p {
                 runnablePs = pp
         }
  
+       // Now that we've assigned Ms to Ps with runnable goroutines, assign GC
+       // mark workers to remaining idle Ps, if needed.
+       //
+       // By assigning GC workers to Ps here, we slightly speed up starting
+       // the world, as we will start enough Ps to run all of the user
+       // goroutines and GC mark workers all at once, rather than using a
+       // sequence of wakep calls as each P's findRunnable realizes it needs
+       // to run a mark worker instead of a user goroutine.
+       //
+       // By assigning GC workers to Ps only _after_ previously-running Ps are
+       // assigned Ms, we ensure that goroutines previously running on a P
+       // continue to run on the same P, with GC mark workers preferring
+       // previously-idle Ps. This helps prevent goroutines from shuffling
+       // around too much across STW.
+       //
+       // N.B., if there aren't enough Ps left in idlePs for all of the GC
+       // mark workers, then findRunnable will still choose to run mark
+       // workers on Ps assigned above.
+       //
+       // N.B., we do this during any STW in the mark phase, not just the
+       // sweep termination STW that starts the mark phase. gcBgMarkWorker
+       // always preempts by removing itself from the P, so even unrelated
+       // STWs during the mark require that Ps reselect mark workers upon
+       // restart.
+       if gcBlackenEnabled != 0 {
+               for idlePs != nil {
+                       pp := idlePs
+
+                       ok, _ := gcController.assignWaitingGCWorker(pp, now)
+                       if !ok {
+                               // No more mark workers needed.
+                               break
+                       }
+
+                       // Got a worker, P is now runnable.
+                       //
+                       // mget may return nil if there aren't enough Ms, in
+                       // which case startTheWorldWithSema will start one.
+                       //
+                       // N.B. findRunnableGCWorker will make the worker G
+                       // itself runnable.
+                       idlePs = pp.link.ptr()
+                       mp := mget()
+                       pp.m.set(mp)
+                       pp.link.set(runnablePs)
+                       runnablePs = pp
+               }
+       }
+
+       // Finally, any remaining Ps are truly idle.
+       for idlePs != nil {
+               pp := idlePs
+               idlePs = pp.link.ptr()
+               pidleput(pp, now)
+       }
+
         stealOrder.reset(uint32(nprocs))
         var int32p *int32 = &gomaxprocs // make compiler check that gomaxprocs is an int32
         atomic.Store((*uint32)(unsafe.Pointer(int32p)), uint32(nprocs))
@@ -6183,6 +6256,10 @@ func releasepNoTrace() *p {
                 print("releasep: m=", gp.m, " m->p=", gp.m.p.ptr(), " p->m=", hex(pp.m), " p->status=", pp.status, "\n")
                 throw("releasep: invalid p state")
         }
+
+       // P must clear if nextGCMarkWorker if it stops.
+       gcController.releaseNextGCMarkWorker(pp)
+
         gp.m.p = 0
         pp.m = 0
         pp.status = _Pidle
diff --git a/src/runtime/proc_test.go b/src/runtime/proc_test.go

index b3084f4895fe4fb18c3d614675b3e2f9be9af652..35a1aeab1f660a6bf8b425c845c8fd149bc50086 100644 (file)
--- a/src/runtime/proc_test.go
+++ b/src/runtime/proc_test.go
@@ -1221,7 +1221,7 @@ func TestTraceSTW(t *testing.T) {
  
         var errors int
         for i := range runs {
-               err := runTestTracesSTW(t, i)
+               err := runTestTracesSTW(t, i, "TraceSTW", "stop-the-world (read mem stats)")
                 if err != nil {
                         t.Logf("Run %d failed: %v", i, err)
                         errors++
@@ -1235,7 +1235,43 @@ func TestTraceSTW(t *testing.T) {
         }
  }
  
-func runTestTracesSTW(t *testing.T, run int) (err error) {
+// TestTraceGCSTW verifies that goroutines continue running on the same M and P
+// after a GC STW.
+func TestTraceGCSTW(t *testing.T) {
+       // Very similar to TestTraceSTW, but using a STW that starts the GC.
+       // When the GC starts, the background GC mark workers start running,
+       // which provide an additional source of disturbance to the scheduler.
+       //
+       // procresize assigns GC workers to previously-idle Ps to avoid
+       // changing what the previously-running Ps are doing.
+
+       if testing.Short() {
+               t.Skip("skipping in -short mode")
+       }
+
+       if runtime.NumCPU() < 8 {
+               t.Skip("This test sets GOMAXPROCS=8 and wants to avoid thread descheduling as much as possible. Skip on machines with less than 8 CPUs")
+       }
+
+       const runs = 50
+
+       var errors int
+       for i := range runs {
+               err := runTestTracesSTW(t, i, "TraceGCSTW", "stop-the-world (GC sweep termination)")
+               if err != nil {
+                       t.Logf("Run %d failed: %v", i, err)
+                       errors++
+               }
+       }
+
+       pct := float64(errors)/float64(runs)
+       t.Logf("Errors: %d/%d = %f%%", errors, runs, 100*pct)
+       if pct > 0.25 {
+               t.Errorf("Error rate too high")
+       }
+}
+
+func runTestTracesSTW(t *testing.T, run int, name, stwType string) (err error) {
         t.Logf("Run %d", run)
  
         // By default, TSAN sleeps for 1s at exit to allow background
@@ -1243,7 +1279,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) {
         // much, since we are running 50 iterations, so disable the sleep.
         //
         // Outside of race mode, GORACE does nothing.
-       buf := []byte(runTestProg(t, "testprog", "TraceSTW", "GORACE=atexit_sleep_ms=0"))
+       buf := []byte(runTestProg(t, "testprog", name, "GORACE=atexit_sleep_ms=0"))
  
         // We locally "fail" the run (return an error) if the trace exhibits
         // unwanted scheduling. i.e., the target goroutines did not remain on
@@ -1253,7 +1289,7 @@ func runTestTracesSTW(t *testing.T, run int) (err error) {
         // occur, such as a trace parse error.
         defer func() {
                 if err != nil || t.Failed() {
-                       testtrace.Dump(t, fmt.Sprintf("TestTraceSTW-run%d", run), []byte(buf), false)
+                       testtrace.Dump(t, fmt.Sprintf("Test%s-run%d", name, run), []byte(buf), false)
                 }
         }()
  
@@ -1509,12 +1545,10 @@ findEnd:
                         break findEnd
                 case trace.EventRangeBegin:
                         r := ev.Range()
-                       if r.Name == "stop-the-world (read mem stats)" {
+                       if r.Name == stwType {
                                 // Note when we see the STW begin. This is not
                                 // load bearing; it's purpose is simply to fail
-                               // the test if we manage to remove the STW from
-                               // ReadMemStat, so we remember to change this
-                               // test to add some new source of STW.
+                               // the test if we accidentally remove the STW.
                                 stwSeen = true
                         }
                 }
diff --git a/src/runtime/testdata/testprog/stw_trace.go b/src/runtime/testdata/testprog/stw_trace.go

index 0fed55b875721ab944f000811dc94b893a61db88..0fa15da09e2f9c2b6112c0ac1dc98dd7d752d08a 100644 (file)
--- a/src/runtime/testdata/testprog/stw_trace.go
+++ b/src/runtime/testdata/testprog/stw_trace.go
@@ -7,15 +7,18 @@ package main
  import (
         "context"
         "log"
+       "math/rand/v2"
         "os"
         "runtime"
         "runtime/debug"
+       "runtime/metrics"
         "runtime/trace"
         "sync/atomic"
  )
  
  func init() {
         register("TraceSTW", TraceSTW)
+       register("TraceGCSTW", TraceGCSTW)
  }
  
  // The parent writes to ping and waits for the children to write back
@@ -53,7 +56,7 @@ func TraceSTW() {
         // https://go.dev/issue/65694). Alternatively, we could just ignore the
         // trace if the GC runs.
         runtime.GOMAXPROCS(4)
-       debug.SetGCPercent(0)
+       debug.SetGCPercent(-1)
  
         if err := trace.Start(os.Stdout); err != nil {
                 log.Fatalf("failed to start tracing: %v", err)
@@ -86,6 +89,112 @@ func TraceSTW() {
         stop.Store(true)
  }
  
+// Variant of TraceSTW for GC STWs. We want the GC mark workers to start on
+// previously-idle Ps, rather than bumping the current P.
+func TraceGCSTW() {
+       ctx := context.Background()
+
+       // The idea here is to have 2 target goroutines that are constantly
+       // running. When the world restarts after STW, we expect these
+       // goroutines to continue execution on the same M and P.
+       //
+       // Set GOMAXPROCS=8 to make room for the 2 target goroutines, 1 parent,
+       // 2 dedicated workers, and a bit of slack.
+       //
+       // Disable the GC initially so we can be sure it only triggers once we
+       // are ready.
+       runtime.GOMAXPROCS(8)
+       debug.SetGCPercent(-1)
+
+       if err := trace.Start(os.Stdout); err != nil {
+               log.Fatalf("failed to start tracing: %v", err)
+       }
+       defer trace.Stop()
+
+       for i := range 2 {
+               go traceSTWTarget(i)
+       }
+
+       // Wait for children to start running.
+       ping.Store(1)
+       for pong[0].Load() != 1 {}
+       for pong[1].Load() != 1 {}
+
+       trace.Log(ctx, "TraceSTW", "start")
+
+       // STW
+       triggerGC()
+
+       // Make sure to run long enough for the children to schedule again
+       // after STW. This is included for good measure, but the goroutines
+       // really ought to have already scheduled since the entire GC
+       // completed.
+       ping.Store(2)
+       for pong[0].Load() != 2 {}
+       for pong[1].Load() != 2 {}
+
+       trace.Log(ctx, "TraceSTW", "end")
+
+       stop.Store(true)
+}
+
+func triggerGC() {
+       // Allocate a bunch to trigger the GC rather than using runtime.GC. The
+       // latter blocks until the GC is complete, which is convenient, but
+       // messes with scheduling as it gives this P a chance to steal the
+       // other goroutines before their Ps get up and running again.
+
+       // Bring heap size up prior to enabling the GC to ensure that there is
+       // a decent amount of work in case the GC triggers immediately upon
+       // re-enabling.
+       for range 1000 {
+               alloc()
+       }
+
+       sample := make([]metrics.Sample, 1)
+       sample[0].Name = "/gc/cycles/total:gc-cycles"
+       metrics.Read(sample)
+
+       start := sample[0].Value.Uint64()
+
+       debug.SetGCPercent(100)
+
+       // Keep allocating until the GC is complete. We really only need to
+       // continue until the mark workers are scheduled, but there isn't a
+       // good way to measure that.
+       for {
+               metrics.Read(sample)
+               if sample[0].Value.Uint64() != start {
+                       return
+               }
+
+               alloc()
+       }
+}
+
+// Allocate a tree data structure to generate plenty of scan work for the GC.
+
+type node struct {
+       children []*node
+}
+
+var gcSink node
+
+func alloc() {
+       // 10% chance of adding a node a each layer.
+
+       curr := &gcSink
+       for {
+               if len(curr.children) == 0 || rand.Float32() < 0.1 {
+                       curr.children = append(curr.children, new(node))
+                       return
+               }
+
+               i := rand.IntN(len(curr.children))
+               curr = curr.children[i]
+       }
+}
+
  // Manually insert a morestack call. Leaf functions can omit morestack, but
  // non-leaf functions should include them.
author	Michael Pratt <mpratt@google.com>
	Mon, 17 Nov 2025 21:08:21 +0000 (16:08 -0500)
committer	Gopher Robot <gobot@golang.org>
	Thu, 20 Nov 2025 16:10:14 +0000 (08:10 -0800)
src/runtime/mgcpacer.go		patch \| blob \| history
src/runtime/proc.go		patch \| blob \| history
src/runtime/proc_test.go		patch \| blob \| history
src/runtime/testdata/testprog/stw_trace.go		patch \| blob \| history