From bca3e98b8aa4f754ea3604b62ef42f64dec88800 Mon Sep 17 00:00:00 2001 From: Jon San Miguel Date: Wed, 13 Aug 2025 13:06:31 -0700 Subject: [PATCH] cmd/go: test barrier actions Add a barrier action between test run action and it's dependencies. Run will depend on this barrier action, and the barrier action will depend on: 1. The run action's dependencies 2. The previous barrier action This will force internal/work to schedule test run actions in-order, preventing potential goroutine starvation from the channel locking mechanism created to force test run actions to start in-order. Fixes #73106 #61233 Change-Id: I72e9f752f7521093f3c875eef7f2f29b2393fce9 Reviewed-on: https://go-review.googlesource.com/c/go/+/668035 LUCI-TryBot-Result: Go LUCI Reviewed-by: Michael Matloob Reviewed-by: David Chase Reviewed-by: Michael Matloob --- src/cmd/go/internal/test/test.go | 37 +++++++++++++++++++++++++---- src/cmd/go/internal/work/action.go | 2 +- src/cmd/go/internal/work/buildid.go | 21 +++++++++++++++- src/cmd/go/internal/work/cover.go | 5 ++-- src/cmd/go/internal/work/exec.go | 16 ++++++++++++- 5 files changed, 71 insertions(+), 10 deletions(-) diff --git a/src/cmd/go/internal/test/test.go b/src/cmd/go/internal/test/test.go index 6c4a6a574d..17348a70f7 100644 --- a/src/cmd/go/internal/test/test.go +++ b/src/cmd/go/internal/test/test.go @@ -1044,11 +1044,36 @@ func runTest(ctx context.Context, cmd *base.Command, args []string) { prints = append(prints, printTest) } - // Order runs for coordinating start JSON prints. + // Order runs for coordinating start JSON prints via two mechanisms: + // 1. Channel locking forces runTest actions to start in-order. + // 2. Barrier tasks force runTest actions to be scheduled in-order. + // We need both for performant behavior, as channel locking without the barrier tasks starves the worker pool, + // and barrier tasks without channel locking doesn't guarantee start in-order behavior alone. + var prevBarrier *work.Action ch := make(chan struct{}) close(ch) for _, a := range runs { if r, ok := a.Actor.(*runTestActor); ok { + // Inject a barrier task between the run action and its dependencies. + // This barrier task wil also depend on the previous barrier task. + // This prevents the run task from being scheduled until all previous run dependencies have finished. + // The build graph will be augmented to look roughly like this: + // build("a") build("b") build("c") + // | | | + // barrier("a.test") -> barrier("b.test") -> barrier("c.test") + // | | | + // run("a.test") run("b.test") run("c.test") + + barrier := &work.Action{ + Mode: "test barrier", + Deps: slices.Clip(a.Deps), + } + if prevBarrier != nil { + barrier.Deps = append(barrier.Deps, prevBarrier) + } + a.Deps = []*work.Action{barrier} + prevBarrier = barrier + r.prev = ch ch = make(chan struct{}) r.next = ch @@ -1400,6 +1425,8 @@ func (lockedStdout) Write(b []byte) (int, error) { func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action) error { sh := b.Shell(a) + barrierAction := a.Deps[0] + buildAction := barrierAction.Deps[0] // Wait for previous test to get started and print its first json line. select { @@ -1530,7 +1557,7 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action) // we have different link inputs but the same final binary, // we still reuse the cached test result. // c.saveOutput will store the result under both IDs. - r.c.tryCacheWithID(b, a, a.Deps[0].BuildContentID()) + r.c.tryCacheWithID(b, a, buildAction.BuildContentID()) } if r.c.buf != nil { if stdout != &buf { @@ -1581,7 +1608,7 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action) // fresh copies of tools to test as part of the testing. addToEnv = "GOCOVERDIR=" + gcd } - args := str.StringList(execCmd, a.Deps[0].BuiltTarget(), testlogArg, panicArg, fuzzArg, coverdirArg, testArgs) + args := str.StringList(execCmd, buildAction.BuiltTarget(), testlogArg, panicArg, fuzzArg, coverdirArg, testArgs) if testCoverProfile != "" { // Write coverage to temporary profile, for merging later. @@ -1741,8 +1768,8 @@ func (r *runTestActor) Act(b *work.Builder, ctx context.Context, a *work.Action) // tryCache is called just before the link attempt, // to see if the test result is cached and therefore the link is unneeded. // It reports whether the result can be satisfied from cache. -func (c *runCache) tryCache(b *work.Builder, a *work.Action) bool { - return c.tryCacheWithID(b, a, a.Deps[0].BuildActionID()) +func (c *runCache) tryCache(b *work.Builder, a *work.Action, linkAction *work.Action) bool { + return c.tryCacheWithID(b, a, linkAction.BuildActionID()) } func (c *runCache) tryCacheWithID(b *work.Builder, a *work.Action, id string) bool { diff --git a/src/cmd/go/internal/work/action.go b/src/cmd/go/internal/work/action.go index ecc3337131..cb92a31645 100644 --- a/src/cmd/go/internal/work/action.go +++ b/src/cmd/go/internal/work/action.go @@ -92,7 +92,7 @@ type Action struct { buggyInstall bool // is this a buggy install (see -linkshared)? - TryCache func(*Builder, *Action) bool // callback for cache bypass + TryCache func(*Builder, *Action, *Action) bool // callback for cache bypass CacheExecutable bool // Whether to cache executables produced by link steps diff --git a/src/cmd/go/internal/work/buildid.go b/src/cmd/go/internal/work/buildid.go index c272131c77..88c24b11ac 100644 --- a/src/cmd/go/internal/work/buildid.go +++ b/src/cmd/go/internal/work/buildid.go @@ -401,6 +401,25 @@ var ( stdlibRecompiledIncOnce = sync.OnceFunc(stdlibRecompiled.Inc) ) +// testRunAction returns the run action for a test given the link action +// for the test binary, if the only (non-test-barrier) action that depend +// on the link action is the run action. +func testRunAction(a *Action) *Action { + if len(a.triggers) != 1 || a.triggers[0].Mode != "test barrier" { + return nil + } + var runAction *Action + for _, t := range a.triggers[0].triggers { + if t.Mode == "test run" { + if runAction != nil { + return nil + } + runAction = t + } + } + return runAction +} + // useCache tries to satisfy the action a, which has action ID actionHash, // by using a cached result from an earlier build. // If useCache decides that the cache can be used, it sets a.buildID @@ -526,7 +545,7 @@ func (b *Builder) useCache(a *Action, actionHash cache.ActionID, target string, // then to avoid the link step, report the link as up-to-date. // We avoid the nested build ID problem in the previous special case // by recording the test results in the cache under the action ID half. - if len(a.triggers) == 1 && a.triggers[0].TryCache != nil && a.triggers[0].TryCache(b, a.triggers[0]) { + if ra := testRunAction(a); ra != nil && ra.TryCache != nil && ra.TryCache(b, ra, a) { // Best effort attempt to display output from the compile and link steps. // If it doesn't work, it doesn't work: reusing the test result is more // important than reprinting diagnostic information. diff --git a/src/cmd/go/internal/work/cover.go b/src/cmd/go/internal/work/cover.go index 62fcdb3fda..fc96f67d6e 100644 --- a/src/cmd/go/internal/work/cover.go +++ b/src/cmd/go/internal/work/cover.go @@ -36,8 +36,9 @@ func (b *Builder) CovData(a *Action, cmdargs ...any) ([]byte, error) { // but will be empty; in this case the return is an empty string. func BuildActionCoverMetaFile(runAct *Action) (string, error) { p := runAct.Package - for i := range runAct.Deps { - pred := runAct.Deps[i] + barrierAct := runAct.Deps[0] + for i := range barrierAct.Deps { + pred := barrierAct.Deps[i] if pred.Mode != "build" || pred.Package == nil { continue } diff --git a/src/cmd/go/internal/work/exec.go b/src/cmd/go/internal/work/exec.go index 63fd13f754..9959dc804d 100644 --- a/src/cmd/go/internal/work/exec.go +++ b/src/cmd/go/internal/work/exec.go @@ -183,7 +183,21 @@ func (b *Builder) Do(ctx context.Context, root *Action) { for _, a0 := range a.triggers { if a.Failed != nil { - a0.Failed = a.Failed + if a0.Mode == "test barrier" { + // If this action was triggered by a test, there + // will be a test barrier action in between the test + // and the true trigger. But there will be other + // triggers that are other barriers that are waiting + // for this one. Propagate the failure to the true + // trigger, but not to the other barriers. + for _, bt := range a0.triggers { + if bt.Mode != "test barrier" { + bt.Failed = a.Failed + } + } + } else { + a0.Failed = a.Failed + } } if a0.pending--; a0.pending == 0 { b.ready.push(a0) -- 2.51.0