From: Lasse Folger <lassefolger@google.com>
Date: Fri, 24 Nov 2023 15:30:07 +0000 (+0000)
Subject: Revert "testing: simplify concurrency and cleanup logic"
X-Git-Tag: go1.22rc1~152
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=6317fb41e920cc593620d4b9a55376472fc02c07;p=gostls13.git

Revert "testing: simplify concurrency and cleanup logic"

reverts CL 506755

Reason for revert: This leads to deadlocks of tests in Google internal testing

For #64402

Change-Id: I78329fc9dcc2377e7e880b264ac1d18d577ef99a
Reviewed-on: https://go-review.googlesource.com/c/go/+/544895
Auto-Submit: Bryan Mills <bcmills@google.com>
Auto-Submit: Lasse Folger <lassefolger@google.com>
Reviewed-by: Bryan Mills <bcmills@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
Run-TryBot: Bryan Mills <bcmills@google.com>
---

diff --git a/src/cmd/cgo/internal/test/callback_windows.go b/src/cmd/cgo/internal/test/callback_windows.go
index b4e0867e77..95e97c9af9 100644
--- a/src/cmd/cgo/internal/test/callback_windows.go
+++ b/src/cmd/cgo/internal/test/callback_windows.go
@@ -29,7 +29,7 @@ USHORT backtrace(ULONG FramesToCapture, PVOID *BackTrace) {
 		}
 
 		ControlPc = context.Rip;
-		// Check if we left the user range.
+        // Check if we left the user range.
 		if (ControlPc < 0x10000) {
 			break;
 		}
@@ -89,7 +89,7 @@ func testCallbackCallersSEH(t *testing.T) {
 		"test.testCallbackCallersSEH",
 		"test.TestCallbackCallersSEH",
 		"testing.tRunner",
-		"testing.(*T).Run.gowrap3",
+		"testing.(*T).Run.gowrap1",
 		"runtime.goexit",
 	}
 	pc := make([]uintptr, 100)
diff --git a/src/testing/benchmark.go b/src/testing/benchmark.go
index da20c1c0c8..9491213ef1 100644
--- a/src/testing/benchmark.go
+++ b/src/testing/benchmark.go
@@ -176,10 +176,6 @@ func (b *B) ReportAllocs() {
 
 // runN runs a single benchmark for the specified number of iterations.
 func (b *B) runN(n int) {
-	if b.done {
-		panic("testing: internal error: runN when benchmark is already done")
-	}
-
 	benchmarkLock.Lock()
 	defer benchmarkLock.Unlock()
 	defer func() {
@@ -200,46 +196,46 @@ func (b *B) runN(n int) {
 	b.previousDuration = b.duration
 }
 
-// run1 runs the first iteration of benchFunc.
-//
-// If no more iterations of this benchmark should be done, run1 sets b.done.
-func (b *B) run1() {
+// run1 runs the first iteration of benchFunc. It reports whether more
+// iterations of this benchmarks should be run.
+func (b *B) run1() bool {
 	if ctx := b.context; ctx != nil {
 		// Extend maxLen, if needed.
 		if n := len(b.name) + ctx.extLen + 1; n > ctx.maxLen {
 			ctx.maxLen = n + 8 // Add additional slack to avoid too many jumps in size.
 		}
 	}
-
-	runOneDone := make(chan struct{})
 	go func() {
 		// Signal that we're done whether we return normally
 		// or by FailNow's runtime.Goexit.
-		defer close(runOneDone)
+		defer func() {
+			b.signal <- true
+		}()
+
 		b.runN(1)
 	}()
-	<-runOneDone
-
+	<-b.signal
 	if b.failed {
 		fmt.Fprintf(b.w, "%s--- FAIL: %s\n%s", b.chatty.prefix(), b.name, b.output)
-		b.done = true
-		close(b.doneOrParallel)
-		return
+		return false
 	}
 	// Only print the output if we know we are not going to proceed.
 	// Otherwise it is printed in processBench.
-	if b.hasSub.Load() || b.skipped {
+	b.mu.RLock()
+	finished := b.finished
+	b.mu.RUnlock()
+	if b.hasSub.Load() || finished {
 		tag := "BENCH"
 		if b.skipped {
 			tag = "SKIP"
 		}
-		if b.chatty != nil && (len(b.output) > 0 || b.skipped) {
+		if b.chatty != nil && (len(b.output) > 0 || finished) {
 			b.trimOutput()
 			fmt.Fprintf(b.w, "%s--- %s: %s\n%s", b.chatty.prefix(), tag, b.name, b.output)
 		}
-		b.done = true
-		close(b.doneOrParallel)
+		return false
 	}
+	return true
 }
 
 var labelsOnce sync.Once
@@ -266,10 +262,9 @@ func (b *B) run() {
 	}
 }
 
-// doBench calls b.launch in a separate goroutine and waits for it to complete.
 func (b *B) doBench() BenchmarkResult {
 	go b.launch()
-	<-b.doneOrParallel
+	<-b.signal
 	return b.result
 }
 
@@ -281,10 +276,7 @@ func (b *B) launch() {
 	// Signal that we're done whether we return normally
 	// or by FailNow's runtime.Goexit.
 	defer func() {
-		b.result = BenchmarkResult{b.N, b.duration, b.bytes, b.netAllocs, b.netBytes, b.extra}
-		b.setRanLeaf()
-		b.done = true
-		close(b.doneOrParallel)
+		b.signal <- true
 	}()
 
 	// Run the benchmark for at least the specified amount of time.
@@ -324,6 +316,7 @@ func (b *B) launch() {
 			b.runN(int(n))
 		}
 	}
+	b.result = BenchmarkResult{b.N, b.duration, b.bytes, b.netAllocs, b.netBytes, b.extra}
 }
 
 // Elapsed returns the measured elapsed time of the benchmark.
@@ -557,7 +550,6 @@ func runBenchmarks(importPath string, matchString func(pat, str string) (bool, e
 		main.chatty = newChattyPrinter(main.w)
 	}
 	main.runN(1)
-	main.done = true
 	return !main.failed
 }
 
@@ -576,21 +568,18 @@ func (ctx *benchContext) processBench(b *B) {
 			if i > 0 || j > 0 {
 				b = &B{
 					common: common{
-						doneOrParallel: make(chan struct{}),
-						name:           b.name,
-						w:              b.w,
-						chatty:         b.chatty,
-						bench:          true,
+						signal: make(chan bool),
+						name:   b.name,
+						w:      b.w,
+						chatty: b.chatty,
+						bench:  true,
 					},
 					benchFunc: b.benchFunc,
 					benchTime: b.benchTime,
 				}
 				b.run1()
 			}
-			var r BenchmarkResult
-			if !b.done {
-				r = b.doBench()
-			}
+			r := b.doBench()
 			if b.failed {
 				// The output could be very long here, but probably isn't.
 				// We print it all, regardless, because we don't want to trim the reason
@@ -633,13 +622,6 @@ var hideStdoutForTesting = false
 // A subbenchmark is like any other benchmark. A benchmark that calls Run at
 // least once will not be measured itself and will be called once with N=1.
 func (b *B) Run(name string, f func(b *B)) bool {
-	if b.previousN > 0 {
-		// If the benchmark calls Run we will only call it once with N=1.
-		// If it doesn't call Run the first time we try it, it must not
-		// call run on subsequent invocations either.
-		panic("testing: unexpected call to B.Run during iteration")
-	}
-
 	// Since b has subbenchmarks, we will no longer run it as a benchmark itself.
 	// Release the lock and acquire it on exit to ensure locks stay paired.
 	b.hasSub.Store(true)
@@ -657,14 +639,14 @@ func (b *B) Run(name string, f func(b *B)) bool {
 	n := runtime.Callers(2, pc[:])
 	sub := &B{
 		common: common{
-			doneOrParallel: make(chan struct{}),
-			name:           benchName,
-			parent:         &b.common,
-			level:          b.level + 1,
-			creator:        pc[:n],
-			w:              b.w,
-			chatty:         b.chatty,
-			bench:          true,
+			signal:  make(chan bool),
+			name:    benchName,
+			parent:  &b.common,
+			level:   b.level + 1,
+			creator: pc[:n],
+			w:       b.w,
+			chatty:  b.chatty,
+			bench:   true,
 		},
 		importPath: b.importPath,
 		benchFunc:  f,
@@ -697,8 +679,7 @@ func (b *B) Run(name string, f func(b *B)) bool {
 		}
 	}
 
-	sub.run1()
-	if !sub.done {
+	if sub.run1() {
 		sub.run()
 	}
 	b.add(sub.result)
@@ -842,14 +823,13 @@ func (b *B) SetParallelism(p int) {
 func Benchmark(f func(b *B)) BenchmarkResult {
 	b := &B{
 		common: common{
-			doneOrParallel: make(chan struct{}),
-			w:              discard{},
+			signal: make(chan bool),
+			w:      discard{},
 		},
 		benchFunc: f,
 		benchTime: benchTime,
 	}
-	b.run1()
-	if !b.done {
+	if b.run1() {
 		b.run()
 	}
 	return b.result
diff --git a/src/testing/fuzz.go b/src/testing/fuzz.go
index 5a3117b1e0..d50ea793e0 100644
--- a/src/testing/fuzz.go
+++ b/src/testing/fuzz.go
@@ -70,6 +70,10 @@ type F struct {
 	fuzzContext *fuzzContext
 	testContext *testContext
 
+	// inFuzzFn is true when the fuzz function is running. Most F methods cannot
+	// be called when inFuzzFn is true.
+	inFuzzFn bool
+
 	// corpus is a set of seed corpus entries, added with F.Add and loaded
 	// from testdata.
 	corpus []corpusEntry
@@ -296,13 +300,13 @@ func (f *F) Fuzz(ff any) {
 		n := runtime.Callers(2, pc[:])
 		t := &T{
 			common: common{
-				runParallel:    make(chan struct{}),
-				doneOrParallel: make(chan struct{}),
-				name:           testName,
-				parent:         &f.common,
-				level:          f.level + 1,
-				creator:        pc[:n],
-				chatty:         f.chatty,
+				barrier: make(chan bool),
+				signal:  make(chan bool),
+				name:    testName,
+				parent:  &f.common,
+				level:   f.level + 1,
+				creator: pc[:n],
+				chatty:  f.chatty,
 			},
 			context: f.testContext,
 		}
@@ -314,7 +318,7 @@ func (f *F) Fuzz(ff any) {
 		if t.chatty != nil {
 			t.chatty.Updatef(t.name, "=== RUN   %s\n", t.name)
 		}
-		f.inFuzzFn = true
+		f.common.inFuzzFn, f.inFuzzFn = true, true
 		go tRunner(t, func(t *T) {
 			args := []reflect.Value{reflect.ValueOf(t)}
 			for _, v := range e.Values {
@@ -330,11 +334,11 @@ func (f *F) Fuzz(ff any) {
 			}
 			fn.Call(args)
 		})
-		<-t.doneOrParallel
+		<-t.signal
 		if t.chatty != nil && t.chatty.json {
 			t.chatty.Updatef(t.parent.name, "=== NAME  %s\n", t.parent.name)
 		}
-		f.inFuzzFn = false
+		f.common.inFuzzFn, f.inFuzzFn = false, false
 		return !t.Failed()
 	}
 
@@ -506,12 +510,12 @@ func runFuzzTests(deps testDeps, fuzzTests []InternalFuzzTarget, deadline time.T
 				}
 				f := &F{
 					common: common{
-						doneOrParallel: make(chan struct{}),
-						runParallel:    make(chan struct{}),
-						name:           testName,
-						parent:         &root,
-						level:          root.level + 1,
-						chatty:         root.chatty,
+						signal:  make(chan bool),
+						barrier: make(chan bool),
+						name:    testName,
+						parent:  &root,
+						level:   root.level + 1,
+						chatty:  root.chatty,
 					},
 					testContext: tctx,
 					fuzzContext: fctx,
@@ -521,12 +525,12 @@ func runFuzzTests(deps testDeps, fuzzTests []InternalFuzzTarget, deadline time.T
 					f.chatty.Updatef(f.name, "=== RUN   %s\n", f.name)
 				}
 				go fRunner(f, ft.Fn)
-				<-f.doneOrParallel
+				<-f.signal
 				if f.chatty != nil && f.chatty.json {
 					f.chatty.Updatef(f.parent.name, "=== NAME  %s\n", f.parent.name)
 				}
 				ok = ok && !f.Failed()
-				ran = ran || f.ranAnyLeaf
+				ran = ran || f.ran
 			}
 			if !ran {
 				// There were no tests to run on this iteration.
@@ -588,12 +592,12 @@ func runFuzzing(deps testDeps, fuzzTests []InternalFuzzTarget) (ok bool) {
 
 	f := &F{
 		common: common{
-			doneOrParallel: make(chan struct{}),
-			runParallel:    nil, // T.Parallel has no effect when fuzzing.
-			name:           testName,
-			parent:         &root,
-			level:          root.level + 1,
-			chatty:         root.chatty,
+			signal:  make(chan bool),
+			barrier: nil, // T.Parallel has no effect when fuzzing.
+			name:    testName,
+			parent:  &root,
+			level:   root.level + 1,
+			chatty:  root.chatty,
 		},
 		fuzzContext: fctx,
 		testContext: tctx,
@@ -603,7 +607,7 @@ func runFuzzing(deps testDeps, fuzzTests []InternalFuzzTarget) (ok bool) {
 		f.chatty.Updatef(f.name, "=== RUN   %s\n", f.name)
 	}
 	go fRunner(f, fuzzTest.Fn)
-	<-f.doneOrParallel
+	<-f.signal
 	if f.chatty != nil {
 		f.chatty.Updatef(f.parent.name, "=== NAME  %s\n", f.parent.name)
 	}
@@ -621,12 +625,6 @@ func runFuzzing(deps testDeps, fuzzTests []InternalFuzzTarget) (ok bool) {
 // simplifications are made. We also require that F.Fuzz, F.Skip, or F.Fail is
 // called.
 func fRunner(f *F, fn func(*F)) {
-	// TODO(bcmills): This function has a lot of code and structure in common with
-	// tRunner. At some point it would probably be good to factor out the common
-	// parts to make the differences easier to spot.
-
-	returned := false
-
 	// When this goroutine is done, either because runtime.Goexit was called, a
 	// panic started, or fn returned normally, record the duration and send
 	// t.signal, indicating the fuzz test is done.
@@ -638,93 +636,96 @@ func fRunner(f *F, fn func(*F)) {
 		// Unfortunately, recovering here adds stack frames, but the location of
 		// the original panic should still be
 		// clear.
-
-		panicVal := recover()
-		if panicVal == nil && !f.skipped && !f.failed {
-			if !returned {
-				panicVal = errNilPanicOrGoexit
-			} else if !f.fuzzCalled {
+		f.checkRaces()
+		if f.Failed() {
+			numFailed.Add(1)
+		}
+		err := recover()
+		if err == nil {
+			f.mu.RLock()
+			fuzzNotCalled := !f.fuzzCalled && !f.skipped && !f.failed
+			if !f.finished && !f.skipped && !f.failed {
+				err = errNilPanicOrGoexit
+			}
+			f.mu.RUnlock()
+			if fuzzNotCalled && err == nil {
 				f.Error("returned without calling F.Fuzz, F.Fail, or F.Skip")
 			}
 		}
 
-		if panicVal != nil {
-			// Mark the test as failed so that Cleanup functions can see its correct status.
-			f.Fail()
-		} else if f.runParallel != nil {
-			// Unblock inputs that called T.Parallel while running the seed corpus.
-			// This only affects fuzz tests run as normal tests.
-			// While fuzzing, T.Parallel has no effect, so f.parallelSubtests is empty
-			// and this is a no-op.
-
-			// Check for races before starting parallel subtests, so that if a
-			// parallel subtest *also* triggers a data race we will report the two
-			// races to the two tests and not attribute all of them to the subtest.
-			f.checkRaces()
-
-			close(f.runParallel)
-			f.parallelSubtests.Wait()
-		}
-
 		// Use a deferred call to ensure that we report that the test is
-		// complete even if a cleanup function calls t.FailNow. See issue 41355.
+		// complete even if a cleanup function calls F.FailNow. See issue 41355.
+		didPanic := false
 		defer func() {
-			cleanupPanic := recover()
-			if panicVal == nil {
-				panicVal = cleanupPanic
+			if !didPanic {
+				// Only report that the test is complete if it doesn't panic,
+				// as otherwise the test binary can exit before the panic is
+				// reported to the user. See issue 41479.
+				f.signal <- true
 			}
+		}()
 
-			// Only report that the test is complete if it doesn't panic,
-			// as otherwise the test binary can exit before the panic is
-			// reported to the user. See issue 41479.
-			if panicVal != nil {
-				// Flush the output log up to the root before dying.
-				for root := &f.common; root.parent != nil; root = root.parent {
-					root.mu.Lock()
-					root.duration += time.Since(root.start)
-					d := root.duration
-					root.mu.Unlock()
-					root.flushToParent(root.name, "--- FAIL: %s (%s)\n", root.name, fmtDuration(d))
-
-					// Since the parent will never finish running, do its cleanup now.
-					// Run the cleanup in a fresh goroutine in case it calls runtime.Goexit,
-					// which we cannot recover.
-					cleanupDone := make(chan struct{})
-					go func() {
-						defer close(cleanupDone)
-						if r := root.parent.runCleanup(recoverAndReturnPanic); r != nil {
-							fmt.Fprintf(root.parent.w, "cleanup panicked with %v", r)
-						}
-					}()
-					<-cleanupDone
-				}
-				panic(panicVal)
+		// If we recovered a panic or inappropriate runtime.Goexit, fail the test,
+		// flush the output log up to the root, then panic.
+		doPanic := func(err any) {
+			f.Fail()
+			if r := f.runCleanup(recoverAndReturnPanic); r != nil {
+				f.Logf("cleanup panicked with %v", r)
+			}
+			for root := &f.common; root.parent != nil; root = root.parent {
+				root.mu.Lock()
+				root.duration += time.Since(root.start)
+				d := root.duration
+				root.mu.Unlock()
+				root.flushToParent(root.name, "--- FAIL: %s (%s)\n", root.name, fmtDuration(d))
 			}
+			didPanic = true
+			panic(err)
+		}
+		if err != nil {
+			doPanic(err)
+		}
 
-			f.checkRaces()
-			f.duration += time.Since(f.start)
-			f.report()
+		// No panic or inappropriate Goexit.
+		f.duration += time.Since(f.start)
 
-			// Do not lock f.done to allow race detector to detect race in case
-			// the user does not appropriately synchronize a goroutine.
-			f.done = true
-			if f.parent != nil && !f.hasSub.Load() {
-				f.setRanLeaf()
+		if len(f.sub) > 0 {
+			// Unblock inputs that called T.Parallel while running the seed corpus.
+			// This only affects fuzz tests run as normal tests.
+			// While fuzzing, T.Parallel has no effect, so f.sub is empty, and this
+			// branch is not taken. f.barrier is nil in that case.
+			f.testContext.release()
+			close(f.barrier)
+			// Wait for the subtests to complete.
+			for _, sub := range f.sub {
+				<-sub.signal
 			}
+			cleanupStart := time.Now()
+			err := f.runCleanup(recoverAndReturnPanic)
+			f.duration += time.Since(cleanupStart)
+			if err != nil {
+				doPanic(err)
+			}
+		}
 
-			running.Delete(f.name)
-			close(f.doneOrParallel)
-		}()
-
-		f.runCleanup(normalPanic)
+		// Report after all subtests have finished.
+		f.report()
+		f.done = true
+		f.setRan()
+	}()
+	defer func() {
+		if len(f.sub) == 0 {
+			f.runCleanup(normalPanic)
+		}
 	}()
 
-	// Run the actual fuzz function.
 	f.start = time.Now()
 	f.resetRaces()
 	fn(f)
 
 	// Code beyond this point will not be executed when FailNow or SkipNow
 	// is invoked.
-	returned = true
+	f.mu.Lock()
+	f.finished = true
+	f.mu.Unlock()
 }
diff --git a/src/testing/panic_test.go b/src/testing/panic_test.go
index 82e7087025..6307b84a7a 100644
--- a/src/testing/panic_test.go
+++ b/src/testing/panic_test.go
@@ -211,68 +211,57 @@ func TestPanicHelper(t *testing.T) {
 }
 
 func TestMorePanic(t *testing.T) {
-	subprocess := false
-	if os.Getenv("GO_WANT_HELPER_PROCESS") == "1" {
-		subprocess = true
-	} else {
-		testenv.MustHaveExec(t)
-		t.Parallel()
-	}
+	testenv.MustHaveExec(t)
 
 	testCases := []struct {
-		issue int
 		desc  string
-		f     func(*testing.T)
+		flags []string
 		want  string
 	}{
 		{
-			issue: 48502,
-			desc:  "runtime.Goexit in t.Cleanup after panic",
-			f: func(t *testing.T) {
-				t.Cleanup(func() {
-					t.Log("Goexiting in cleanup")
-					runtime.Goexit()
-				})
-				t.Parallel()
-				panic("die")
-			},
-			want: `panic: die [recovered]
-	panic: die`,
+			desc:  "Issue 48502: call runtime.Goexit in t.Cleanup after panic",
+			flags: []string{"-test.run=^TestGoexitInCleanupAfterPanicHelper$"},
+			want: `panic: die
+	panic: test executed panic(nil) or runtime.Goexit`,
 		},
 		{
-			issue: 48515,
-			desc:  "t.Run in t.Cleanup should trigger panic",
-			f: func(t *testing.T) {
-				t.Cleanup(func() {
-					t.Run("in-cleanup", func(t *testing.T) {
-						t.Log("must not be executed")
-					})
-				})
-			},
-			want: `panic: testing: t.Run called during t.Cleanup`,
+			desc:  "Issue 48515: call t.Run in t.Cleanup should trigger panic",
+			flags: []string{"-test.run=^TestCallRunInCleanupHelper$"},
+			want:  `panic: testing: t.Run called during t.Cleanup`,
 		},
 	}
 
 	for _, tc := range testCases {
-		tc := tc
-		t.Run(fmt.Sprintf("issue%v", tc.issue), func(t *testing.T) {
-			if subprocess {
-				tc.f(t)
-				return
-			}
+		cmd := exec.Command(os.Args[0], tc.flags...)
+		cmd.Env = append(os.Environ(), "GO_WANT_HELPER_PROCESS=1")
+		b, _ := cmd.CombinedOutput()
+		got := string(b)
+		want := tc.want
+		re := makeRegexp(want)
+		if ok, err := regexp.MatchString(re, got); !ok || err != nil {
+			t.Errorf("output:\ngot:\n%s\nwant:\n%s", got, want)
+		}
+	}
+}
 
-			t.Parallel()
-			cmd := testenv.Command(t, os.Args[0], "-test.run="+t.Name())
-			cmd.Env = append(cmd.Environ(), "GO_WANT_HELPER_PROCESS=1")
-			b, _ := cmd.CombinedOutput()
-			got := string(b)
-			t.Logf("%v:\n%s", tc.desc, got)
+func TestCallRunInCleanupHelper(t *testing.T) {
+	if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
+		return
+	}
 
-			want := tc.want
-			re := makeRegexp(want)
-			if ok, err := regexp.MatchString(re, got); !ok || err != nil {
-				t.Errorf("wanted:\n%s", want)
-			}
+	t.Cleanup(func() {
+		t.Run("in-cleanup", func(t *testing.T) {
+			t.Log("must not be executed")
 		})
+	})
+}
+
+func TestGoexitInCleanupAfterPanicHelper(t *testing.T) {
+	if os.Getenv("GO_WANT_HELPER_PROCESS") != "1" {
+		return
 	}
+
+	t.Cleanup(func() { runtime.Goexit() })
+	t.Parallel()
+	panic("die")
 }
diff --git a/src/testing/sub_test.go b/src/testing/sub_test.go
index 7cbbd2e1cf..55b14c3795 100644
--- a/src/testing/sub_test.go
+++ b/src/testing/sub_test.go
@@ -21,6 +21,102 @@ func init() {
 	benchTime.d = 100 * time.Millisecond
 }
 
+func TestTestContext(t *T) {
+	const (
+		add1 = 0
+		done = 1
+	)
+	// After each of the calls are applied to the context, the
+	type call struct {
+		typ int // run or done
+		// result from applying the call
+		running int
+		waiting int
+		started bool
+	}
+	testCases := []struct {
+		max int
+		run []call
+	}{{
+		max: 1,
+		run: []call{
+			{typ: add1, running: 1, waiting: 0, started: true},
+			{typ: done, running: 0, waiting: 0, started: false},
+		},
+	}, {
+		max: 1,
+		run: []call{
+			{typ: add1, running: 1, waiting: 0, started: true},
+			{typ: add1, running: 1, waiting: 1, started: false},
+			{typ: done, running: 1, waiting: 0, started: true},
+			{typ: done, running: 0, waiting: 0, started: false},
+			{typ: add1, running: 1, waiting: 0, started: true},
+		},
+	}, {
+		max: 3,
+		run: []call{
+			{typ: add1, running: 1, waiting: 0, started: true},
+			{typ: add1, running: 2, waiting: 0, started: true},
+			{typ: add1, running: 3, waiting: 0, started: true},
+			{typ: add1, running: 3, waiting: 1, started: false},
+			{typ: add1, running: 3, waiting: 2, started: false},
+			{typ: add1, running: 3, waiting: 3, started: false},
+			{typ: done, running: 3, waiting: 2, started: true},
+			{typ: add1, running: 3, waiting: 3, started: false},
+			{typ: done, running: 3, waiting: 2, started: true},
+			{typ: done, running: 3, waiting: 1, started: true},
+			{typ: done, running: 3, waiting: 0, started: true},
+			{typ: done, running: 2, waiting: 0, started: false},
+			{typ: done, running: 1, waiting: 0, started: false},
+			{typ: done, running: 0, waiting: 0, started: false},
+		},
+	}}
+	for i, tc := range testCases {
+		ctx := &testContext{
+			startParallel: make(chan bool),
+			maxParallel:   tc.max,
+		}
+		for j, call := range tc.run {
+			doCall := func(f func()) chan bool {
+				done := make(chan bool)
+				go func() {
+					f()
+					done <- true
+				}()
+				return done
+			}
+			started := false
+			switch call.typ {
+			case add1:
+				signal := doCall(ctx.waitParallel)
+				select {
+				case <-signal:
+					started = true
+				case ctx.startParallel <- true:
+					<-signal
+				}
+			case done:
+				signal := doCall(ctx.release)
+				select {
+				case <-signal:
+				case <-ctx.startParallel:
+					started = true
+					<-signal
+				}
+			}
+			if started != call.started {
+				t.Errorf("%d:%d:started: got %v; want %v", i, j, started, call.started)
+			}
+			if ctx.running != call.running {
+				t.Errorf("%d:%d:running: got %v; want %v", i, j, ctx.running, call.running)
+			}
+			if ctx.numWaiting != call.waiting {
+				t.Errorf("%d:%d:waiting: got %v; want %v", i, j, ctx.numWaiting, call.waiting)
+			}
+		}
+	}
+}
+
 func TestTRun(t *T) {
 	realTest := t
 	testCases := []struct {
@@ -415,10 +511,10 @@ func TestTRun(t *T) {
 			buf := &strings.Builder{}
 			root := &T{
 				common: common{
-					doneOrParallel: make(chan struct{}),
-					runParallel:    make(chan struct{}),
-					name:           "",
-					w:              buf,
+					signal:  make(chan bool),
+					barrier: make(chan bool),
+					name:    "",
+					w:       buf,
 				},
 				context: ctx,
 			}
@@ -427,6 +523,7 @@ func TestTRun(t *T) {
 				root.chatty.json = tc.json
 			}
 			ok := root.Run(tc.desc, tc.f)
+			ctx.release()
 
 			if ok != tc.ok {
 				t.Errorf("%s:ok: got %v; want %v", tc.desc, ok, tc.ok)
@@ -434,8 +531,8 @@ func TestTRun(t *T) {
 			if ok != !root.Failed() {
 				t.Errorf("%s:root failed: got %v; want %v", tc.desc, !ok, root.Failed())
 			}
-			if n := ctx.running(); n != 0 {
-				t.Errorf("%s:running non-zero: got %d", tc.desc, n)
+			if ctx.running != 0 || ctx.numWaiting != 0 {
+				t.Errorf("%s:running and waiting non-zero: got %d and %d", tc.desc, ctx.running, ctx.numWaiting)
 			}
 			got := strings.TrimSpace(buf.String())
 			want := strings.TrimSpace(tc.output)
@@ -604,9 +701,9 @@ func TestBRun(t *T) {
 			// the benchtime and catch the failure result of the subbenchmark.
 			root := &B{
 				common: common{
-					doneOrParallel: make(chan struct{}),
-					name:           "root",
-					w:              buf,
+					signal: make(chan bool),
+					name:   "root",
+					w:      buf,
 				},
 				benchFunc: func(b *B) { ok = b.Run("test", tc.f) }, // Use Run to catch failure.
 				benchTime: durationOrCountFlag{d: 1 * time.Microsecond},
@@ -740,8 +837,8 @@ func TestLogAfterComplete(t *T) {
 		common: common{
 			// Use a buffered channel so that tRunner can write
 			// to it although nothing is reading from it.
-			doneOrParallel: make(chan struct{}),
-			w:              &buf,
+			signal: make(chan bool, 1),
+			w:      &buf,
 		},
 		context: ctx,
 	}
@@ -813,22 +910,18 @@ func TestCleanup(t *T) {
 func TestConcurrentCleanup(t *T) {
 	cleanups := 0
 	t.Run("test", func(t *T) {
-		var wg sync.WaitGroup
-		wg.Add(2)
+		done := make(chan struct{})
 		for i := 0; i < 2; i++ {
 			i := i
 			go func() {
 				t.Cleanup(func() {
-					// Although the calls to Cleanup are concurrent, the functions passed
-					// to Cleanup should be called sequentially, in some nondeterministic
-					// order based on when the Cleanup calls happened to be scheduled.
-					// So these assignments to the cleanups variable should not race.
 					cleanups |= 1 << i
 				})
-				wg.Done()
+				done <- struct{}{}
 			}()
 		}
-		wg.Wait()
+		<-done
+		<-done
 	})
 	if cleanups != 1|2 {
 		t.Errorf("unexpected cleanup; got %d want 3", cleanups)
diff --git a/src/testing/testing.go b/src/testing/testing.go
index 96f71c89b9..ed8b3630f1 100644
--- a/src/testing/testing.go
+++ b/src/testing/testing.go
@@ -480,7 +480,7 @@ var (
 	cpuList     []int
 	testlogFile *os.File
 
-	anyFailed atomic.Bool
+	numFailed atomic.Uint32 // number of test failures
 
 	running sync.Map // map[string]time.Time of running, unpaused tests
 )
@@ -593,40 +593,38 @@ const maxStackLen = 50
 // captures common methods such as Errorf.
 type common struct {
 	mu          sync.RWMutex         // guards this group of fields
-	output      []byte               // output generated by test or benchmark
-	w           io.Writer            // output to which flushToParent should write
-	ranAnyLeaf  bool                 // Test or benchmark ran to completion without calling Run itself, or had a subtest that did so.
+	output      []byte               // Output generated by test or benchmark.
+	w           io.Writer            // For flushToParent.
+	ran         bool                 // Test or benchmark (or one of its subtests) was executed.
 	failed      bool                 // Test or benchmark has failed.
 	skipped     bool                 // Test or benchmark has been skipped.
-	goexiting   bool                 // Test function is attempting to abort by goexit (implies failed || skipped).
 	done        bool                 // Test is finished and all subtests have completed.
 	helperPCs   map[uintptr]struct{} // functions to be skipped when writing file/line info
 	helperNames map[string]struct{}  // helperPCs converted to function names
 	cleanups    []func()             // optional functions to be called at the end of the test
-	cleanupName string               // name of the cleanup function currently running
-	cleanupPc   []uintptr            // stack trace at the point where Cleanup was called
-
-	chatty     *chattyPrinter // copy of chattyPrinter, if the chatty flag is set
-	bench      bool           // Current test is a benchmark.
-	runner     string         // function name of tRunner running the test
-	isParallel bool           // Test is running in parallel.
-	inFuzzFn   bool           // Fuzz target, if this is one, is running.
-	inCleanup  bool           // Cleanup callbacks, if any, are running.
-
-	parent         *common
-	level          int           // nesting depth of test or benchmark
-	creator        []uintptr     // if level > 0, the stack trace at the point where the parent called t.Run
-	name           string        // name of test or benchmark
-	start          time.Time     // time test or benchmark started or resumed
-	duration       time.Duration // time in the test function, excluding time blocked in t.Parallel
-	runParallel    chan struct{} // Closed when parallel subtests may start. Nil when T.Parallel is not present (B) or not usable (when fuzzing).
-	doneOrParallel chan struct{} // Closed when the test is either blocked in Parallel or done running.
-
-	hasSub           atomic.Bool // Test or benchmark has subtests or sub-benchmarks.
-	parallelSubtests sync.WaitGroup
-	runMu            sync.Mutex // Held during calls to Run to prevent the total number of active subtests from exceeding the parallelism limit.
-
-	lastRaceErrors  atomic.Int64 // max value of race.Errors seen during the test or its subtests
+	cleanupName string               // Name of the cleanup function.
+	cleanupPc   []uintptr            // The stack trace at the point where Cleanup was called.
+	finished    bool                 // Test function has completed.
+	inFuzzFn    bool                 // Whether the fuzz target, if this is one, is running.
+
+	chatty         *chattyPrinter // A copy of chattyPrinter, if the chatty flag is set.
+	bench          bool           // Whether the current test is a benchmark.
+	hasSub         atomic.Bool    // whether there are sub-benchmarks.
+	cleanupStarted atomic.Bool    // Registered cleanup callbacks have started to execute
+	runner         string         // Function name of tRunner running the test.
+	isParallel     bool           // Whether the test is parallel.
+
+	parent   *common
+	level    int       // Nesting depth of test or benchmark.
+	creator  []uintptr // If level > 0, the stack trace at the point where the parent called t.Run.
+	name     string    // Name of test or benchmark.
+	start    time.Time // Time test or benchmark started
+	duration time.Duration
+	barrier  chan bool // To signal parallel subtests they may start. Nil when T.Parallel is not present (B) or not usable (when fuzzing).
+	signal   chan bool // To signal a test is done.
+	sub      []*T      // Queue of subtests to be run in parallel.
+
+	lastRaceErrors  atomic.Int64 // Max value of race.Errors seen during the test or its subtests.
 	raceErrorLogged atomic.Bool
 
 	tempDirMu  sync.Mutex
@@ -933,13 +931,13 @@ func (c *common) Name() string {
 	return c.name
 }
 
-func (c *common) setRanLeaf() {
+func (c *common) setRan() {
 	if c.parent != nil {
-		c.parent.setRanLeaf()
+		c.parent.setRan()
 	}
 	c.mu.Lock()
 	defer c.mu.Unlock()
-	c.ranAnyLeaf = true
+	c.ran = true
 }
 
 // Fail marks the function as having failed but continues execution.
@@ -954,7 +952,6 @@ func (c *common) Fail() {
 		panic("Fail in goroutine after " + c.name + " has completed")
 	}
 	c.failed = true
-	anyFailed.Store(true)
 }
 
 // Failed reports whether the function has failed.
@@ -1003,7 +1000,7 @@ func (c *common) FailNow() {
 	// a top-of-stack deferred function now, we know that the send
 	// only happens after any other stacked defers have completed.
 	c.mu.Lock()
-	c.goexiting = true
+	c.finished = true
 	c.mu.Unlock()
 	runtime.Goexit()
 }
@@ -1118,7 +1115,7 @@ func (c *common) SkipNow() {
 	c.checkFuzzFn("SkipNow")
 	c.mu.Lock()
 	c.skipped = true
-	c.goexiting = true
+	c.finished = true
 	c.mu.Unlock()
 	runtime.Goexit()
 }
@@ -1321,8 +1318,8 @@ const (
 // If ph is recoverAndReturnPanic, it will catch panics, and return the
 // recovered value if any.
 func (c *common) runCleanup(ph panicHandling) (panicVal any) {
-	c.inCleanup = true
-	defer func() { c.inCleanup = false }()
+	c.cleanupStarted.Store(true)
+	defer c.cleanupStarted.Store(false)
 
 	if ph == recoverAndReturnPanic {
 		defer func() {
@@ -1449,7 +1446,8 @@ func (t *T) Parallel() {
 	if t.isEnvSet {
 		panic("testing: t.Parallel called after t.Setenv; cannot set environment variables in parallel tests")
 	}
-	if t.parent.runParallel == nil {
+	t.isParallel = true
+	if t.parent.barrier == nil {
 		// T.Parallel has no effect when fuzzing.
 		// Multiple processes may run in parallel, but only one input can run at a
 		// time per process so we can attribute crashes to specific inputs.
@@ -1462,7 +1460,7 @@ func (t *T) Parallel() {
 	t.duration += time.Since(t.start)
 
 	// Add to the list of tests to be released by the parent.
-	t.parent.parallelSubtests.Add(1)
+	t.parent.sub = append(t.parent.sub, t)
 
 	// Report any races during execution of this test up to this point.
 	//
@@ -1481,19 +1479,9 @@ func (t *T) Parallel() {
 	}
 	running.Delete(t.name)
 
-	t.isParallel = true
-
-	// Release the parent test to run. We can't just use parallelSem tokens for
-	// this because some callers (notably TestParallelSub) expect to be able to
-	// call Run from multiple goroutines and have those calls succeed.
-	//
-	// Instead, we close a channel to unblock the parent's call to Run, allowing
-	// it to resume. Then, we wait for it to finish and unblock its parallel
-	// subtests, and acquire a parallel run token so that we don't run too many of
-	// the subtests together all at once.
-	close(t.doneOrParallel)
-	<-t.parent.runParallel
-	t.context.acquireParallel()
+	t.signal <- true   // Release calling test.
+	<-t.parent.barrier // Wait for the parent test to complete.
+	t.context.waitParallel()
 
 	if t.chatty != nil {
 		t.chatty.Updatef(t.name, "=== CONT  %s\n", t.name)
@@ -1550,13 +1538,19 @@ var errNilPanicOrGoexit = errors.New("test executed panic(nil) or runtime.Goexit
 
 func tRunner(t *T, fn func(t *T)) {
 	t.runner = callerName(0)
-	returned := false
 
 	// When this goroutine is done, either because fn(t)
 	// returned normally or because a test failure triggered
 	// a call to runtime.Goexit, record the duration and send
 	// a signal saying that the test is done.
 	defer func() {
+		t.checkRaces()
+
+		// TODO(#61034): This is the wrong place for this check.
+		if t.Failed() {
+			numFailed.Add(1)
+		}
+
 		// Check if the test panicked or Goexited inappropriately.
 		//
 		// If this happens in a normal test, print output but continue panicking.
@@ -1565,120 +1559,132 @@ func tRunner(t *T, fn func(t *T)) {
 		// If this happens while fuzzing, recover from the panic and treat it like a
 		// normal failure. It's important that the process keeps running in order to
 		// find short inputs that cause panics.
-		panicVal := recover()
-		if !returned && !t.goexiting && panicVal == nil {
-			panicVal = errNilPanicOrGoexit
+		err := recover()
+		signal := true
+
+		t.mu.RLock()
+		finished := t.finished
+		t.mu.RUnlock()
+		if !finished && err == nil {
+			err = errNilPanicOrGoexit
 			for p := t.parent; p != nil; p = p.parent {
 				p.mu.RLock()
-				pGoexiting := p.goexiting
+				finished = p.finished
 				p.mu.RUnlock()
-				if pGoexiting {
-					t.Errorf("%v: subtest may have called FailNow on a parent test", panicVal)
-					panicVal = nil
+				if finished {
+					if !t.isParallel {
+						t.Errorf("%v: subtest may have called FailNow on a parent test", err)
+						err = nil
+					}
+					signal = false
 					break
 				}
 			}
 		}
 
-		if panicVal != nil && t.context.isFuzzing {
+		if err != nil && t.context.isFuzzing {
 			prefix := "panic: "
-			if panicVal == errNilPanicOrGoexit {
+			if err == errNilPanicOrGoexit {
 				prefix = ""
 			}
-			t.Errorf("%s%s\n%s\n", prefix, panicVal, string(debug.Stack()))
-			panicVal = nil
-		}
-
-		if panicVal != nil {
-			// Mark the test as failed so that Cleanup functions can see its correct status.
-			t.Fail()
-		} else if t.runParallel != nil {
-			// Run parallel subtests.
-
-			// Check for races before starting parallel subtests, so that if a
-			// parallel subtest *also* triggers a data race we will report the two
-			// races to the two tests and not attribute all of them to the subtest.
-			t.checkRaces()
-
-			if t.isParallel {
-				// Release our own parallel token first, so that it is available for
-				// subtests to acquire.
-				t.context.releaseParallel()
-			}
-			close(t.runParallel)
-			t.parallelSubtests.Wait()
-			if t.isParallel {
-				// Re-acquire a parallel token to limit concurrent cleanup.
-				t.context.acquireParallel()
-			}
+			t.Errorf("%s%s\n%s\n", prefix, err, string(debug.Stack()))
+			t.mu.Lock()
+			t.finished = true
+			t.mu.Unlock()
+			err = nil
 		}
 
 		// Use a deferred call to ensure that we report that the test is
 		// complete even if a cleanup function calls t.FailNow. See issue 41355.
+		didPanic := false
 		defer func() {
-			cleanupPanic := recover()
-			if panicVal == nil {
-				panicVal = cleanupPanic
-			}
-
 			// Only report that the test is complete if it doesn't panic,
 			// as otherwise the test binary can exit before the panic is
 			// reported to the user. See issue 41479.
-			if panicVal != nil {
-				// Flush the output log up to the root before dying.
-				for root := &t.common; root.parent != nil; root = root.parent {
-					root.mu.Lock()
-					root.duration += time.Since(root.start)
-					d := root.duration
-					root.mu.Unlock()
-					root.flushToParent(root.name, "--- FAIL: %s (%s)\n", root.name, fmtDuration(d))
-
-					// Since the parent will never finish running, do its cleanup now.
-					// Run the cleanup in a fresh goroutine in case it calls runtime.Goexit,
-					// which we cannot recover.
-					cleanupDone := make(chan struct{})
-					go func() {
-						defer close(cleanupDone)
-						if r := root.parent.runCleanup(recoverAndReturnPanic); r != nil {
-							fmt.Fprintf(root.parent.w, "cleanup panicked with %v", r)
-						}
-					}()
-					<-cleanupDone
-				}
-				panic(panicVal)
+			if didPanic {
+				return
+			}
+			if err != nil {
+				panic(err)
 			}
+			running.Delete(t.name)
+			t.signal <- signal
+		}()
 
-			t.checkRaces()
-			t.duration += time.Since(t.start)
-			t.report()
-
-			// Do not lock t.done to allow race detector to detect race in case
-			// the user does not appropriately synchronize a goroutine.
-			t.done = true
-			if t.parent != nil && !t.hasSub.Load() {
-				t.setRanLeaf()
+		doPanic := func(err any) {
+			t.Fail()
+			if r := t.runCleanup(recoverAndReturnPanic); r != nil {
+				t.Logf("cleanup panicked with %v", r)
 			}
+			// Flush the output log up to the root before dying.
+			for root := &t.common; root.parent != nil; root = root.parent {
+				root.mu.Lock()
+				root.duration += time.Since(root.start)
+				d := root.duration
+				root.mu.Unlock()
+				root.flushToParent(root.name, "--- FAIL: %s (%s)\n", root.name, fmtDuration(d))
+				if r := root.parent.runCleanup(recoverAndReturnPanic); r != nil {
+					fmt.Fprintf(root.parent.w, "cleanup panicked with %v", r)
+				}
+			}
+			didPanic = true
+			panic(err)
+		}
+		if err != nil {
+			doPanic(err)
+		}
 
-			running.Delete(t.name)
-			if t.isParallel {
-				t.context.releaseParallel()
-				t.parent.parallelSubtests.Done()
-			} else {
-				close(t.doneOrParallel)
+		t.duration += time.Since(t.start)
+
+		if len(t.sub) > 0 {
+			// Run parallel subtests.
+			// Decrease the running count for this test.
+			t.context.release()
+			// Release the parallel subtests.
+			close(t.barrier)
+			// Wait for subtests to complete.
+			for _, sub := range t.sub {
+				<-sub.signal
 			}
-		}()
+			cleanupStart := time.Now()
+			err := t.runCleanup(recoverAndReturnPanic)
+			t.duration += time.Since(cleanupStart)
+			if err != nil {
+				doPanic(err)
+			}
+			t.checkRaces()
+			if !t.isParallel {
+				// Reacquire the count for sequential tests. See comment in Run.
+				t.context.waitParallel()
+			}
+		} else if t.isParallel {
+			// Only release the count for this test if it was run as a parallel
+			// test. See comment in Run method.
+			t.context.release()
+		}
+		t.report() // Report after all subtests have finished.
 
-		t.runCleanup(normalPanic)
+		// Do not lock t.done to allow race detector to detect race in case
+		// the user does not appropriately synchronize a goroutine.
+		t.done = true
+		if t.parent != nil && !t.hasSub.Load() {
+			t.setRan()
+		}
+	}()
+	defer func() {
+		if len(t.sub) == 0 {
+			t.runCleanup(normalPanic)
+		}
 	}()
 
-	// Run the actual test function.
 	t.start = time.Now()
 	t.resetRaces()
 	fn(t)
 
-	// Code beyond this point will not be executed when FailNow or SkipNow
-	// is invoked.
-	returned = true
+	// code beyond here will not be executed when FailNow is invoked
+	t.mu.Lock()
+	t.finished = true
+	t.mu.Unlock()
 }
 
 // Run runs f as a subtest of t called name. It runs f in a separate goroutine
@@ -1688,7 +1694,7 @@ func tRunner(t *T, fn func(t *T)) {
 // Run may be called simultaneously from multiple goroutines, but all such calls
 // must return before the outer test function for t returns.
 func (t *T) Run(name string, f func(t *T)) bool {
-	if t.inCleanup {
+	if t.cleanupStarted.Load() {
 		panic("testing: t.Run called during t.Cleanup")
 	}
 
@@ -1702,56 +1708,40 @@ func (t *T) Run(name string, f func(t *T)) bool {
 	// continue walking the stack into the parent test.
 	var pc [maxStackLen]uintptr
 	n := runtime.Callers(2, pc[:])
-	sub := &T{
+	t = &T{
 		common: common{
-			runParallel:    make(chan struct{}),
-			doneOrParallel: make(chan struct{}),
-			name:           testName,
-			parent:         &t.common,
-			level:          t.level + 1,
-			creator:        pc[:n],
-			chatty:         t.chatty,
+			barrier: make(chan bool),
+			signal:  make(chan bool, 1),
+			name:    testName,
+			parent:  &t.common,
+			level:   t.level + 1,
+			creator: pc[:n],
+			chatty:  t.chatty,
 		},
 		context: t.context,
 	}
-	sub.w = indenter{&sub.common}
-
-	// Ensure that only one non-parallel subtest runs at a time so that we don't
-	// exceed the -parallel setting due to concurrent calls.
-	// (Run may be called concurrently even if the test is not marked parallel â
-	// see TestParallelSub.)
-	t.runMu.Lock()
-	defer t.runMu.Unlock()
-
-	if t.isParallel {
-		// Release our parallelism token for the subtest to use
-		// for its own parallel subtests.
-		t.context.releaseParallel()
-		defer t.context.acquireParallel()
-	}
+	t.w = indenter{&t.common}
 
-	if sub.chatty != nil {
-		sub.chatty.Updatef(sub.name, "=== RUN   %s\n", sub.name)
+	if t.chatty != nil {
+		t.chatty.Updatef(t.name, "=== RUN   %s\n", t.name)
 	}
-	running.Store(sub.name, time.Now())
+	running.Store(t.name, time.Now())
 
 	// Instead of reducing the running count of this test before calling the
 	// tRunner and increasing it afterwards, we rely on tRunner keeping the
 	// count correct. This ensures that a sequence of sequential tests runs
 	// without being preempted, even when their parent is a parallel test. This
 	// may especially reduce surprises if *parallel == 1.
-	go tRunner(sub, f)
-	<-sub.doneOrParallel
-	if t.goexiting {
-		// The parent test (t) thinks it called runtime.Goexit, but here we are
-		// still running. It is likely that this subtest called FailNow or SkipNow
-		// on the t instead of sub, so propagate the Goexit to the parent goroutine.
+	go tRunner(t, f)
+	if !<-t.signal {
+		// At this point, it is likely that FailNow was called on one of the
+		// parent tests by one of the subtests. Continue aborting up the chain.
 		runtime.Goexit()
 	}
 	if t.chatty != nil && t.chatty.json {
-		t.chatty.Updatef(t.name, "=== NAME  %s\n", t.name)
+		t.chatty.Updatef(t.parent.name, "=== NAME  %s\n", t.parent.name)
 	}
-	return !sub.failed
+	return !t.failed
 }
 
 // Deadline reports the time at which the test binary will have
@@ -1775,43 +1765,53 @@ type testContext struct {
 	// does not match).
 	isFuzzing bool
 
-	// parallelSem is a counting semaphore to limit concurrency of Parallel tests.
-	// It has a capacity equal to the parallel flag.
-	// Send a token to acquire; receive to release.
-	// Non-parallel tests do not require a token.
-	parallelSem chan token
-}
+	mu sync.Mutex
 
-// A token is a semaphore token.
-type token struct{}
+	// Channel used to signal tests that are ready to be run in parallel.
+	startParallel chan bool
 
-// newTestContext returns a new testContext with the given parallelism and matcher.
-func newTestContext(maxParallel int, m *matcher) *testContext {
-	tc := &testContext{
-		match:       m,
-		parallelSem: make(chan token, maxParallel),
-	}
-	return tc
+	// running is the number of tests currently running in parallel.
+	// This does not include tests that are waiting for subtests to complete.
+	running int
+
+	// numWaiting is the number tests waiting to be run in parallel.
+	numWaiting int
+
+	// maxParallel is a copy of the parallel flag.
+	maxParallel int
 }
 
-// acquireParallel blocks until it can obtain a semaphore token for running a
-// parallel test.
-func (c *testContext) acquireParallel() {
-	c.parallelSem <- token{}
+func newTestContext(maxParallel int, m *matcher) *testContext {
+	return &testContext{
+		match:         m,
+		startParallel: make(chan bool),
+		maxParallel:   maxParallel,
+		running:       1, // Set the count to 1 for the main (sequential) test.
+	}
 }
 
-// releaseParallel returns a semaphore token obtained by acquireParallel.
-func (c *testContext) releaseParallel() {
-	select {
-	case <-c.parallelSem:
-	default:
-		panic("testing: internal error: no parallel token to release")
+func (c *testContext) waitParallel() {
+	c.mu.Lock()
+	if c.running < c.maxParallel {
+		c.running++
+		c.mu.Unlock()
+		return
 	}
+	c.numWaiting++
+	c.mu.Unlock()
+	<-c.startParallel
 }
 
-// running returns the number of semaphore tokens outstanding.
-func (c *testContext) running() int {
-	return len(c.parallelSem)
+func (c *testContext) release() {
+	c.mu.Lock()
+	if c.numWaiting == 0 {
+		c.running--
+		c.mu.Unlock()
+		return
+	}
+	c.numWaiting--
+	c.mu.Unlock()
+	c.startParallel <- true // Pick a waiting test to be run.
 }
 
 // No one should be using func Main anymore.
@@ -2132,9 +2132,9 @@ func runTests(matchString func(pat, str string) (bool, error), tests []InternalT
 			ctx.deadline = deadline
 			t := &T{
 				common: common{
-					doneOrParallel: make(chan struct{}),
-					runParallel:    make(chan struct{}),
-					w:              os.Stdout,
+					signal:  make(chan bool, 1),
+					barrier: make(chan bool),
+					w:       os.Stdout,
 				},
 				context: ctx,
 			}
@@ -2147,12 +2147,12 @@ func runTests(matchString func(pat, str string) (bool, error), tests []InternalT
 				}
 			})
 			select {
-			case <-t.doneOrParallel:
+			case <-t.signal:
 			default:
-				panic("internal error: tRunner exited without closing t.doneOrParallel")
+				panic("internal error: tRunner exited without sending on t.signal")
 			}
 			ok = ok && !t.Failed()
-			ran = ran || t.ranAnyLeaf
+			ran = ran || t.ran
 		}
 	}
 	return ran, ok
@@ -2390,5 +2390,5 @@ func parseCpuList() {
 }
 
 func shouldFailFast() bool {
-	return *failFast && anyFailed.Load()
+	return *failFast && numFailed.Load() > 0
 }