From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Thu, 23 May 2024 20:26:02 +0000 (+0000)
Subject: iter: deflake TestPull by letting exiting goroutines finish
X-Git-Tag: go1.23rc1~137
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=54efef99b2b9432c2eb6cd63a287a13d43b9bb7b;p=gostls13.git

iter: deflake TestPull by letting exiting goroutines finish

Currently TestPull is flaky because goroutines spawned to run subtests
exit asynchronously when they finish and TestPull has explicit checks
for the number of existing goroutines.

This is pretty much only a problem between subtests executing, because
within each subtest the coroutine goroutine spawned for iter.Pull always
exits fully synchronously before the final `next` or `stop` returns.

So, we can resolve the problem by ensuring the first goroutine count the
test takes likely doesn't contain any exiting goroutines. The trick is
to set GOMAXPROCS=1 and spin in runtime.Gosched until the number of
goroutines stabilizes to some reasonable degree (we pick 100 consecutive
iterations; there are only a handful of possible goroutines that can
run, so this is giving that handful around 20 chances to actually run to
completion).

When running TestPull under stress2, this issue is easily reproducible
before this CL. After this CL, it no longer reproduces under these
conditions.

Fixes #66017.

Change-Id: I4bf0a9771f7364df7dd58f8aeb3ae26742d5746f
Reviewed-on: https://go-review.googlesource.com/c/go/+/587917
Reviewed-by: David Chase <drchase@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---

diff --git a/src/iter/pull_test.go b/src/iter/pull_test.go
index 21db1029af..c39574b959 100644
--- a/src/iter/pull_test.go
+++ b/src/iter/pull_test.go
@@ -34,7 +34,7 @@ func squares(n int) Seq2[int, int64] {
 func TestPull(t *testing.T) {
 	for end := 0; end <= 3; end++ {
 		t.Run(fmt.Sprint(end), func(t *testing.T) {
-			ng := runtime.NumGoroutine()
+			ng := stableNumGoroutine()
 			wantNG := func(want int) {
 				if xg := runtime.NumGoroutine() - ng; xg != want {
 					t.Helper()
@@ -76,7 +76,7 @@ func TestPull(t *testing.T) {
 func TestPull2(t *testing.T) {
 	for end := 0; end <= 3; end++ {
 		t.Run(fmt.Sprint(end), func(t *testing.T) {
-			ng := runtime.NumGoroutine()
+			ng := stableNumGoroutine()
 			wantNG := func(want int) {
 				if xg := runtime.NumGoroutine() - ng; xg != want {
 					t.Helper()
@@ -115,6 +115,39 @@ func TestPull2(t *testing.T) {
 	}
 }
 
+// stableNumGoroutine is like NumGoroutine but tries to ensure stability of
+// the value by letting any exiting goroutines finish exiting.
+func stableNumGoroutine() int {
+	// The idea behind stablizing the value of NumGoroutine is to
+	// see the same value enough times in a row in between calls to
+	// runtime.Gosched. With GOMAXPROCS=1, we're trying to make sure
+	// that other goroutines run, so that they reach a stable point.
+	// It's not guaranteed, because it is still possible for a goroutine
+	// to Gosched back into itself, so we require NumGoroutine to be
+	// the same 100 times in a row. This should be more than enough to
+	// ensure all goroutines get a chance to run to completion (or to
+	// some block point) for a small group of test goroutines.
+	defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1))
+
+	c := 0
+	ng := runtime.NumGoroutine()
+	for i := 0; i < 1000; i++ {
+		nng := runtime.NumGoroutine()
+		if nng == ng {
+			c++
+		} else {
+			c = 0
+			ng = nng
+		}
+		if c >= 100 {
+			// The same value 100 times in a row is good enough.
+			return ng
+		}
+		runtime.Gosched()
+	}
+	panic("failed to stabilize NumGoroutine after 1000 iterations")
+}
+
 func TestPullDoubleNext(t *testing.T) {
 	next, _ := Pull(doDoubleNext())
 	nextSlot = next