From 1dafccaaf324547c92fbba40e343d9a142f106b4 Mon Sep 17 00:00:00 2001 From: Michael Anthony Knyszek Date: Wed, 26 Feb 2025 23:11:29 +0000 Subject: [PATCH] runtime: increase timeout in TestSpuriousWakeupsNeverHangSemasleep This change tries increasing the timeout in TestSpuriousWakeupsNeverHangSemasleep. I'm not entirely sure of the mechanism, but GODEBUG=gcstoptheworld=2 and GODEBUG=gccheckmark=1 can cause this test to fail at it's regular timeout. It does not seem to indicate a deadlock, because bumping the timeout 10x make the problem go away. I suspect the problem is due to the long STW times these two modes can induce, plus the fact this test runs in parallel with others. Let's just bump the timeout. The test is fundamentally sound, and it's unclear to me how else to test for a deadlock here. Fixes #71691. Fixes #71548. Change-Id: I649531eeec8a8408ba90823ce5223f3a17863124 Reviewed-on: https://go-review.googlesource.com/c/go/+/652756 Auto-Submit: Michael Knyszek Reviewed-by: Michael Pratt LUCI-TryBot-Result: Go LUCI Reviewed-by: Cherry Mui --- src/runtime/semasleep_test.go | 61 +++++++++++++++++++++-------------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/src/runtime/semasleep_test.go b/src/runtime/semasleep_test.go index c891bc7e0c..4f2eea9d92 100644 --- a/src/runtime/semasleep_test.go +++ b/src/runtime/semasleep_test.go @@ -72,14 +72,6 @@ func TestSpuriousWakeupsNeverHangSemasleep(t *testing.T) { close(doneCh) }() - // Wait for an arbitrary timeout longer than one second. The subprocess itself - // attempts to sleep for one second, but if the machine running the test is - // heavily loaded that subprocess may not schedule very quickly even if the - // bug remains fixed. (This is fine, because if the bug really is unfixed we - // can keep the process hung indefinitely, as long as we signal it often - // enough.) - timeout := 10 * time.Second - // The subprocess begins sleeping for 1s after it writes to stdout, so measure // the timeout from here (not from when we started creating the process). // That should reduce noise from process startup overhead. @@ -102,31 +94,50 @@ func TestSpuriousWakeupsNeverHangSemasleep(t *testing.T) { t.Fatalf("The program stopped too quickly.") } } + signaled := 0 for { select { - case now := <-ticker.C: - if now.Sub(ready) > timeout { - // If we got paused for a long time, for example if GODEBUG=gcstoptheworld=2, - // it could be that the subprocess did actually finish and not deadlock, we - // just got stuck as runnable or our wakeup was delayed. Double-check that - // we don't have anything from doneCh before declaring failure. - select { - case err := <-doneCh: - checkDoneErr(err) - return - default: - } - t.Error("Program failed to return on time and has to be killed, issue #27520 still exists") - // Send SIGQUIT to get a goroutine dump. - // Stop sending SIGIO so that the program can clean up and actually terminate. - cmd.Process.Signal(syscall.SIGQUIT) + // Wait for an arbitrary timeout longer than one second (we use the test binary's + // timeout). The subprocess itself attempts to sleep for one second, but if the + // machine running the test is heavily loaded that subprocess may not schedule + // very quickly even if the bug remains fixed. (This is fine, because if the bug + // really is unfixed we can keep the process hung indefinitely, as long as we + // signal it often enough.) + case <-t.Context().Done(): + // If we got paused for a long time for any reason (we're running in parallel + // with other tests after all) it could be that the subprocess did actually + // finish and not deadlock, we just got stuck as runnable or our wakeup was + // delayed. Double-check that we don't have anything from doneCh before + // declaring failure. + select { + case err := <-doneCh: + checkDoneErr(err) return + default: } + // Send SIGQUIT to get a goroutine dump. + // Stop sending SIGIO so that the program can clean up and actually terminate. + cmd.Process.Signal(syscall.SIGQUIT) + + // For failure, we require that we have waited at least 2 seconds. Otherwise + // if another test just ran for a long time and we just happened to push the + // overall binary to timeout, we will report a failure here that will just + // muddy the waters for whoever is trying to investigate the timeout. + const minTime = 2 * time.Second + if dur := time.Now().Sub(ready); dur > minTime { + t.Logf("signaled program %d times", signaled) + t.Logf("time waited: %v", dur) + t.Error("program failed to return on time and has to be killed, issue #27520 still exists") + } else { + t.Skipf("test context deadline ended, but test ran for %s<%s, cutting it short", dur, minTime) + } + return + case <-ticker.C: // Send the pesky signal that toggles spinning // indefinitely if #27520 is not fixed. cmd.Process.Signal(syscall.SIGIO) - + signaled++ case err := <-doneCh: checkDoneErr(err) return -- 2.50.0