From 0c64be46ac292ea5b387c6cda49b07f4a366c404 Mon Sep 17 00:00:00 2001 From: Michael Pratt Date: Mon, 22 Jan 2024 17:50:44 -0500 Subject: [PATCH] runtime: disable use of runnext on wasm When readying a goroutine, the scheduler typically places the readied goroutine in pp.runnext, which will typically be the next goroutine to run in the schedule. In order to prevent a set of ping-pong goroutines from simply switching back and forth via runnext and starving the rest of the run queue, a goroutine scheduled via runnext shares a time slice (pp.schedtick) with the previous goroutine. sysmon detects "long-running goroutines", which really means Ps using the same pp.schedtick for too long, and preempts them to allow the rest of the run queue to run. Thus this avoids starvation via runnext. However, wasm has no threads, and thus no sysmon. Without sysmon to preempt, the possibility for starvation returns. Avoid this by disabling runnext entirely on wasm. This means that readied goroutines always go on the end of the run queue and thus cannot starve via runnext. Note that this CL doesn't do anything about single long-running goroutines. Without sysmon to preempt them, a single goroutine that fails to yield will starve the run queue indefinitely. For #65178. Cq-Include-Trybots: luci.golang.try:gotip-js-wasm,gotip-wasip1-wasm_wasmtime,gotip-wasip1-wasm_wazero Change-Id: I7dffe1e72c6586474186b72f8068cff77b661eae Reviewed-on: https://go-review.googlesource.com/c/go/+/557437 LUCI-TryBot-Result: Go LUCI Reviewed-by: Mauri de Souza Meneguzzo Reviewed-by: Michael Knyszek Reviewed-by: Bryan Mills --- src/net/net_fake.go | 17 ----------------- src/runtime/proc.go | 23 +++++++++++++++++++++-- src/time/sleep_test.go | 5 ----- 3 files changed, 21 insertions(+), 24 deletions(-) diff --git a/src/net/net_fake.go b/src/net/net_fake.go index 2d1e137b6d..f7eb28e01a 100644 --- a/src/net/net_fake.go +++ b/src/net/net_fake.go @@ -14,7 +14,6 @@ import ( "errors" "io" "os" - "runtime" "sync" "sync/atomic" "syscall" @@ -517,14 +516,6 @@ func (pq *packetQueue) send(dt *deadlineTimer, b []byte, from sockaddr, block bo full = pq.full } - // Before we check dt.expired, yield to other goroutines. - // This may help to prevent starvation of the goroutine that runs the - // deadlineTimer's time.After callback. - // - // TODO(#65178): Remove this when the runtime scheduler no longer starves - // runnable goroutines. - runtime.Gosched() - select { case <-dt.expired: return 0, os.ErrDeadlineExceeded @@ -576,14 +567,6 @@ func (pq *packetQueue) recvfrom(dt *deadlineTimer, b []byte, wholePacket bool, c empty = pq.empty } - // Before we check dt.expired, yield to other goroutines. - // This may help to prevent starvation of the goroutine that runs the - // deadlineTimer's time.After callback. - // - // TODO(#65178): Remove this when the runtime scheduler no longer starves - // runnable goroutines. - runtime.Gosched() - select { case <-dt.expired: return 0, nil, os.ErrDeadlineExceeded diff --git a/src/runtime/proc.go b/src/runtime/proc.go index c1b3ce20d5..410bd01169 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -167,7 +167,7 @@ func main() { // Allow newproc to start new Ms. mainStarted = true - if GOARCH != "wasm" { // no threads on wasm yet, so no sysmon + if haveSysmon { systemstack(func() { newm(sysmon, nil, -1) }) @@ -5933,6 +5933,11 @@ var forcegcperiod int64 = 2 * 60 * 1e9 // golang.org/issue/42515 is needed on NetBSD. var needSysmonWorkaround bool = false +// haveSysmon indicates whether there is sysmon thread support. +// +// No threads on wasm yet, so no sysmon. +const haveSysmon = GOARCH != "wasm" + // Always runs without a P, so write barriers are not allowed. // //go:nowritebarrierrec @@ -6113,7 +6118,10 @@ func retake(now int64) uint32 { s := pp.status sysretake := false if s == _Prunning || s == _Psyscall { - // Preempt G if it's running for too long. + // Preempt G if it's running on the same schedtick for + // too long. This could be from a single long-running + // goroutine or a sequence of goroutines run via + // runnext, which share a single schedtick time slice. t := int64(pp.schedtick) if int64(pd.schedtick) != t { pd.schedtick = uint32(t) @@ -6624,6 +6632,17 @@ const randomizeScheduler = raceenabled // If the run queue is full, runnext puts g on the global queue. // Executed only by the owner P. func runqput(pp *p, gp *g, next bool) { + if !haveSysmon && next { + // A runnext goroutine shares the same time slice as the + // current goroutine (inheritTime from runqget). To prevent a + // ping-pong pair of goroutines from starving all others, we + // depend on sysmon to preempt "long-running goroutines". That + // is, any set of goroutines sharing the same time slice. + // + // If there is no sysmon, we must avoid runnext entirely or + // risk starvation. + next = false + } if randomizeScheduler && next && randn(2) == 0 { next = false } diff --git a/src/time/sleep_test.go b/src/time/sleep_test.go index b25606dfed..36e203e65e 100644 --- a/src/time/sleep_test.go +++ b/src/time/sleep_test.go @@ -96,11 +96,6 @@ func TestAfterFuncStarvation(t *testing.T) { // the AfterFunc goroutine instead of the runnable channel goroutine. // However, in https://go.dev/issue/65178 this was observed to live-lock // on wasip1/wasm and js/wasm after <10000 runs. - - if runtime.GOARCH == "wasm" { - testenv.SkipFlaky(t, 65178) - } - defer runtime.GOMAXPROCS(runtime.GOMAXPROCS(1)) var ( -- 2.48.1