From 3db4f8146c0a7e41a234042eb2057fbfddc17e5f Mon Sep 17 00:00:00 2001 From: Lucien Coffe Date: Fri, 21 Apr 2023 13:44:35 +0200 Subject: [PATCH] [release-branch.go1.20] runtime: resolve checkdead panic by refining `startm` lock handling in caller context This change addresses a `checkdead` panic caused by a race condition between `sysmon->startm` and `checkdead` callers, due to prematurely releasing the scheduler lock. The solution involves allowing a `startm` caller to acquire the scheduler lock and call `startm` in this context. A new `lockheld` bool argument is added to `startm`, which manages all lock and unlock calls within the function. The`startIdle` function variable in `injectglist` is updated to call `startm` with the lock held, ensuring proper lock handling in this specific case. This refined lock handling resolves the observed race condition issue. For #59600. Fixes #60760. Change-Id: I11663a15536c10c773fc2fde291d959099aa71be Reviewed-on: https://go-review.googlesource.com/c/go/+/487316 Reviewed-by: Dmitri Shuralyov Reviewed-by: Michael Pratt TryBot-Result: Gopher Robot Run-TryBot: Michael Pratt (cherry picked from commit ff059add10d71fe13239cf893c0cca113de1fc21) Reviewed-on: https://go-review.googlesource.com/c/go/+/504395 Reviewed-by: Lucien Coffe Auto-Submit: Dmitri Shuralyov Run-TryBot: Dmitri Shuralyov --- src/runtime/proc.go | 45 +++++++++++++++++++++++++++++++-------------- 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/src/runtime/proc.go b/src/runtime/proc.go index 7cd6898a93..bf2a33ea92 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -2351,10 +2351,15 @@ func mspinning() { // Callers passing a non-nil P must call from a non-preemptible context. See // comment on acquirem below. // +// Argument lockheld indicates whether the caller already acquired the +// scheduler lock. Callers holding the lock when making the call must pass +// true. The lock might be temporarily dropped, but will be reacquired before +// returning. +// // Must not have write barriers because this may be called without a P. // //go:nowritebarrierrec -func startm(pp *p, spinning bool) { +func startm(pp *p, spinning, lockheld bool) { // Disable preemption. // // Every owned P must have an owner that will eventually stop it in the @@ -2372,7 +2377,9 @@ func startm(pp *p, spinning bool) { // startm. Callers passing a nil P may be preemptible, so we must // disable preemption before acquiring a P from pidleget below. mp := acquirem() - lock(&sched.lock) + if !lockheld { + lock(&sched.lock) + } if pp == nil { if spinning { // TODO(prattmic): All remaining calls to this function @@ -2382,7 +2389,9 @@ func startm(pp *p, spinning bool) { } pp, _ = pidleget(0) if pp == nil { - unlock(&sched.lock) + if !lockheld { + unlock(&sched.lock) + } releasem(mp) return } @@ -2396,6 +2405,8 @@ func startm(pp *p, spinning bool) { // could find no idle P while checkdead finds a runnable G but // no running M's because this new M hasn't started yet, thus // throwing in an apparent deadlock. + // This apparent deadlock is possible when startm is called + // from sysmon, which doesn't count as a running M. // // Avoid this situation by pre-allocating the ID for the new M, // thus marking it as 'running' before we drop sched.lock. This @@ -2410,12 +2421,18 @@ func startm(pp *p, spinning bool) { fn = mspinning } newm(fn, pp, id) + + if lockheld { + lock(&sched.lock) + } // Ownership transfer of pp committed by start in newm. // Preemption is now safe. releasem(mp) return } - unlock(&sched.lock) + if !lockheld { + unlock(&sched.lock) + } if nmp.spinning { throw("startm: m is spinning") } @@ -2444,24 +2461,24 @@ func handoffp(pp *p) { // if it has local work, start it straight away if !runqempty(pp) || sched.runqsize != 0 { - startm(pp, false) + startm(pp, false, false) return } // if there's trace work to do, start it straight away if (trace.enabled || trace.shutdown) && traceReaderAvailable() != nil { - startm(pp, false) + startm(pp, false, false) return } // if it has GC work, start it straight away if gcBlackenEnabled != 0 && gcMarkWorkAvailable(pp) { - startm(pp, false) + startm(pp, false, false) return } // no local work, check that there are no spinning/idle M's, // otherwise our help is not required if sched.nmspinning.Load()+sched.npidle.Load() == 0 && sched.nmspinning.CompareAndSwap(0, 1) { // TODO: fast atomic sched.needspinning.Store(0) - startm(pp, true) + startm(pp, true, false) return } lock(&sched.lock) @@ -2483,14 +2500,14 @@ func handoffp(pp *p) { } if sched.runqsize != 0 { unlock(&sched.lock) - startm(pp, false) + startm(pp, false, false) return } // If this is the last running P and nobody is polling network, // need to wakeup another M to poll network. if sched.npidle.Load() == gomaxprocs-1 && sched.lastpoll.Load() != 0 { unlock(&sched.lock) - startm(pp, false) + startm(pp, false, false) return } @@ -2539,7 +2556,7 @@ func wakep() { // see at least one running M (ours). unlock(&sched.lock) - startm(pp, true) + startm(pp, true, false) releasem(mp) } @@ -3292,8 +3309,8 @@ func injectglist(glist *gList) { break } + startm(pp, false, true) unlock(&sched.lock) - startm(pp, false) releasem(mp) } } @@ -5391,7 +5408,7 @@ func sysmon() { // See issue 42515 and // https://gnats.netbsd.org/cgi-bin/query-pr-single.pl?number=50094. if next := timeSleepUntil(); next < now { - startm(nil, false) + startm(nil, false, false) } } if scavenger.sysmonWake.Load() != 0 { @@ -5663,7 +5680,7 @@ func schedEnableUser(enable bool) { globrunqputbatch(&sched.disable.runnable, n) unlock(&sched.lock) for ; n != 0 && sched.npidle.Load() != 0; n-- { - startm(nil, false) + startm(nil, false, false) } } else { unlock(&sched.lock) -- 2.48.1