From a42e337edee375a91f304bb890e7ec24594058b9 Mon Sep 17 00:00:00 2001 From: Dmitrii Martynov Date: Wed, 2 Apr 2025 13:58:18 +0300 Subject: [PATCH] runtime: explicitly exclude a potential deadlock in the scheduler The following sequence in the scheduler may potentially lead to deadlock: - globrunqget() -> runqput() -> runqputslow() -> globrunqputbatch() However, according to the current logic of the scheduler it is not possible to face the deadlock. The patch explicitly excludes the deadlock, even though it is impossible situation at the moment. Additionally, the "runq" and "globrunq" APIs were partially refactored, which allowed to minimize the usage of these APIs by each other. This will prevent situations described in the CL. Change-Id: I7318f935d285b95522998e0903eaa6193af2ba48 Reviewed-on: https://go-review.googlesource.com/c/go/+/662216 Reviewed-by: Keith Randall Reviewed-by: Keith Randall LUCI-TryBot-Result: Go LUCI Reviewed-by: Dmitri Shuralyov --- src/runtime/proc.go | 77 +++++++++++++++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 24 deletions(-) diff --git a/src/runtime/proc.go b/src/runtime/proc.go index cee91b6ce8..16339decbd 100644 --- a/src/runtime/proc.go +++ b/src/runtime/proc.go @@ -3345,7 +3345,7 @@ top: // by constantly respawning each other. if pp.schedtick%61 == 0 && sched.runqsize > 0 { lock(&sched.lock) - gp := globrunqget(pp, 1) + gp := globrunqget() unlock(&sched.lock) if gp != nil { return gp, false, false @@ -3370,9 +3370,12 @@ top: // global runq if sched.runqsize != 0 { lock(&sched.lock) - gp := globrunqget(pp, 0) + gp, q, qsize := globrunqgetbatch(int32(len(pp.runq)) / 2) unlock(&sched.lock) if gp != nil { + if runqputbatch(pp, &q, qsize); !q.empty() { + throw("Couldn't put Gs into empty local runq") + } return gp, false, false } } @@ -3483,8 +3486,14 @@ top: goto top } if sched.runqsize != 0 { - gp := globrunqget(pp, 0) + gp, q, qsize := globrunqgetbatch(int32(len(pp.runq)) / 2) unlock(&sched.lock) + if gp == nil { + throw("global runq empty with non-zero runqsize") + } + if runqputbatch(pp, &q, qsize); !q.empty() { + throw("Couldn't put Gs into empty local runq") + } return gp, false, false } if !mp.spinning && sched.needspinning.Load() == 1 { @@ -3557,11 +3566,14 @@ top: if sched.runqsize != 0 { pp, _ := pidlegetSpinning(0) if pp != nil { - gp := globrunqget(pp, 0) + gp, q, qsize := globrunqgetbatch(int32(len(pp.runq)) / 2) + unlock(&sched.lock) if gp == nil { throw("global runq empty with non-zero runqsize") } - unlock(&sched.lock) + if runqputbatch(pp, &q, qsize); !q.empty() { + throw("Couldn't put Gs into empty local runq") + } acquirep(pp) mp.becomeSpinning() return gp, false, false @@ -3990,7 +4002,12 @@ func injectglist(glist *gList) { } if !q.empty() { - runqputbatch(pp, &q, qsize) + qsize = int(runqputbatch(pp, &q, int32(qsize))) + if !q.empty() { + lock(&sched.lock) + globrunqputbatch(&q, int32(qsize)) + unlock(&sched.lock) + } } // Some P's might have become idle after we loaded `sched.npidle` @@ -6603,35 +6620,48 @@ func globrunqputbatch(batch *gQueue, n int32) { *batch = gQueue{} } -// Try get a batch of G's from the global runnable queue. +// Try get a single G from the global runnable queue. // sched.lock must be held. -func globrunqget(pp *p, max int32) *g { +func globrunqget() *g { assertLockHeld(&sched.lock) if sched.runqsize == 0 { return nil } + sched.runqsize-- + + return sched.runq.pop() +} + +// Try get a batch of G's from the global runnable queue. +// sched.lock must be held. +func globrunqgetbatch(max int32) (gp *g, q gQueue, qsize int32) { + assertLockHeld(&sched.lock) + + if sched.runqsize == 0 { + return + } + n := sched.runqsize/gomaxprocs + 1 if n > sched.runqsize { n = sched.runqsize } - if max > 0 && n > max { + if n > max { n = max } - if n > int32(len(pp.runq))/2 { - n = int32(len(pp.runq)) / 2 - } sched.runqsize -= n - gp := sched.runq.pop() + gp = sched.runq.pop() n-- + + qsize = n for ; n > 0; n-- { gp1 := sched.runq.pop() - runqput(pp, gp1, false) + q.pushBack(gp1) } - return gp + return } // pMask is an atomic bitstring with one bit per P. @@ -6860,10 +6890,12 @@ func runqputslow(pp *p, gp *g, h, t uint32) bool { } // runqputbatch tries to put all the G's on q on the local runnable queue. -// If the queue is full, they are put on the global queue; in that case -// this will temporarily acquire the scheduler lock. +// If the local runq is full the updated size of the input queue will be returned. // Executed only by the owner P. -func runqputbatch(pp *p, q *gQueue, qsize int) { +func runqputbatch(pp *p, q *gQueue, qsize int32) int32 { + if qsize == 0 { + return 0 + } h := atomic.LoadAcq(&pp.runqhead) t := pp.runqtail n := uint32(0) @@ -6873,7 +6905,7 @@ func runqputbatch(pp *p, q *gQueue, qsize int) { t++ n++ } - qsize -= int(n) + qsize -= int32(n) if randomizeScheduler { off := func(o uint32) uint32 { @@ -6886,11 +6918,8 @@ func runqputbatch(pp *p, q *gQueue, qsize int) { } atomic.StoreRel(&pp.runqtail, t) - if !q.empty() { - lock(&sched.lock) - globrunqputbatch(q, int32(qsize)) - unlock(&sched.lock) - } + + return qsize } // Get g from local runnable queue. -- 2.51.0