From 82d14d77daab2ee61bba1af3c291cba47010657a Mon Sep 17 00:00:00 2001 From: Austin Clements Date: Mon, 19 Oct 2015 13:46:32 -0400 Subject: [PATCH] runtime: perform concurrent scan in GC workers MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Currently the concurrent root scan is performed in its entirety by the GC coordinator before entering concurrent mark (which enables GC workers). This scan is done sequentially, which can prolong the scan phase, delay the mark phase, and means that the scan phase does not obey the 25% CPU goal. Furthermore, there's no need to complete the root scan before starting marking (in fact, we already allow GC assists to happen during the scan phase), so this acts as an unnecessary barrier between root scanning and marking. This change shifts the root scan work out of the GC coordinator and in to the GC workers. The coordinator simply sets up the scan state and enqueues the right number of root scan jobs. The GC workers then drain the root scan jobs prior to draining heap scan jobs. This parallelizes the root scan process, makes it obey the 25% CPU goal, and effectively eliminates root scanning as an isolated phase, allowing the system to smoothly transition from root scanning to heap marking. This also eliminates a major non-STW responsibility of the GC coordinator, which will make it easier to switch to a decentralized state machine. Finally, it puts us in a good position to perform root scanning in assists as well, which will help satisfy assists at the beginning of the GC cycle. This is mostly straightforward. One tricky aspect is that we have to deal with preemption deadlock: where two non-preemptible gorountines are trying to preempt each other to perform a stack scan. Given the context where this happens, the only instance of this is two background workers trying to scan each other. We avoid this by simply not scanning the stacks of background workers during the concurrent phase; this is safe because we'll scan them during mark termination (and their stacks are *very* small and should not contain any new pointers). This change also switches the root marking during mark termination to use the same gcDrain-based code path as concurrent mark. This shouldn't affect performance because STW root marking was already parallel and tasks switched to heap marking immediately when no more root marking tasks were available. However, it simplifies the code and unifies these code paths. This has negligible effect on the go1 benchmarks. It slightly slows down the garbage benchmark, possibly by making GC run slightly more frequently. name old time/op new time/op delta XBenchGarbage-12 5.10ms ± 1% 5.24ms ± 1% +2.87% (p=0.000 n=18+18) name old time/op new time/op delta BinaryTree17-12 3.25s ± 3% 3.20s ± 5% -1.57% (p=0.013 n=20+20) Fannkuch11-12 2.45s ± 1% 2.46s ± 1% +0.38% (p=0.019 n=20+18) FmtFprintfEmpty-12 49.7ns ± 3% 49.9ns ± 4% ~ (p=0.851 n=19+20) FmtFprintfString-12 170ns ± 2% 170ns ± 1% ~ (p=0.775 n=20+19) FmtFprintfInt-12 161ns ± 1% 160ns ± 1% -0.78% (p=0.000 n=19+18) FmtFprintfIntInt-12 267ns ± 1% 270ns ± 1% +1.04% (p=0.000 n=19+19) FmtFprintfPrefixedInt-12 238ns ± 2% 238ns ± 1% ~ (p=0.133 n=18+19) FmtFprintfFloat-12 311ns ± 1% 310ns ± 2% -0.35% (p=0.023 n=20+19) FmtManyArgs-12 1.08µs ± 1% 1.06µs ± 1% -2.31% (p=0.000 n=20+20) GobDecode-12 8.65ms ± 1% 8.63ms ± 1% ~ (p=0.377 n=18+20) GobEncode-12 6.49ms ± 1% 6.52ms ± 1% +0.37% (p=0.015 n=20+20) Gzip-12 319ms ± 3% 318ms ± 1% ~ (p=0.975 n=19+17) Gunzip-12 41.9ms ± 1% 42.1ms ± 2% +0.65% (p=0.004 n=19+20) HTTPClientServer-12 61.7µs ± 1% 62.6µs ± 1% +1.40% (p=0.000 n=18+20) JSONEncode-12 16.8ms ± 1% 16.9ms ± 1% ~ (p=0.239 n=20+18) JSONDecode-12 58.4ms ± 1% 60.7ms ± 1% +3.85% (p=0.000 n=19+20) Mandelbrot200-12 3.86ms ± 0% 3.86ms ± 1% ~ (p=0.092 n=18+19) GoParse-12 3.75ms ± 2% 3.75ms ± 2% ~ (p=0.708 n=19+20) RegexpMatchEasy0_32-12 100ns ± 1% 100ns ± 2% +0.60% (p=0.010 n=17+20) RegexpMatchEasy0_1K-12 341ns ± 1% 342ns ± 2% ~ (p=0.203 n=20+19) RegexpMatchEasy1_32-12 82.5ns ± 2% 83.2ns ± 2% +0.83% (p=0.007 n=19+19) RegexpMatchEasy1_1K-12 495ns ± 1% 495ns ± 2% ~ (p=0.970 n=19+18) RegexpMatchMedium_32-12 130ns ± 2% 130ns ± 2% +0.59% (p=0.039 n=19+20) RegexpMatchMedium_1K-12 39.2µs ± 1% 39.3µs ± 1% ~ (p=0.214 n=18+18) RegexpMatchHard_32-12 2.03µs ± 2% 2.02µs ± 1% ~ (p=0.166 n=18+19) RegexpMatchHard_1K-12 61.0µs ± 1% 60.9µs ± 1% ~ (p=0.169 n=20+18) Revcomp-12 533ms ± 1% 535ms ± 1% ~ (p=0.071 n=19+17) Template-12 68.1ms ± 2% 73.0ms ± 1% +7.26% (p=0.000 n=19+20) TimeParse-12 355ns ± 2% 356ns ± 2% ~ (p=0.530 n=19+20) TimeFormat-12 357ns ± 2% 347ns ± 1% -2.59% (p=0.000 n=20+19) [Geo mean] 62.1µs 62.3µs +0.31% name old speed new speed delta GobDecode-12 88.7MB/s ± 1% 88.9MB/s ± 1% ~ (p=0.377 n=18+20) GobEncode-12 118MB/s ± 1% 118MB/s ± 1% -0.37% (p=0.015 n=20+20) Gzip-12 60.9MB/s ± 3% 60.9MB/s ± 1% ~ (p=0.944 n=19+17) Gunzip-12 464MB/s ± 1% 461MB/s ± 2% -0.64% (p=0.004 n=19+20) JSONEncode-12 115MB/s ± 1% 115MB/s ± 1% ~ (p=0.236 n=20+18) JSONDecode-12 33.2MB/s ± 1% 32.0MB/s ± 1% -3.71% (p=0.000 n=19+20) GoParse-12 15.5MB/s ± 2% 15.5MB/s ± 2% ~ (p=0.702 n=19+20) RegexpMatchEasy0_32-12 320MB/s ± 1% 318MB/s ± 2% ~ (p=0.094 n=18+20) RegexpMatchEasy0_1K-12 3.00GB/s ± 1% 2.99GB/s ± 1% ~ (p=0.194 n=20+19) RegexpMatchEasy1_32-12 388MB/s ± 2% 385MB/s ± 2% -0.83% (p=0.008 n=19+19) RegexpMatchEasy1_1K-12 2.07GB/s ± 1% 2.07GB/s ± 1% ~ (p=0.964 n=19+18) RegexpMatchMedium_32-12 7.68MB/s ± 1% 7.64MB/s ± 2% -0.57% (p=0.020 n=19+20) RegexpMatchMedium_1K-12 26.1MB/s ± 1% 26.1MB/s ± 1% ~ (p=0.211 n=18+18) RegexpMatchHard_32-12 15.8MB/s ± 1% 15.8MB/s ± 1% ~ (p=0.180 n=18+19) RegexpMatchHard_1K-12 16.8MB/s ± 1% 16.8MB/s ± 2% ~ (p=0.236 n=20+19) Revcomp-12 477MB/s ± 1% 475MB/s ± 1% ~ (p=0.071 n=19+17) Template-12 28.5MB/s ± 2% 26.6MB/s ± 1% -6.77% (p=0.000 n=19+20) [Geo mean] 100MB/s 99.0MB/s -0.82% Change-Id: I875bf6ceb306d1ee2f470cabf88aa6ede27c47a0 Reviewed-on: https://go-review.googlesource.com/16059 Reviewed-by: Rick Hudson Run-TryBot: Austin Clements TryBot-Result: Gobot Gobot --- src/runtime/mgc.go | 63 +++++++++++++---------- src/runtime/mgcmark.go | 113 ++++++++++++++++++++++++++--------------- src/runtime/mgcwork.go | 2 +- 3 files changed, 109 insertions(+), 69 deletions(-) diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go index de054dd340..56dcd91739 100644 --- a/src/runtime/mgc.go +++ b/src/runtime/mgc.go @@ -228,15 +228,14 @@ var gcBlackenPromptly bool const ( _GCoff = iota // GC not running; sweeping in background, write barrier disabled - _GCscan // GC collecting roots into workbufs, write barrier ENABLED - _GCmark // GC marking from workbufs, write barrier ENABLED + _GCmark // GC marking roots and workbufs, write barrier ENABLED _GCmarktermination // GC mark termination: allocate black, P's help GC, write barrier ENABLED ) //go:nosplit func setGCPhase(x uint32) { atomicstore(&gcphase, x) - writeBarrierEnabled = gcphase == _GCmark || gcphase == _GCmarktermination || gcphase == _GCscan + writeBarrierEnabled = gcphase == _GCmark || gcphase == _GCmarktermination } // gcMarkWorkerMode represents the mode that a concurrent mark worker @@ -786,9 +785,13 @@ func (s *bgMarkSignal) clear() { } var work struct { - full uint64 // lock-free list of full blocks workbuf - empty uint64 // lock-free list of empty blocks workbuf - pad0 [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait + full uint64 // lock-free list of full blocks workbuf + empty uint64 // lock-free list of empty blocks workbuf + pad0 [_CacheLineSize]uint8 // prevents false-sharing between full/empty and nproc/nwait + + markrootNext uint32 // next markroot job + markrootJobs uint32 // number of markroot jobs + nproc uint32 tstart int64 nwait uint32 @@ -937,7 +940,7 @@ func backgroundgc() { func gc(mode gcMode) { // Timing/utilization tracking var stwprocs, maxprocs int32 - var tSweepTerm, tScan, tMark, tMarkTerm int64 + var tSweepTerm, tMark, tMarkTerm int64 // debug.gctrace variables var heap0, heap1, heap2, heapGoal uint64 @@ -990,7 +993,8 @@ func gc(mode gcMode) { heapGoal = gcController.heapGoal systemstack(func() { - // Enter scan phase and enable write barriers. + // Enter concurrent mark phase and enable + // write barriers. // // Because the world is stopped, all Ps will // observe that write barriers are enabled by @@ -1014,13 +1018,14 @@ func gc(mode gcMode) { // allocations are blocked until assists can // happen, we want enable assists as early as // possible. - setGCPhase(_GCscan) + setGCPhase(_GCmark) // markrootSpans uses work.spans, so make sure // it is up to date. gcCopySpans() gcBgMarkPrepare() // Must happen before assist enable. + gcMarkRootPrepare() // At this point all Ps have enabled the write // barrier, thus maintaining the no white to @@ -1029,26 +1034,22 @@ func gc(mode gcMode) { // mutators. atomicstore(&gcBlackenEnabled, 1) - // Concurrent scan. + // Concurrent mark. startTheWorldWithSema() now = nanotime() pauseNS += now - pauseStart - tScan = now gcController.assistStartTime = now - gcscan_m() - - // Enter mark phase. - setGCPhase(_GCmark) }) - // Concurrent mark. - tMark = nanotime() + tMark = now // Enable background mark workers and wait for // background mark completion. - gcController.bgMarkStartTime = nanotime() + gcController.bgMarkStartTime = now work.bgMark1.clear() work.bgMark1.wait() + gcMarkRootCheck() + // The global work list is empty, but there can still be work // sitting in the per-P work caches and there can be more // objects reachable from global roots since they don't have write @@ -1095,7 +1096,7 @@ func gc(mode gcMode) { gcController.endCycle() } else { t := nanotime() - tScan, tMark, tMarkTerm = t, t, t + tMark, tMarkTerm = t, t heapGoal = heap0 } @@ -1189,13 +1190,12 @@ func gc(mode gcMode) { memstats.pause_total_ns += uint64(pauseNS) // Update work.totaltime. - sweepTermCpu := int64(stwprocs) * (tScan - tSweepTerm) - scanCpu := tMark - tScan + sweepTermCpu := int64(stwprocs) * (tMark - tSweepTerm) // We report idle marking time below, but omit it from the // overall utilization here since it's "free". markCpu := gcController.assistTime + gcController.dedicatedMarkTime + gcController.fractionalMarkTime markTermCpu := int64(stwprocs) * (now - tMarkTerm) - cycleCpu := sweepTermCpu + scanCpu + markCpu + markTermCpu + cycleCpu := sweepTermCpu + markCpu + markTermCpu work.totaltime += cycleCpu // Compute overall GC CPU utilization. @@ -1218,6 +1218,12 @@ func gc(mode gcMode) { tInstallWB := tMark installWBCpu := int64(0) + // Scan phase is no longer used. + tScan := tInstallWB + scanCpu := int64(0) + + // TODO: Clean up the gctrace format. + var sbuf [24]byte printlock() print("gc ", memstats.numgc, @@ -1423,6 +1429,9 @@ func gcMarkWorkAvailable(p *p) bool { if atomicload64(&work.full) != 0 { return true // global work available } + if work.markrootNext < work.markrootJobs { + return true // root scan work available + } return false } @@ -1458,7 +1467,7 @@ func gcMark(start_time int64) { gcFlushGCWork() // Queue root marking jobs. - nRoots := gcMarkRootPrepare() + gcMarkRootPrepare() work.nwait = 0 work.ndone = 0 @@ -1468,19 +1477,18 @@ func gcMark(start_time int64) { traceGCScanStart() } - parforsetup(work.markfor, work.nproc, uint32(nRoots), false, markroot) if work.nproc > 1 { noteclear(&work.alldone) helpgc(int32(work.nproc)) } gchelperstart() - parfordo(work.markfor) var gcw gcWork gcDrain(&gcw, gcDrainBlock) gcw.dispose() + gcMarkRootCheck() if work.full != 0 { throw("work.full != 0") } @@ -1727,9 +1735,8 @@ func gchelper() { traceGCScanStart() } - // parallel mark for over GC roots - parfordo(work.markfor) - if gcphase != _GCscan { + // Parallel mark over GC roots and heap + if gcphase == _GCmarktermination { var gcw gcWork gcDrain(&gcw, gcDrainBlock) // blocks in getfull gcw.dispose() diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go index 04267dbdb0..7603085fa8 100644 --- a/src/runtime/mgcmark.go +++ b/src/runtime/mgcmark.go @@ -22,13 +22,13 @@ const ( rootBlockSpans = 8 * 1024 // 64MB worth of spans ) -// gcMarkRootPrepare initializes scanning-related state and returns -// the number of roots. +// gcMarkRootPrepare queues root scanning jobs (stacks, globals, and +// some miscellany) and initializes scanning-related state. // // The caller must have call gcCopySpans(). // //go:nowritebarrier -func gcMarkRootPrepare() int { +func gcMarkRootPrepare() { // Compute how many data and BSS root blocks there are. nBlocks := func(bytes uintptr) int { return int((bytes + rootBlockBytes - 1) / rootBlockBytes) @@ -63,34 +63,17 @@ func gcMarkRootPrepare() int { // allglen isn't changing, so we'll scan all Gs. work.nStackRoots = int(atomicloaduintptr(&allglen)) - return fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots + work.markrootNext = 0 + work.markrootJobs = uint32(fixedRootCount + work.nDataRoots + work.nBSSRoots + work.nSpanRoots + work.nStackRoots) } -// Scan all of the stacks, greying (or graying if in America) the referents -// but not blackening them since the mark write barrier isn't installed. -//go:nowritebarrier -func gcscan_m() { - _g_ := getg() - - // Grab the g that called us and potentially allow rescheduling. - // This allows it to be scanned like other goroutines. - mastergp := _g_.m.curg - casgstatus(mastergp, _Grunning, _Gwaiting) - mastergp.waitreason = "garbage collection scan" - - // Span sweeping has been done by finishsweep_m. - // Long term we will want to make this goroutine runnable - // by placing it onto a scanenqueue state and then calling - // runtime·restartg(mastergp) to make it Grunnable. - // At the bottom we will want to return this p back to the scheduler. - - nroots := gcMarkRootPrepare() - - work.ndone = 0 - useOneP := uint32(1) // For now do not do this in parallel. - // ackgcphase is not needed since we are not scanning running goroutines. - parforsetup(work.markfor, useOneP, uint32(nroots), false, markroot) - parfordo(work.markfor) +// gcMarkRootCheck checks that all roots have been scanned. It is +// purely for debugging. +func gcMarkRootCheck() { + if work.markrootNext < work.markrootJobs { + print(work.markrootNext, " of ", work.markrootJobs, " markroot jobs done\n") + throw("left over markroot jobs") + } lock(&allglock) // Check that gc work is done. @@ -101,14 +84,15 @@ func gcscan_m() { } } unlock(&allglock) - - casgstatus(mastergp, _Gwaiting, _Grunning) - // Let the g that called us continue to run. } // ptrmask for an allocation containing a single pointer. var oneptrmask = [...]uint8{1} +// markroot scans the i'th root. +// +// Preemption must be disabled (because this uses a gcWork). +// //go:nowritebarrier func markroot(desc *parfor, i uint32) { // TODO: Consider using getg().m.p.ptr().gcw. @@ -137,7 +121,7 @@ func markroot(desc *parfor, i uint32) { } case i == fixedRootFlushCaches: - if gcphase != _GCscan { // Do not flush mcaches during GCscan phase. + if gcphase == _GCmarktermination { // Do not flush mcaches during concurrent phase. flushallmcaches() } @@ -167,7 +151,43 @@ func markroot(desc *parfor, i uint32) { shrinkstack(gp) } - scang(gp) + if gcphase != _GCmarktermination && gp.startpc == gcBgMarkWorkerPC { + // GC background workers may be + // non-preemptible, so we may deadlock if we + // try to scan them during a concurrent phase. + // They also have tiny stacks, so just ignore + // them until mark termination. + gp.gcscandone = true + break + } + + // scang must be done on the system stack in case + // we're trying to scan our own stack. + systemstack(func() { + // If this is a self-scan, put the user G in + // _Gwaiting to prevent self-deadlock. It may + // already be in _Gwaiting if this is mark + // termination. + userG := getg().m.curg + selfScan := gp == userG && readgstatus(userG) == _Grunning + if selfScan { + casgstatus(userG, _Grunning, _Gwaiting) + userG.waitreason = "garbage collection scan" + } + + // TODO: scang blocks until gp's stack has + // been scanned, which may take a while for + // running goroutines. Consider doing this in + // two phases where the first is non-blocking: + // we scan the stacks we can and ask running + // goroutines to scan themselves; and the + // second blocks. + scang(gp) + + if selfScan { + casgstatus(userG, _Gwaiting, _Grunning) + } + }) } gcw.dispose() @@ -481,7 +501,7 @@ func scanstack(gp *g) { sp = gp.sched.sp } switch gcphase { - case _GCscan: + case _GCmark: // Install stack barriers during stack scan. barrierOffset = uintptr(firstStackBarrierOffset) nextBarrier = sp + barrierOffset @@ -505,7 +525,7 @@ func scanstack(gp *g) { } else { // Only re-scan up to the lowest un-hit // barrier. Any frames above this have not - // executed since the _GCscan scan of gp and + // executed since the concurrent scan of gp and // any writes through up-pointers to above // this barrier had write barriers. nextBarrier = gp.stkbar[gp.stkbarPos].savedLRPtr @@ -530,7 +550,7 @@ func scanstack(gp *g) { // We skip installing a barrier on bottom-most // frame because on LR machines this LR is not // on the stack. - if gcphase == _GCscan && n != 0 { + if gcphase == _GCmark && n != 0 { if gcInstallStackBarrier(gp, frame) { barrierOffset *= 2 nextBarrier = sp + barrierOffset @@ -640,8 +660,8 @@ const ( gcDrainBlock gcDrainFlags = 0 ) -// gcDrain scans objects in work buffers, blackening grey objects -// until all work buffers have been drained. +// gcDrain scans roots and objects in work buffers, blackening grey +// objects until all roots and work buffers have been drained. // // If flags&gcDrainUntilPreempt != 0, gcDrain also returns if // g.preempt is set. Otherwise, this will block until all dedicated @@ -656,12 +676,25 @@ func gcDrain(gcw *gcWork, flags gcDrainFlags) { throw("gcDrain phase incorrect") } + gp := getg() blocking := flags&gcDrainUntilPreempt == 0 flushBgCredit := flags&gcDrainFlushBgCredit != 0 + // Drain root marking jobs. + if work.markrootNext < work.markrootJobs { + for blocking || !gp.preempt { + job := xadd(&work.markrootNext, +1) - 1 + if job >= work.markrootJobs { + break + } + // TODO: Pass in gcw. + markroot(nil, job) + } + } + initScanWork := gcw.scanWork - gp := getg() + // Drain heap marking jobs. for blocking || !gp.preempt { // If another proc wants a pointer, give it some. if work.nwait > 0 && work.full == 0 { diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go index 4d305e25df..41edb48954 100644 --- a/src/runtime/mgcwork.go +++ b/src/runtime/mgcwork.go @@ -371,7 +371,7 @@ func getfull(entry int) *workbuf { throw("work.nwait > work.nproc") } } - if work.nwait == work.nproc { + if work.nwait == work.nproc && work.markrootNext >= work.markrootJobs { return nil } _g_ := getg() -- 2.48.1