From 90a19961f2b885ee253b2d51820977dc9c635f0c Mon Sep 17 00:00:00 2001
From: Rick Hudson <rlh@golang.org>
Date: Mon, 1 Jun 2015 18:16:03 -0400
Subject: [PATCH] runtime: reduce latency by aggressively ending mark phase

Some latency regressions have crept into our system over the past few
weeks. This CL fixes those by having the mark phase more aggressively
blacken objects so that the mark termination phase, a STW phase, has less
work to do. Three approaches were taken when the mark phase believes
it has no more work to do, ie all the work buffers are empty.
If things have gone well the mark phase is correct and there is
in fact little or no work. In that case the following items will
take very little time. If the mark phase is wrong this CL will
ferret that work out and give the mark phase a chance to deal with
it concurrently before mark termination begins.

When the mark phase first appears to be out of work, it does three things:
1) It switches from allocating white to allocating black to reduce the
number of unmarked objects reachable only from stacks.
2) It flushes and disables per-P GC work caches so all work must be in
globally visible work buffers.
3) It rescans the global roots---the BSS and data segments---so there
are fewer objects to blacken during mark termination. We do not rescan
stacks at this point, though that could be done in a later CL.
After these steps, it again drains the global work buffers.

On a lightly loaded machine the garbage benchmark has reduced the
number of GC cycles with latency > 10 ms from 83 out of 4083 cycles
down to 2 out of 3995 cycles. Maximum latency was reduced from
60+ msecs down to 20 ms.

Change-Id: I152285b48a7e56c5083a02e8e4485dd39c990492
Reviewed-on: https://go-review.googlesource.com/10590
Reviewed-by: Austin Clements <austin@google.com>
---
 src/runtime/malloc.go  |   2 +-
 src/runtime/mgc.go     | 163 ++++++++++++++++++++++++++++++-----------
 src/runtime/mgcmark.go |  56 ++++++++++----
 src/runtime/mgcwork.go |  20 ++++-
 4 files changed, 177 insertions(+), 64 deletions(-)

diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go
index 25371ab776..5872a3752e 100644
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -663,7 +663,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
 	// All slots hold nil so no scanning is needed.
 	// This may be racing with GC so do it atomically if there can be
 	// a race marking the bit.
-	if gcphase == _GCmarktermination {
+	if gcphase == _GCmarktermination || gcBlackenPromptly {
 		systemstack(func() {
 			gcmarknewobject_m(uintptr(x), size)
 		})
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go
index 5103739497..6289cb57a2 100644
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -225,6 +225,21 @@ var writeBarrierEnabled bool // compiler emits references to this in write barri
 // gcphase == _GCmark.
 var gcBlackenEnabled uint32
 
+// gcBlackenPromptly indicates that optimizations that may
+// hide work from the global work queue should be disabled.
+//
+// If gcBlackenPromptly is true, per-P gcWork caches should
+// be flushed immediately and new objects should be allocated black.
+//
+// There is a tension between allocating objects white and
+// allocating them black. If white and the objects die before being
+// marked they can be collected during this GC cycle. On the other
+// hand allocating them black will reduce _GCmarktermination latency
+// since more work is done in the mark phase. This tension is resolved
+// by allocating white until the mark phase is approaching its end and
+// then allocating black for the remainder of the mark phase.
+var gcBlackenPromptly bool
+
 const (
 	_GCoff             = iota // GC not running, write barrier disabled
 	_GCstw                    // unused state
@@ -547,7 +562,7 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
 	if _p_.gcBgMarkWorker == nil {
 		throw("gcControllerState.findRunnable: no background mark worker")
 	}
-	if work.bgMarkDone != 0 {
+	if work.bgMark1.done != 0 && work.bgMark2.done != 0 {
 		// Background mark is done. Don't schedule background
 		// mark worker any more. (This is not just an
 		// optimization. Without this we can spin scheduling
@@ -667,6 +682,51 @@ func shouldtriggergc() bool {
 	return memstats.heap_live >= memstats.next_gc && atomicloaduint(&bggc.working) == 0
 }
 
+// bgMarkSignal synchronizes the GC coordinator and background mark workers.
+type bgMarkSignal struct {
+	// Workers race to cas to 1. Winner signals coordinator.
+	done uint32
+	// Coordinator to wake up.
+	lock mutex
+	g    *g
+	wake bool
+}
+
+func (s *bgMarkSignal) wait() {
+	lock(&s.lock)
+	if s.wake {
+		// Wakeup already happened
+		unlock(&s.lock)
+	} else {
+		s.g = getg()
+		goparkunlock(&s.lock, "mark wait (idle)", traceEvGoBlock, 1)
+	}
+	s.wake = false
+	s.g = nil
+}
+
+// complete signals the completion of this phase of marking. This can
+// be called multiple times during a cycle; only the first call has
+// any effect.
+func (s *bgMarkSignal) complete() {
+	if cas(&s.done, 0, 1) {
+		// This is the first worker to reach this completion point.
+		// Signal the main GC goroutine.
+		lock(&s.lock)
+		if s.g == nil {
+			// It hasn't parked yet.
+			s.wake = true
+		} else {
+			ready(s.g, 0)
+		}
+		unlock(&s.lock)
+	}
+}
+
+func (s *bgMarkSignal) clear() {
+	s.done = 0
+}
+
 var work struct {
 	full    uint64                // lock-free list of full blocks workbuf
 	empty   uint64                // lock-free list of empty blocks workbuf
@@ -681,13 +741,11 @@ var work struct {
 
 	bgMarkReady note   // signal background mark worker has started
 	bgMarkDone  uint32 // cas to 1 when at a background mark completion point
-
 	// Background mark completion signaling
-	bgMarkWake struct {
-		lock mutex
-		g    *g
-		wake bool
-	}
+
+	// Coordination for the 2 parts of the mark phase.
+	bgMark1 bgMarkSignal
+	bgMark2 bgMarkSignal
 
 	// Copy of mheap.allspans for marker or sweeper.
 	spans []*mspan
@@ -903,16 +961,31 @@ func gc(mode int) {
 		}
 
 		// Wait for background mark completion.
-		lock(&work.bgMarkWake.lock)
-		if work.bgMarkWake.wake {
-			// Wakeup already happened
-			unlock(&work.bgMarkWake.lock)
+		work.bgMark1.wait()
+
+		// The global work list is empty, but there can still be work
+		// sitting in the per-P work caches and there can be more
+		// objects reachable from global roots since they don't have write
+		// barriers. Rescan some roots and flush work caches.
+		systemstack(func() {
+			// rescan global data and bss.
+			markroot(nil, _RootData)
+			markroot(nil, _RootBss)
+			forEachP(func(_p_ *p) {
+				_p_.gcw.dispose()
+			})
+		})
+
+		if atomicload64(&work.full) != 0 || atomicload64(&work.partial) != 0 {
+			if work.bgMark2.done != 0 {
+				throw("work.bgMark2.done != 0")
+			}
+			gcBlackenPromptly = true
+			// Wait for this more aggressive background mark to complete.
+			work.bgMark2.wait()
 		} else {
-			work.bgMarkWake.g = getg()
-			goparkunlock(&work.bgMarkWake.lock, "mark wait (idle)", traceEvGoBlock, 1)
+			work.bgMark2.done = 1
 		}
-		work.bgMarkWake.wake = false
-		work.bgMarkWake.g = nil
 
 		// Begin mark termination.
 		if debug.gctrace > 0 {
@@ -945,6 +1018,7 @@ func gc(mode int) {
 	// World is stopped.
 	// Start marktermination which includes enabling the write barrier.
 	atomicstore(&gcBlackenEnabled, 0)
+	gcBlackenPromptly = false
 	setGCPhase(_GCmarktermination)
 
 	if debug.gctrace > 0 {
@@ -1119,10 +1193,9 @@ func gcBgMarkPrepare() {
 	work.nproc = ^uint32(0)
 	work.nwait = ^uint32(0)
 
-	// Background GC and assists race to set this to 1 on
-	// completion so that this only gets one "done" signal.
-	work.bgMarkDone = 0
-
+	// Reset background mark completion points.
+	work.bgMark1.clear()
+	work.bgMark2.clear()
 	gcController.bgMarkStartTime = nanotime()
 }
 
@@ -1169,7 +1242,11 @@ func gcBgMarkWorker(p *p) {
 
 		startTime := nanotime()
 
-		xadd(&work.nwait, -1)
+		decnwait := xadd(&work.nwait, -1)
+		if decnwait == work.nproc {
+			println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
+			throw("work.nwait was > work.nproc")
+		}
 
 		done := false
 		switch p.gcMarkWorkerMode {
@@ -1185,21 +1262,37 @@ func gcBgMarkWorker(p *p) {
 			gcDrainUntilPreempt(&p.gcw, gcBgCreditSlack)
 			// Was this the last worker and did we run out
 			// of work?
-			done = xadd(&work.nwait, +1) == work.nproc && work.full == 0 && work.partial == 0
+			incnwait := xadd(&work.nwait, +1)
+			if incnwait > work.nproc {
+				println("runtime: p.gcMarkWorkerMode=", p.gcMarkWorkerMode,
+					"work.nwait=", incnwait, "work.nproc=", work.nproc)
+				throw("work.nwait > work.nproc")
+			}
+			done = incnwait == work.nproc && work.full == 0 && work.partial == 0
+		}
+		// If we are near the end of the mark phase dispose of p.gcw.
+		if gcBlackenPromptly {
+			p.gcw.dispose()
 		}
-		// We're not in mark termination, so there's no need
-		// to dispose p.gcw.
 
 		// If this worker reached a background mark completion
 		// point, signal the main GC goroutine.
 		if done {
-			gcBgMarkDone()
+			if gcBlackenPromptly {
+				if work.bgMark1.done == 0 {
+					throw("completing mark 2, but bgMark1.done == 0")
+				}
+				work.bgMark2.complete()
+			} else {
+				work.bgMark1.complete()
+			}
 		}
 
 		duration := nanotime() - startTime
 		switch p.gcMarkWorkerMode {
 		case gcMarkWorkerDedicatedMode:
 			xaddint64(&gcController.dedicatedMarkTime, duration)
+			xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
 		case gcMarkWorkerFractionalMode:
 			xaddint64(&gcController.fractionalMarkTime, duration)
 			xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
@@ -1209,26 +1302,8 @@ func gcBgMarkWorker(p *p) {
 	}
 }
 
-// gcBgMarkDone signals the completion of background marking. This can
-// be called multiple times during a cycle; only the first call has
-// any effect.
-func gcBgMarkDone() {
-	if cas(&work.bgMarkDone, 0, 1) {
-		// This is the first worker to reach completion.
-		// Signal the main GC goroutine.
-		lock(&work.bgMarkWake.lock)
-		if work.bgMarkWake.g == nil {
-			// It hasn't parked yet.
-			work.bgMarkWake.wake = true
-		} else {
-			ready(work.bgMarkWake.g, 0)
-		}
-		unlock(&work.bgMarkWake.lock)
-	}
-}
-
-// gcMarkWorkAvailable determines if mark work is readily available.
-// It is used by the scheduler to decide if this p run a mark work.
+// gcMarkWorkAvailable returns true if executing a mark worker
+// on p is potentially useful.
 func gcMarkWorkAvailable(p *p) bool {
 	if !p.gcw.empty() {
 		return true
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go
index b2fbc97615..57dc2560dd 100644
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -31,9 +31,10 @@ func gcscan_m() {
 
 	work.nwait = 0
 	work.ndone = 0
-	work.nproc = 1 // For now do not do this in parallel.
+	work.nproc = 1
+	useOneP := uint32(1) // For now do not do this in parallel.
 	//	ackgcphase is not needed since we are not scanning running goroutines.
-	parforsetup(work.markfor, work.nproc, uint32(_RootCount+local_allglen), false, markroot)
+	parforsetup(work.markfor, useOneP, uint32(_RootCount+local_allglen), false, markroot)
 	parfordo(work.markfor)
 
 	lock(&allglock)
@@ -193,12 +194,24 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
 
 	// Perform assist work
 	systemstack(func() {
+		if atomicload(&gcBlackenEnabled) == 0 {
+			// The gcBlackenEnabled check in malloc races with the
+			// store that clears it but an atomic check in every malloc
+			// would be a performance hit.
+			// Instead we recheck it here on the non-preemptable system
+			// stack to determine if we should preform an assist.
+			return
+		}
 		// Track time spent in this assist. Since we're on the
 		// system stack, this is non-preemptible, so we can
 		// just measure start and end time.
 		startTime := nanotime()
 
-		xadd(&work.nwait, -1)
+		decnwait := xadd(&work.nwait, -1)
+		if decnwait == work.nproc {
+			println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc)
+			throw("nwait > work.nprocs")
+		}
 
 		// drain own cached work first in the hopes that it
 		// will be more cache friendly.
@@ -207,16 +220,33 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
 		gcDrainN(gcw, scanWork)
 		// Record that we did this much scan work.
 		gp.gcscanwork += gcw.scanWork - startScanWork
-		// No need to dispose since we're not in mark termination.
-
+		// If we are near the end of the mark phase
+		// dispose of the gcw.
+		if gcBlackenPromptly {
+			gcw.dispose()
+		}
 		// If this is the last worker and we ran out of work,
 		// signal a completion point.
-		if xadd(&work.nwait, +1) == work.nproc && work.full == 0 && work.partial == 0 {
+		incnwait := xadd(&work.nwait, +1)
+		if incnwait > work.nproc {
+			println("runtime: work.nwait=", incnwait,
+				"work.nproc=", work.nproc,
+				"gcBlackenPromptly=", gcBlackenPromptly)
+			throw("work.nwait > work.nproc")
+		}
+
+		if incnwait == work.nproc && work.full == 0 && work.partial == 0 {
 			// This has reached a background completion
 			// point.
-			gcBgMarkDone()
+			if gcBlackenPromptly {
+				if work.bgMark1.done == 0 {
+					throw("completing mark 2, but bgMark1.done == 0")
+				}
+				work.bgMark2.complete()
+			} else {
+				work.bgMark1.complete()
+			}
 		}
-
 		duration := nanotime() - startTime
 		_p_ := gp.m.p.ptr()
 		_p_.gcAssistTime += duration
@@ -795,7 +825,7 @@ func shade(b uintptr) {
 	if obj, hbits, span := heapBitsForObject(b); obj != 0 {
 		gcw := &getg().m.p.ptr().gcw
 		greyobject(obj, 0, 0, hbits, span, gcw)
-		if gcphase == _GCmarktermination {
+		if gcphase == _GCmarktermination || gcBlackenPromptly {
 			// Ps aren't allowed to cache work during mark
 			// termination.
 			gcw.dispose()
@@ -885,16 +915,12 @@ func gcDumpObject(label string, obj, off uintptr) {
 	}
 }
 
-// When in GCmarkterminate phase we allocate black.
+// If gcBlackenPromptly is true we are in the second mark phase phase so we allocate black.
 //go:nowritebarrier
 func gcmarknewobject_m(obj, size uintptr) {
-	if gcphase != _GCmarktermination {
-		throw("marking new object while not in mark termination phase")
-	}
-	if useCheckmark { // The world should be stopped so this should not happen.
+	if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen.
 		throw("gcmarknewobject called while doing checkmark")
 	}
-
 	heapBitsForAddr(obj).setMarked()
 	xadd64(&work.bytesMarked, int64(size))
 }
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go
index 226c65635f..4a1455c860 100644
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -240,7 +240,7 @@ func (b *workbuf) logput(entry int) {
 		return
 	}
 	if !b.inuse {
-		println("runtime:logput fails log entry=", entry,
+		println("runtime: logput fails log entry=", entry,
 			"b.log[0]=", b.log[0], "b.log[1]=", b.log[1],
 			"b.log[2]=", b.log[2], "b.log[3]=", b.log[3])
 		throw("logput: put not legal")
@@ -388,10 +388,18 @@ func getfull(entry int) *workbuf {
 		return b
 	}
 
-	xadd(&work.nwait, +1)
+	incnwait := xadd(&work.nwait, +1)
+	if incnwait > work.nproc {
+		println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
+		throw("work.nwait > work.nproc")
+	}
 	for i := 0; ; i++ {
 		if work.full != 0 || work.partial != 0 {
-			xadd(&work.nwait, -1)
+			decnwait := xadd(&work.nwait, -1)
+			if decnwait == work.nproc {
+				println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
+				throw("work.nwait > work.nproc")
+			}
 			b = (*workbuf)(lfstackpop(&work.full))
 			if b == nil {
 				b = (*workbuf)(lfstackpop(&work.partial))
@@ -401,7 +409,11 @@ func getfull(entry int) *workbuf {
 				b.checknonempty()
 				return b
 			}
-			xadd(&work.nwait, +1)
+			incnwait := xadd(&work.nwait, +1)
+			if incnwait > work.nproc {
+				println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
+				throw("work.nwait > work.nproc")
+			}
 		}
 		if work.nwait == work.nproc {
 			return nil
-- 
2.51.0