runtime: reduce latency by aggressively ending mark phase

author Rick Hudson <rlh@golang.org>

Mon, 1 Jun 2015 22:16:03 +0000 (18:16 -0400)

committer Rick Hudson <rlh@golang.org>

Thu, 18 Jun 2015 21:38:46 +0000 (21:38 +0000)
author Rick Hudson <rlh@golang.org>
Mon, 1 Jun 2015 22:16:03 +0000 (18:16 -0400)
committer Rick Hudson <rlh@golang.org>
Thu, 18 Jun 2015 21:38:46 +0000 (21:38 +0000)
diff --git a/src/runtime/malloc.go b/src/runtime/malloc.go

index 25371ab776e5fb6af74e4b870d08a0419bb679e8..5872a3752e43b95366ac87159034954239d0835c 100644 (file)
--- a/src/runtime/malloc.go
+++ b/src/runtime/malloc.go
@@ -663,7 +663,7 @@ func mallocgc(size uintptr, typ *_type, flags uint32) unsafe.Pointer {
         // All slots hold nil so no scanning is needed.
         // This may be racing with GC so do it atomically if there can be
         // a race marking the bit.
-       if gcphase == _GCmarktermination {
+       if gcphase == _GCmarktermination || gcBlackenPromptly {
                 systemstack(func() {
                         gcmarknewobject_m(uintptr(x), size)
                 })
diff --git a/src/runtime/mgc.go b/src/runtime/mgc.go

index 5103739497a3f41d46418712069c08de6239d400..6289cb57a2ffd8e53d913ca495fe39de0bccacac 100644 (file)
--- a/src/runtime/mgc.go
+++ b/src/runtime/mgc.go
@@ -225,6 +225,21 @@ var writeBarrierEnabled bool // compiler emits references to this in write barri
  // gcphase == _GCmark.
  var gcBlackenEnabled uint32
  
+// gcBlackenPromptly indicates that optimizations that may
+// hide work from the global work queue should be disabled.
+//
+// If gcBlackenPromptly is true, per-P gcWork caches should
+// be flushed immediately and new objects should be allocated black.
+//
+// There is a tension between allocating objects white and
+// allocating them black. If white and the objects die before being
+// marked they can be collected during this GC cycle. On the other
+// hand allocating them black will reduce _GCmarktermination latency
+// since more work is done in the mark phase. This tension is resolved
+// by allocating white until the mark phase is approaching its end and
+// then allocating black for the remainder of the mark phase.
+var gcBlackenPromptly bool
+
  const (
         _GCoff             = iota // GC not running, write barrier disabled
         _GCstw                    // unused state
@@ -547,7 +562,7 @@ func (c *gcControllerState) findRunnableGCWorker(_p_ *p) *g {
         if _p_.gcBgMarkWorker == nil {
                 throw("gcControllerState.findRunnable: no background mark worker")
         }
-       if work.bgMarkDone != 0 {
+       if work.bgMark1.done != 0 && work.bgMark2.done != 0 {
                 // Background mark is done. Don't schedule background
                 // mark worker any more. (This is not just an
                 // optimization. Without this we can spin scheduling
@@ -667,6 +682,51 @@ func shouldtriggergc() bool {
         return memstats.heap_live >= memstats.next_gc && atomicloaduint(&bggc.working) == 0
  }
  
+// bgMarkSignal synchronizes the GC coordinator and background mark workers.
+type bgMarkSignal struct {
+       // Workers race to cas to 1. Winner signals coordinator.
+       done uint32
+       // Coordinator to wake up.
+       lock mutex
+       g    *g
+       wake bool
+}
+
+func (s *bgMarkSignal) wait() {
+       lock(&s.lock)
+       if s.wake {
+               // Wakeup already happened
+               unlock(&s.lock)
+       } else {
+               s.g = getg()
+               goparkunlock(&s.lock, "mark wait (idle)", traceEvGoBlock, 1)
+       }
+       s.wake = false
+       s.g = nil
+}
+
+// complete signals the completion of this phase of marking. This can
+// be called multiple times during a cycle; only the first call has
+// any effect.
+func (s *bgMarkSignal) complete() {
+       if cas(&s.done, 0, 1) {
+               // This is the first worker to reach this completion point.
+               // Signal the main GC goroutine.
+               lock(&s.lock)
+               if s.g == nil {
+                       // It hasn't parked yet.
+                       s.wake = true
+               } else {
+                       ready(s.g, 0)
+               }
+               unlock(&s.lock)
+       }
+}
+
+func (s *bgMarkSignal) clear() {
+       s.done = 0
+}
+
  var work struct {
         full    uint64                // lock-free list of full blocks workbuf
         empty   uint64                // lock-free list of empty blocks workbuf
@@ -681,13 +741,11 @@ var work struct {
  
         bgMarkReady note   // signal background mark worker has started
         bgMarkDone  uint32 // cas to 1 when at a background mark completion point
-
         // Background mark completion signaling
-       bgMarkWake struct {
-               lock mutex
-               g    *g
-               wake bool
-       }
+
+       // Coordination for the 2 parts of the mark phase.
+       bgMark1 bgMarkSignal
+       bgMark2 bgMarkSignal
  
         // Copy of mheap.allspans for marker or sweeper.
         spans []*mspan
@@ -903,16 +961,31 @@ func gc(mode int) {
                 }
  
                 // Wait for background mark completion.
-               lock(&work.bgMarkWake.lock)
-               if work.bgMarkWake.wake {
-                       // Wakeup already happened
-                       unlock(&work.bgMarkWake.lock)
+               work.bgMark1.wait()
+
+               // The global work list is empty, but there can still be work
+               // sitting in the per-P work caches and there can be more
+               // objects reachable from global roots since they don't have write
+               // barriers. Rescan some roots and flush work caches.
+               systemstack(func() {
+                       // rescan global data and bss.
+                       markroot(nil, _RootData)
+                       markroot(nil, _RootBss)
+                       forEachP(func(_p_ *p) {
+                               _p_.gcw.dispose()
+                       })
+               })
+
+               if atomicload64(&work.full) != 0 || atomicload64(&work.partial) != 0 {
+                       if work.bgMark2.done != 0 {
+                               throw("work.bgMark2.done != 0")
+                       }
+                       gcBlackenPromptly = true
+                       // Wait for this more aggressive background mark to complete.
+                       work.bgMark2.wait()
                 } else {
-                       work.bgMarkWake.g = getg()
-                       goparkunlock(&work.bgMarkWake.lock, "mark wait (idle)", traceEvGoBlock, 1)
+                       work.bgMark2.done = 1
                 }
-               work.bgMarkWake.wake = false
-               work.bgMarkWake.g = nil
  
                 // Begin mark termination.
                 if debug.gctrace > 0 {
@@ -945,6 +1018,7 @@ func gc(mode int) {
         // World is stopped.
         // Start marktermination which includes enabling the write barrier.
         atomicstore(&gcBlackenEnabled, 0)
+       gcBlackenPromptly = false
         setGCPhase(_GCmarktermination)
  
         if debug.gctrace > 0 {
@@ -1119,10 +1193,9 @@ func gcBgMarkPrepare() {
         work.nproc = ^uint32(0)
         work.nwait = ^uint32(0)
  
-       // Background GC and assists race to set this to 1 on
-       // completion so that this only gets one "done" signal.
-       work.bgMarkDone = 0
-
+       // Reset background mark completion points.
+       work.bgMark1.clear()
+       work.bgMark2.clear()
         gcController.bgMarkStartTime = nanotime()
  }
  
@@ -1169,7 +1242,11 @@ func gcBgMarkWorker(p *p) {
  
                 startTime := nanotime()
  
-               xadd(&work.nwait, -1)
+               decnwait := xadd(&work.nwait, -1)
+               if decnwait == work.nproc {
+                       println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
+                       throw("work.nwait was > work.nproc")
+               }
  
                 done := false
                 switch p.gcMarkWorkerMode {
@@ -1185,21 +1262,37 @@ func gcBgMarkWorker(p *p) {
                         gcDrainUntilPreempt(&p.gcw, gcBgCreditSlack)
                         // Was this the last worker and did we run out
                         // of work?
-                       done = xadd(&work.nwait, +1) == work.nproc && work.full == 0 && work.partial == 0
+                       incnwait := xadd(&work.nwait, +1)
+                       if incnwait > work.nproc {
+                               println("runtime: p.gcMarkWorkerMode=", p.gcMarkWorkerMode,
+                                       "work.nwait=", incnwait, "work.nproc=", work.nproc)
+                               throw("work.nwait > work.nproc")
+                       }
+                       done = incnwait == work.nproc && work.full == 0 && work.partial == 0
+               }
+               // If we are near the end of the mark phase dispose of p.gcw.
+               if gcBlackenPromptly {
+                       p.gcw.dispose()
                 }
-               // We're not in mark termination, so there's no need
-               // to dispose p.gcw.
  
                 // If this worker reached a background mark completion
                 // point, signal the main GC goroutine.
                 if done {
-                       gcBgMarkDone()
+                       if gcBlackenPromptly {
+                               if work.bgMark1.done == 0 {
+                                       throw("completing mark 2, but bgMark1.done == 0")
+                               }
+                               work.bgMark2.complete()
+                       } else {
+                               work.bgMark1.complete()
+                       }
                 }
  
                 duration := nanotime() - startTime
                 switch p.gcMarkWorkerMode {
                 case gcMarkWorkerDedicatedMode:
                         xaddint64(&gcController.dedicatedMarkTime, duration)
+                       xaddint64(&gcController.dedicatedMarkWorkersNeeded, 1)
                 case gcMarkWorkerFractionalMode:
                         xaddint64(&gcController.fractionalMarkTime, duration)
                         xaddint64(&gcController.fractionalMarkWorkersNeeded, 1)
@@ -1209,26 +1302,8 @@ func gcBgMarkWorker(p *p) {
         }
  }
  
-// gcBgMarkDone signals the completion of background marking. This can
-// be called multiple times during a cycle; only the first call has
-// any effect.
-func gcBgMarkDone() {
-       if cas(&work.bgMarkDone, 0, 1) {
-               // This is the first worker to reach completion.
-               // Signal the main GC goroutine.
-               lock(&work.bgMarkWake.lock)
-               if work.bgMarkWake.g == nil {
-                       // It hasn't parked yet.
-                       work.bgMarkWake.wake = true
-               } else {
-                       ready(work.bgMarkWake.g, 0)
-               }
-               unlock(&work.bgMarkWake.lock)
-       }
-}
-
-// gcMarkWorkAvailable determines if mark work is readily available.
-// It is used by the scheduler to decide if this p run a mark work.
+// gcMarkWorkAvailable returns true if executing a mark worker
+// on p is potentially useful.
  func gcMarkWorkAvailable(p *p) bool {
         if !p.gcw.empty() {
                 return true
diff --git a/src/runtime/mgcmark.go b/src/runtime/mgcmark.go

index b2fbc976159604c26d80444589f81abbf652e171..57dc2560ddace912e5883a59093ef7a207cfe13a 100644 (file)
--- a/src/runtime/mgcmark.go
+++ b/src/runtime/mgcmark.go
@@ -31,9 +31,10 @@ func gcscan_m() {
  
         work.nwait = 0
         work.ndone = 0
-       work.nproc = 1 // For now do not do this in parallel.
+       work.nproc = 1
+       useOneP := uint32(1) // For now do not do this in parallel.
         //      ackgcphase is not needed since we are not scanning running goroutines.
-       parforsetup(work.markfor, work.nproc, uint32(_RootCount+local_allglen), false, markroot)
+       parforsetup(work.markfor, useOneP, uint32(_RootCount+local_allglen), false, markroot)
         parfordo(work.markfor)
  
         lock(&allglock)
@@ -193,12 +194,24 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
  
         // Perform assist work
         systemstack(func() {
+               if atomicload(&gcBlackenEnabled) == 0 {
+                       // The gcBlackenEnabled check in malloc races with the
+                       // store that clears it but an atomic check in every malloc
+                       // would be a performance hit.
+                       // Instead we recheck it here on the non-preemptable system
+                       // stack to determine if we should preform an assist.
+                       return
+               }
                 // Track time spent in this assist. Since we're on the
                 // system stack, this is non-preemptible, so we can
                 // just measure start and end time.
                 startTime := nanotime()
  
-               xadd(&work.nwait, -1)
+               decnwait := xadd(&work.nwait, -1)
+               if decnwait == work.nproc {
+                       println("runtime: work.nwait =", decnwait, "work.nproc=", work.nproc)
+                       throw("nwait > work.nprocs")
+               }
  
                 // drain own cached work first in the hopes that it
                 // will be more cache friendly.
@@ -207,16 +220,33 @@ func gcAssistAlloc(size uintptr, allowAssist bool) {
                 gcDrainN(gcw, scanWork)
                 // Record that we did this much scan work.
                 gp.gcscanwork += gcw.scanWork - startScanWork
-               // No need to dispose since we're not in mark termination.
-
+               // If we are near the end of the mark phase
+               // dispose of the gcw.
+               if gcBlackenPromptly {
+                       gcw.dispose()
+               }
                 // If this is the last worker and we ran out of work,
                 // signal a completion point.
-               if xadd(&work.nwait, +1) == work.nproc && work.full == 0 && work.partial == 0 {
+               incnwait := xadd(&work.nwait, +1)
+               if incnwait > work.nproc {
+                       println("runtime: work.nwait=", incnwait,
+                               "work.nproc=", work.nproc,
+                               "gcBlackenPromptly=", gcBlackenPromptly)
+                       throw("work.nwait > work.nproc")
+               }
+
+               if incnwait == work.nproc && work.full == 0 && work.partial == 0 {
                         // This has reached a background completion
                         // point.
-                       gcBgMarkDone()
+                       if gcBlackenPromptly {
+                               if work.bgMark1.done == 0 {
+                                       throw("completing mark 2, but bgMark1.done == 0")
+                               }
+                               work.bgMark2.complete()
+                       } else {
+                               work.bgMark1.complete()
+                       }
                 }
-
                 duration := nanotime() - startTime
                 _p_ := gp.m.p.ptr()
                 _p_.gcAssistTime += duration
@@ -795,7 +825,7 @@ func shade(b uintptr) {
         if obj, hbits, span := heapBitsForObject(b); obj != 0 {
                 gcw := &getg().m.p.ptr().gcw
                 greyobject(obj, 0, 0, hbits, span, gcw)
-               if gcphase == _GCmarktermination {
+               if gcphase == _GCmarktermination || gcBlackenPromptly {
                         // Ps aren't allowed to cache work during mark
                         // termination.
                         gcw.dispose()
@@ -885,16 +915,12 @@ func gcDumpObject(label string, obj, off uintptr) {
         }
  }
  
-// When in GCmarkterminate phase we allocate black.
+// If gcBlackenPromptly is true we are in the second mark phase phase so we allocate black.
  //go:nowritebarrier
  func gcmarknewobject_m(obj, size uintptr) {
-       if gcphase != _GCmarktermination {
-               throw("marking new object while not in mark termination phase")
-       }
-       if useCheckmark { // The world should be stopped so this should not happen.
+       if useCheckmark && !gcBlackenPromptly { // The world should be stopped so this should not happen.
                 throw("gcmarknewobject called while doing checkmark")
         }
-
         heapBitsForAddr(obj).setMarked()
         xadd64(&work.bytesMarked, int64(size))
  }
diff --git a/src/runtime/mgcwork.go b/src/runtime/mgcwork.go

index 226c65635f1dd3c3995111d169022da0bd487f51..4a1455c860d95a1c2c4f39dc4ef375878dd7f242 100644 (file)
--- a/src/runtime/mgcwork.go
+++ b/src/runtime/mgcwork.go
@@ -240,7 +240,7 @@ func (b *workbuf) logput(entry int) {
                 return
         }
         if !b.inuse {
-               println("runtime:logput fails log entry=", entry,
+               println("runtime: logput fails log entry=", entry,
                         "b.log[0]=", b.log[0], "b.log[1]=", b.log[1],
                         "b.log[2]=", b.log[2], "b.log[3]=", b.log[3])
                 throw("logput: put not legal")
@@ -388,10 +388,18 @@ func getfull(entry int) *workbuf {
                 return b
         }
  
-       xadd(&work.nwait, +1)
+       incnwait := xadd(&work.nwait, +1)
+       if incnwait > work.nproc {
+               println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
+               throw("work.nwait > work.nproc")
+       }
         for i := 0; ; i++ {
                 if work.full != 0 || work.partial != 0 {
-                       xadd(&work.nwait, -1)
+                       decnwait := xadd(&work.nwait, -1)
+                       if decnwait == work.nproc {
+                               println("runtime: work.nwait=", decnwait, "work.nproc=", work.nproc)
+                               throw("work.nwait > work.nproc")
+                       }
                         b = (*workbuf)(lfstackpop(&work.full))
                         if b == nil {
                                 b = (*workbuf)(lfstackpop(&work.partial))
@@ -401,7 +409,11 @@ func getfull(entry int) *workbuf {
                                 b.checknonempty()
                                 return b
                         }
-                       xadd(&work.nwait, +1)
+                       incnwait := xadd(&work.nwait, +1)
+                       if incnwait > work.nproc {
+                               println("runtime: work.nwait=", incnwait, "work.nproc=", work.nproc)
+                               throw("work.nwait > work.nproc")
+                       }
                 }
                 if work.nwait == work.nproc {
                         return nil
author	Rick Hudson <rlh@golang.org>
	Mon, 1 Jun 2015 22:16:03 +0000 (18:16 -0400)
committer	Rick Hudson <rlh@golang.org>
	Thu, 18 Jun 2015 21:38:46 +0000 (21:38 +0000)
src/runtime/malloc.go		patch \| blob \| history
src/runtime/mgc.go		patch \| blob \| history
src/runtime/mgcmark.go		patch \| blob \| history
src/runtime/mgcwork.go		patch \| blob \| history