]> Cypherpunks repositories - gostls13.git/commitdiff
runtime: check more work flushing races
authorAustin Clements <austin@google.com>
Mon, 26 Nov 2018 19:41:23 +0000 (14:41 -0500)
committerAustin Clements <austin@google.com>
Thu, 29 Nov 2018 22:08:05 +0000 (22:08 +0000)
This adds several new checks to help debug #27993. It adds a mechanism
for freezing write barriers and gcWork puts during the mark completion
algorithm. This way, if we do detect mark completion, we can catch any
puts that happened during the completion algorithm. Based on build
dashboard failures, this seems to be the window of time when these are
happening.

This also double-checks that all work buffers are empty immediately
upon entering mark termination (much earlier than the current check).
This is unlikely to trigger based on the current failures, but is a
good safety net.

Change-Id: I03f56c48c4322069e28c50fbc3c15b2fee2130c2
Reviewed-on: https://go-review.googlesource.com/c/151797
Run-TryBot: Austin Clements <austin@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
src/runtime/mgc.go
src/runtime/mgcwork.go
src/runtime/mwbbuf.go

index db589c3f8fae6ad30aaa3267450719073da7b8ab..7747e5409c9aef7c69b270353d9970026283b0b0 100644 (file)
@@ -1363,6 +1363,19 @@ func gcStart(trigger gcTrigger) {
 // This is protected by markDoneSema.
 var gcMarkDoneFlushed uint32
 
+// debugCachedWork enables extra checks for debugging premature mark
+// termination.
+//
+// For debugging issue #27993.
+const debugCachedWork = true
+
+// gcWorkPauseGen is for debugging the mark completion algorithm.
+// gcWork put operations spin while gcWork.pauseGen == gcWorkPauseGen.
+// Only used if debugCachedWork is true.
+//
+// For debugging issue #27993.
+var gcWorkPauseGen uint32 = 1
+
 // gcMarkDone transitions the GC from mark to mark termination if all
 // reachable objects have been marked (that is, there are no grey
 // objects and can be no more in the future). Otherwise, it flushes
@@ -1408,6 +1421,14 @@ top:
                        // Flush the write barrier buffer, since this may add
                        // work to the gcWork.
                        wbBufFlush1(_p_)
+                       // For debugging, shrink the write barrier
+                       // buffer so it flushes immediately.
+                       // wbBuf.reset will keep it at this size as
+                       // long as throwOnGCWork is set.
+                       if debugCachedWork {
+                               b := &_p_.wbBuf
+                               b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))
+                       }
                        // Flush the gcWork, since this may create global work
                        // and set the flushedWork flag.
                        //
@@ -1418,11 +1439,23 @@ top:
                        if _p_.gcw.flushedWork {
                                atomic.Xadd(&gcMarkDoneFlushed, 1)
                                _p_.gcw.flushedWork = false
+                       } else if debugCachedWork {
+                               // For debugging, freeze the gcWork
+                               // until we know whether we've reached
+                               // completion or not. If we think
+                               // we've reached completion, but
+                               // there's a paused gcWork, then
+                               // that's a bug.
+                               _p_.gcw.pauseGen = gcWorkPauseGen
                        }
                })
        })
 
        if gcMarkDoneFlushed != 0 {
+               if debugCachedWork {
+                       // Release paused gcWorks.
+                       atomic.Xadd(&gcWorkPauseGen, 1)
+               }
                // More grey objects were discovered since the
                // previous termination check, so there may be more
                // work to do. Keep going. It's possible the
@@ -1431,7 +1464,12 @@ top:
                goto top
        }
 
-       throwOnGCWork = true
+       if debugCachedWork {
+               throwOnGCWork = true
+               // Release paused gcWorks. If there are any, they
+               // should now observe throwOnGCWork and panic.
+               atomic.Xadd(&gcWorkPauseGen, 1)
+       }
 
        // There was no global work, no local work, and no Ps
        // communicated work since we took markDoneSema. Therefore
@@ -1449,6 +1487,30 @@ top:
        // below. The important thing is that the wb remains active until
        // all marking is complete. This includes writes made by the GC.
 
+       if debugCachedWork {
+               // For debugging, double check that no work was added after we
+               // went around above and disable write barrier buffering.
+               for _, p := range allp {
+                       gcw := &p.gcw
+                       if !gcw.empty() {
+                               printlock()
+                               print("runtime: P ", p.id, " flushedWork ", gcw.flushedWork)
+                               if gcw.wbuf1 == nil {
+                                       print(" wbuf1=<nil>")
+                               } else {
+                                       print(" wbuf1.n=", gcw.wbuf1.nobj)
+                               }
+                               if gcw.wbuf2 == nil {
+                                       print(" wbuf2=<nil>")
+                               } else {
+                                       print(" wbuf2.n=", gcw.wbuf2.nobj)
+                               }
+                               print("\n")
+                               throw("throwOnGCWork")
+                       }
+               }
+       }
+
        // Disable assists and background workers. We must do
        // this before waking blocked assists.
        atomic.Store(&gcBlackenEnabled, 0)
index da2129ee508dd19c29cfe6cfa09b1a94ffa490f2..8a77ff55e4d1d9f021ba80ecf31171352557f6b1 100644 (file)
@@ -93,6 +93,10 @@ type gcWork struct {
        // termination check. Specifically, this indicates that this
        // gcWork may have communicated work to another gcWork.
        flushedWork bool
+
+       // pauseGen causes put operations to spin while pauseGen ==
+       // gcWorkPauseGen if debugCachedWork is true.
+       pauseGen uint32
 }
 
 // Most of the methods of gcWork are go:nowritebarrierrec because the
@@ -111,13 +115,21 @@ func (w *gcWork) init() {
        w.wbuf2 = wbuf2
 }
 
+func (w *gcWork) checkPut() {
+       if debugCachedWork {
+               for atomic.Load(&gcWorkPauseGen) == w.pauseGen {
+               }
+               if throwOnGCWork {
+                       throw("throwOnGCWork")
+               }
+       }
+}
+
 // put enqueues a pointer for the garbage collector to trace.
 // obj must point to the beginning of a heap object or an oblet.
 //go:nowritebarrierrec
 func (w *gcWork) put(obj uintptr) {
-       if throwOnGCWork {
-               throw("throwOnGCWork")
-       }
+       w.checkPut()
 
        flushed := false
        wbuf := w.wbuf1
@@ -153,9 +165,7 @@ func (w *gcWork) put(obj uintptr) {
 // otherwise it returns false and the caller needs to call put.
 //go:nowritebarrierrec
 func (w *gcWork) putFast(obj uintptr) bool {
-       if throwOnGCWork {
-               throw("throwOnGCWork")
-       }
+       w.checkPut()
 
        wbuf := w.wbuf1
        if wbuf == nil {
@@ -178,9 +188,7 @@ func (w *gcWork) putBatch(obj []uintptr) {
                return
        }
 
-       if throwOnGCWork {
-               throw("throwOnGCWork")
-       }
+       w.checkPut()
 
        flushed := false
        wbuf := w.wbuf1
@@ -303,16 +311,12 @@ func (w *gcWork) balance() {
                return
        }
        if wbuf := w.wbuf2; wbuf.nobj != 0 {
-               if throwOnGCWork {
-                       throw("throwOnGCWork")
-               }
+               w.checkPut()
                putfull(wbuf)
                w.flushedWork = true
                w.wbuf2 = getempty()
        } else if wbuf := w.wbuf1; wbuf.nobj > 4 {
-               if throwOnGCWork {
-                       throw("throwOnGCWork")
-               }
+               w.checkPut()
                w.wbuf1 = handoff(wbuf)
                w.flushedWork = true // handoff did putfull
        } else {
index c91cea254e9dacefeb1bb194941c899c0400c251..a698493a0a2c5e4a75db21784f77a2977d32f5ab 100644 (file)
@@ -79,7 +79,7 @@ const (
 func (b *wbBuf) reset() {
        start := uintptr(unsafe.Pointer(&b.buf[0]))
        b.next = start
-       if writeBarrier.cgo {
+       if writeBarrier.cgo || (debugCachedWork && throwOnGCWork) {
                // Effectively disable the buffer by forcing a flush
                // on every barrier.
                b.end = uintptr(unsafe.Pointer(&b.buf[wbBufEntryPointers]))