runtime: only acquire trace.lock on the system stack

author Austin Clements <austin@google.com>

Thu, 21 Jul 2022 19:43:11 +0000 (15:43 -0400)

committer Austin Clements <austin@google.com>

Thu, 11 Aug 2022 20:16:35 +0000 (20:16 +0000)
author Austin Clements <austin@google.com>
Thu, 21 Jul 2022 19:43:11 +0000 (15:43 -0400)
committer Austin Clements <austin@google.com>
Thu, 11 Aug 2022 20:16:35 +0000 (20:16 +0000)
diff --git a/src/runtime/trace.go b/src/runtime/trace.go

index 0f661493ce61333e6436d83b5f0a622de317bf19..56fd1ba37b6361c99f6d67dbe7665ec242480c3c 100644 (file)
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -109,6 +109,8 @@ const (
  
  // trace is global tracing context.
  var trace struct {
+       // trace.lock must only be acquired on the system stack where
+       // stack splits cannot happen while it is held.
         lock          mutex       // protects the following members
         lockOwner     *g          // to avoid deadlocks during recursive lock locks
         enabled       bool        // when set runtime traces events
@@ -385,31 +387,33 @@ func StopTrace() {
                 raceacquire(unsafe.Pointer(&trace.shutdownSema))
         }
  
-       // The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
-       lock(&trace.lock)
-       for _, p := range allp[:cap(allp)] {
-               if p.tracebuf != 0 {
-                       throw("trace: non-empty trace buffer in proc")
+       systemstack(func() {
+               // The lock protects us from races with StartTrace/StopTrace because they do stop-the-world.
+               lock(&trace.lock)
+               for _, p := range allp[:cap(allp)] {
+                       if p.tracebuf != 0 {
+                               throw("trace: non-empty trace buffer in proc")
+                       }
                 }
-       }
-       if trace.buf != 0 {
-               throw("trace: non-empty global trace buffer")
-       }
-       if trace.fullHead != 0 || trace.fullTail != 0 {
-               throw("trace: non-empty full trace buffer")
-       }
-       if trace.reading != 0 || trace.reader.Load() != nil {
-               throw("trace: reading after shutdown")
-       }
-       for trace.empty != 0 {
-               buf := trace.empty
-               trace.empty = buf.ptr().link
-               sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
-       }
-       trace.strings = nil
-       trace.shutdown = false
-       trace.cpuLogRead = nil
-       unlock(&trace.lock)
+               if trace.buf != 0 {
+                       throw("trace: non-empty global trace buffer")
+               }
+               if trace.fullHead != 0 || trace.fullTail != 0 {
+                       throw("trace: non-empty full trace buffer")
+               }
+               if trace.reading != 0 || trace.reader.Load() != nil {
+                       throw("trace: reading after shutdown")
+               }
+               for trace.empty != 0 {
+                       buf := trace.empty
+                       trace.empty = buf.ptr().link
+                       sysFree(unsafe.Pointer(buf), unsafe.Sizeof(*buf.ptr()), &memstats.other_sys)
+               }
+               trace.strings = nil
+               trace.shutdown = false
+               trace.cpuLogRead = nil
+               unlock(&trace.lock)
+       })
  }
  
  // ReadTrace returns the next chunk of binary tracing data, blocking until data
@@ -419,6 +423,54 @@ func StopTrace() {
  // ReadTrace must be called from one goroutine at a time.
  func ReadTrace() []byte {
  top:
+       var buf []byte
+       var park bool
+       systemstack(func() {
+               buf, park = readTrace0()
+       })
+       if park {
+               gopark(func(gp *g, _ unsafe.Pointer) bool {
+                       if !trace.reader.CompareAndSwapNoWB(nil, gp) {
+                               // We're racing with another reader.
+                               // Wake up and handle this case.
+                               return false
+                       }
+
+                       if g2 := traceReader(); gp == g2 {
+                               // New data arrived between unlocking
+                               // and the CAS and we won the wake-up
+                               // race, so wake up directly.
+                               return false
+                       } else if g2 != nil {
+                               printlock()
+                               println("runtime: got trace reader", g2, g2.goid)
+                               throw("unexpected trace reader")
+                       }
+
+                       return true
+               }, nil, waitReasonTraceReaderBlocked, traceEvGoBlock, 2)
+               goto top
+       }
+
+       return buf
+}
+
+// readTrace0 is ReadTrace's continuation on g0. This must run on the
+// system stack because it acquires trace.lock.
+//
+//go:systemstack
+func readTrace0() (buf []byte, park bool) {
+       if raceenabled {
+               // g0 doesn't have a race context. Borrow the user G's.
+               if getg().racectx != 0 {
+                       throw("expected racectx == 0")
+               }
+               getg().racectx = getg().m.curg.racectx
+               // (This defer should get open-coded, which is safe on
+               // the system stack.)
+               defer func() { getg().racectx = 0 }()
+       }
+
         // This function may need to lock trace.lock recursively
         // (goparkunlock -> traceGoPark -> traceEvent -> traceFlush).
         // To allow this we use trace.lockOwner.
@@ -426,7 +478,7 @@ top:
         // allocation can call heap allocate, which will try to emit a trace
         // event while holding heap lock.
         lock(&trace.lock)
-       trace.lockOwner = getg()
+       trace.lockOwner = getg().m.curg
  
         if trace.reader.Load() != nil {
                 // More than one goroutine reads trace. This is bad.
@@ -435,7 +487,7 @@ top:
                 trace.lockOwner = nil
                 unlock(&trace.lock)
                 println("runtime: ReadTrace called from multiple goroutines simultaneously")
-               return nil
+               return nil, false
         }
         // Recycle the old buffer.
         if buf := trace.reading; buf != 0 {
@@ -448,7 +500,7 @@ top:
                 trace.headerWritten = true
                 trace.lockOwner = nil
                 unlock(&trace.lock)
-               return []byte("go 1.19 trace\x00\x00\x00")
+               return []byte("go 1.19 trace\x00\x00\x00"), false
         }
         // Optimistically look for CPU profile samples. This may write new stack
         // records, and may write new tracing buffers.
@@ -460,30 +512,10 @@ top:
                 // We don't simply use a note because the scheduler
                 // executes this goroutine directly when it wakes up
                 // (also a note would consume an M).
+               trace.lockOwner = nil
                 unlock(&trace.lock)
-               gopark(func(gp *g, _ unsafe.Pointer) bool {
-                       if !trace.reader.CompareAndSwapNoWB(nil, gp) {
-                               // We're racing with another reader.
-                               // Wake up and handle this case.
-                               return false
-                       }
-
-                       if g2 := traceReader(); gp == g2 {
-                               // New data arrived between unlocking
-                               // and the CAS and we won the wake-up
-                               // race, so wake up directly.
-                               return false
-                       } else if g2 != nil {
-                               printlock()
-                               println("runtime: got trace reader", g2, g2.goid)
-                               throw("unexpected trace reader")
-                       }
-
-                       return true
-               }, nil, waitReasonTraceReaderBlocked, traceEvGoBlock, 2)
-               goto top
+               return nil, true
         }
-
  newFull:
         assertLockHeld(&trace.lock)
         // Write a buffer.
@@ -492,7 +524,7 @@ newFull:
                 trace.reading = buf
                 trace.lockOwner = nil
                 unlock(&trace.lock)
-               return buf.ptr().arr[:buf.ptr().pos]
+               return buf.ptr().arr[:buf.ptr().pos], false
         }
  
         // Write footer with timer frequency.
@@ -534,17 +566,21 @@ newFull:
                 }
                 // trace.enabled is already reset, so can call traceable functions.
                 semrelease(&trace.shutdownSema)
-               return nil
+               return nil, false
         }
         // Also bad, but see the comment above.
         trace.lockOwner = nil
         unlock(&trace.lock)
         println("runtime: spurious wakeup of trace reader")
-       return nil
+       return nil, false
  }
  
  // traceReader returns the trace reader that should be woken up, if any.
  // Callers should first check that trace.enabled or trace.shutdown is set.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
  func traceReader() *g {
         // Optimistic check first
         if traceReaderAvailable() == nil {
@@ -571,6 +607,10 @@ func traceReaderAvailable() *g {
  }
  
  // traceProcFree frees trace buffer associated with pp.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
  func traceProcFree(pp *p) {
         buf := pp.tracebuf
         pp.tracebuf = 0
@@ -661,7 +701,9 @@ func traceEventLocked(extraBytes int, mp *m, pid int32, bufp *traceBufPtr, ev by
         // TODO: test on non-zero extraBytes param.
         maxSize := 2 + 5*traceBytesPerNumber + extraBytes // event type, length, sequence, timestamp, stack id and two add params
         if buf == nil || len(buf.arr)-buf.pos < maxSize {
-               buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+               systemstack(func() {
+                       buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+               })
                 bufp.set(buf)
         }
  
@@ -802,7 +844,9 @@ func traceReadCPU() {
  
                         buf := bufp.ptr()
                         if buf == nil {
-                               *bufp = traceFlush(*bufp, 0)
+                               systemstack(func() {
+                                       *bufp = traceFlush(*bufp, 0)
+                               })
                                 buf = bufp.ptr()
                         }
                         for i := range stk {
@@ -856,6 +900,10 @@ func traceReleaseBuffer(pid int32) {
  }
  
  // traceFlush puts buf onto stack of full buffers and returns an empty buffer.
+//
+// This must run on the system stack because it acquires trace.lock.
+//
+//go:systemstack
  func traceFlush(buf traceBufPtr, pid int32) traceBufPtr {
         owner := trace.lockOwner
         dolock := owner == nil || owner != getg().m.curg
@@ -933,8 +981,10 @@ func traceString(bufp *traceBufPtr, pid int32, s string) (uint64, *traceBufPtr)
         buf := bufp.ptr()
         size := 1 + 2*traceBytesPerNumber + len(s)
         if buf == nil || len(buf.arr)-buf.pos < size {
-               buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
-               bufp.set(buf)
+               systemstack(func() {
+                       buf = traceFlush(traceBufPtrOf(buf), pid).ptr()
+                       bufp.set(buf)
+               })
         }
         buf.byte(traceEvString)
         buf.varint(id)
@@ -1088,6 +1138,10 @@ func traceFrames(bufp traceBufPtr, pcs []uintptr) ([]traceFrame, traceBufPtr) {
  
  // dump writes all previously cached stacks to trace buffers,
  // releases all memory and resets state.
+//
+// This must run on the system stack because it calls traceFlush.
+//
+//go:systemstack
  func (tab *traceStackTable) dump(bufp traceBufPtr) traceBufPtr {
         for i, _ := range tab.tab {
                 stk := tab.tab[i].ptr()
author	Austin Clements <austin@google.com>
	Thu, 21 Jul 2022 19:43:11 +0000 (15:43 -0400)
committer	Austin Clements <austin@google.com>
	Thu, 11 Aug 2022 20:16:35 +0000 (20:16 +0000)