runtime: document tracer invariants explicitly

author Michael Anthony Knyszek <mknyszek@google.com>

Wed, 1 Oct 2025 20:50:57 +0000 (20:50 +0000)

committer Gopher Robot <gobot@golang.org>

Thu, 30 Oct 2025 17:45:52 +0000 (10:45 -0700)
author Michael Anthony Knyszek <mknyszek@google.com>
Wed, 1 Oct 2025 20:50:57 +0000 (20:50 +0000)
committer Gopher Robot <gobot@golang.org>
Thu, 30 Oct 2025 17:45:52 +0000 (10:45 -0700)
diff --git a/src/runtime/HACKING.md b/src/runtime/HACKING.md

index c53779a5882f4ce98dd7c06ac5f08ff1b8e4cf62..89727ee80f1b4a06b85ba682aba5bacb63e111bf 100644 (file)
--- a/src/runtime/HACKING.md
+++ b/src/runtime/HACKING.md
@@ -518,9 +518,8 @@ The parser for the execution trace format lives in the `internal/trace` package.
  If you plan on adding new trace events, consider starting with a [trace
  experiment](../internal/trace/tracev2/EXPERIMENTS.md).
  
-If you plan to add new trace instrumentation to the runtime, wrap whatever operation
-you're tracing in `traceAcquire` and `traceRelease` fully. These functions mark a
-critical section that appears atomic to the execution tracer (but nothing else).
+If you plan to add new trace instrumentation to the runtime, read the comment
+at the top of [trace.go](./trace.go), especially the invariants.
  
  debuglog
  ========
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index 081b9a28258c5516043c5ea4669baa0c771081b8..2ec2f8cfca1f5cf503667d81492ea6262aa66288 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -2170,8 +2170,6 @@ func forEachPInternal(fn func(*p)) {
         // Force Ps currently in a system call into _Pidle and hand them
         // off to induce safe point function execution.
         for _, p2 := range allp {
-               // We need to be fine-grained about tracing here, since handoffp
-               // might call into the tracer, and the tracer is non-reentrant.
                 if atomic.Load(&p2.runSafePointFn) != 1 {
                         // Already ran it.
                         continue
@@ -4344,21 +4342,26 @@ func preemptPark(gp *g) {
         casGToPreemptScan(gp, _Grunning, _Gscan|_Gpreempted)
         dropg()
  
-       // Be careful about how we trace this next event. The ordering
-       // is subtle.
+       // Be careful about ownership as we trace this next event.
         //
-       // The moment we CAS into _Gpreempted, suspendG could CAS to
-       // _Gwaiting, do its work, and ready the goroutine. All of
+       // According to the tracer invariants (trace.go) it's unsafe
+       // for us to emit an event for a goroutine we do not own.
+       // The moment we CAS into _Gpreempted, suspendG could CAS the
+       // goroutine to _Gwaiting, effectively taking ownership. All of
         // this could happen before we even get the chance to emit
         // an event. The end result is that the events could appear
         // out of order, and the tracer generally assumes the scheduler
         // takes care of the ordering between GoPark and GoUnpark.
         //
         // The answer here is simple: emit the event while we still hold
-       // the _Gscan bit on the goroutine. We still need to traceAcquire
-       // and traceRelease across the CAS because the tracer could be
-       // what's calling suspendG in the first place, and we want the
-       // CAS and event emission to appear atomic to the tracer.
+       // the _Gscan bit on the goroutine, since the _Gscan bit means
+       // ownership over transitions.
+       //
+       // We still need to traceAcquire and traceRelease across the CAS
+       // because the tracer could be what's calling suspendG in the first
+       // place. This also upholds the tracer invariant that we must hold
+       // traceAcquire/traceRelease across the transition. However, we
+       // specifically *only* emit the event while we still have ownership.
         trace := traceAcquire()
         if trace.ok() {
                 trace.GoPark(traceBlockPreempted, 0)
@@ -4598,7 +4601,8 @@ func reentersyscall(pc, sp, bp uintptr) {
         if trace.ok() {
                 // Emit a trace event. Notably, actually emitting the event must happen before
                 // the casgstatus because it mutates the P, but the traceLocker must be held
-               // across the casgstatus so the transition is atomic with respect to the event.
+               // across the casgstatus since we're transitioning out of _Grunning
+               // (see trace.go invariants).
                 systemstack(func() {
                         trace.GoSysCall()
                 })
@@ -5249,7 +5253,8 @@ func newproc1(fn *funcval, callergp *g, callerpc uintptr, parked bool, waitreaso
         }
         gcController.addScannableStack(pp, int64(newg.stack.hi-newg.stack.lo))
  
-       // Get a goid and switch to runnable. Make all this atomic to the tracer.
+       // Get a goid and switch to runnable. This needs to happen under traceAcquire
+       // since it's a goroutine transition. See tracer invariants in trace.go.
         trace := traceAcquire()
         var status uint32 = _Grunnable
         if parked {
diff --git a/src/runtime/trace.go b/src/runtime/trace.go

index 2c712469ea66bc0ea1c0567eeb3a2209fcb7958e..7130e2c13627632fe788b73709737bd45af27042 100644 (file)
--- a/src/runtime/trace.go
+++ b/src/runtime/trace.go
@@ -2,20 +2,182 @@
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
-// Go execution tracer.
+// # Go execution tracer
+//
  // The tracer captures a wide range of execution events like goroutine
  // creation/blocking/unblocking, syscall enter/exit/block, GC-related events,
  // changes of heap size, processor start/stop, etc and writes them to a buffer
  // in a compact form. A precise nanosecond-precision timestamp and a stack
  // trace is captured for most events.
  //
-// Tracer invariants (to keep the synchronization making sense):
-// - An m that has a trace buffer must be on either the allm or sched.freem lists.
-// - Any trace buffer mutation must either be happening in traceAdvance or between
-//   a traceAcquire and a subsequent traceRelease.
-// - traceAdvance cannot return until the previous generation's buffers are all flushed.
+// ## Design
+//
+// The basic idea behind the the execution tracer is to have per-M buffers that
+// trace data may be written into. Each M maintains a seqlock indicating whether
+// its trace buffer is currently in use.
+//
+// Tracing is initiated by StartTrace, and proceeds in "generations," with each
+// generation being marked by a call to traceAdvance, to advance to the next
+// generation. Generations are a global synchronization point for trace data,
+// and we proceed to a new generation by moving forward trace.gen. Each M reads
+// trace.gen under its own seqlock to determine which generation it is writing
+// trace data for. To this end, each M has 2 slots for buffers: one slot for the
+// previous generation, one slot for the current one. It uses tl.gen to select
+// which buffer slot to write to. Simultaneously, traceAdvance uses the seqlock
+// to determine whether every thread is guaranteed to observe an updated
+// trace.gen. Once it is sure, it may then flush any buffers that are left over
+// from the previous generation safely, since it knows the Ms will not mutate
+// it.
+//
+// Flushed buffers are processed by the ReadTrace function, which is called by
+// the trace reader goroutine. The first goroutine to call ReadTrace is designated
+// as the trace reader goroutine until tracing completes. (There may only be one at
+// a time.)
+//
+// Once all buffers are flushed, any extra post-processing complete, and flushed
+// buffers are processed by the trace reader goroutine, the trace emits an
+// EndOfGeneration event to mark the global synchronization point in the trace.
+//
+// All other trace features, including CPU profile samples, stack information,
+// string tables, etc. all revolve around this generation system, and typically
+// appear in pairs: one for the previous generation, and one for the current one.
+// Like the per-M buffers, which of the two is written to is selected using trace.gen,
+// and anything managed this way must similarly be mutated only in traceAdvance or
+// under the M's seqlock.
+//
+// Trace events themselves are simple. They consist of a single byte for the event type,
+// followed by zero or more LEB128-encoded unsigned varints. They are decoded using
+// a pre-determined table for each trace version: internal/trace/tracev2.specs.
+//
+// To avoid relying on timestamps for correctness and validation, each G and P have
+// sequence counters that are written into trace events to encode a partial order.
+// The sequence counters reset on each generation. Ms do not need sequence counters
+// because they are the source of truth for execution: trace events, and even whole
+// buffers, are guaranteed to appear in order in the trace data stream, simply because
+// that's the order the thread emitted them in.
+//
+// See traceruntime.go for the API the tracer exposes to the runtime for emitting events.
+//
+// In each generation, we ensure that we enumerate all goroutines, such that each
+// generation's data is fully self-contained. This makes features like the flight
+// recorder easy to implement. To this end, we guarantee that every live goroutine is
+// listed at least once by emitting a status event for the goroutine, indicating its
+// starting state. These status events are emitted based on context, generally based
+// on the event that's about to be emitted.
+//
+// The traceEventWriter type encapsulates these details, and is the backbone of
+// the API exposed in traceruntime.go, though there are deviations where necessary.
+//
+// This is the overall design, but as always, there are many details. Beyond this,
+// look to the invariants and select corner cases below and the code itself for the
+// source of truth.
+//
+// See https://go.dev/issue/60773 for a link to a more complete design with rationale,
+// though parts of it are out-of-date.
+//
+// ## Invariants
+//
+// 1. An m that has a trace buffer MUST be on either the allm or sched.freem lists.
+//
+// Otherwise, traceAdvance might miss an M with a buffer that needs to be flushed.
+//
+// 2. Trace buffers MUST only be mutated in traceAdvance or under a traceAcquire/traceRelease.
+//
+// Otherwise, traceAdvance may race with Ms writing trace data when trying to flush buffers.
+//
+// 3. traceAdvance MUST NOT return until all of the current generation's buffers are flushed.
+//
+// Otherwise, callers cannot rely on all the data they need being available (for example, for
+// the flight recorder).
  //
-// See https://go.dev/issue/60773 for a link to the full design.
+// 4. P and goroutine state transition events MUST be emitted by an M that owns its ability
+//    to transition.
+//
+// What this means is that the M must either be the owner of the P, the owner of the goroutine,
+// or owner of a non-running goroutine's _Gscan bit. There are a lot of bad things that can
+// happen if this invariant isn't maintained, mostly around generating inconsistencies in the
+// trace due to racy emission of events.
+//
+// 5. Acquisition of a P (pidleget or takeP/gcstopP) MUST NOT be performed under a traceAcquire/traceRelease pair.
+//
+// Notably, it's important that traceAcquire/traceRelease not cover a state in which the
+// goroutine or P is not yet owned. For example, if traceAcquire is held across both wirep and
+// pidleget, then we could end up emitting an event in the wrong generation. Suppose T1
+// traceAcquires in generation 1, a generation transition happens, T2 emits a ProcStop and
+// executes pidleput in generation 2, and finally T1 calls pidleget and emits ProcStart.
+// The ProcStart must follow the ProcStop in the trace to make any sense, but ProcStop was
+// emitted in a latter generation.
+//
+// 6. Goroutine state transitions, with the exception of transitions into _Grunning, MUST be
+//    performed under the traceAcquire/traceRelease pair where the event is emitted.
+//
+// Otherwise, traceAdvance may observe a goroutine state that is inconsistent with the
+// events being emitted. traceAdvance inspects all goroutines' states in order to emit
+// a status event for any goroutine that did not have an event emitted for it already.
+// If the generation then advances in between that observation and the event being emitted,
+// then the trace will contain a status that doesn't line up with the event. For example,
+// if the event is emitted after the state transition _Gwaiting -> _Grunnable, then
+// traceAdvance may observe the goroutine in _Grunnable, emit a status event, advance the
+// generation, and the following generation contains a GoUnblock event. The trace parser
+// will get confused because it sees that goroutine in _Grunnable in the previous generation
+// trying to be transitioned from _Gwaiting into _Grunnable in the following one. Something
+// similar happens if the trace event is emitted before the state transition, so that does
+// not help either.
+//
+// Transitions to _Grunning do not have the same problem because traceAdvance is unable to
+// observe running goroutines directly. It must stop them, or wait for them to emit an event.
+// Note that it cannot even stop them with asynchronous preemption in any "bad" window between
+// the state transition to _Grunning and the event emission because async preemption cannot
+// stop goroutines in the runtime.
+//
+// 7. Goroutine state transitions into _Grunning MUST emit an event for the transition after
+//    the state transition.
+//
+// This follows from invariants (4), (5), and the explanation of (6).
+// The relevant part of the previous invariant is that in order for the tracer to be unable to
+// stop a goroutine, it must be in _Grunning and in the runtime. So to close any windows between
+// event emission and the state transition, the event emission must happen *after* the transition
+// to _Grunning.
+//
+// ## Select corner cases
+//
+// ### CGO calls / system calls
+//
+// CGO calls and system calls are mostly straightforward, except for P stealing. For historical
+// reasons, this introduces a new trace-level P state called ProcSyscall which used to model
+// _Psyscall (now _Psyscall_unused). This state is used to indicate in the trace that a P
+// is eligible for stealing as part of the parser's ordering logic.
+//
+// Another quirk of this corner case is the ProcSyscallAbandoned trace-level P state, which
+// is used only in status events to indicate a relaxation of verification requirements. It
+// means that if the execution trace parser can't find the corresponding thread that the P
+// was stolen from in the state it expects it to be, to accept the trace anyway. This is also
+// historical. When _Psyscall still existed, one would steal and then ProcSteal, and there
+// was no ordering between the ProcSteal and the subsequent GoSyscallEndBlocked. One clearly
+// happened before the other, but since P stealing was a single atomic, there was no way
+// to enforce the order. The GoSyscallEndBlocked thread could move on and end up in any
+// state, and the GoSyscallEndBlocked could be in a completely different generation to the
+// ProcSteal. Today this is no longer possible as the ProcSteal is always ordered before
+// the GoSyscallEndBlocked event in the runtime.
+//
+// Both ProcSyscall and ProcSyscallAbandoned are likely no longer be necessary.
+//
+// ### CGO callbacks
+//
+// When a C thread calls into Go, the execution tracer models that as the creation of a new
+// goroutine. When the thread exits back into C, that is modeled as the destruction of that
+// goroutine. These are the GoCreateSyscall and GoDestroySyscall events, which represent the
+// creation and destruction of a goroutine with its starting and ending states being _Gsyscall.
+//
+// This model is simple to reason about but contradicts the runtime implementation, which
+// doesn't do this directly for performance reasons. The runtime implementation instead caches
+// a G on the M created for the C thread. On Linux this M is then cached in the thread's TLS,
+// and on other systems, the M is put on a global list on exit from Go. We need to do some
+// extra work to make sure that this is modeled correctly in the the tracer. For example,
+// a C thread exiting Go may leave a P hanging off of its M (whether that M is kept in TLS
+// or placed back on a list). In order to correctly model goroutine creation and destruction,
+// we must behave as if the P was at some point stolen by the runtime, if the C thread
+// reenters Go with the same M (and thus, same P) once more.
  
  package runtime
  
@@ -192,38 +354,14 @@ func StartTrace() error {
  
         // Stop the world.
         //
-       // The purpose of stopping the world is to make sure that no goroutine is in a
-       // context where it could emit an event by bringing all goroutines to a safe point
-       // with no opportunity to transition.
-       //
-       // The exception to this rule are goroutines that are concurrently exiting a syscall.
-       // Those will all be forced into the syscalling slow path, and we'll just make sure
-       // that we don't observe any goroutines in that critical section before starting
-       // the world again.
-       //
-       // A good follow-up question to this is why stopping the world is necessary at all
-       // given that we have traceAcquire and traceRelease. Unfortunately, those only help
-       // us when tracing is already active (for performance, so when tracing is off the
-       // tracing seqlock is left untouched). The main issue here is subtle: we're going to
-       // want to obtain a correct starting status for each goroutine, but there are windows
-       // of time in which we could read and emit an incorrect status. Specifically:
-       //
-       //      trace := traceAcquire()
-       //  // <----> problem window
-       //      casgstatus(gp, _Gwaiting, _Grunnable)
-       //      if trace.ok() {
-       //              trace.GoUnpark(gp, 2)
-       //              traceRelease(trace)
-       //      }
-       //
-       // More precisely, if we readgstatus for a gp while another goroutine is in the problem
-       // window and that goroutine didn't observe that tracing had begun, then we might write
-       // a GoStatus(GoWaiting) event for that goroutine, but it won't trace an event marking
-       // the transition from GoWaiting to GoRunnable. The trace will then be broken, because
-       // future events will be emitted assuming the tracer sees GoRunnable.
+       // What we need to successfully begin tracing is to make sure that the next time
+       // *any goroutine* hits a traceAcquire, it sees that the trace is enabled.
         //
-       // In short, what we really need here is to make sure that the next time *any goroutine*
-       // hits a traceAcquire, it sees that the trace is enabled.
+       // Stopping the world gets us most of the way there, since it makes sure that goroutines
+       // stop executing. There is however one exception: goroutines without Ps concurrently
+       // exiting a syscall. We handle this by making sure that, after we update trace.gen,
+       // there isn't a single goroutine calling traceAcquire on the syscall slow path by checking
+       // trace.exitingSyscall. See the comment on the check below for more details.
         //
         // Note also that stopping the world is necessary to make sure sweep-related events are
         // coherent. Since the world is stopped and sweeps are non-preemptible, we can never start
diff --git a/src/runtime/traceruntime.go b/src/runtime/traceruntime.go

index d3f82f6fceeaf4b8654ab5a0372b1994d90a8140..ad91d9c836c6ec4ac95568212e1f9d5880d924b7 100644 (file)
--- a/src/runtime/traceruntime.go
+++ b/src/runtime/traceruntime.go
@@ -541,10 +541,10 @@ func (tl traceLocker) ProcSteal(pp *p) {
         pp.trace.mSyscallID = -1
  
         // Emit the status of the P we're stealing. We may be just about to do this when creating the event
-       // writer but it's not guaranteed, even if inSyscall is true. Although it might seem like from a
-       // syscall context we're always stealing a P for ourselves, we may have not wired it up yet (so
+       // writer but it's not guaranteed, even if we're stealing from a syscall. Although it might seem like
+       // from a syscall context we're always stealing a P for ourselves, we may have not wired it up yet (so
         // it wouldn't be visible to eventWriter) or we may not even intend to wire it up to ourselves
-       // at all (e.g. entersyscall_gcwait).
+       // at all and plan to hand it back to the runtime.
         if !pp.trace.statusWasTraced(tl.gen) && pp.trace.acquireStatus(tl.gen) {
                 // Careful: don't use the event writer. We never want status or in-progress events
                 // to trigger more in-progress events.
author	Michael Anthony Knyszek <mknyszek@google.com>
	Wed, 1 Oct 2025 20:50:57 +0000 (20:50 +0000)
committer	Gopher Robot <gobot@golang.org>
	Thu, 30 Oct 2025 17:45:52 +0000 (10:45 -0700)
src/runtime/HACKING.md		patch \| blob \| history
src/runtime/proc.go		patch \| blob \| history
src/runtime/trace.go		patch \| blob \| history
src/runtime/traceruntime.go		patch \| blob \| history