runtime: eliminate _Psyscall

author Michael Anthony Knyszek <mknyszek@google.com>

Sun, 2 Feb 2025 19:50:39 +0000 (19:50 +0000)

committer Gopher Robot <gobot@golang.org>

Thu, 30 Oct 2025 17:08:55 +0000 (10:08 -0700)
author Michael Anthony Knyszek <mknyszek@google.com>
Sun, 2 Feb 2025 19:50:39 +0000 (19:50 +0000)
committer Gopher Robot <gobot@golang.org>
Thu, 30 Oct 2025 17:08:55 +0000 (10:08 -0700)
diff --git a/src/runtime/crash_test.go b/src/runtime/crash_test.go

index 2b8ca549ad84f214446af09ab1f518a3ff47bf9f..7d02363cc994aa2ebebf3ca84c53ba61e069b0e2 100644 (file)
--- a/src/runtime/crash_test.go
+++ b/src/runtime/crash_test.go
@@ -788,6 +788,9 @@ func TestPanicInlined(t *testing.T) {
  // Test for issues #3934 and #20018.
  // We want to delay exiting until a panic print is complete.
  func TestPanicRace(t *testing.T) {
+       if race.Enabled {
+               t.Skip("racefini exits program before main can delay exiting")
+       }
         testenv.MustHaveGoRun(t)
  
         exe, err := buildTestProg(t, "testprog")
diff --git a/src/runtime/heapdump.go b/src/runtime/heapdump.go

index 9aaee8f90c9d26b04d26ff66497f9f4d3643918a..3671c65ab731513141dfc521b7f145620016ed27 100644 (file)
--- a/src/runtime/heapdump.go
+++ b/src/runtime/heapdump.go
@@ -412,6 +412,14 @@ func dumpgs() {
         forEachG(func(gp *g) {
                 status := readgstatus(gp) // The world is stopped so gp will not be in a scan state.
                 switch status {
+               case _Grunning:
+                       // Dump goroutine if it's _Grunning only during a syscall. This is safe
+                       // because the goroutine will just park without mutating its stack, since
+                       // the world is stopped.
+                       if gp.syscallsp != 0 {
+                               dumpgoroutine(gp)
+                       }
+                       fallthrough
                 default:
                         print("runtime: unexpected G.status ", hex(status), "\n")
                         throw("dumpgs in STW - bad status")
diff --git a/src/runtime/metrics.go b/src/runtime/metrics.go

index 6ee2c2854934734e69f40d39ccc2ff3993dad175..2710bf68652ca6d4873e0264e17b41719ea33d5f 100644 (file)
--- a/src/runtime/metrics.go
+++ b/src/runtime/metrics.go
@@ -818,9 +818,12 @@ func (a *schedStatsAggregate) compute() {
                 a.gCreated += p.goroutinesCreated
                 switch p.status {
                 case _Prunning:
-                       a.gRunning++
-               case _Psyscall:
-                       a.gNonGo++
+                       if thread, ok := setBlockOnExitSyscall(p); ok {
+                               thread.resume()
+                               a.gNonGo++
+                       } else {
+                               a.gRunning++
+                       }
                 case _Pgcstop:
                         // The world is stopping or stopped.
                         // This is fine. The results will be
@@ -847,7 +850,7 @@ func (a *schedStatsAggregate) compute() {
         // Global run queue.
         a.gRunnable += uint64(sched.runq.size)
  
-       // Account for Gs that are in _Gsyscall without a P in _Psyscall.
+       // Account for Gs that are in _Gsyscall without a P.
         nGsyscallNoP := sched.nGsyscallNoP.Load()
  
         // nGsyscallNoP can go negative during temporary races.
diff --git a/src/runtime/mprof.go b/src/runtime/mprof.go

index 19e90fbb337e551b189dc5ea73b6ffabf21d521f..d745d5f5b9588576abb1025282d3270ca49e5acc 100644 (file)
--- a/src/runtime/mprof.go
+++ b/src/runtime/mprof.go
@@ -1535,7 +1535,18 @@ func doRecordGoroutineProfile(gp1 *g, pcbuf []uintptr) {
                 // everything else, we just don't record the stack in the profile.
                 return
         }
-       if readgstatus(gp1) == _Grunning {
+       // Double-check that we didn't make a grave mistake. If the G is running then in
+       // general, we cannot safely read its stack.
+       //
+       // However, there is one case where it's OK. There's a small window of time in
+       // exitsyscall where a goroutine could be in _Grunning as it's exiting a syscall.
+       // This is OK because goroutine will not exit the syscall until it passes through
+       // a call to tryRecordGoroutineProfile. (An explicit one on the fast path, an
+       // implicit one via the scheduler on the slow path.)
+       //
+       // This is also why it's safe to check syscallsp here. The syscall path mutates
+       // syscallsp only after passing through tryRecordGoroutineProfile.
+       if readgstatus(gp1) == _Grunning && gp1.syscallsp == 0 {
                 print("doRecordGoroutineProfile gp1=", gp1.goid, "\n")
                 throw("cannot read stack of running goroutine")
         }
diff --git a/src/runtime/mstats.go b/src/runtime/mstats.go

index e34f0b10ea013e8c39be88660481bec8d2222a5b..febfb69a3a33ee42e02e9a7381a7be4f783233cf 100644 (file)
--- a/src/runtime/mstats.go
+++ b/src/runtime/mstats.go
@@ -914,7 +914,7 @@ type cpuStats struct {
         ScavengeTotalTime  int64
  
         IdleTime int64 // Time Ps spent in _Pidle.
-       UserTime int64 // Time Ps spent in _Prunning or _Psyscall that's not any of the above.
+       UserTime int64 // Time Ps spent in _Prunning that's not any of the above.
  
         TotalTime int64 // GOMAXPROCS * (monotonic wall clock time elapsed)
  }
@@ -976,7 +976,7 @@ func (s *cpuStats) accumulate(now int64, gcMarkPhase bool) {
         // Compute userTime. We compute this indirectly as everything that's not the above.
         //
         // Since time spent in _Pgcstop is covered by gcPauseTime, and time spent in _Pidle
-       // is covered by idleTime, what we're left with is time spent in _Prunning and _Psyscall,
+       // is covered by idleTime, what we're left with is time spent in _Prunning,
         // the latter of which is fine because the P will either go idle or get used for something
         // else via sysmon. Meanwhile if we subtract GC time from whatever's left, we get non-GC
         // _Prunning time. Note that this still leaves time spent in sweeping and in the scheduler,
diff --git a/src/runtime/os_windows.go b/src/runtime/os_windows.go

index 2ae625580df920976077f82577e5260e7e80e3e0..1688ba910e604a1a61f4232036e0fa547213d578 100644 (file)
--- a/src/runtime/os_windows.go
+++ b/src/runtime/os_windows.go
@@ -191,8 +191,8 @@ type mOS struct {
         // complete.
         //
         // TODO(austin): We may not need this if preemption were more
-       // tightly synchronized on the G/P status and preemption
-       // blocked transition into _Gsyscall/_Psyscall.
+       // tightly synchronized on the G status and preemption
+       // blocked transition into _Gsyscall.
         preemptExtLock uint32
  }
  
diff --git a/src/runtime/preempt.go b/src/runtime/preempt.go

index 1044a6d7207f69d30c6fbd48e48f34c5f249361b..447c7399fcb9029a826476b3fc8fe5809cc7c184 100644 (file)
--- a/src/runtime/preempt.go
+++ b/src/runtime/preempt.go
@@ -286,7 +286,7 @@ func resumeG(state suspendGState) {
  //
  //go:nosplit
  func canPreemptM(mp *m) bool {
-       return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning
+       return mp.locks == 0 && mp.mallocing == 0 && mp.preemptoff == "" && mp.p.ptr().status == _Prunning && mp.curg != nil && readgstatus(mp.curg)&^_Gscan != _Gsyscall
  }
  
  //go:generate go run mkpreempt.go
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index 6c16effc954a2bfe313e32538860f128f7c9dbd3..081b9a28258c5516043c5ea4669baa0c771081b8 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -1654,29 +1654,21 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
         sched.stopwait = gomaxprocs
         sched.gcwaiting.Store(true)
         preemptall()
-       // stop current P
+
+       // Stop current P.
         gp.m.p.ptr().status = _Pgcstop // Pgcstop is only diagnostic.
         gp.m.p.ptr().gcStopTime = start
         sched.stopwait--
-       // try to retake all P's in Psyscall status
-       trace = traceAcquire()
+
+       // Try to retake all P's in syscalls.
         for _, pp := range allp {
-               s := pp.status
-               if s == _Psyscall && atomic.Cas(&pp.status, s, _Pgcstop) {
-                       if trace.ok() {
-                               trace.ProcSteal(pp, false)
-                       }
-                       sched.nGsyscallNoP.Add(1)
-                       pp.syscalltick++
-                       pp.gcStopTime = nanotime()
-                       sched.stopwait--
+               if thread, ok := setBlockOnExitSyscall(pp); ok {
+                       thread.gcstopP()
+                       thread.resume()
                 }
         }
-       if trace.ok() {
-               traceRelease(trace)
-       }
  
-       // stop idle P's
+       // Stop idle Ps.
         now := nanotime()
         for {
                 pp, _ := pidleget(now)
@@ -1690,7 +1682,7 @@ func stopTheWorldWithSema(reason stwReason) worldStop {
         wait := sched.stopwait > 0
         unlock(&sched.lock)
  
-       // wait for remaining P's to stop voluntarily
+       // Wait for remaining Ps to stop voluntarily.
         if wait {
                 for {
                         // wait for 100us, then try to re-preempt in case of any races
@@ -2156,9 +2148,9 @@ func forEachPInternal(fn func(*p)) {
         }
         preemptall()
  
-       // Any P entering _Pidle or _Psyscall from now on will observe
+       // Any P entering _Pidle or a system call from now on will observe
         // p.runSafePointFn == 1 and will call runSafePointFn when
-       // changing its status to _Pidle/_Psyscall.
+       // changing its status to _Pidle.
  
         // Run safe point function for all idle Ps. sched.pidle will
         // not change because we hold sched.lock.
@@ -2175,25 +2167,19 @@ func forEachPInternal(fn func(*p)) {
         // Run fn for the current P.
         fn(pp)
  
-       // Force Ps currently in _Psyscall into _Pidle and hand them
+       // Force Ps currently in a system call into _Pidle and hand them
         // off to induce safe point function execution.
         for _, p2 := range allp {
-               s := p2.status
-
                 // We need to be fine-grained about tracing here, since handoffp
                 // might call into the tracer, and the tracer is non-reentrant.
-               trace := traceAcquire()
-               if s == _Psyscall && p2.runSafePointFn == 1 && atomic.Cas(&p2.status, s, _Pidle) {
-                       if trace.ok() {
-                               // It's important that we traceRelease before we call handoffp, which may also traceAcquire.
-                               trace.ProcSteal(p2, false)
-                               traceRelease(trace)
-                       }
-                       sched.nGsyscallNoP.Add(1)
-                       p2.syscalltick++
+               if atomic.Load(&p2.runSafePointFn) != 1 {
+                       // Already ran it.
+                       continue
+               }
+               if thread, ok := setBlockOnExitSyscall(p2); ok {
+                       thread.takeP()
+                       thread.resume()
                         handoffp(p2)
-               } else if trace.ok() {
-                       traceRelease(trace)
                 }
         }
  
@@ -2234,9 +2220,9 @@ func forEachPInternal(fn func(*p)) {
  //     }
  //
  // runSafePointFn must be checked on any transition in to _Pidle or
-// _Psyscall to avoid a race where forEachP sees that the P is running
-// just before the P goes into _Pidle/_Psyscall and neither forEachP
-// nor the P run the safe-point function.
+// when entering a system call to avoid a race where forEachP sees
+// that the P is running just before the P goes into _Pidle/system call
+// and neither forEachP nor the P run the safe-point function.
  func runSafePointFn() {
         p := getg().m.p.ptr()
         // Resolve the race between forEachP running the safe-point
@@ -2571,7 +2557,7 @@ func oneNewExtraM() {
  // So that the destructor would invoke dropm while the non-Go thread is exiting.
  // This is much faster since it avoids expensive signal-related syscalls.
  //
-// This always runs without a P, so //go:nowritebarrierrec is required.
+// This may run without a P, so //go:nowritebarrierrec is required.
  //
  // This may run with a different stack than was recorded in g0 (there is no
  // call to callbackUpdateSystemStack prior to dropm), so this must be
@@ -4566,7 +4552,6 @@ func save(pc, sp, bp uintptr) {
  //
  //go:nosplit
  func reentersyscall(pc, sp, bp uintptr) {
-       trace := traceAcquire()
         gp := getg()
  
         // Disable preemption because during this function g is in Gsyscall status,
@@ -4580,17 +4565,23 @@ func reentersyscall(pc, sp, bp uintptr) {
         gp.stackguard0 = stackPreempt
         gp.throwsplit = true
  
+       // Copy the syscalltick over so we can identify if the P got stolen later.
+       gp.m.syscalltick = gp.m.p.ptr().syscalltick
+
+       pp := gp.m.p.ptr()
+       if pp.runSafePointFn != 0 {
+               // runSafePointFn may stack split if run on this stack
+               systemstack(runSafePointFn)
+       }
+       gp.m.oldp.set(pp)
+
         // Leave SP around for GC and traceback.
         save(pc, sp, bp)
         gp.syscallsp = sp
         gp.syscallpc = pc
         gp.syscallbp = bp
-       casgstatus(gp, _Grunning, _Gsyscall)
-       if staticLockRanking {
-               // When doing static lock ranking casgstatus can call
-               // systemstack which clobbers g.sched.
-               save(pc, sp, bp)
-       }
+
+       // Double-check sp and bp.
         if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
                 systemstack(func() {
                         print("entersyscall inconsistent sp ", hex(gp.syscallsp), " [", hex(gp.stack.lo), ",", hex(gp.stack.hi), "]\n")
@@ -4603,40 +4594,45 @@ func reentersyscall(pc, sp, bp uintptr) {
                         throw("entersyscall")
                 })
         }
-
+       trace := traceAcquire()
         if trace.ok() {
+               // Emit a trace event. Notably, actually emitting the event must happen before
+               // the casgstatus because it mutates the P, but the traceLocker must be held
+               // across the casgstatus so the transition is atomic with respect to the event.
                 systemstack(func() {
                         trace.GoSysCall()
-                       traceRelease(trace)
                 })
-               // systemstack itself clobbers g.sched.{pc,sp} and we might
-               // need them later when the G is genuinely blocked in a
-               // syscall
+               // systemstack clobbered gp.sched, so restore it.
                 save(pc, sp, bp)
         }
-
-       if sched.sysmonwait.Load() {
-               systemstack(entersyscall_sysmon)
+       if sched.gcwaiting.Load() {
+               // Optimization: If there's a pending STW, do the equivalent of
+               // entersyscallblock here at the last minute and immediately give
+               // away our P.
+               systemstack(func() {
+                       entersyscallHandleGCWait(trace)
+               })
+               // systemstack clobbered gp.sched, so restore it.
                 save(pc, sp, bp)
         }
-
-       if gp.m.p.ptr().runSafePointFn != 0 {
-               // runSafePointFn may stack split if run on this stack
-               systemstack(runSafePointFn)
+       // As soon as we switch to _Gsyscall, we are in danger of losing our P.
+       // We must not touch it after this point.
+       casgstatus(gp, _Grunning, _Gsyscall)
+       if staticLockRanking {
+               // casgstatus clobbers gp.sched via systemstack under staticLockRanking. Restore it.
                 save(pc, sp, bp)
         }
-
-       gp.m.syscalltick = gp.m.p.ptr().syscalltick
-       pp := gp.m.p.ptr()
-       pp.m = 0
-       gp.m.oldp.set(pp)
-       gp.m.p = 0
-       atomic.Store(&pp.status, _Psyscall)
-       if sched.gcwaiting.Load() {
-               systemstack(entersyscall_gcwait)
+       if trace.ok() {
+               // N.B. We don't need to go on the systemstack because traceRelease is very
+               // carefully recursively nosplit. This also means we don't need to worry
+               // about clobbering gp.sched.
+               traceRelease(trace)
+       }
+       if sched.sysmonwait.Load() {
+               systemstack(entersyscallWakeSysmon)
+               // systemstack clobbered gp.sched, so restore it.
                 save(pc, sp, bp)
         }
-
         gp.m.locks--
  }
  
@@ -4663,7 +4659,7 @@ func entersyscall() {
         reentersyscall(sys.GetCallerPC(), sys.GetCallerSP(), fp)
  }
  
-func entersyscall_sysmon() {
+func entersyscallWakeSysmon() {
         lock(&sched.lock)
         if sched.sysmonwait.Load() {
                 sched.sysmonwait.Store(false)
@@ -4672,25 +4668,19 @@ func entersyscall_sysmon() {
         unlock(&sched.lock)
  }
  
-func entersyscall_gcwait() {
+func entersyscallHandleGCWait(trace traceLocker) {
         gp := getg()
-       pp := gp.m.oldp.ptr()
  
         lock(&sched.lock)
-       trace := traceAcquire()
-       if sched.stopwait > 0 && atomic.Cas(&pp.status, _Psyscall, _Pgcstop) {
+       if sched.stopwait > 0 {
+               // Set our P to _Pgcstop so the STW can take it.
+               pp := gp.m.p.ptr()
+               pp.m = 0
+               gp.m.p = 0
+               atomic.Store(&pp.status, _Pgcstop)
+
                 if trace.ok() {
-                       // This is a steal in the new tracer. While it's very likely
-                       // that we were the ones to put this P into _Psyscall, between
-                       // then and now it's totally possible it had been stolen and
-                       // then put back into _Psyscall for us to acquire here. In such
-                       // case ProcStop would be incorrect.
-                       //
-                       // TODO(mknyszek): Consider emitting a ProcStop instead when
-                       // gp.m.syscalltick == pp.syscalltick, since then we know we never
-                       // lost the P.
-                       trace.ProcSteal(pp, true)
-                       traceRelease(trace)
+                       trace.ProcStop(pp)
                 }
                 sched.nGsyscallNoP.Add(1)
                 pp.gcStopTime = nanotime()
@@ -4698,8 +4688,6 @@ func entersyscall_gcwait() {
                 if sched.stopwait--; sched.stopwait == 0 {
                         notewakeup(&sched.stopnote)
                 }
-       } else if trace.ok() {
-               traceRelease(trace)
         }
         unlock(&sched.lock)
  }
@@ -4744,6 +4732,22 @@ func entersyscallblock() {
                         throw("entersyscallblock")
                 })
         }
+
+       // Once we switch to _Gsyscall, we can't safely touch
+       // our P anymore, so we need to hand it off beforehand.
+       // The tracer also needs to see the syscall before the P
+       // handoff, so the order here must be (1) trace,
+       // (2) handoff, (3) _Gsyscall switch.
+       trace := traceAcquire()
+       systemstack(func() {
+               if trace.ok() {
+                       trace.GoSysCall()
+               }
+               handoffp(releasep())
+       })
+       // <--
+       // Caution: we're in a small window where we are in _Grunning without a P.
+       // -->
         casgstatus(gp, _Grunning, _Gsyscall)
         if gp.syscallsp < gp.stack.lo || gp.stack.hi < gp.syscallsp {
                 systemstack(func() {
@@ -4757,8 +4761,11 @@ func entersyscallblock() {
                         throw("entersyscallblock")
                 })
         }
-
-       systemstack(entersyscallblock_handoff)
+       if trace.ok() {
+               systemstack(func() {
+                       traceRelease(trace)
+               })
+       }
  
         // Resave for traceback during blocked call.
         save(sys.GetCallerPC(), sys.GetCallerSP(), getcallerfp())
@@ -4766,15 +4773,6 @@ func entersyscallblock() {
         gp.m.locks--
  }
  
-func entersyscallblock_handoff() {
-       trace := traceAcquire()
-       if trace.ok() {
-               trace.GoSysCall()
-               traceRelease(trace)
-       }
-       handoffp(releasep())
-}
-
  // The goroutine g exited its system call.
  // Arrange for it to run on a cpu again.
  // This is called only from the go syscall library, not
@@ -4802,13 +4800,87 @@ func exitsyscall() {
         if sys.GetCallerSP() > gp.syscallsp {
                 throw("exitsyscall: syscall frame is no longer valid")
         }
-
         gp.waitsince = 0
+
+       if sched.stopwait == freezeStopWait {
+               // Wedge ourselves if there's an outstanding freezetheworld.
+               // If we transition to running, we might end up with our traceback
+               // being taken twice.
+               systemstack(func() {
+                       lock(&deadlock)
+                       lock(&deadlock)
+               })
+       }
+
+       // Optimistically assume we're going to keep running, and switch to running.
+       // Before this point, our P wiring is not ours. Once we get past this point,
+       // we can access our P if we have it, otherwise we lost it.
+       //
+       // N.B. Because we're transitioning to _Grunning here, traceAcquire doesn't
+       // need to be held ahead of time. We're effectively atomic with respect to
+       // the tracer because we're non-preemptible and in the runtime. It can't stop
+       // us to read a bad status.
+       casgstatus(gp, _Gsyscall, _Grunning)
+
+       // Caution: we're in a window where we may be in _Grunning without a P.
+       // Either we will grab a P or call exitsyscall0, where we'll switch to
+       // _Grunnable.
+
+       // Grab and clear our old P.
         oldp := gp.m.oldp.ptr()
-       gp.m.oldp = 0
-       if exitsyscallfast(oldp) {
-               // When exitsyscallfast returns success, we have a P so can now use
-               // write barriers
+       gp.m.oldp.set(nil)
+
+       // Check if we still have a P, and if not, try to acquire an idle P.
+       pp := gp.m.p.ptr()
+       if pp != nil {
+               // Fast path: we still have our P. Just emit a syscall exit event.
+               if trace := traceAcquire(); trace.ok() {
+                       systemstack(func() {
+                               // The truth is we truly never lost the P, but syscalltick
+                               // is used to indicate whether the P should be treated as
+                               // lost anyway. For example, when syscalltick is trashed by
+                               // dropm.
+                               //
+                               // TODO(mknyszek): Consider a more explicit mechanism for this.
+                               // Then syscalltick doesn't need to be trashed, and can be used
+                               // exclusively by sysmon for deciding when it's time to retake.
+                               if pp.syscalltick == gp.m.syscalltick {
+                                       trace.GoSysExit(false)
+                               } else {
+                                       // Since we need to pretend we lost the P, but nobody ever
+                                       // took it, we need a ProcSteal event to model the loss.
+                                       // Then, continue with everything else we'd do if we lost
+                                       // the P.
+                                       trace.ProcSteal(pp)
+                                       trace.ProcStart()
+                                       trace.GoSysExit(true)
+                                       trace.GoStart()
+                               }
+                               traceRelease(trace)
+                       })
+               }
+       } else {
+               // Slow path: we lost our P. Try to get another one.
+               systemstack(func() {
+                       // Try to get some other P.
+                       if pp := exitsyscallTryGetP(oldp); pp != nil {
+                               // Install the P.
+                               acquirepNoTrace(pp)
+
+                               // We're going to start running again, so emit all the relevant events.
+                               if trace := traceAcquire(); trace.ok() {
+                                       trace.ProcStart()
+                                       trace.GoSysExit(true)
+                                       trace.GoStart()
+                                       traceRelease(trace)
+                               }
+                       }
+               })
+               pp = gp.m.p.ptr()
+       }
+
+       // If we have a P, clean up and exit.
+       if pp != nil {
                 if goroutineProfile.active {
                         // Make sure that gp has had its stack written out to the goroutine
                         // profile, exactly as it was when the goroutine profiler first
@@ -4817,41 +4889,19 @@ func exitsyscall() {
                                 tryRecordGoroutineProfileWB(gp)
                         })
                 }
-               trace := traceAcquire()
-               if trace.ok() {
-                       lostP := oldp != gp.m.p.ptr() || gp.m.syscalltick != gp.m.p.ptr().syscalltick
-                       systemstack(func() {
-                               // Write out syscall exit eagerly.
-                               //
-                               // It's important that we write this *after* we know whether we
-                               // lost our P or not (determined by exitsyscallfast).
-                               trace.GoSysExit(lostP)
-                               if lostP {
-                                       // We lost the P at some point, even though we got it back here.
-                                       // Trace that we're starting again, because there was a tracev2.GoSysBlock
-                                       // call somewhere in exitsyscallfast (indicating that this goroutine
-                                       // had blocked) and we're about to start running again.
-                                       trace.GoStart()
-                               }
-                       })
-               }
-               // There's a cpu for us, so we can run.
-               gp.m.p.ptr().syscalltick++
-               // We need to cas the status and scan before resuming...
-               casgstatus(gp, _Gsyscall, _Grunning)
-               if trace.ok() {
-                       traceRelease(trace)
-               }
+
+               // Increment the syscalltick for P, since we're exiting a syscall.
+               pp.syscalltick++
  
                 // Garbage collector isn't running (since we are),
                 // so okay to clear syscallsp.
                 gp.syscallsp = 0
                 gp.m.locks--
                 if gp.preempt {
-                       // restore the preemption request in case we've cleared it in newstack
+                       // Restore the preemption request in case we cleared it in newstack.
                         gp.stackguard0 = stackPreempt
                 } else {
-                       // otherwise restore the real stackGuard, we've spoiled it in entersyscall/entersyscallblock
+                       // Otherwise restore the real stackGuard, we clobbered it in entersyscall/entersyscallblock.
                         gp.stackguard0 = gp.stack.lo + stackGuard
                 }
                 gp.throwsplit = false
@@ -4860,14 +4910,13 @@ func exitsyscall() {
                         // Scheduling of this goroutine is disabled.
                         Gosched()
                 }
-
                 return
         }
-
+       // Slowest path: We couldn't get a P, so call into the scheduler.
         gp.m.locks--
  
         // Call the scheduler.
-       mcall(exitsyscall0)
+       mcall(exitsyscallNoP)
  
         // Scheduler returned, so we're allowed to run now.
         // Delete the syscallsp information that we left for
@@ -4880,78 +4929,38 @@ func exitsyscall() {
         gp.throwsplit = false
  }
  
-//go:nosplit
-func exitsyscallfast(oldp *p) bool {
-       // Freezetheworld sets stopwait but does not retake P's.
-       if sched.stopwait == freezeStopWait {
-               return false
-       }
-
-       // Try to re-acquire the last P.
-       trace := traceAcquire()
-       if oldp != nil && oldp.status == _Psyscall && atomic.Cas(&oldp.status, _Psyscall, _Pidle) {
-               // There's a cpu for us, so we can run.
-               wirep(oldp)
-               exitsyscallfast_reacquired(trace)
-               if trace.ok() {
-                       traceRelease(trace)
+// exitsyscall's attempt to try to get any P, if it's missing one.
+// Returns true on success.
+//
+// Must execute on the systemstack because exitsyscall is nosplit.
+//
+//go:systemstack
+func exitsyscallTryGetP(oldp *p) *p {
+       // Try to steal our old P back.
+       if oldp != nil {
+               if thread, ok := setBlockOnExitSyscall(oldp); ok {
+                       thread.takeP()
+                       thread.resume()
+                       sched.nGsyscallNoP.Add(-1) // takeP adds 1.
+                       return oldp
                 }
-               return true
-       }
-       if trace.ok() {
-               traceRelease(trace)
         }
  
-       // Try to get any other idle P.
+       // Try to get an idle P.
         if sched.pidle != 0 {
-               var ok bool
-               systemstack(func() {
-                       ok = exitsyscallfast_pidle()
-               })
-               if ok {
-                       return true
+               lock(&sched.lock)
+               pp, _ := pidleget(0)
+               if pp != nil && sched.sysmonwait.Load() {
+                       sched.sysmonwait.Store(false)
+                       notewakeup(&sched.sysmonnote)
                 }
-       }
-       return false
-}
-
-// exitsyscallfast_reacquired is the exitsyscall path on which this G
-// has successfully reacquired the P it was running on before the
-// syscall.
-//
-//go:nosplit
-func exitsyscallfast_reacquired(trace traceLocker) {
-       gp := getg()
-       if gp.m.syscalltick != gp.m.p.ptr().syscalltick {
-               if trace.ok() {
-                       // The p was retaken and then enter into syscall again (since gp.m.syscalltick has changed).
-                       // tracev2.GoSysBlock for this syscall was already emitted,
-                       // but here we effectively retake the p from the new syscall running on the same p.
-                       systemstack(func() {
-                               // We're stealing the P. It's treated
-                               // as if it temporarily stopped running. Then, start running.
-                               trace.ProcSteal(gp.m.p.ptr(), true)
-                               trace.ProcStart()
-                       })
+               unlock(&sched.lock)
+               if pp != nil {
+                       sched.nGsyscallNoP.Add(-1)
+                       return pp
                 }
-               gp.m.p.ptr().syscalltick++
-       }
-}
-
-func exitsyscallfast_pidle() bool {
-       lock(&sched.lock)
-       pp, _ := pidleget(0)
-       if pp != nil && sched.sysmonwait.Load() {
-               sched.sysmonwait.Store(false)
-               notewakeup(&sched.sysmonnote)
-       }
-       unlock(&sched.lock)
-       if pp != nil {
-               sched.nGsyscallNoP.Add(-1)
-               acquirep(pp)
-               return true
         }
-       return false
+       return nil
  }
  
  // exitsyscall slow path on g0.
@@ -4960,11 +4969,10 @@ func exitsyscallfast_pidle() bool {
  // Called via mcall, so gp is the calling g from this M.
  //
  //go:nowritebarrierrec
-func exitsyscall0(gp *g) {
-       var trace traceLocker
+func exitsyscallNoP(gp *g) {
         traceExitingSyscall()
-       trace = traceAcquire()
-       casgstatus(gp, _Gsyscall, _Grunnable)
+       trace := traceAcquire()
+       casgstatus(gp, _Grunning, _Grunnable)
         traceExitedSyscall()
         if trace.ok() {
                 // Write out syscall exit eagerly.
@@ -6021,6 +6029,21 @@ func procresize(nprocs int32) *p {
  //
  //go:yeswritebarrierrec
  func acquirep(pp *p) {
+       // Do the work.
+       acquirepNoTrace(pp)
+
+       // Emit the event.
+       trace := traceAcquire()
+       if trace.ok() {
+               trace.ProcStart()
+               traceRelease(trace)
+       }
+}
+
+// Internals of acquirep, just skipping the trace events.
+//
+//go:yeswritebarrierrec
+func acquirepNoTrace(pp *p) {
         // Do the part that isn't allowed to have write barriers.
         wirep(pp)
  
@@ -6029,12 +6052,6 @@ func acquirep(pp *p) {
         // Perform deferred mcache flush before this P can allocate
         // from a potentially stale mcache.
         pp.mcache.prepareForSweep()
-
-       trace := traceAcquire()
-       if trace.ok() {
-               trace.ProcStart()
-               traceRelease(trace)
-       }
  }
  
  // wirep is the first step of acquirep, which actually associates the
@@ -6382,73 +6399,205 @@ func retake(now int64) uint32 {
         // temporarily drop the allpLock. Hence, we need to re-fetch
         // allp each time around the loop.
         for i := 0; i < len(allp); i++ {
+               // Quickly filter out non-running Ps. Running Ps are either
+               // in a syscall or are actually executing. Idle Ps don't
+               // need to be retaken.
+               //
+               // This is best-effort, so it's OK that it's racy. Our target
+               // is to retake Ps that have been running or in a syscall for
+               // a long time (milliseconds), so the state has plenty of time
+               // to stabilize.
                 pp := allp[i]
-               if pp == nil {
-                       // This can happen if procresize has grown
+               if pp == nil || atomic.Load(&pp.status) != _Prunning {
+                       // pp can be nil if procresize has grown
                         // allp but not yet created new Ps.
                         continue
                 }
                 pd := &pp.sysmontick
-               s := pp.status
                 sysretake := false
-               if s == _Prunning || s == _Psyscall {
-                       // Preempt G if it's running on the same schedtick for
-                       // too long. This could be from a single long-running
-                       // goroutine or a sequence of goroutines run via
-                       // runnext, which share a single schedtick time slice.
-                       t := int64(pp.schedtick)
-                       if int64(pd.schedtick) != t {
-                               pd.schedtick = uint32(t)
-                               pd.schedwhen = now
-                       } else if pd.schedwhen+forcePreemptNS <= now {
-                               preemptone(pp)
-                               // In case of syscall, preemptone() doesn't
-                               // work, because there is no M wired to P.
-                               sysretake = true
-                       }
+
+               // Preempt G if it's running on the same schedtick for
+               // too long. This could be from a single long-running
+               // goroutine or a sequence of goroutines run via
+               // runnext, which share a single schedtick time slice.
+               schedt := int64(pp.schedtick)
+               if int64(pd.schedtick) != schedt {
+                       pd.schedtick = uint32(schedt)
+                       pd.schedwhen = now
+               } else if pd.schedwhen+forcePreemptNS <= now {
+                       preemptone(pp)
+                       // If pp is in a syscall, preemptone doesn't work.
+                       // The goroutine nor the thread can respond to a
+                       // preemption request because they're not in Go code,
+                       // so we need to take the P ourselves.
+                       sysretake = true
+               }
+
+               // Drop allpLock so we can take sched.lock.
+               unlock(&allpLock)
+
+               // Need to decrement number of idle locked M's (pretending that
+               // one more is running) before we take the P and resume.
+               // Otherwise the M from which we retake can exit the syscall,
+               // increment nmidle and report deadlock.
+               //
+               // Can't call incidlelocked once we setBlockOnExitSyscall, due
+               // to a lock ordering violation between sched.lock and _Gscan.
+               incidlelocked(-1)
+
+               // Try to prevent the P from continuing in the syscall, if it's in one at all.
+               thread, ok := setBlockOnExitSyscall(pp)
+               if !ok {
+                       // Not in a syscall, or something changed out from under us.
+                       goto done
                 }
-               if s == _Psyscall {
-                       // Retake P from syscall if it's there for more than 1 sysmon tick (at least 20us).
-                       t := int64(pp.syscalltick)
-                       if !sysretake && int64(pd.syscalltick) != t {
-                               pd.syscalltick = uint32(t)
-                               pd.syscallwhen = now
-                               continue
-                       }
-                       // On the one hand we don't want to retake Ps if there is no other work to do,
-                       // but on the other hand we want to retake them eventually
-                       // because they can prevent the sysmon thread from deep sleep.
-                       if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now {
-                               continue
-                       }
-                       // Drop allpLock so we can take sched.lock.
-                       unlock(&allpLock)
-                       // Need to decrement number of idle locked M's
-                       // (pretending that one more is running) before the CAS.
-                       // Otherwise the M from which we retake can exit the syscall,
-                       // increment nmidle and report deadlock.
-                       incidlelocked(-1)
-                       trace := traceAcquire()
-                       if atomic.Cas(&pp.status, s, _Pidle) {
-                               if trace.ok() {
-                                       trace.ProcSteal(pp, false)
-                                       traceRelease(trace)
-                               }
-                               sched.nGsyscallNoP.Add(1)
-                               n++
-                               pp.syscalltick++
-                               handoffp(pp)
-                       } else if trace.ok() {
-                               traceRelease(trace)
-                       }
-                       incidlelocked(1)
-                       lock(&allpLock)
+
+               // Retake the P if it's there for more than 1 sysmon tick (at least 20us).
+               if syst := int64(pp.syscalltick); !sysretake && int64(pd.syscalltick) != syst {
+                       pd.syscalltick = uint32(syst)
+                       pd.syscallwhen = now
+                       thread.resume()
+                       goto done
+               }
+
+               // On the one hand we don't want to retake Ps if there is no other work to do,
+               // but on the other hand we want to retake them eventually
+               // because they can prevent the sysmon thread from deep sleep.
+               if runqempty(pp) && sched.nmspinning.Load()+sched.npidle.Load() > 0 && pd.syscallwhen+10*1000*1000 > now {
+                       thread.resume()
+                       goto done
                 }
+
+               // Take the P. Note: because we have the scan bit, the goroutine
+               // is at worst stuck spinning in exitsyscall.
+               thread.takeP()
+               thread.resume()
+               n++
+
+               // Handoff the P for some other thread to run it.
+               handoffp(pp)
+
+               // The P has been handed off to another thread, so risk of a false
+               // deadlock report while we hold onto it is gone.
+       done:
+               incidlelocked(1)
+               lock(&allpLock)
         }
         unlock(&allpLock)
         return uint32(n)
  }
  
+// syscallingThread represents a thread in a system call that temporarily
+// cannot advance out of the system call.
+type syscallingThread struct {
+       gp     *g
+       mp     *m
+       pp     *p
+       status uint32
+}
+
+// setBlockOnExitSyscall prevents pp's thread from advancing out of
+// exitsyscall. On success, returns the g/m/p state of the thread
+// and true. At that point, the caller owns the g/m/p links referenced,
+// the goroutine is in _Gsyscall, and prevented from transitioning out
+// of it. On failure, it returns false, and none of these guarantees are
+// made.
+//
+// Callers must call resume on the resulting thread state once
+// they're done with thread, otherwise it will remain blocked forever.
+//
+// This function races with state changes on pp, and thus may fail
+// if pp is not in a system call, or exits a system call concurrently
+// with this function. However, this function is safe to call without
+// any additional synchronization.
+func setBlockOnExitSyscall(pp *p) (syscallingThread, bool) {
+       if pp.status != _Prunning {
+               return syscallingThread{}, false
+       }
+       // Be very careful here, these reads are intentionally racy.
+       // Once we notice the G is in _Gsyscall, acquire its scan bit,
+       // and validate that it's still connected to the *same* M and P,
+       // we can actually get to work. Holding the scan bit will prevent
+       // the G from exiting the syscall.
+       //
+       // Our goal here is to interrupt long syscalls. If it turns out
+       // that we're wrong and the G switched to another syscall while
+       // we were trying to do this, that's completely fine. It's
+       // probably making more frequent syscalls and the typical
+       // preemption paths should be effective.
+       mp := pp.m.ptr()
+       if mp == nil {
+               // Nothing to do.
+               return syscallingThread{}, false
+       }
+       gp := mp.curg
+       if gp == nil {
+               // Nothing to do.
+               return syscallingThread{}, false
+       }
+       status := readgstatus(gp) &^ _Gscan
+
+       // A goroutine is considered in a syscall, and may have a corresponding
+       // P, if it's in _Gsyscall *or* _Gdeadextra. In the latter case, it's an
+       // extra M goroutine.
+       if status != _Gsyscall && status != _Gdeadextra {
+               // Not in a syscall, nothing to do.
+               return syscallingThread{}, false
+       }
+       if !castogscanstatus(gp, status, status|_Gscan) {
+               // Not in _Gsyscall or _Gdeadextra anymore. Nothing to do.
+               return syscallingThread{}, false
+       }
+       if gp.m != mp || gp.m.p.ptr() != pp {
+               // This is not what we originally observed. Nothing to do.
+               casfrom_Gscanstatus(gp, status|_Gscan, status)
+               return syscallingThread{}, false
+       }
+       return syscallingThread{gp, mp, pp, status}, true
+}
+
+// gcstopP unwires the P attached to the syscalling thread
+// and moves it into the _Pgcstop state.
+//
+// The caller must be stopping the world.
+func (s syscallingThread) gcstopP() {
+       assertLockHeld(&sched.lock)
+
+       s.releaseP(_Pgcstop)
+       s.pp.gcStopTime = nanotime()
+       sched.stopwait--
+}
+
+// takeP unwires the P attached to the syscalling thread
+// and moves it into the _Pidle state.
+func (s syscallingThread) takeP() {
+       s.releaseP(_Pidle)
+}
+
+// releaseP unwires the P from the syscalling thread, moving
+// it to the provided state. Callers should prefer to use
+// takeP and gcstopP.
+func (s syscallingThread) releaseP(state uint32) {
+       if state != _Pidle && state != _Pgcstop {
+               throw("attempted to release P into a bad state")
+       }
+       trace := traceAcquire()
+       s.pp.m = 0
+       s.mp.p = 0
+       atomic.Store(&s.pp.status, state)
+       if trace.ok() {
+               trace.ProcSteal(s.pp)
+               traceRelease(trace)
+       }
+       sched.nGsyscallNoP.Add(1)
+       s.pp.syscalltick++
+}
+
+// resume allows a syscalling thread to advance beyond exitsyscall.
+func (s syscallingThread) resume() {
+       casfrom_Gscanstatus(s.gp, s.status|_Gscan, s.status)
+}
+
  // Tell all goroutines that they have been preempted and they should stop.
  // This function is purely best-effort. It can fail to inform a goroutine if a
  // processor just started running it.
@@ -6486,6 +6635,10 @@ func preemptone(pp *p) bool {
         if gp == nil || gp == mp.g0 {
                 return false
         }
+       if readgstatus(gp)&^_Gscan == _Gsyscall {
+               // Don't bother trying to preempt a goroutine in a syscall.
+               return false
+       }
  
         gp.preempt = true
  
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go

index 721fae0b28b3c78d1a3a32ca91e199b4db6fbbd2..b346337d60354c0007eb4cc94776e7a3c3d3428f 100644 (file)
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -42,12 +42,16 @@ const (
  
         // _Grunning means this goroutine may execute user code. The
         // stack is owned by this goroutine. It is not on a run queue.
-       // It is assigned an M and a P (g.m and g.m.p are valid).
+       // It is assigned an M (g.m is valid) and it usually has a P
+       // (g.m.p is valid), but there are small windows of time where
+       // it might not, namely upon entering and exiting _Gsyscall.
         _Grunning // 2
  
         // _Gsyscall means this goroutine is executing a system call.
         // It is not executing user code. The stack is owned by this
         // goroutine. It is not on a run queue. It is assigned an M.
+       // It may have a P attached, but it does not own it. Code
+       // executing in this state must not touch g.m.p.
         _Gsyscall // 3
  
         // _Gwaiting means this goroutine is blocked in the runtime.
@@ -131,22 +135,15 @@ const (
         // run user code or the scheduler. Only the M that owns this P
         // is allowed to change the P's status from _Prunning. The M
         // may transition the P to _Pidle (if it has no more work to
-       // do), _Psyscall (when entering a syscall), or _Pgcstop (to
-       // halt for the GC). The M may also hand ownership of the P
-       // off directly to another M (e.g., to schedule a locked G).
+       // do), or _Pgcstop (to halt for the GC). The M may also hand
+       // ownership of the P off directly to another M (for example,
+       // to schedule a locked G).
         _Prunning
  
-       // _Psyscall means a P is not running user code. It has
-       // affinity to an M in a syscall but is not owned by it and
-       // may be stolen by another M. This is similar to _Pidle but
-       // uses lightweight transitions and maintains M affinity.
-       //
-       // Leaving _Psyscall must be done with a CAS, either to steal
-       // or retake the P. Note that there's an ABA hazard: even if
-       // an M successfully CASes its original P back to _Prunning
-       // after a syscall, it must understand the P may have been
-       // used by another M in the interim.
-       _Psyscall
+       // _Psyscall_unused is a now-defunct state for a P. A P is
+       // identified as "in a system call" by looking at the goroutine's
+       // state.
+       _Psyscall_unused
  
         // _Pgcstop means a P is halted for STW and owned by the M
         // that stopped the world. The M that stopped the world
@@ -620,18 +617,27 @@ type m struct {
         morebuf gobuf  // gobuf arg to morestack
         divmod  uint32 // div/mod denominator for arm - known to liblink (cmd/internal/obj/arm/obj5.go)
  
-       // Fields not known to debuggers.
-       procid          uint64            // for debuggers, but offset not hard-coded
-       gsignal         *g                // signal-handling g
-       goSigStack      gsignalStack      // Go-allocated signal handling stack
-       sigmask         sigset            // storage for saved signal mask
-       tls             [tlsSlots]uintptr // thread-local storage (for x86 extern register)
-       mstartfn        func()
-       curg            *g       // current running goroutine
-       caughtsig       guintptr // goroutine running during fatal signal
-       p               puintptr // attached p for executing go code (nil if not executing go code)
-       nextp           puintptr
-       oldp            puintptr // the p that was attached before executing a syscall
+       // Fields whose offsets are not known to debuggers.
+
+       procid     uint64            // for debuggers, but offset not hard-coded
+       gsignal    *g                // signal-handling g
+       goSigStack gsignalStack      // Go-allocated signal handling stack
+       sigmask    sigset            // storage for saved signal mask
+       tls        [tlsSlots]uintptr // thread-local storage (for x86 extern register)
+       mstartfn   func()
+       curg       *g       // current running goroutine
+       caughtsig  guintptr // goroutine running during fatal signal
+
+       // p is the currently attached P for executing Go code, nil if not executing user Go code.
+       //
+       // A non-nil p implies exclusive ownership of the P, unless curg is in _Gsyscall.
+       // In _Gsyscall the scheduler may mutate this instead. The point of synchronization
+       // is the _Gscan bit on curg's status. The scheduler must arrange to prevent curg
+       // from transitioning out of _Gsyscall if it intends to mutate p.
+       p puintptr
+
+       nextp           puintptr // The next P to install before executing. Implies exclusive ownership of this P.
+       oldp            puintptr // The P that was attached before executing a syscall.
         id              int64
         mallocing       int32
         throwing        throwType
diff --git a/src/runtime/traceevent.go b/src/runtime/traceevent.go

index b0bc4c017da2660f50c93790183202b24f8e7d06..8651275476128d097a431278e38ba39fe73c6458 100644 (file)
--- a/src/runtime/traceevent.go
+++ b/src/runtime/traceevent.go
@@ -44,6 +44,13 @@ func (tl traceLocker) eventWriter(goStatus tracev2.GoStatus, procStatus tracev2.
         if gp := tl.mp.curg; gp != nil && !gp.trace.statusWasTraced(tl.gen) && gp.trace.acquireStatus(tl.gen) {
                 tl.writer().writeGoStatus(gp.goid, int64(tl.mp.procid), goStatus, gp.inMarkAssist, 0 /* no stack */).end()
         }
+       return tl.rawEventWriter()
+}
+
+// rawEventWriter creates a new traceEventWriter without emitting any status events.
+//
+// It is the caller's responsibility to emit any status events, if necessary.
+func (tl traceLocker) rawEventWriter() traceEventWriter {
         return traceEventWriter{tl}
  }
  
diff --git a/src/runtime/traceruntime.go b/src/runtime/traceruntime.go

index 7df6f68b7061648b6130fdab9224c5cf5cc44cb5..d3f82f6fceeaf4b8654ab5a0372b1994d90a8140 100644 (file)
--- a/src/runtime/traceruntime.go
+++ b/src/runtime/traceruntime.go
@@ -534,10 +534,8 @@ func (tl traceLocker) GoSysExit(lostP bool) {
  
  // ProcSteal indicates that our current M stole a P from another M.
  //
-// inSyscall indicates that we're stealing the P from a syscall context.
-//
  // The caller must have ownership of pp.
-func (tl traceLocker) ProcSteal(pp *p, inSyscall bool) {
+func (tl traceLocker) ProcSteal(pp *p) {
         // Grab the M ID we stole from.
         mStolenFrom := pp.trace.mSyscallID
         pp.trace.mSyscallID = -1
@@ -561,7 +559,7 @@ func (tl traceLocker) ProcSteal(pp *p, inSyscall bool) {
         // In the latter, we're a goroutine in a syscall.
         goStatus := tracev2.GoRunning
         procStatus := tracev2.ProcRunning
-       if inSyscall {
+       if tl.mp.curg != nil && tl.mp.curg.syscallsp != 0 {
                 goStatus = tracev2.GoSyscall
                 procStatus = tracev2.ProcSyscallAbandoned
         }
@@ -595,19 +593,27 @@ func (tl traceLocker) GoCreateSyscall(gp *g) {
         // N.B. We should never trace a status for this goroutine (which we're currently running on),
         // since we want this to appear like goroutine creation.
         gp.trace.setStatusTraced(tl.gen)
-       tl.eventWriter(tracev2.GoBad, tracev2.ProcBad).event(tracev2.EvGoCreateSyscall, traceArg(gp.goid))
+
+       // We might have a P left over on the thread from the last cgo callback,
+       // but in a syscall context, it is NOT ours. Act as if we do not have a P,
+       // and don't record a status.
+       tl.rawEventWriter().event(tracev2.EvGoCreateSyscall, traceArg(gp.goid))
  }
  
  // GoDestroySyscall indicates that a goroutine has transitioned from GoSyscall to dead.
  //
-// Must not have a P.
-//
  // This occurs when Go code returns back to C. On pthread platforms it occurs only when
  // the C thread is destroyed.
  func (tl traceLocker) GoDestroySyscall() {
-       // N.B. If we trace a status here, we must never have a P, and we must be on a goroutine
-       // that is in the syscall state.
-       tl.eventWriter(tracev2.GoSyscall, tracev2.ProcBad).event(tracev2.EvGoDestroySyscall)
+       // Write the status for the goroutine if necessary.
+       if gp := tl.mp.curg; gp != nil && !gp.trace.statusWasTraced(tl.gen) && gp.trace.acquireStatus(tl.gen) {
+               tl.writer().writeGoStatus(gp.goid, int64(tl.mp.procid), tracev2.GoSyscall, false, 0 /* no stack */).end()
+       }
+
+       // We might have a P left over on the thread from the last cgo callback,
+       // but in a syscall context, it is NOT ours. Act as if we do not have a P,
+       // and don't record a status.
+       tl.rawEventWriter().event(tracev2.EvGoDestroySyscall)
  }
  
  // To access runtime functions from runtime/trace.
diff --git a/src/runtime/tracestatus.go b/src/runtime/tracestatus.go

index f09bdfd3554811bc5f95b84b0677de7c93839b02..6343126faf0208153b3bc7e1eb5e1df8d636a545 100644 (file)
--- a/src/runtime/tracestatus.go
+++ b/src/runtime/tracestatus.go
@@ -62,15 +62,14 @@ func (w traceWriter) writeProcStatusForP(pp *p, inSTW bool) traceWriter {
                 }
         case _Prunning:
                 status = tracev2.ProcRunning
-               // There's a short window wherein the goroutine may have entered _Gsyscall
-               // but it still owns the P (it's not in _Psyscall yet). The goroutine entering
-               // _Gsyscall is the tracer's signal that the P its bound to is also in a syscall,
-               // so we need to emit a status that matches. See #64318.
+               // A P is considered to be in a syscall if its attached G is. Since we fully
+               // own P, then the goroutine isn't going to transition and we can trivially
+               // check if the goroutine is in a syscall. This used to be just a small problematic
+               // window, but this is now the default since _Psyscall no longer exists. See #64318
+               // for the history on why it was needed while _Psyscall still existed.
                 if w.mp.p.ptr() == pp && w.mp.curg != nil && readgstatus(w.mp.curg)&^_Gscan == _Gsyscall {
                         status = tracev2.ProcSyscall
                 }
-       case _Psyscall:
-               status = tracev2.ProcSyscall
         default:
                 throw("attempt to trace invalid or unsupported P status")
         }
author	Michael Anthony Knyszek <mknyszek@google.com>
	Sun, 2 Feb 2025 19:50:39 +0000 (19:50 +0000)
committer	Gopher Robot <gobot@golang.org>
	Thu, 30 Oct 2025 17:08:55 +0000 (10:08 -0700)
src/runtime/crash_test.go		patch \| blob \| history
src/runtime/heapdump.go		patch \| blob \| history
src/runtime/metrics.go		patch \| blob \| history
src/runtime/mprof.go		patch \| blob \| history
src/runtime/mstats.go		patch \| blob \| history
src/runtime/os_windows.go		patch \| blob \| history
src/runtime/preempt.go		patch \| blob \| history
src/runtime/proc.go		patch \| blob \| history
src/runtime/runtime2.go		patch \| blob \| history
src/runtime/traceevent.go		patch \| blob \| history
src/runtime/traceruntime.go		patch \| blob \| history
src/runtime/tracestatus.go		patch \| blob \| history