From: Michael Anthony Knyszek <mknyszek@google.com>
Date: Tue, 23 Dec 2025 17:16:17 +0000 (+0000)
Subject: runtime: fix nGsyscallNoP accounting
X-Git-Tag: go1.26rc2~7^2~40
X-Git-Url: http://www.git.cypherpunks.su/?a=commitdiff_plain;h=81283ad33985958c63e43ada6c168da49e62de0f;p=gostls13.git

runtime: fix nGsyscallNoP accounting

CL 726964 has two bugs.

One is fairly obvious. Where there was previous a decrement of
nGsyscallNoP in exitsyscallTryGetP, it added a call to addGSyscallNoP.
Oops.

The other is more subtle. In needm we set isExtraInC to false very
early. This will cause exitsyscall (via cgocallbackg) to decrement
nGsyscallNoP when the thread never had a corresponding increment. (It
could not have, otherwise it would not have called needm, on Linux
anyway.) The fix is simple: increment nGsyscallNoP. CL 726964 actually
removed this increment erroneously. I'm pretty sure I removed it because
the first bug was the real issue, and removing this increment "fixed it"
in the context of the test. I was right that this case was subtle, but
wrong about how.

Fixes #76435.

Change-Id: I1ff1dfbf43bd4cf536b0965da370fee58e3f8753
Reviewed-on: https://go-review.googlesource.com/c/go/+/732320
Reviewed-by: Cherry Mui <cherryyz@google.com>
Auto-Submit: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
---

diff --git a/src/runtime/metrics_cgo_test.go b/src/runtime/metrics_cgo_test.go
index 6cc9d23195..ef1e3dd71d 100644
--- a/src/runtime/metrics_cgo_test.go
+++ b/src/runtime/metrics_cgo_test.go
@@ -12,7 +12,7 @@ import (
 	"testing"
 )
 
-func TestNotInGoMetricCallback(t *testing.T) {
+func TestNotInGoMetric(t *testing.T) {
 	switch runtime.GOOS {
 	case "windows", "plan9":
 		t.Skip("unsupported on Windows and Plan9")
@@ -22,11 +22,22 @@ func TestNotInGoMetricCallback(t *testing.T) {
 		}
 	}
 
-	// This test is run in a subprocess to prevent other tests from polluting the metrics
-	// and because we need to make some cgo callbacks.
-	output := runTestProg(t, "testprogcgo", "NotInGoMetricCallback")
-	want := "OK\n"
-	if output != want {
-		t.Fatalf("output:\n%s\n\nwanted:\n%s", output, want)
+	run := func(t *testing.T, name string) {
+		// This test is run in a subprocess to prevent other tests from polluting the metrics
+		// and because we need to make some cgo callbacks.
+		output := runTestProg(t, "testprogcgo", name)
+		want := "OK\n"
+		if output != want {
+			t.Fatalf("output:\n%s\n\nwanted:\n%s", output, want)
+		}
 	}
+	t.Run("CgoCall", func(t *testing.T) {
+		run(t, "NotInGoMetricCgoCall")
+	})
+	t.Run("CgoCallback", func(t *testing.T) {
+		run(t, "NotInGoMetricCgoCallback")
+	})
+	t.Run("CgoCallAndCallback", func(t *testing.T) {
+		run(t, "NotInGoMetricCgoCallAndCallback")
+	})
 }
diff --git a/src/runtime/proc.go b/src/runtime/proc.go
index 5ea96f03f5..005c875cbf 100644
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -2455,8 +2455,16 @@ func needm(signal bool) {
 	// mp.curg is now a real goroutine.
 	casgstatus(mp.curg, _Gdeadextra, _Gsyscall)
 	sched.ngsys.Add(-1)
-	// N.B. We do not update nGsyscallNoP, because isExtraInC threads are not
-	// counted as real goroutines while they're in C.
+
+	// This is technically inaccurate, but we set isExtraInC to false above,
+	// and so we need to update addGSyscallNoP to keep the two pieces of state
+	// consistent (it's only updated when isExtraInC is false). More specifically,
+	// When we get to cgocallbackg and exitsyscall, we'll be looking for a P, and
+	// since isExtraInC is false, we will decrement this metric.
+	//
+	// The inaccuracy is thankfully transient: only until this thread can get a P.
+	// We're going into Go anyway, so it's okay to pretend we're a real goroutine now.
+	addGSyscallNoP(mp)
 
 	if !signal {
 		if trace.ok() {
@@ -5027,7 +5035,7 @@ func exitsyscallTryGetP(oldp *p) *p {
 	if oldp != nil {
 		if thread, ok := setBlockOnExitSyscall(oldp); ok {
 			thread.takeP()
-			addGSyscallNoP(thread.mp) // takeP does the opposite, but this is a net zero change.
+			decGSyscallNoP(getg().m) // We got a P for ourselves.
 			thread.resume()
 			return oldp
 		}
diff --git a/src/runtime/testdata/testprogcgo/notingo.go b/src/runtime/testdata/testprogcgo/notingo.go
index 5af4c00e1f..a385ae24d6 100644
--- a/src/runtime/testdata/testprogcgo/notingo.go
+++ b/src/runtime/testdata/testprogcgo/notingo.go
@@ -12,6 +12,7 @@ package main
 #include <pthread.h>
 
 extern void Ready();
+extern void BlockForeverInGo();
 
 static _Atomic int spinning;
 static _Atomic int released;
@@ -40,6 +41,21 @@ static void Release() {
 	atomic_store(&spinning, 0);
 	atomic_store(&released, 1);
 }
+
+static void* enterGoThenWait(void* arg __attribute__ ((unused))) {
+	BlockForeverInGo();
+	return NULL;
+}
+
+static void WaitInGoInNewCThread() {
+	pthread_t tid;
+	pthread_create(&tid, NULL, enterGoThenWait, NULL);
+}
+
+static void SpinForever() {
+	atomic_fetch_add(&spinning, 1);
+	while(1) {};
+}
 */
 import "C"
 
@@ -47,15 +63,62 @@ import (
 	"os"
 	"runtime"
 	"runtime/metrics"
+	"sync/atomic"
 )
 
 func init() {
-	register("NotInGoMetricCallback", NotInGoMetricCallback)
+	register("NotInGoMetricCgoCall", NotInGoMetricCgoCall)
+	register("NotInGoMetricCgoCallback", NotInGoMetricCgoCallback)
+	register("NotInGoMetricCgoCallAndCallback", NotInGoMetricCgoCallAndCallback)
 }
 
-func NotInGoMetricCallback() {
+// NotInGoMetric just double-checks that N goroutines in cgo count as the metric reading N.
+func NotInGoMetricCgoCall() {
 	const N = 10
+
+	// Spin up the same number of goroutines that will all wait in a cgo call.
+	for range N {
+		go func() {
+			C.SpinForever()
+		}()
+	}
+
+	// Make sure we're all blocked and spinning.
+	for C.Spinning() < N {
+	}
+
+	// Read not-in-go before taking the Ps back.
 	s := []metrics.Sample{{Name: "/sched/goroutines/not-in-go:goroutines"}}
+	failed := false
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("pre-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Do something that stops the world to take all the Ps back.
+	//
+	// This will force a re-accounting of some of the goroutines and
+	// re-checking not-in-go will help catch bugs.
+	runtime.ReadMemStats(&m)
+
+	// Read not-in-go.
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("post-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Fail if we get a bad reading.
+	if failed {
+		os.Exit(2)
+	}
+	println("OK")
+}
+
+// NotInGoMetricCgoCallback tests that threads that called into Go, then returned
+// to C with *no* Go on the stack, are *not* counted as not-in-go in the
+// runtime/metrics package.
+func NotInGoMetricCgoCallback() {
+	const N = 10
 
 	// Create N new C threads that have called into Go at least once.
 	for range N {
@@ -90,6 +153,7 @@ func NotInGoMetricCallback() {
 	}
 
 	// Read not-in-go.
+	s := []metrics.Sample{{Name: "/sched/goroutines/not-in-go:goroutines"}}
 	metrics.Read(s)
 	if n := s[0].Value.Uint64(); n != 0 {
 		println("expected 0 not-in-go goroutines, found", n)
@@ -105,3 +169,69 @@ var readyCh = make(chan bool)
 func Ready() {
 	readyCh <- true
 }
+
+// NotInGoMetricCgoCallAndCallback tests that threads that called into Go are not
+// keeping the count of not-in-go threads negative. Specifically, needm sets
+// isExtraInC to false, breaking some of the invariants behind the not-in-go
+// runtime/metrics metric, causing the underlying count to break if we don't
+// account for this. In go.dev/cl/726964 this amounts to nGsyscallNoP being negative.
+// Unfortunately the runtime/metrics package masks a negative nGsyscallNoP because
+// it can transiently go negative due to a race. Therefore, this test checks
+// the condition by making sure not-in-go is positive when we expect it to be.
+// That is, threads in a cgo callback are *not* cancelling out threads in a
+// regular cgo call.
+func NotInGoMetricCgoCallAndCallback() {
+	const N = 10
+
+	// Spin up some threads that will do a cgo callback and just wait in Go.
+	// These threads are the ones we're worried about having the incorrect
+	// accounting that skews the count later.
+	for range N {
+		C.WaitInGoInNewCThread()
+	}
+
+	// Spin up the same number of goroutines that will all wait in a cgo call.
+	for range N {
+		go func() {
+			C.SpinForever()
+		}()
+	}
+
+	// Make sure we're all blocked and spinning.
+	for C.Spinning() < N || blockedForever.Load() < N {
+	}
+
+	// Read not-in-go before taking the Ps back.
+	s := []metrics.Sample{{Name: "/sched/goroutines/not-in-go:goroutines"}}
+	failed := false
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("pre-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Do something that stops the world to take all the Ps back.
+	//
+	// This will force a re-accounting of some of the goroutines and
+	// re-checking not-in-go will help catch bugs.
+	runtime.ReadMemStats(&m)
+
+	// Read not-in-go.
+	metrics.Read(s)
+	if n := s[0].Value.Uint64(); n != N {
+		println("post-STW: expected", N, "not-in-go goroutines, found", n)
+	}
+
+	// Fail if we get a bad reading.
+	if failed {
+		os.Exit(2)
+	}
+	println("OK")
+}
+
+var blockedForever atomic.Uint32
+
+//export BlockForeverInGo
+func BlockForeverInGo() {
+	blockedForever.Add(1)
+	select {}
+}