]> Cypherpunks repositories - gostls13.git/commitdiff
runtime,runtime/pprof: clean up goroutine leak profile writing
authorVlad Saioc <vsaioc@uber.com>
Thu, 6 Nov 2025 09:53:28 +0000 (09:53 +0000)
committerMichael Knyszek <mknyszek@google.com>
Wed, 12 Nov 2025 16:08:34 +0000 (08:08 -0800)
Cleaned up goroutine leak profile extraction:
- removed the acquisition of goroutineProfile semaphore
- inlined the call to saveg when recording stacks instead of using
doRecordGoroutineProfile, which had side-effects over
goroutineProfile fields.

Added regression tests for goroutine leak profiling frontend for binary
and debug=1 profile formats.
Added stress tests for concurrent goroutine and goroutine leak profile requests.

Change-Id: I55c1bcef11e9a7fb7699b4c5a2353e594d3e7173
GitHub-Last-Rev: 5e9eb3b1d80c4d2d9b668a01f6b39a7b42f7bb45
GitHub-Pull-Request: golang/go#76045
Reviewed-on: https://go-review.googlesource.com/c/go/+/714580
Reviewed-by: Michael Pratt <mpratt@google.com>
Reviewed-by: Michael Knyszek <mknyszek@google.com>
LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com>
Reviewed-by: Junyang Shao <shaojunyang@google.com>
src/runtime/mprof.go
src/runtime/pprof/pprof.go
src/runtime/pprof/pprof_test.go

index d745d5f5b9588576abb1025282d3270ca49e5acc..f1703a7ebab80210a47e8da22fcda53dc28b3ae5 100644 (file)
@@ -1261,7 +1261,7 @@ func goroutineProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.P
 
 //go:linkname pprof_goroutineLeakProfileWithLabels
 func pprof_goroutineLeakProfileWithLabels(p []profilerecord.StackRecord, labels []unsafe.Pointer) (n int, ok bool) {
-       return goroutineLeakProfileWithLabelsConcurrent(p, labels)
+       return goroutineLeakProfileWithLabels(p, labels)
 }
 
 // labels may be nil. If labels is non-nil, it must have the same length as p.
@@ -1323,30 +1323,26 @@ func goroutineLeakProfileWithLabelsConcurrent(p []profilerecord.StackRecord, lab
                return work.goroutineLeak.count, false
        }
 
-       // Use the same semaphore as goroutineProfileWithLabelsConcurrent,
-       // because ultimately we still use goroutine profiles.
-       semacquire(&goroutineProfile.sema)
-
-       // Unlike in goroutineProfileWithLabelsConcurrent, we don't need to
-       // save the current goroutine stack, because it is obviously not leaked.
-
        pcbuf := makeProfStack() // see saveg() for explanation
 
        // Prepare a profile large enough to store all leaked goroutines.
        n = work.goroutineLeak.count
 
        if n > len(p) {
-               // There's not enough space in p to store the whole profile, so (per the
-               // contract of runtime.GoroutineProfile) we're not allowed to write to p
-               // at all and must return n, false.
-               semrelease(&goroutineProfile.sema)
+               // There's not enough space in p to store the whole profile, so
+               // we're not allowed to write to p at all and must return n, false.
                return n, false
        }
 
        // Visit each leaked goroutine and try to record its stack.
+       var offset int
        forEachGRace(func(gp1 *g) {
-               if readgstatus(gp1) == _Gleaked {
-                       doRecordGoroutineProfile(gp1, pcbuf)
+               if readgstatus(gp1)&^_Gscan == _Gleaked {
+                       systemstack(func() { saveg(^uintptr(0), ^uintptr(0), gp1, &p[offset], pcbuf) })
+                       if labels != nil {
+                               labels[offset] = gp1.labels
+                       }
+                       offset++
                }
        })
 
@@ -1354,7 +1350,6 @@ func goroutineLeakProfileWithLabelsConcurrent(p []profilerecord.StackRecord, lab
                raceacquire(unsafe.Pointer(&labelSync))
        }
 
-       semrelease(&goroutineProfile.sema)
        return n, true
 }
 
index b524e992b8b2090e04d80f38941870954e2e8745..78d00af6cac2fd649e0ae24a524c8bf9db3bd0cd 100644 (file)
@@ -228,6 +228,31 @@ var mutexProfile = &Profile{
        write: writeMutex,
 }
 
+// goroutineLeakProfileLock ensures that the goroutine leak profile writer observes the
+// leaked goroutines discovered during the goroutine leak detection GC cycle
+// that was triggered by the profile request.
+// This prevents a race condition between the garbage collector and the profile writer
+// when multiple profile requests are issued concurrently: the status of leaked goroutines
+// is reset to _Gwaiting at the beginning of a leak detection cycle, which may lead the
+// profile writer of another concurrent request to produce an incomplete profile.
+//
+// Example trace:
+//
+//     G1                    | GC                          | G2
+//     ----------------------+-----------------------------+---------------------
+//     Request profile       | .                           | .
+//     .                     | .                           | Request profile
+//     .                     | [G1] Resets leaked g status | .
+//     .                     | [G1] Leaks detected         | .
+//     .                     | <New cycle>                 | .
+//     .                     | [G2] Resets leaked g status | .
+//     Write profile         | .                           | .
+//     .                     | [G2] Leaks detected         | .
+//     .                     | .                           | Write profile
+//     ----------------------+-----------------------------+---------------------
+//     Incomplete profile    |+++++++++++++++++++++++++++++| Complete profile
+var goroutineLeakProfileLock sync.Mutex
+
 func lockProfiles() {
        profiles.mu.Lock()
        if profiles.m == nil {
@@ -763,6 +788,15 @@ func writeGoroutine(w io.Writer, debug int) error {
 // writeGoroutineLeak first invokes a GC cycle that performs goroutine leak detection.
 // It then writes the goroutine profile, filtering for leaked goroutines.
 func writeGoroutineLeak(w io.Writer, debug int) error {
+       // Acquire the goroutine leak detection lock and release
+       // it after the goroutine leak profile is written.
+       //
+       // While the critical section is long, this is needed to prevent
+       // a race condition between the garbage collector and the goroutine
+       // leak profile writer when multiple profile requests are issued concurrently.
+       goroutineLeakProfileLock.Lock()
+       defer goroutineLeakProfileLock.Unlock()
+
        // Run the GC with leak detection first so that leaked goroutines
        // may transition to the leaked state.
        runtime_goroutineLeakGC()
index adbd1932a43210b7f8dae1d52aa8b3afd29f5815..c1d4f06595f224b571f9e28710cf05f1fcf07f7a 100644 (file)
@@ -774,7 +774,11 @@ func TestMorestack(t *testing.T) {
                for {
                        go func() {
                                growstack1()
-                               c <- true
+                               // NOTE(vsaioc): This goroutine may leak without this select.
+                               select {
+                               case c <- true:
+                               case <-time.After(duration):
+                               }
                        }()
                        select {
                        case <-t:
@@ -1565,6 +1569,210 @@ func containsCountsLabels(prof *profile.Profile, countLabels map[int64]map[strin
        return true
 }
 
+func goroutineLeakExample() {
+       <-make(chan struct{})
+       panic("unreachable")
+}
+
+func TestGoroutineLeakProfileConcurrency(t *testing.T) {
+       const leakCount = 3
+
+       testenv.MustHaveParallelism(t)
+       regexLeakCount := regexp.MustCompile("goroutineleak profile: total ")
+       whiteSpace := regexp.MustCompile("\\s+")
+
+       // Regular goroutine profile. Used to check that there is no interference between
+       // the two profile types.
+       goroutineProf := Lookup("goroutine")
+       goroutineLeakProf := goroutineLeakProfile
+
+       // Check that a profile with debug information contains
+       includesLeak := func(t *testing.T, name, s string) {
+               if !strings.Contains(s, "runtime/pprof.goroutineLeakExample") {
+                       t.Errorf("%s profile does not contain expected leaked goroutine (runtime/pprof.goroutineLeakExample): %s", name, s)
+               }
+       }
+
+       checkFrame := func(i int, j int, locations []*profile.Location, expectedFunctionName string) {
+               if len(locations) <= i {
+                       t.Errorf("leaked goroutine stack locations out of range at %d of %d", i+1, len(locations))
+                       return
+               }
+               location := locations[i]
+               if len(location.Line) <= j {
+                       t.Errorf("leaked goroutine stack location lines out of range at %d of %d", j+1, len(location.Line))
+                       return
+               }
+               if location.Line[j].Function.Name != expectedFunctionName {
+                       t.Errorf("leaked goroutine stack expected %s as the location[%d].Line[%d] but found %s (%s:%d)", expectedFunctionName, i, j, location.Line[j].Function.Name, location.Line[j].Function.Filename, location.Line[j].Line)
+               }
+       }
+
+       // We use this helper to count the total number of leaked goroutines in the profile.
+       //
+       // NOTE(vsaioc): This value should match for the number of leaks produced in this test,
+       // but other tests could also leak goroutines, in which case we would have a mismatch
+       // when bulk-running tests.
+       //
+       // The two mismatching outcomes are therefore:
+       //      - More leaks than expected, which is a correctness issue with other tests.
+       //              In this case, this test effectively checks other tests wrt
+       //              goroutine leaks during bulk executions (e.g., running all.bash).
+       //
+       //      - Fewer leaks than expected; this is an unfortunate symptom of scheduling
+       //              non-determinism, which may occur once in a blue moon. We make
+       //              a best-effort attempt to allow the expected leaks to occur, by yielding
+       //              the main thread, but it is never a guarantee.
+       countLeaks := func(t *testing.T, number int, s string) {
+               // Strip the profile header
+               parts := regexLeakCount.Split(s, -1)
+               if len(parts) < 2 {
+                       t.Fatalf("goroutineleak profile does not contain 'goroutineleak profile: total ': %s\nparts: %v", s, parts)
+                       return
+               }
+
+               parts = whiteSpace.Split(parts[1], -1)
+
+               count, err := strconv.ParseInt(parts[0], 10, 64)
+               if err != nil {
+                       t.Fatalf("goroutineleak profile count is not a number: %s\nerror: %v", s, err)
+               }
+
+               // Check that the total number of leaked goroutines is exactly the expected number.
+               if count != int64(number) {
+                       t.Errorf("goroutineleak profile does not contain exactly %d leaked goroutines: %d", number, count)
+               }
+       }
+
+       checkLeakStack := func(t *testing.T) func(pc uintptr, locations []*profile.Location, _ map[string][]string) {
+               return func(pc uintptr, locations []*profile.Location, _ map[string][]string) {
+                       if pc != leakCount {
+                               t.Errorf("expected %d leaked goroutines with specific stack configurations, but found %d", leakCount, pc)
+                               return
+                       }
+                       switch len(locations) {
+                       case 4:
+                               // We expect a receive operation. This is the typical stack.
+                               checkFrame(0, 0, locations, "runtime.gopark")
+                               checkFrame(1, 0, locations, "runtime.chanrecv")
+                               checkFrame(2, 0, locations, "runtime.chanrecv1")
+                               switch len(locations[3].Line) {
+                               case 2:
+                                       // Running `go func() { goroutineLeakExample() }()` will produce a stack with 2 lines.
+                                       // The anonymous function will have the call to goroutineLeakExample inlined.
+                                       checkFrame(3, 1, locations, "runtime/pprof.TestGoroutineLeakProfileConcurrency.func5")
+                                       fallthrough
+                               case 1:
+                                       // Running `go goroutineLeakExample()` will produce a stack with 1 line.
+                                       checkFrame(3, 0, locations, "runtime/pprof.goroutineLeakExample")
+                               default:
+                                       t.Errorf("leaked goroutine stack location expected 1 or 2 lines in the 4th location but found %d", len(locations[3].Line))
+                                       return
+                               }
+                       default:
+                               message := fmt.Sprintf("leaked goroutine stack expected 4 or 5 locations but found %d", len(locations))
+                               for _, location := range locations {
+                                       for _, line := range location.Line {
+                                               message += fmt.Sprintf("\n%s:%d", line.Function.Name, line.Line)
+                                       }
+                               }
+                               t.Errorf("%s", message)
+                       }
+               }
+       }
+       // Leak some goroutines that will feature in the goroutine leak profile
+       for i := 0; i < leakCount; i++ {
+               go goroutineLeakExample()
+               go func() {
+                       // Leak another goroutine that will feature a slightly different stack.
+                       // This includes the frame runtime/pprof.TestGoroutineLeakProfileConcurrency.func1.
+                       goroutineLeakExample()
+                       panic("unreachable")
+               }()
+               // Yield several times to allow the goroutines to leak.
+               runtime.Gosched()
+               runtime.Gosched()
+       }
+
+       // Give all goroutines a chance to leak.
+       time.Sleep(time.Second)
+
+       t.Run("profile contains leak", func(t *testing.T) {
+               var w strings.Builder
+               goroutineLeakProf.WriteTo(&w, 0)
+               parseProfile(t, []byte(w.String()), checkLeakStack(t))
+       })
+
+       t.Run("leak persists between sequential profiling runs", func(t *testing.T) {
+               for i := 0; i < 2; i++ {
+                       var w strings.Builder
+                       goroutineLeakProf.WriteTo(&w, 0)
+                       parseProfile(t, []byte(w.String()), checkLeakStack(t))
+               }
+       })
+
+       // Concurrent calls to the goroutine leak profiler should not trigger data races
+       // or corruption.
+       t.Run("overlapping profile requests", func(t *testing.T) {
+               ctx := context.Background()
+               ctx, cancel := context.WithTimeout(ctx, time.Second)
+               defer cancel()
+
+               var wg sync.WaitGroup
+               for i := 0; i < 2; i++ {
+                       wg.Add(1)
+                       Do(ctx, Labels("i", fmt.Sprint(i)), func(context.Context) {
+                               go func() {
+                                       defer wg.Done()
+                                       for ctx.Err() == nil {
+                                               var w strings.Builder
+                                               goroutineLeakProf.WriteTo(&w, 1)
+                                               countLeaks(t, 2*leakCount, w.String())
+                                               includesLeak(t, "goroutineleak", w.String())
+                                       }
+                               }()
+                       })
+               }
+               wg.Wait()
+       })
+
+       // Concurrent calls to the goroutine leak profiler should not trigger data races
+       // or corruption, or interfere with regular goroutine profiles.
+       t.Run("overlapping goroutine and goroutine leak profile requests", func(t *testing.T) {
+               ctx := context.Background()
+               ctx, cancel := context.WithTimeout(ctx, time.Second)
+               defer cancel()
+
+               var wg sync.WaitGroup
+               for i := 0; i < 2; i++ {
+                       wg.Add(2)
+                       Do(ctx, Labels("i", fmt.Sprint(i)), func(context.Context) {
+                               go func() {
+                                       defer wg.Done()
+                                       for ctx.Err() == nil {
+                                               var w strings.Builder
+                                               goroutineLeakProf.WriteTo(&w, 1)
+                                               countLeaks(t, 2*leakCount, w.String())
+                                               includesLeak(t, "goroutineleak", w.String())
+                                       }
+                               }()
+                               go func() {
+                                       defer wg.Done()
+                                       for ctx.Err() == nil {
+                                               var w strings.Builder
+                                               goroutineProf.WriteTo(&w, 1)
+                                               // The regular goroutine profile should see the leaked
+                                               // goroutines. We simply check that the goroutine leak
+                                               // profile does not corrupt the goroutine profile state.
+                                               includesLeak(t, "goroutine", w.String())
+                                       }
+                               }()
+                       })
+               }
+               wg.Wait()
+       })
+}
+
 func TestGoroutineProfileConcurrency(t *testing.T) {
        testenv.MustHaveParallelism(t)