]> Cypherpunks repositories - gostls13.git/commitdiff
runtime/pprof: avoid crash due to truncated stack traces
authorHana Kim <hyangah@gmail.com>
Mon, 18 Nov 2019 17:44:33 +0000 (12:44 -0500)
committerHyang-Ah Hana Kim <hyangah@gmail.com>
Fri, 22 Nov 2019 14:32:02 +0000 (14:32 +0000)
The profile proto message builder maintains a location entry cache
that maps a location (possibly involving multiple user frames
that represent inlined function calls) to the location id. We have
been using the first pc of the inlined call sequence as the key of
the cached location entry assuming that, for a given pc, the sequence
of frames representing the inlined call stack is deterministic and
stable. Then, when analyzing the new stack trace, we expected the
exact number of pcs to be present in the captured stack trace upon
the cache hit.

This assumption does not hold, however, in the presence of the stack
trace truncation in the runtime during profiling, and also with the
potential bugs in runtime.

A better fix is to use all the pcs of the inlined call sequece as
the key instead of the first pc. But that is a bigger code change.
This CL avoids the crash assuming the trace was truncated.

Fixes #35538

Change-Id: I8c6bae98bc8b178ee51523c7316f56b1cce6df16
Reviewed-on: https://go-review.googlesource.com/c/go/+/207609
Run-TryBot: Hyang-Ah Hana Kim <hyangah@gmail.com>
Reviewed-by: Keith Randall <khr@golang.org>
src/runtime/pprof/pprof_test.go
src/runtime/pprof/proto.go

index 64e03aeccfb963e3e46b231223d8869a0549e518..20b44e1e013a920771f734df3f7db14c112c1a19 100644 (file)
@@ -1183,6 +1183,32 @@ func TestTryAdd(t *testing.T) {
                        {Value: []int64{30, 30 * period}, Location: []*profile.Location{{ID: 1}, {ID: 1}}},
                        {Value: []int64{40, 40 * period}, Location: []*profile.Location{{ID: 1}}},
                },
+       }, {
+               name: "truncated_stack_trace_later",
+               input: []uint64{
+                       3, 0, 500, // hz = 500. Must match the period.
+                       5, 0, 50, inlinedCalleePtr, inlinedCallerPtr,
+                       4, 0, 60, inlinedCalleePtr,
+               },
+               wantLocs: [][]string{{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}},
+               wantSamples: []*profile.Sample{
+                       {Value: []int64{50, 50 * period}, Location: []*profile.Location{{ID: 1}}},
+                       {Value: []int64{60, 60 * period}, Location: []*profile.Location{{ID: 1}}},
+               },
+       }, {
+               name: "truncated_stack_trace_first",
+               input: []uint64{
+                       3, 0, 500, // hz = 500. Must match the period.
+                       4, 0, 70, inlinedCalleePtr,
+                       5, 0, 80, inlinedCalleePtr, inlinedCallerPtr,
+               },
+               wantLocs: [][]string{ // the inline info is screwed up, but better than a crash.
+                       {"runtime/pprof.inlinedCallee"},
+                       {"runtime/pprof.inlinedCaller"}},
+               wantSamples: []*profile.Sample{
+                       {Value: []int64{70, 70 * period}, Location: []*profile.Location{{ID: 1}}},
+                       {Value: []int64{80, 80 * period}, Location: []*profile.Location{{ID: 1}, {ID: 2}}},
+               },
        }}
 
        for _, tc := range testCases {
index a42cd80c153b2d62ec23feec4ce081e86b46d685..8a30c7151d270903607e77a34834daec6cc9c0d7 100644 (file)
@@ -394,7 +394,23 @@ func (b *profileBuilder) appendLocsForStack(locs []uint64, stk []uintptr) (newLo
 
                        // then, record the cached location.
                        locs = append(locs, l.id)
-                       stk = stk[len(l.pcs):] // skip the matching pcs.
+
+                       // The stk may be truncated due to the stack depth limit
+                       // (e.g. See maxStack and maxCPUProfStack in runtime) or
+                       // bugs in runtime. Avoid the crash in either case.
+                       // TODO(hyangah): The correct fix may require using the exact
+                       // pcs as the key for b.locs cache management instead of just
+                       // relying on the very first pc. We are late in the go1.14 dev
+                       // cycle, so this is a workaround with little code change.
+                       if len(l.pcs) > len(stk) {
+                               stk = nil
+                               // TODO(hyangah): would be nice if we can enable
+                               // debug print out on demand and report the problematic
+                               // cached location entry and stack traces. Do we already
+                               // have such facility to utilize (e.g. GODEBUG)?
+                       } else {
+                               stk = stk[len(l.pcs):] // skip the matching pcs.
+                       }
                        continue
                }