runtime/pprof: correctly encode inlined functions in CPU profile

author Hana (Hyang-Ah) Kim <hyangah@gmail.com>

Sat, 2 Nov 2019 08:16:36 +0000 (17:16 +0900)

committer Hyang-Ah Hana Kim <hyangah@gmail.com>

Fri, 8 Nov 2019 02:40:04 +0000 (02:40 +0000)
author Hana (Hyang-Ah) Kim <hyangah@gmail.com>
Sat, 2 Nov 2019 08:16:36 +0000 (17:16 +0900)
committer Hyang-Ah Hana Kim <hyangah@gmail.com>
Fri, 8 Nov 2019 02:40:04 +0000 (02:40 +0000)
diff --git a/src/runtime/pprof/pprof_test.go b/src/runtime/pprof/pprof_test.go

index ed04fe06acfefcf31307a3d31f3441e2ef4743b3..b553baf3a95e066c54d52f07e7e92eb9e8c7472a 100644 (file)
--- a/src/runtime/pprof/pprof_test.go
+++ b/src/runtime/pprof/pprof_test.go
@@ -49,8 +49,12 @@ var (
  // Must not call other functions nor access heap/globals in the loop,
  // otherwise under race detector the samples will be in the race runtime.
  func cpuHog1(x int) int {
+       return cpuHog0(x, 1e5)
+}
+
+func cpuHog0(x, n int) int {
         foo := x
-       for i := 0; i < 1e5; i++ {
+       for i := 0; i < n; i++ {
                 if foo > 0 {
                         foo *= foo
                 } else {
@@ -101,34 +105,69 @@ func TestCPUProfileMultithreaded(t *testing.T) {
  }
  
  func TestCPUProfileInlining(t *testing.T) {
-       testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) {
+       p := testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.inlinedCaller"}, avoidFunctions(), func(dur time.Duration) {
                 cpuHogger(inlinedCaller, &salt1, dur)
         })
+
+       // Check if inlined function locations are encoded correctly. The inlinedCalee and inlinedCaller should be in one location.
+       for _, loc := range p.Location {
+               hasInlinedCallerAfterInlinedCallee, hasInlinedCallee := false, false
+               for _, line := range loc.Line {
+                       if line.Function.Name == "runtime/pprof.inlinedCallee" {
+                               hasInlinedCallee = true
+                       }
+                       if hasInlinedCallee && line.Function.Name == "runtime/pprof.inlinedCaller" {
+                               hasInlinedCallerAfterInlinedCallee = true
+                       }
+               }
+               if hasInlinedCallee != hasInlinedCallerAfterInlinedCallee {
+                       t.Fatalf("want inlinedCallee followed by inlinedCaller, got separate Location entries:\n%v", p)
+               }
+       }
  }
  
  func inlinedCaller(x int) int {
-       x = inlinedCallee(x)
+       x = inlinedCallee(x, 1e5)
         return x
  }
  
-func inlinedCallee(x int) int {
-       // We could just use cpuHog1, but for loops prevent inlining
-       // right now. :(
-       foo := x
-       i := 0
-loop:
-       if foo > 0 {
-               foo *= foo
-       } else {
-               foo *= foo + 1
+func inlinedCallee(x, n int) int {
+       return cpuHog0(x, n)
+}
+
+func TestCPUProfileRecursion(t *testing.T) {
+       p := testCPUProfile(t, stackContains, []string{"runtime/pprof.inlinedCallee", "runtime/pprof.recursionCallee", "runtime/pprof.recursionCaller"}, avoidFunctions(), func(dur time.Duration) {
+               cpuHogger(recursionCaller, &salt1, dur)
+       })
+
+       // check the Location encoding was not confused by recursive calls.
+       for i, loc := range p.Location {
+               recursionFunc := 0
+               for _, line := range loc.Line {
+                       if name := line.Function.Name; name == "runtime/pprof.recursionCaller" || name == "runtime/pprof.recursionCallee" {
+                               recursionFunc++
+                       }
+               }
+               if recursionFunc > 1 {
+                       t.Fatalf("want at most one recursionCaller or recursionCallee in one Location, got a violating Location (index: %d):\n%v", i, p)
+               }
         }
-       if i++; i < 1e5 {
-               goto loop
+}
+
+func recursionCaller(x int) int {
+       y := recursionCallee(3, x)
+       return y
+}
+
+func recursionCallee(n, x int) int {
+       if n == 0 {
+               return 1
         }
-       return foo
+       y := inlinedCallee(x, 1e4)
+       return y * recursionCallee(n-1, x)
  }
  
-func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) {
+func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Location, map[string][]string)) *profile.Profile {
         p, err := profile.Parse(bytes.NewReader(valBytes))
         if err != nil {
                 t.Fatal(err)
@@ -137,11 +176,12 @@ func parseProfile(t *testing.T, valBytes []byte, f func(uintptr, []*profile.Loca
                 count := uintptr(sample.Value[0])
                 f(count, sample.Location, sample.Label)
         }
+       return p
  }
  
  // testCPUProfile runs f under the CPU profiler, checking for some conditions specified by need,
-// as interpreted by matches.
-func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) {
+// as interpreted by matches, and returns the parsed profile.
+func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []string, f func(dur time.Duration)) *profile.Profile {
         switch runtime.GOOS {
         case "darwin":
                 switch runtime.GOARCH {
@@ -195,8 +235,8 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri
                 f(duration)
                 StopCPUProfile()
  
-               if profileOk(t, matches, need, avoid, prof, duration) {
-                       return
+               if p, ok := profileOk(t, matches, need, avoid, prof, duration); ok {
+                       return p
                 }
  
                 duration *= 2
@@ -217,6 +257,7 @@ func testCPUProfile(t *testing.T, matches matchFunc, need []string, avoid []stri
                 t.Skip("ignore the failure in QEMU; see golang.org/issue/9605")
         }
         t.FailNow()
+       return nil
  }
  
  func contains(slice []string, s string) bool {
@@ -242,7 +283,7 @@ func stackContains(spec string, count uintptr, stk []*profile.Location, labels m
  
  type matchFunc func(spec string, count uintptr, stk []*profile.Location, labels map[string][]string) bool
  
-func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (ok bool) {
+func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, prof bytes.Buffer, duration time.Duration) (_ *profile.Profile, ok bool) {
         ok = true
  
         // Check that profile is well formed, contains 'need', and does not contain
@@ -251,7 +292,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
         avoidSamples := make([]uintptr, len(avoid))
         var samples uintptr
         var buf bytes.Buffer
-       parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
+       p := parseProfile(t, prof.Bytes(), func(count uintptr, stk []*profile.Location, labels map[string][]string) {
                 fmt.Fprintf(&buf, "%d:", count)
                 fprintStack(&buf, stk)
                 samples += count
@@ -278,7 +319,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
                 // not enough samples due to coarse timer
                 // resolution. Let it go.
                 t.Log("too few samples on Windows (golang.org/issue/10842)")
-               return false
+               return p, false
         }
  
         // Check that we got a reasonable number of samples.
@@ -300,7 +341,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
         }
  
         if len(need) == 0 {
-               return ok
+               return p, ok
         }
  
         var total uintptr
@@ -323,7 +364,7 @@ func profileOk(t *testing.T, matches matchFunc, need []string, avoid []string, p
                         ok = false
                 }
         }
-       return ok
+       return p, ok
  }
  
  // Fork can hang if preempted with signals frequently enough (see issue 5517).
diff --git a/src/runtime/pprof/proto.go b/src/runtime/pprof/proto.go

index 7864dd79ad0c64e68960d79c0f623b37f2e102c2..688df4b942fb8b8bac23207d34c1d70a76752a9b 100644 (file)
--- a/src/runtime/pprof/proto.go
+++ b/src/runtime/pprof/proto.go
@@ -41,8 +41,8 @@ type profileBuilder struct {
         pb        protobuf
         strings   []string
         stringMap map[string]int
-       locs      map[uintptr]int
-       funcs     map[string]int // Package path-qualified function name to Function.ID
+       locs      map[uintptr]locInfo // list of locInfo starting with the given PC.
+       funcs     map[string]int      // Package path-qualified function name to Function.ID
         mem       []memMap
  }
  
@@ -207,13 +207,43 @@ func (b *profileBuilder) pbMapping(tag int, id, base, limit, offset uint64, file
         b.pb.endMessage(tag, start)
  }
  
+func allFrames(addr uintptr) ([]runtime.Frame, symbolizeFlag) {
+       // Expand this one address using CallersFrames so we can cache
+       // each expansion. In general, CallersFrames takes a whole
+       // stack, but in this case we know there will be no skips in
+       // the stack and we have return PCs anyway.
+       frames := runtime.CallersFrames([]uintptr{addr})
+       frame, more := frames.Next()
+       if frame.Function == "runtime.goexit" {
+               // Short-circuit if we see runtime.goexit so the loop
+               // below doesn't allocate a useless empty location.
+               return nil, 0
+       }
+
+       symbolizeResult := lookupTried
+       if frame.PC == 0 || frame.Function == "" || frame.File == "" || frame.Line == 0 {
+               symbolizeResult |= lookupFailed
+       }
+
+       if frame.PC == 0 {
+               // If we failed to resolve the frame, at least make up
+               // a reasonable call PC. This mostly happens in tests.
+               frame.PC = addr - 1
+       }
+       ret := []runtime.Frame{frame}
+       for frame.Function != "runtime.goexit" && more == true {
+               frame, more = frames.Next()
+               ret = append(ret, frame)
+       }
+       return ret, symbolizeResult
+}
+
  // locForPC returns the location ID for addr.
  // addr must a return PC or 1 + the PC of an inline marker. This returns the location of the corresponding call.
  // It may emit to b.pb, so there must be no message encoding in progress.
  func (b *profileBuilder) locForPC(addr uintptr) uint64 {
-       id := uint64(b.locs[addr])
-       if id != 0 {
-               return id
+       if loc, ok := b.locs[addr]; ok {
+               return loc.id
         }
  
         // Expand this one address using CallersFrames so we can cache
@@ -248,8 +278,8 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 {
         }
         newFuncs := make([]newFunc, 0, 8)
  
-       id = uint64(len(b.locs)) + 1
-       b.locs[addr] = int(id)
+       id := uint64(len(b.locs)) + 1
+       b.locs[addr] = locInfo{id: id, pcs: []uintptr{addr}}
         start := b.pb.startMessage()
         b.pb.uint64Opt(tagLocation_ID, id)
         b.pb.uint64Opt(tagLocation_Address, uint64(frame.PC))
@@ -293,6 +323,16 @@ func (b *profileBuilder) locForPC(addr uintptr) uint64 {
         return id
  }
  
+type locInfo struct {
+       // location id assigned by the profileBuilder
+       id uint64
+
+       // sequence of PCs, including the fake PCs returned by the traceback
+       // to represent inlined functions
+       // https://github.com/golang/go/blob/d6f2f833c93a41ec1c68e49804b8387a06b131c5/src/runtime/traceback.go#L347-L368
+       pcs []uintptr
+}
+
  // newProfileBuilder returns a new profileBuilder.
  // CPU profiling data obtained from the runtime can be added
  // by calling b.addCPUData, and then the eventual profile
@@ -305,7 +345,7 @@ func newProfileBuilder(w io.Writer) *profileBuilder {
                 start:     time.Now(),
                 strings:   []string{""},
                 stringMap: map[string]int{"": 0},
-               locs:      map[uintptr]int{},
+               locs:      map[uintptr]locInfo{},
                 funcs:     map[string]int{},
         }
         b.readMapping()
@@ -388,7 +428,10 @@ func (b *profileBuilder) build() {
         }
  
         values := []int64{0, 0}
+
+       var deck = &pcDeck{}
         var locs []uint64
+
         for e := b.m.all; e != nil; e = e.nextAll {
                 values[0] = e.count
                 values[1] = e.count * b.period
@@ -402,23 +445,62 @@ func (b *profileBuilder) build() {
                         }
                 }
  
+               deck.reset()
                 locs = locs[:0]
-               for i, addr := range e.stk {
-                       // Addresses from stack traces point to the
-                       // next instruction after each call, except
-                       // for the leaf, which points to where the
-                       // signal occurred. locForPC expects return
-                       // PCs, so increment the leaf address to look
-                       // like a return PC.
-                       if i == 0 {
-                               addr++
+
+               // Addresses from stack traces point to the next instruction after each call,
+               // except for the leaf, which points to where the signal occurred.
+               // deck.add+emitLocation expects return PCs so increment the leaf address to
+               // look like a return PC.
+               e.stk[0] += 1
+               for stk := e.stk; len(stk) > 0; {
+                       addr := stk[0]
+                       if l, ok := b.locs[addr]; ok {
+                               // first record the location if there is any pending accumulated info.
+                               if id := b.emitLocation(deck); id > 0 {
+                                       locs = append(locs, id)
+                               }
+
+                               // then, record the cached location.
+                               locs = append(locs, l.id)
+                               stk = stk[len(l.pcs):] // skip the matching pcs.
+                               continue
                         }
-                       l := b.locForPC(addr)
-                       if l == 0 { // runtime.goexit
+
+                       frames, symbolizeResult := allFrames(addr)
+                       if len(frames) == 0 { // runtime.goexit.
+                               if id := b.emitLocation(deck); id > 0 {
+                                       locs = append(locs, id)
+                               }
+                               stk = stk[1:]
                                 continue
                         }
-                       locs = append(locs, l)
+
+                       if added := deck.tryAdd(addr, frames, symbolizeResult); added {
+                               stk = stk[1:]
+                               continue
+                       }
+                       // add failed because this addr is not inlined with
+                       // the existing PCs in the deck. Flush the deck and retry to
+                       // handle this pc.
+                       if id := b.emitLocation(deck); id > 0 {
+                               locs = append(locs, id)
+                       }
+
+                       // check cache again - previous emitLocation added a new entry
+                       if l, ok := b.locs[addr]; ok {
+                               locs = append(locs, l.id)
+                               stk = stk[len(l.pcs):] // skip the matching pcs.
+                       } else {
+                               deck.tryAdd(addr, frames, symbolizeResult) // must succeed.
+                               stk = stk[1:]
+                       }
+               }
+               if id := b.emitLocation(deck); id > 0 { // emit remaining location.
+                       locs = append(locs, id)
                 }
+               e.stk[0] -= 1 // undo the adjustment on the leaf done before the loop.
+
                 b.pbSample(values, locs, labels)
         }
  
@@ -435,6 +517,133 @@ func (b *profileBuilder) build() {
         b.zw.Close()
  }
  
+// pcDeck is a helper to detect a sequence of inlined functions from
+// a stack trace returned by the runtime.
+//
+// The stack traces returned by runtime's trackback functions are fully
+// expanded (at least for Go functions) and include the fake pcs representing
+// inlined functions. The profile proto expects the inlined functions to be
+// encoded in one Location message.
+// https://github.com/google/pprof/blob/5e965273ee43930341d897407202dd5e10e952cb/proto/profile.proto#L177-L184
+//
+// Runtime does not directly expose whether a frame is for an inlined function
+// and looking up debug info is not ideal, so we use a heuristic to filter
+// the fake pcs and restore the inlined and entry functions. Inlined functions
+// have the following properties:
+//   Frame's Func is nil (note: also true for non-Go functions), and
+//   Frame's Entry matches its entry function frame's Entry. (note: could also be true for recursive calls and non-Go functions),
+//   Frame's Name does not match its entry function frame's name.
+//
+// As reading and processing the pcs in a stack trace one by one (from leaf to the root),
+// we use pcDeck to temporarily hold the observed pcs and their expanded frames
+// until we observe the entry function frame.
+type pcDeck struct {
+       pcs             []uintptr
+       frames          []runtime.Frame
+       symbolizeResult symbolizeFlag
+}
+
+func (d *pcDeck) reset() {
+       d.pcs = d.pcs[:0]
+       d.frames = d.frames[:0]
+       d.symbolizeResult = 0
+}
+
+// tryAdd tries to add the pc and Frames expanded from it (most likely one,
+// since the stack trace is already fully expanded) and the symbolizeResult
+// to the deck. If it fails the caller needs to flush the deck and retry.
+func (d *pcDeck) tryAdd(pc uintptr, frames []runtime.Frame, symbolizeResult symbolizeFlag) (success bool) {
+       if existing := len(d.pcs); existing > 0 {
+               // 'frames' are all expanded from one 'pc' and represent all inlined functions
+               // so we check only the first one.
+               newFrame := frames[0]
+               last := d.frames[existing-1]
+               if last.Func != nil && newFrame.Func != nil { // Can't be an inlined frame.
+                       return false
+               }
+
+               if last.Entry == 0 || newFrame.Entry == 0 { // Possibly not a Go function. Don't try to merge.
+                       return false
+               }
+
+               if last.Entry != newFrame.Entry { // newFrame is for a different function.
+                       return false
+               }
+               if last.Function == newFrame.Function { // maybe recursion.
+                       return false
+               }
+       }
+       d.pcs = append(d.pcs, pc)
+       d.frames = append(d.frames, frames...)
+       d.symbolizeResult |= symbolizeResult
+       return true
+}
+
+// emitLocation emits the new location and function information recorded in the deck
+// and returns the location ID encoded in the profile protobuf.
+// It emits to b.pb, so there must be no message encoding in progress.
+// It resets the deck.
+func (b *profileBuilder) emitLocation(deck *pcDeck) uint64 {
+       defer deck.reset()
+
+       if len(deck.pcs) == 0 {
+               return 0
+       }
+
+       addr := deck.pcs[0]
+       firstFrame := deck.frames[0]
+
+       // We can't write out functions while in the middle of the
+       // Location message, so record new functions we encounter and
+       // write them out after the Location.
+       type newFunc struct {
+               id         uint64
+               name, file string
+       }
+       newFuncs := make([]newFunc, 0, 8)
+
+       id := uint64(len(b.locs)) + 1
+       b.locs[addr] = locInfo{id: id, pcs: append([]uintptr{}, deck.pcs...)}
+
+       start := b.pb.startMessage()
+       b.pb.uint64Opt(tagLocation_ID, id)
+       b.pb.uint64Opt(tagLocation_Address, uint64(firstFrame.PC))
+       for _, frame := range deck.frames {
+               // Write out each line in frame expansion.
+               funcID := uint64(b.funcs[frame.Function])
+               if funcID == 0 {
+                       funcID = uint64(len(b.funcs)) + 1
+                       b.funcs[frame.Function] = int(funcID)
+                       newFuncs = append(newFuncs, newFunc{funcID, frame.Function, frame.File})
+               }
+               b.pbLine(tagLocation_Line, funcID, int64(frame.Line))
+       }
+       for i := range b.mem {
+               if b.mem[i].start <= addr && addr < b.mem[i].end || b.mem[i].fake {
+                       b.pb.uint64Opt(tagLocation_MappingID, uint64(i+1))
+
+                       m := b.mem[i]
+                       m.funcs |= deck.symbolizeResult
+                       b.mem[i] = m
+                       break
+               }
+       }
+       b.pb.endMessage(tagProfile_Location, start)
+
+       // Write out functions we found during frame expansion.
+       for _, fn := range newFuncs {
+               start := b.pb.startMessage()
+               b.pb.uint64Opt(tagFunction_ID, fn.id)
+               b.pb.int64Opt(tagFunction_Name, b.stringIndex(fn.name))
+               b.pb.int64Opt(tagFunction_SystemName, b.stringIndex(fn.name))
+               b.pb.int64Opt(tagFunction_Filename, b.stringIndex(fn.file))
+               b.pb.endMessage(tagProfile_Function, start)
+       }
+
+       b.flush()
+       return id
+}
+
  // readMapping reads /proc/self/maps and writes mappings to b.pb.
  // It saves the address ranges of the mappings in b.mem for use
  // when emitting locations.
diff --git a/src/runtime/pprof/proto_test.go b/src/runtime/pprof/proto_test.go

index bcb4d3386d54ba766f8a3c3e04f75155fd3735ec..eda2b003ad44a466fba2177080a1aed19d201087 100644 (file)
--- a/src/runtime/pprof/proto_test.go
+++ b/src/runtime/pprof/proto_test.go
@@ -358,6 +358,17 @@ func TestMapping(t *testing.T) {
                                         continue
                                 }
                         }
+
+                       if traceback == "Go+C" {
+                               // The test code was arranged to have PCs from C and
+                               // they are not symbolized.
+                               // Check no Location containing those unsymbolized PCs contains multiple lines.
+                               for i, loc := range prof.Location {
+                                       if !symbolized(loc) && len(loc.Line) > 1 {
+                                               t.Errorf("Location[%d] contains unsymbolized PCs and multiple lines: %v", i, loc)
+                                       }
+                               }
+                       }
                 })
         }
  }
diff --git a/src/runtime/pprof/testdata/mappingtest/main.go b/src/runtime/pprof/testdata/mappingtest/main.go

index 476b9e88a3297f3a06f652cdfcf280fedb66b87a..484b7f9d06cf54d44c3f5bbb1108ffbfa6882089 100644 (file)
--- a/src/runtime/pprof/testdata/mappingtest/main.go
+++ b/src/runtime/pprof/testdata/mappingtest/main.go
@@ -17,8 +17,7 @@ package main
  int cpuHogCSalt1 = 0;
  int cpuHogCSalt2 = 0;
  
-void CPUHogCFunction() {
-       int foo = cpuHogCSalt1;
+void CPUHogCFunction0(int foo) {
         int i;
         for (i = 0; i < 100000; i++) {
                 if (foo > 0) {
@@ -30,6 +29,10 @@ void CPUHogCFunction() {
         }
  }
  
+void CPUHogCFunction() {
+       CPUHogCFunction0(cpuHogCSalt1);
+}
+
  struct CgoTracebackArg {
         uintptr_t context;
          uintptr_t sigContext;
@@ -39,8 +42,9 @@ struct CgoTracebackArg {
  
  void CollectCgoTraceback(void* parg) {
          struct CgoTracebackArg* arg = (struct CgoTracebackArg*)(parg);
-       arg->buf[0] = (uintptr_t)(CPUHogCFunction);
-       arg->buf[1] = 0;
+       arg->buf[0] = (uintptr_t)(CPUHogCFunction0);
+       arg->buf[1] = (uintptr_t)(CPUHogCFunction);
+       arg->buf[2] = 0;
  };
  */
  import "C"
@@ -81,7 +85,6 @@ var salt1 int
  var salt2 int
  
  func cpuHogGoFunction() {
-       // Generates CPU profile samples including a Go call path.
         for {
                 foo := salt1
                 for i := 0; i < 1e5; i++ {
author	Hana (Hyang-Ah) Kim <hyangah@gmail.com>
	Sat, 2 Nov 2019 08:16:36 +0000 (17:16 +0900)
committer	Hyang-Ah Hana Kim <hyangah@gmail.com>
	Fri, 8 Nov 2019 02:40:04 +0000 (02:40 +0000)
src/runtime/pprof/pprof_test.go		patch \| blob \| history
src/runtime/pprof/proto.go		patch \| blob \| history
src/runtime/pprof/proto_test.go		patch \| blob \| history
src/runtime/pprof/testdata/mappingtest/main.go		patch \| blob \| history