runtime: for deep stacks, print both the top 50 and bottom 50 frames

author Austin Clements <austin@google.com>

Mon, 13 Mar 2023 18:02:16 +0000 (14:02 -0400)

committer Austin Clements <austin@google.com>

Tue, 21 Mar 2023 19:14:14 +0000 (19:14 +0000)
author Austin Clements <austin@google.com>
Mon, 13 Mar 2023 18:02:16 +0000 (14:02 -0400)
committer Austin Clements <austin@google.com>
Tue, 21 Mar 2023 19:14:14 +0000 (19:14 +0000)
diff --git a/doc/go1.21.html b/doc/go1.21.html

index 0af9d373afa0ddc0b5ebb2555273cc406babfc97..38678a93c266fc5b8d2922a6c9e61c748afe2180 100644 (file)
--- a/doc/go1.21.html
+++ b/doc/go1.21.html
@@ -49,6 +49,14 @@ Do not send CLs removing the interior tags from such phrases.
    TODO: complete this section, or delete if not needed
  </p>
  
+<p><!-- https://go.dev/issue/7181 -->
+  When printing very deep stacks, the runtime now prints the first 50
+  (innermost) frames followed by the bottom 50 (outermost) frames,
+  rather than just printing the first 100 frames. This makes it easier
+  to see how deeply recursive stacks started, and is especially
+  valuable for debugging stack overflows.
+</p>
+
  <h2 id="compiler">Compiler</h2>
  
  <p>
diff --git a/src/runtime/export_test.go b/src/runtime/export_test.go

index 31d32eabebd15eb9b30d21342613f97338c05e3f..367362b6721779c6dd642206185a61bc8dc5b333 100644 (file)
--- a/src/runtime/export_test.go
+++ b/src/runtime/export_test.go
@@ -47,6 +47,9 @@ var NetpollGenericInit = netpollGenericInit
  var Memmove = memmove
  var MemclrNoHeapPointers = memclrNoHeapPointers
  
+const TracebackInnerFrames = tracebackInnerFrames
+const TracebackOuterFrames = tracebackOuterFrames
+
  var LockPartialOrder = lockPartialOrder
  
  type LockRank lockRank
diff --git a/src/runtime/proc.go b/src/runtime/proc.go

index 533cf20838034fc2230323eeabf5aed7e99a0113..76ed8966dc8b101d0fb488fd225cdf0f5eb499e5 100644 (file)
--- a/src/runtime/proc.go
+++ b/src/runtime/proc.go
@@ -4356,7 +4356,7 @@ func saveAncestors(callergp *g) *[]ancestorInfo {
         ancestors := make([]ancestorInfo, n)
         copy(ancestors[1:], callerAncestors)
  
-       var pcs [_TracebackMaxFrames]uintptr
+       var pcs [tracebackInnerFrames]uintptr
         npcs := gcallers(callergp, 0, pcs[:])
         ipcs := make([]uintptr, npcs)
         copy(ipcs, pcs[:])
diff --git a/src/runtime/runtime2.go b/src/runtime/runtime2.go

index 2e98f895a37b40e353f4ef3a18a0f617818dafa2..a9706a642e802f0aa32e5d196c7cc033a8f6638b 100644 (file)
--- a/src/runtime/runtime2.go
+++ b/src/runtime/runtime2.go
@@ -1049,9 +1049,6 @@ type ancestorInfo struct {
         gopc uintptr   // pc of go statement that created this goroutine
  }
  
-// The maximum number of frames we print for a traceback
-const _TracebackMaxFrames = 100
-
  // A waitReason explains why a goroutine has been stopped.
  // See gopark. Do not re-use waitReasons, add new ones.
  type waitReason uint8
diff --git a/src/runtime/traceback.go b/src/runtime/traceback.go

index 9dfa97f6581d84fd31402cbe18c03eac6f2911db..eb5e67eb810c32b99888229193bc30cc3fa5a145 100644 (file)
--- a/src/runtime/traceback.go
+++ b/src/runtime/traceback.go
@@ -21,6 +21,17 @@ import (
  
  const usesLR = sys.MinFrameSize > 0
  
+const (
+       // tracebackInnerFrames is the number of innermost frames to print in a
+       // stack trace. The total maximum frames is tracebackInnerFrames +
+       // tracebackOuterFrames.
+       tracebackInnerFrames = 50
+
+       // tracebackOuterFrames is the number of outermost frames to print in a
+       // stack trace.
+       tracebackOuterFrames = 50
+)
+
  // unwindFlags control the behavior of various unwinders.
  type unwindFlags uint8
  
@@ -812,18 +823,80 @@ func traceback1(pc, sp, lr uintptr, gp *g, flags unwindFlags) {
                 flags &^= unwindTrap
         }
  
-       // Print traceback. By default, omits runtime frames.
-       // If that means we print nothing at all, repeat forcing all frames printed.
+       // Print traceback.
+       //
+       // We print the first tracebackInnerFrames frames, and the last
+       // tracebackOuterFrames frames. There are many possible approaches to this.
+       // There are various complications to this:
+       //
+       // - We'd prefer to walk the stack once because in really bad situations
+       //   traceback may crash (and we want as much output as possible) or the stack
+       //   may be changing.
+       //
+       // - Each physical frame can represent several logical frames, so we might
+       //   have to pause in the middle of a physical frame and pick up in the middle
+       //   of a physical frame.
+       //
+       // - The cgo symbolizer can expand a cgo PC to more than one logical frame,
+       //   and involves juggling state on the C side that we don't manage. Since its
+       //   expansion state is managed on the C side, we can't capture the expansion
+       //   state part way through, and because the output strings are managed on the
+       //   C side, we can't capture the output. Thus, our only choice is to replay a
+       //   whole expansion, potentially discarding some of it.
+       //
+       // Rejected approaches:
+       //
+       // - Do two passes where the first pass just counts and the second pass does
+       //   all the printing. This is undesireable if the stack is corrupted or changing
+       //   because we won't see a partial stack if we panic.
+       //
+       // - Keep a ring buffer of the last N logical frames and use this to print
+       //   the bottom frames once we reach the end of the stack. This works, but
+       //   requires keeping a surprising amount of state on the stack, and we have
+       //   to run the cgo symbolizer twice—once to count frames, and a second to
+       //   print them—since we can't retain the strings it returns.
+       //
+       // Instead, we print the outer frames, and if we reach that limit, we clone
+       // the unwinder, count the remaining frames, and then skip forward and
+       // finish printing from the clone. This makes two passes over the outer part
+       // of the stack, but the single pass over the inner part ensures that's
+       // printed immediately and not revisited. It keeps minimal state on the
+       // stack. And through a combination of skip counts and limits, we can do all
+       // of the steps we need with a single traceback printer implementation.
+       //
+       // We could be more lax about exactly how many frames we print, for example
+       // always stopping and resuming on physical frame boundaries, or at least
+       // cgo expansion boundaries. It's not clear that's much simpler.
         flags |= unwindPrintErrors
         var u unwinder
-       u.initAt(pc, sp, lr, gp, flags)
-       n := traceback2(&u, false)
-       if n == 0 {
+       tracebackWithRuntime := func(showRuntime bool) int {
+               const maxInt int = 0x7fffffff
                 u.initAt(pc, sp, lr, gp, flags)
-               n = traceback2(&u, true)
+               n, lastN := traceback2(&u, showRuntime, 0, tracebackInnerFrames)
+               if n < tracebackInnerFrames {
+                       // We printed the whole stack.
+                       return n
+               }
+               // Clone the unwinder and figure out how many frames are left. This
+               // count will include any logical frames already printed for u's current
+               // physical frame.
+               u2 := u
+               remaining, _ := traceback2(&u, showRuntime, maxInt, 0)
+               elide := remaining - lastN - tracebackOuterFrames
+               if elide > 0 {
+                       print("...", elide, " frames elided...\n")
+                       traceback2(&u2, showRuntime, lastN+elide, tracebackOuterFrames)
+               } else if elide <= 0 {
+                       // There are tracebackOuterFrames or fewer frames left to print.
+                       // Just print the rest of the stack.
+                       traceback2(&u2, showRuntime, lastN, tracebackOuterFrames)
+               }
+               return n
         }
-       if n == _TracebackMaxFrames {
-               print("...additional frames elided...\n")
+       // By default, omits runtime frames. If that means we print nothing at all,
+       // repeat forcing all frames printed.
+       if tracebackWithRuntime(false) == 0 {
+               tracebackWithRuntime(true)
         }
         printcreatedby(gp)
  
@@ -835,15 +908,38 @@ func traceback1(pc, sp, lr uintptr, gp *g, flags unwindFlags) {
         }
  }
  
-func traceback2(u *unwinder, showRuntime bool) int {
+// traceback2 prints a stack trace starting at u. It skips the first "skip"
+// logical frames, after which it prints at most "max" logical frames. It
+// returns n, which is the number of logical frames skipped and printed, and
+// lastN, which is the number of logical frames skipped or printed just in the
+// phyiscal frame that u references.
+func traceback2(u *unwinder, showRuntime bool, skip, max int) (n, lastN int) {
+       // commitFrame commits to a logical frame and returns whether this frame
+       // should be printed and whether iteration should stop.
+       commitFrame := func() (pr, stop bool) {
+               if skip == 0 && max == 0 {
+                       // Stop
+                       return false, true
+               }
+               n++
+               lastN++
+               if skip > 0 {
+                       // Skip
+                       skip--
+                       return false, false
+               }
+               // Print
+               max--
+               return true, false
+       }
+
         gp := u.g.ptr()
         level, _, _ := gotraceback()
-       n := 0
-       const max = _TracebackMaxFrames
         var cgoBuf [32]uintptr
-       for ; n < max && u.valid(); u.next() {
+       for ; u.valid(); u.next() {
+               lastN = 0
                 f := u.frame.fn
-               for iu, uf := newInlineUnwinder(f, u.symPC(), noEscapePtr(&u.cache)); n < max && uf.valid(); uf = iu.next(uf) {
+               for iu, uf := newInlineUnwinder(f, u.symPC(), noEscapePtr(&u.cache)); uf.valid(); uf = iu.next(uf) {
                         sf := iu.srcFunc(uf)
                         callee := u.calleeFuncID
                         u.calleeFuncID = sf.funcID
@@ -851,6 +947,12 @@ func traceback2(u *unwinder, showRuntime bool) int {
                                 continue
                         }
  
+                       if pr, stop := commitFrame(); stop {
+                               return
+                       } else if !pr {
+                               continue
+                       }
+
                         name := sf.name()
                         file, line := iu.fileLine(uf)
                         if name == "runtime.gopanic" {
@@ -878,34 +980,39 @@ func traceback2(u *unwinder, showRuntime bool) int {
                                 }
                         }
                         print("\n")
-                       n++
                 }
  
                 // Print cgo frames.
                 if cgoN := u.cgoCallers(cgoBuf[:]); cgoN > 0 {
                         var arg cgoSymbolizerArg
                         anySymbolized := false
+                       stop := false
                         for _, pc := range cgoBuf[:cgoN] {
-                               if n >= max {
-                                       break
-                               }
                                 if cgoSymbolizer == nil {
-                                       print("non-Go function at pc=", hex(pc), "\n")
+                                       if pr, stop := commitFrame(); stop {
+                                               break
+                                       } else if pr {
+                                               print("non-Go function at pc=", hex(pc), "\n")
+                                       }
                                 } else {
-                                       c := printOneCgoTraceback(pc, max-n, &arg)
-                                       n += c - 1 // +1 a few lines down
+                                       stop = printOneCgoTraceback(pc, commitFrame, &arg)
                                         anySymbolized = true
+                                       if stop {
+                                               break
+                                       }
                                 }
-                               n++
                         }
                         if anySymbolized {
                                 // Free symbolization state.
                                 arg.pc = 0
                                 callCgoSymbolizer(&arg)
                         }
+                       if stop {
+                               return
+                       }
                 }
         }
-       return n
+       return n, 0
  }
  
  // printAncestorTraceback prints the traceback of the given ancestor.
@@ -918,7 +1025,7 @@ func printAncestorTraceback(ancestor ancestorInfo) {
                         printAncestorTracebackFuncInfo(f, pc)
                 }
         }
-       if len(ancestor.pcs) == _TracebackMaxFrames {
+       if len(ancestor.pcs) == tracebackInnerFrames {
                 print("...additional frames elided...\n")
         }
         // Show what created goroutine, except main goroutine (goid 1).
@@ -1405,12 +1512,13 @@ func printCgoTraceback(callers *cgoCallers) {
                 return
         }
  
+       commitFrame := func() (pr, stop bool) { return true, false }
         var arg cgoSymbolizerArg
         for _, c := range callers {
                 if c == 0 {
                         break
                 }
-               printOneCgoTraceback(c, 0x7fffffff, &arg)
+               printOneCgoTraceback(c, commitFrame, &arg)
         }
         arg.pc = 0
         callCgoSymbolizer(&arg)
@@ -1418,11 +1526,16 @@ func printCgoTraceback(callers *cgoCallers) {
  
  // printOneCgoTraceback prints the traceback of a single cgo caller.
  // This can print more than one line because of inlining.
-// Returns the number of frames printed.
-func printOneCgoTraceback(pc uintptr, max int, arg *cgoSymbolizerArg) int {
-       c := 0
+// It returns the "stop" result of commitFrame.
+func printOneCgoTraceback(pc uintptr, commitFrame func() (pr, stop bool), arg *cgoSymbolizerArg) bool {
         arg.pc = pc
-       for c <= max {
+       for {
+               if pr, stop := commitFrame(); stop {
+                       return true
+               } else if !pr {
+                       continue
+               }
+
                 callCgoSymbolizer(arg)
                 if arg.funcName != nil {
                         // Note that we don't print any argument
@@ -1437,12 +1550,10 @@ func printOneCgoTraceback(pc uintptr, max int, arg *cgoSymbolizerArg) int {
                         print(gostringnocopy(arg.file), ":", arg.lineno, " ")
                 }
                 print("pc=", hex(pc), "\n")
-               c++
                 if arg.more == 0 {
-                       break
+                       return false
                 }
         }
-       return c
  }
  
  // callCgoSymbolizer calls the cgoSymbolizer function.
diff --git a/src/runtime/traceback_test.go b/src/runtime/traceback_test.go

index a47ddb61c53f72cc80a2dda255d0bf8daf5ff737..4dd1d4bae9fcc08b982b2f856341a7b334136bb1 100644 (file)
--- a/src/runtime/traceback_test.go
+++ b/src/runtime/traceback_test.go
@@ -9,8 +9,10 @@ import (
         "fmt"
         "internal/abi"
         "internal/testenv"
+       "regexp"
         "runtime"
         "runtime/debug"
+       "strconv"
         "strings"
         "sync"
         "testing"
@@ -150,6 +152,113 @@ func ttiExcluded3() *ttiResult {
  
  var testTracebackArgsBuf [1000]byte
  
+func TestTracebackElision(t *testing.T) {
+       // Test printing exactly the maximum number of frames to make sure we don't
+       // print any "elided" message, eliding exactly 1 so we have to pick back up
+       // in the paused physical frame, and eliding 10 so we have to advance the
+       // physical frame forward.
+       for _, elided := range []int{0, 1, 10} {
+               t.Run(fmt.Sprintf("elided=%d", elided), func(t *testing.T) {
+                       n := elided + runtime.TracebackInnerFrames + runtime.TracebackOuterFrames
+
+                       // Start a new goroutine so we have control over the whole stack.
+                       stackChan := make(chan string)
+                       go tteStack(n, stackChan)
+                       stack := <-stackChan
+                       tb := parseTraceback1(t, stack)
+
+                       // Check the traceback.
+                       i := 0
+                       for i < n {
+                               if len(tb.frames) == 0 {
+                                       t.Errorf("traceback ended early")
+                                       break
+                               }
+                               fr := tb.frames[0]
+                               if i == runtime.TracebackInnerFrames && elided > 0 {
+                                       // This should be an "elided" frame.
+                                       if fr.elided != elided {
+                                               t.Errorf("want %d frames elided", elided)
+                                               break
+                                       }
+                                       i += fr.elided
+                               } else {
+                                       want := fmt.Sprintf("runtime_test.tte%d", (i+1)%5)
+                                       if i == 0 {
+                                               want = "runtime/debug.Stack"
+                                       } else if i == n-1 {
+                                               want = "runtime_test.tteStack"
+                                       }
+                                       if fr.funcName != want {
+                                               t.Errorf("want %s, got %s", want, fr.funcName)
+                                               break
+                                       }
+                                       i++
+                               }
+                               tb.frames = tb.frames[1:]
+                       }
+                       if !t.Failed() && len(tb.frames) > 0 {
+                               t.Errorf("got %d more frames than expected", len(tb.frames))
+                       }
+                       if t.Failed() {
+                               t.Logf("traceback diverged at frame %d", i)
+                               off := len(stack)
+                               if len(tb.frames) > 0 {
+                                       off = tb.frames[0].off
+                               }
+                               t.Logf("traceback before error:\n%s", stack[:off])
+                               t.Logf("traceback after error:\n%s", stack[off:])
+                       }
+               })
+       }
+}
+
+// tteStack creates a stack of n logical frames and sends the traceback to
+// stack. It cycles through 5 logical frames per physical frame to make it
+// unlikely that any part of the traceback will end on a physical boundary.
+func tteStack(n int, stack chan<- string) {
+       n-- // Account for this frame
+       // This is basically a Duff's device for starting the inline stack in the
+       // right place so we wind up at tteN when n%5=N.
+       switch n % 5 {
+       case 0:
+               stack <- tte0(n)
+       case 1:
+               stack <- tte1(n)
+       case 2:
+               stack <- tte2(n)
+       case 3:
+               stack <- tte3(n)
+       case 4:
+               stack <- tte4(n)
+       default:
+               panic("unreachable")
+       }
+}
+func tte0(n int) string {
+       return tte4(n - 1)
+}
+func tte1(n int) string {
+       return tte0(n - 1)
+}
+func tte2(n int) string {
+       // tte2 opens n%5 == 2 frames. It's also the base case of the recursion,
+       // since we can open no fewer than two frames to call debug.Stack().
+       if n < 2 {
+               panic("bad n")
+       }
+       if n == 2 {
+               return string(debug.Stack())
+       }
+       return tte1(n - 1)
+}
+func tte3(n int) string {
+       return tte2(n - 1)
+}
+func tte4(n int) string {
+       return tte3(n - 1)
+}
+
  func TestTracebackArgs(t *testing.T) {
         if *flagQuick {
                 t.Skip("-quick")
@@ -586,37 +695,50 @@ type tbFrame struct {
         funcName string
         args     string
         inlined  bool
+
+       // elided is set to the number of frames elided, and the other fields are
+       // set to the zero value.
+       elided int
+
+       off int // byte offset in the traceback text of this frame
  }
  
  // parseTraceback parses a printed traceback to make it easier for tests to
  // check the result.
  func parseTraceback(t *testing.T, tb string) []*traceback {
-       lines := strings.Split(tb, "\n")
-       nLines := len(lines)
+       //lines := strings.Split(tb, "\n")
+       //nLines := len(lines)
+       off := 0
+       lineNo := 0
         fatal := func(f string, args ...any) {
-               lineNo := nLines - len(lines) + 1
                 msg := fmt.Sprintf(f, args...)
                 t.Fatalf("%s (line %d):\n%s", msg, lineNo, tb)
         }
         parseFrame := func(funcName, args string) *tbFrame {
                 // Consume file/line/etc
-               if len(lines) == 0 || !strings.HasPrefix(lines[0], "\t") {
+               if !strings.HasPrefix(tb, "\t") {
                         fatal("missing source line")
                 }
-               lines = lines[1:]
+               _, tb, _ = strings.Cut(tb, "\n")
+               lineNo++
                 inlined := args == "..."
-               return &tbFrame{funcName, args, inlined}
+               return &tbFrame{funcName: funcName, args: args, inlined: inlined, off: off}
         }
+       var elidedRe = regexp.MustCompile(`^\.\.\.([0-9]+) frames elided\.\.\.$`)
         var tbs []*traceback
         var cur *traceback
-       for len(lines) > 0 {
-               line := lines[0]
-               lines = lines[1:]
+       tbLen := len(tb)
+       for len(tb) > 0 {
+               var line string
+               off = tbLen - len(tb)
+               line, tb, _ = strings.Cut(tb, "\n")
+               lineNo++
                 switch {
                 case strings.HasPrefix(line, "goroutine "):
                         cur = &traceback{}
                         tbs = append(tbs, cur)
                 case line == "":
+                       // Separator between goroutines
                         cur = nil
                 case line[0] == '\t':
                         fatal("unexpected indent")
@@ -631,6 +753,12 @@ func parseTraceback(t *testing.T, tb string) []*traceback {
                         }
                         frame := parseFrame(funcName, args)
                         cur.frames = append(cur.frames, frame)
+               case elidedRe.MatchString(line):
+                       // "...N frames elided..."
+                       nStr := elidedRe.FindStringSubmatch(line)
+                       n, _ := strconv.Atoi(nStr[1])
+                       frame := &tbFrame{elided: n}
+                       cur.frames = append(cur.frames, frame)
                 }
         }
         return tbs
author	Austin Clements <austin@google.com>
	Mon, 13 Mar 2023 18:02:16 +0000 (14:02 -0400)
committer	Austin Clements <austin@google.com>
	Tue, 21 Mar 2023 19:14:14 +0000 (19:14 +0000)
doc/go1.21.html		patch \| blob \| history
src/runtime/export_test.go		patch \| blob \| history
src/runtime/proc.go		patch \| blob \| history
src/runtime/runtime2.go		patch \| blob \| history
src/runtime/traceback.go		patch \| blob \| history
src/runtime/traceback_test.go		patch \| blob \| history